from:"Liu, Yi L"

RE: [PATCH] intel_iommu: Use the latest fault reasons defined by spec

2024-05-19 Thread Liu, Yi L

> From: Duan, Zhenzhong 
> Sent: Monday, May 20, 2024 11:41 AM
> 
> 
> 
> >-Original Message-
> >From: Jason Wang 
> >Sent: Monday, May 20, 2024 8:44 AM
> >To: Duan, Zhenzhong 
> >Cc: qemu-devel@nongnu.org; Liu, Yi L ; Peng, Chao P
> >; Yu Zhang ; Michael
> >S. Tsirkin ; Paolo Bonzini ;
> >Richard Henderson ; Eduardo Habkost
> >; Marcel Apfelbaum 
> >Subject: Re: [PATCH] intel_iommu: Use the latest fault reasons defined by
> >spec
> >
> >On Fri, May 17, 2024 at 6:26 PM Zhenzhong Duan
> > wrote:
> >>
> >> From: Yu Zhang 
> >>
> >> Currently we use only VTD_FR_PASID_TABLE_INV as fault reason.
> >> Update with more detailed fault reasons listed in VT-d spec 7.2.3.
> >>
> >> Signed-off-by: Yu Zhang 
> >> Signed-off-by: Zhenzhong Duan 
> >> ---
> >
> >I wonder if this could be noticed by the guest or not. If yes should
> >we consider starting to add thing like version to vtd emulation code?
>
> Kernel only dumps the reason like below:
> 
> DMAR: [DMA Write NO_PASID] Request device [20:00.0] fault addr 0x123460
> [fault reason 0x71] SM: Present bit in first-level paging entry is clear

Yes, guest kernel would notice it as the fault would be injected to vm. 

> Maybe bump 1.0 -> 1.1?
> My understanding version number is only informational and is far from
> accurate to mark if a feature supported. Driver should check cap/ecap
> bits instead.

Should the version ID here be aligned with VT-d spec? If yes, it should
be 3.0 as the scalable mode was introduced in spec 3.0. And the fault
code was redefined together with the introduction of this translation
mode. Below is the a snippet from the change log of VT-d spec.

June 2018 3.0
• Removed all text related to Extended-Mode.
• Added support for scalable-mode translation for DMA Remapping, that enables 
PASIDgranular first-level, second-level, nested and pass-through translation 
functions.
• Widen invalidation queue descriptors and page request queue descriptors from 
128 bits
to 256 bits and redefined page-request and page-response descriptors.
• Listed all fault conditions in a unified table and described DMA Remapping 
hardware
behavior under each condition. Assigned new code for each fault condition in 
scalablemode operation.
• Added support for Accessed/Dirty (A/D) bits in second-level translation.
• Added support for submitting commands and receiving response from virtual DMA
Remapping hardware.
• Added a table on snooping behavior and memory type of hardware access to 
various
remapping structures as appendix.
• Move Page Request Overflow (PRO) fault reporting from Fault Status register
(FSTS_REG) to Page Request Status register (PRS_REG).

Regards.
Yi Liu

RE: [PATCH] intel_iommu: Use the latest fault reasons defined by spec

2024-05-18 Thread Liu, Yi L

> From: CLEMENT MATHIEU--DRIF 
> Sent: Friday, May 17, 2024 9:13 PM
> 
> Hi Zhenzhong
> 
> On 17/05/2024 12:23, Zhenzhong Duan wrote:
> > Caution: External email. Do not open attachments or click links, unless 
> > this email
> comes from a known sender and you know the content is safe.
> >
> >
> > From: Yu Zhang 
> >
> > Currently we use only VTD_FR_PASID_TABLE_INV as fault reason.
> > Update with more detailed fault reasons listed in VT-d spec 7.2.3.
> >
> > Signed-off-by: Yu Zhang 
> > Signed-off-by: Zhenzhong Duan 
> > ---
> >   hw/i386/intel_iommu_internal.h |  8 +++-
> >   hw/i386/intel_iommu.c  | 25 -
> >   2 files changed, 23 insertions(+), 10 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
> > index f8cf99bddf..666e2cf2ce 100644
> > --- a/hw/i386/intel_iommu_internal.h
> > +++ b/hw/i386/intel_iommu_internal.h
> > @@ -311,7 +311,13 @@ typedef enum VTDFaultReason {
> > * request while disabled */
> >   VTD_FR_IR_SID_ERR = 0x26,   /* Invalid Source-ID */
> >
> > -VTD_FR_PASID_TABLE_INV = 0x58,  /*Invalid PASID table entry */
> > +/* PASID directory entry access failure */
> > +VTD_FR_PASID_DIR_ACCESS_ERR = 0x50,
> > +/* The Present(P) field of pasid directory entry is 0 */
> > +VTD_FR_PASID_DIR_ENTRY_P = 0x51,
> > +VTD_FR_PASID_TABLE_ACCESS_ERR = 0x58, /* PASID table entry access 
> > failure */
> > +VTD_FR_PASID_ENTRY_P = 0x59, /* The Present(P) field of pasidt-entry 
> > is 0 */
> s/pasidt/pasid

Per spec, it is pasid table entry. So Zhenzhong may need to use the same word
With the line below. E.g. PASID Table entry.

Regards,
Yi Liu

> > +VTD_FR_PASID_TABLE_ENTRY_INV = 0x5b,  /*Invalid PASID table entry */
> >
> >   /* Output address in the interrupt address range for scalable mode */
> >   VTD_FR_SM_INTERRUPT_ADDR = 0x87,
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index cc8e59674e..0951ebb71d 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -771,7 +771,7 @@ static int vtd_get_pdire_from_pdir_table(dma_addr_t
> pasid_dir_base,
> >   addr = pasid_dir_base + index * entry_size;
> >   if (dma_memory_read(_space_memory, addr,
> >   pdire, entry_size, MEMTXATTRS_UNSPECIFIED)) {
> > -return -VTD_FR_PASID_TABLE_INV;
> > +return -VTD_FR_PASID_DIR_ACCESS_ERR;
> >   }
> >
> >   pdire->val = le64_to_cpu(pdire->val);
> > @@ -789,6 +789,7 @@ static int 
> > vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState
> *s,
> > dma_addr_t addr,
> > VTDPASIDEntry *pe)
> >   {
> > +uint8_t pgtt;
> >   uint32_t index;
> >   dma_addr_t entry_size;
> >   X86IOMMUState *x86_iommu = X86_IOMMU_DEVICE(s);
> > @@ -798,7 +799,7 @@ static int 
> > vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState
> *s,
> >   addr = addr + index * entry_size;
> >   if (dma_memory_read(_space_memory, addr,
> >   pe, entry_size, MEMTXATTRS_UNSPECIFIED)) {
> > -return -VTD_FR_PASID_TABLE_INV;
> > +return -VTD_FR_PASID_TABLE_ACCESS_ERR;
> >   }
> >   for (size_t i = 0; i < ARRAY_SIZE(pe->val); i++) {
> >   pe->val[i] = le64_to_cpu(pe->val[i]);
> > @@ -806,11 +807,13 @@ static int 
> > vtd_get_pe_in_pasid_leaf_table(IntelIOMMUState
> *s,
> >
> >   /* Do translation type check */
> >   if (!vtd_pe_type_check(x86_iommu, pe)) {
> > -return -VTD_FR_PASID_TABLE_INV;
> > +return -VTD_FR_PASID_TABLE_ENTRY_INV;
> >   }
> >
> > -if (!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
> > -return -VTD_FR_PASID_TABLE_INV;
> > +pgtt = VTD_PE_GET_TYPE(pe);
> > +if (pgtt == VTD_SM_PASID_ENTRY_SLT &&
> > +!vtd_is_level_supported(s, VTD_PE_GET_LEVEL(pe))) {
> > +return -VTD_FR_PASID_TABLE_ENTRY_INV;
> >   }
> >
> >   return 0;
> > @@ -851,7 +854,7 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState 
> > *s,
> >   }
> >
> >   if (!vtd_pdire_present()) {
> > -return -VTD_FR_PASID_TABLE_INV;
> > +return -VTD_FR_PASID_DIR_ENTRY_P;
> >   }
> >
> >   ret = vtd_get_pe_from_pdire(s, pasid, , pe);
> > @@ -860,7 +863,7 @@ static int vtd_get_pe_from_pasid_table(IntelIOMMUState 
> > *s,
> >   }
> >
> >   if (!vtd_pe_present(pe)) {
> > -return -VTD_FR_PASID_TABLE_INV;
> > +return -VTD_FR_PASID_ENTRY_P;
> >   }
> >
> >   return 0;
> > @@ -913,7 +916,7 @@ static int vtd_ce_get_pasid_fpd(IntelIOMMUState *s,
> >   }
> >
> >   if (!vtd_pdire_present()) {
> > -return -VTD_FR_PASID_TABLE_INV;
> > +return -VTD_FR_PASID_DIR_ENTRY_P;
> >   }
> >
> >   /*
> > @@ -1770,7 +1773,11 @@ static const bool vtd_qualified_faults[] = {
> >   [VTD_FR_ROOT_ENTRY_RSVD] = false,
> >

RE: [PATCH] intel_iommu: Optimize out some unnecessary UNMAP calls

2023-05-26 Thread Liu, Yi L

> From: Jason Wang 
> Sent: Friday, May 26, 2023 2:28 PM
> 
> On Fri, May 26, 2023 at 2:22 PM Duan, Zhenzhong
>  wrote:
> >
> >
> > >-Original Message-
> > >From: Peter Xu 
> > >Sent: Thursday, May 25, 2023 9:54 PM
> > >Subject: Re: [PATCH] intel_iommu: Optimize out some unnecessary UNMAP
> > >calls
> > >
> > >On Thu, May 25, 2023 at 11:29:34AM +, Duan, Zhenzhong wrote:
> > >> Hi Peter,
> > >>
> > >> See inline.
> > >> >-Original Message-
> > >> >From: Peter Xu 
> > >> >Sent: Thursday, May 25, 2023 12:59 AM
> > >> >Subject: Re: [PATCH] intel_iommu: Optimize out some unnecessary UNMAP
> > >> >calls
> > >> >
> > >> >Hi, Zhenzhong,
> > >> >
> > >> >On Tue, May 23, 2023 at 04:07:02PM +0800, Zhenzhong Duan wrote:
> > >> >> Commit 63b88968f1 ("intel-iommu: rework the page walk logic") adds
> > >> >> logic to record mapped IOVA ranges so we only need to send MAP or
> > >> >> UNMAP when necessary. But there are still a few corner cases of
> > >> >unnecessary UNMAP.
> > >> >>
> > >> >> One is address space switch. During switching to iommu address
> > >> >> space, all the original mappings have been dropped by VFIO memory
> > >> >> listener, we don't need to unmap again in replay. The other is
> > >> >> invalidation, we only need to unmap when there are recorded mapped
> > >> >> IOVA ranges, presuming most of OSes allocating IOVA range
> > >> >> continuously, ex. on x86, linux sets up mapping from 0x
> > >downwards.
> > >> >>
> > >> >> Signed-off-by: Zhenzhong Duan 
> > >> >> ---
> > >> >> Tested on x86 with a net card passed or hotpluged to kvm guest,
> > >> >> ping/ssh pass.
> > >> >
> > >> >Since this is a performance related patch, do you have any number to
> > >> >show the effect?
> > >>
> > >> I straced the time of UNMAP ioctl, its time is 0.14us and we have
> > >> 28 ioctl() due to the two notifiers in x86 are split into power of 2 
> > >> pieces.
> > >>
> > >> ioctl(48, VFIO_DEVICE_QUERY_GFX_PLANE or VFIO_IOMMU_UNMAP_DMA,
> > >> 0x7d5c42f0) = 0 <0.14>
> > >
> > >Could you add some information like this into the commit message when
> > >repost?  E.g. UNMAP was xxx sec before, and this patch reduces it to yyy.
> > Sure, will do.
> >
> > >
> > >>
> > >> >
> > >> >>
> > >> >>  hw/i386/intel_iommu.c | 31 ++-
> > >> >>  1 file changed, 14 insertions(+), 17 deletions(-)
> > >> >>
> > >> >> diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c index
> > >> >> 94d52f4205d2..6afd6428 100644
> > >> >> --- a/hw/i386/intel_iommu.c
> > >> >> +++ b/hw/i386/intel_iommu.c
> > >> >> @@ -3743,6 +3743,7 @@ static void
> > >> >vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
> > >> >>  hwaddr start = n->start;
> > >> >>  hwaddr end = n->end;
> > >> >>  IntelIOMMUState *s = as->iommu_state;
> > >> >> +IOMMUTLBEvent event;
> > >> >>  DMAMap map;
> > >> >>
> > >> >>  /*
> > >> >> @@ -3762,22 +3763,25 @@ static void
> > >> >vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n)
> > >> >>  assert(start <= end);
> > >> >>  size = remain = end - start + 1;
> > >> >>
> > >> >> +event.type = IOMMU_NOTIFIER_UNMAP;
> > >> >> +event.entry.target_as = _space_memory;
> > >> >> +event.entry.perm = IOMMU_NONE;
> > >> >> +/* This field is meaningless for unmap */
> > >> >> +event.entry.translated_addr = 0;
> > >> >> +
> > >> >>  while (remain >= VTD_PAGE_SIZE) {
> > >> >> -IOMMUTLBEvent event;
> > >> >>  uint64_t mask = dma_aligned_pow2_mask(start, end, 
> > >> >> s->aw_bits);
> > >> >>  uint64_t size = mask + 1;
> > >> >>
> > >> >>  assert(size);
> > >> >>
> > >> >> -event.type = IOMMU_NOTIFIER_UNMAP;
> > >> >> -event.entry.iova = start;
> > >> >> -event.entry.addr_mask = mask;
> > >> >> -event.entry.target_as = _space_memory;
> > >> >> -event.entry.perm = IOMMU_NONE;
> > >> >> -/* This field is meaningless for unmap */
> > >> >> -event.entry.translated_addr = 0;
> > >> >> -
> > >> >> -memory_region_notify_iommu_one(n, );
> > >> >> +map.iova = start;
> > >> >> +map.size = size;
> > >> >> +if (iova_tree_find(as->iova_tree, )) {
> > >> >> +event.entry.iova = start;
> > >> >> +event.entry.addr_mask = mask;
> > >> >> +memory_region_notify_iommu_one(n, );
> > >> >> +}
> > >> >
> > >> >This one looks fine to me, but I'm not sure how much benefit we'll
> > >> >get here either as this path should be rare afaiu.
> > >>
> > >> Yes, I only see such UNMAP call at cold bootup/shutdown, hot plug and
> > >unplug.
> > >>
> > >> In fact, the other purpose of this patch is to eliminate noisy error
> > >> log when we work with IOMMUFD. It looks the duplicate UNMAP call will
> > >> fail with IOMMUFD while always succeed with legacy container. This
> > >> behavior difference lead to below error log for IOMMUFD:
> 
> A dumb question, should IOMMUFD

Re: [PATCH 3/3] intel-iommu: PASID support

2022-01-14 Thread Liu Yi L


On 2022/1/14 15:22, Jason Wang wrote:

On Fri, Jan 14, 2022 at 3:13 PM Peter Xu  wrote:


On Fri, Jan 14, 2022 at 01:58:07PM +0800, Jason Wang wrote:

Right, but I think you meant to do this only when scalable mode is disabled.


Yes IMHO it will definitely suite for !scalable case since that's exactly what
we did before.  What I'm also wondering is even if scalable is enabled but no
"real" pasid is used, so if all the translations go through the default pasid
that stored in the device context entry, then maybe we can ignore checking it.
The latter is the "hacky" part mentioned above.


The problem I see is that we can't know what PASID is used as default
without reading the context entry?


Can the default NO_PASID being used in mixture of !NO_PASID use case on the
same device?  If that's possible, then I agree..


My understanding is that it is possible.



My previous idea should be based on the fact that if NO_PASID is used on one
device, then all translations will be based on NO_PASID, but now I'm not sure
of it.


Actually, what I meant is:

device 1 using transactions without PASID with RID2PASID 1
device 2 using transactions without PASID with RID2PASID 2


Interesting series, Jason.

haven't read through all your code yet. Just a quick comment. The 
RID2PASID1 and RID2PASID2 may be the same one. Vt-d spec has defined a RPS 
bit in ecap register. If it is reported as 0, that means the RID_PASID 
(previously it is called RID2PASID :-)) field of scalable mode context 
entry is not supported, a PASID value of 0 will be used for transactions 
wihout PASID. So in the code, you may check the RPS bit to see if the 
RID_PASID value are the same for all devices.


Regards,
Yi Liu


Then we can't assume a default pasid here.







The other thing to mention is, if we postpone the iotlb lookup to be after
context entry, then logically we can have per-device iotlb, that means we can
replace IntelIOMMUState.iotlb with VTDAddressSpace.iotlb in the future, too,
which can also be more efficient.


Right but we still need to limit the total slots and ATS is a better
way to deal with the IOTLB bottleneck actually.


I think it depends on how the iotlb ghash is implemented.  Logically I think if
we can split the cache to per-device it'll be slightly better because we don't
need to iterate over iotlbs of other devices when lookup anymore; meanwhile
each iotlb takes less space too (no devfn needed anymore).


So we've already used sid in the IOTLB hash, I wonder how much we can
gain form this.

Thanks



Thanks,

--
Peter Xu





--
Regards,
Yi Liu

RE: [PATCH] docs: fix section numbering in pcie.txt

2021-11-30 Thread Liu, Yi L

> From: Qemu-devel 
> On Behalf Of Ani Sinha
> Sent: Wednesday, December 1, 2021 2:43 PM
> 
> There is no 5.2 section. Section 5.3 should really be 5.2. Fix it.

Reviewed-by: Liu Yi L 

BTW. Is a fix tag needed?

Regards,
Yi Liu

> Signed-off-by: Ani Sinha 
> ---
>  docs/pcie.txt | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/docs/pcie.txt b/docs/pcie.txt
> index 89e3502075..90310b0c5e 100644
> --- a/docs/pcie.txt
> +++ b/docs/pcie.txt
> @@ -262,7 +262,7 @@ PCI Express Root Ports (and PCI Express Downstream
> Ports).
>  Port, which may come handy for hot-plugging another device.
> 
> 
> -5.3 Hot-plug example:
> +5.2 Hot-plug example:
>  Using HMP: (add -monitor stdio to QEMU command line)
>device_add ,id=,bus= Downstream Port Id/PCI-PCI Bridge Id/>
> 
> --
> 2.25.1
>

RE: [PATCH] intel-iommu: ignore SNP bit in scalable mode

2021-11-28 Thread Liu, Yi L

> From: Peter Xu 
> Sent: Monday, November 29, 2021 11:14 AM
> 
> On Mon, Nov 29, 2021 at 10:28:42AM +0800, Jason Wang wrote:
> >
> > And in the future, it could be even more troublesome,e.g there's one
> > day we found another bit that needs not to be checked. Maybe we should
> > even remove all the rsvd bits checks?
> 
> When a real hardware sees any of the reserved bits set, it'll bail out and
> raise an error, right?

I think so. vtd spec has defined Non-zero reserved field error code against
all the translation structures (root/context/pasid dir/pasid table/page table)
for it. And it makes sense since any such error indicates a potential
misunderstanding on the spec.

> If that's the case, I'm wondering whether we should always follow the
> hardware behavior as an emulator.

I think so. and current virtual Intel IOMMU does a good job to detect the
SNP setting.:)

> Now I'm trying to remember normally how a spec could re-use a bit that was
> used to be reserved: should the hardware bumps the version of the version reg 
> in so
> that softwares will know what to expect?

defining a new capability bit is also a way for it. New software will probe the
capability bit and then program the bit which was reserved but now redefined.
Old software doesn’t have any idea on the new capability bit, so it will not
program the reserved bit.

> So I'm thinking whether the emulator code can identify the version bump by
> "scalable mode enabled", if so we know some resved bits are "ignored" now,
> and IIUC that's mostly the original proposal to add a quirk when scalable mode
> in vtd_init().

do you mean the spec version or?

> But again, I really think it should be the spec owner who should have
> considered all these..

yes, spec owner should consider it.

> e.g. explicitly document "this bit was used to reserved,
> but when scalable mode enabled it's ignored and programmable by the guest
> driver", or something like that.

there is a good example for your above sentence. It's the root table entry
and the scalable mode root table entry. In legacy mode, the high 64 bits of
root table entry are all reserved. In scalable mode, some of the high 64 bits
are used. I think we have defined scalable mode reserved bits macro in the
emulator code.

But regards to minor changes within a working mode, it may be more common to
define a capability bit when a reserved bit is re-used.

Regards,
Yi Liu

RE: [PATCH] intel-iommu: ignore SNP bit in scalable mode

2021-11-27 Thread Liu, Yi L

> From: Peter Xu 
> Sent: Thursday, November 25, 2021 2:14 PM
> 
> On Thu, Nov 25, 2021 at 05:49:38AM +0000, Liu, Yi L wrote:
> > > From: Peter Xu 
> > > Sent: Thursday, November 25, 2021 12:31 PM
> > >
> > > On Thu, Nov 25, 2021 at 04:03:34AM +, Liu, Yi L wrote:
> > > > > From: Peter Xu 
> > > > > Sent: Wednesday, November 24, 2021 3:57 PM
> > > > >
> > > > > On Wed, Nov 24, 2021 at 02:03:09PM +0800, Jason Wang wrote:
> > > > > > When booting with scalable mode, I hit this error:
> > > > > >
> > > > > > qemu-system-x86_64: vtd_iova_to_slpte: detected splte reserve
> non-
> > > > > zero iova=0xf002, level=0x1slpte=0x102681803)
> > > > > > qemu-system-x86_64: vtd_iommu_translate: detected translation
> > > failure
> > > > > (dev=01:00:00, iova=0xf002)
> > > > > > qemu-system-x86_64: New fault is not recorded due to
> compression
> > > of
> > > > > faults
> > > > > >
> > > > > > This is because the SNP bit is set since Linux kernel commit
> > > > > > 6c00612d0cba1 ("iommu/vt-d: Report right snoop capability when
> > > using
> > > > > > FL for IOVA") where SNP bit is set if scalable mode is on though 
> > > > > > this
> > > > > > seems to be an violation on the spec which said the SNP bit is
> > > > > > considered to be reserved if SC is not supported.
> > > > >
> > > > > When I was reading that commit, I was actually confused by this
> change:
> > > > >
> > > > > ---8<---
> > > > > diff --git a/drivers/iommu/intel/iommu.c
> > > b/drivers/iommu/intel/iommu.c
> > > > > index 956a02eb40b4..0ee5f1bd8af2 100644
> > > > > --- a/drivers/iommu/intel/iommu.c
> > > > > +++ b/drivers/iommu/intel/iommu.c
> > > > > @@ -658,7 +658,14 @@ static int
> > > domain_update_iommu_snooping(struct
> > > > > intel_iommu *skip)
> > > > > rcu_read_lock();
> > > > > for_each_active_iommu(iommu, drhd) {
> > > > > if (iommu != skip) {
> > > > > -   if (!ecap_sc_support(iommu->ecap)) {
> > > > > +   /*
> > > > > +* If the hardware is operating in the 
> > > > > scalable mode,
> > > > > +* the snooping control is always supported 
> > > > > since we
> > > > > +* always set PASID-table-entry.PGSNP bit if 
> > > > > the domain
> > > > > +* is managed outside (UNMANAGED).
> > > > > +*/
> > > > > +   if (!sm_supported(iommu) &&
> > > > > +   !ecap_sc_support(iommu->ecap)) {
> > > > > ret = 0;
> > > > > break;
> > > > > }
> > > > > ---8<---
> > > > >
> > > > > Does it mean that for some hardwares that has
> sm_supported()==true,
> > > it'll
> > > > > have  SC bit cleared in ecap register?  That sounds odd, and not sure
> why.
> > > Maybe
> > > > > Yi Liu or Yi Sun may know?
> > > >
> > > > scalable mode has no dependency on SC, so it's possible.
> > >
> > > I see; thanks, Yi.
> > >
> > > However then OTOH I don't understand above comment
> > >
> > >   "If the hardware is operating in the scalable mode, the snooping control
> is
> > >always supported since... "
> > >
> > > Because the current qemu vt-d emulation should fall into the case that Yi
> > > mentioned - we support initial scalable mode but no SC yet..
> >
> > chapter 3.9 of 3.2 spec says below.
> >
> > “If the remapping hardware is setup in scalable-mode
> (RTADDR_REG.TTM=01b)
> > and the Page Snoop (PGSNP) field in PASID-table entry is Set, access to
> the
> > final page is snooped.”
> >
> > It means the PGSNP field is available under scalable mode. And spec also
> > says below in chapter 96. of 3.2 spec.
> >
> > "Requests snoop processor caches irrespective of, other attributes in the
>

RE: [PATCH] intel-iommu: ignore SNP bit in scalable mode

2021-11-24 Thread Liu, Yi L

> From: Jason Wang 
> Sent: Wednesday, November 24, 2021 4:29 PM
> 
> On Wed, Nov 24, 2021 at 3:57 PM Peter Xu  wrote:
> >
> > On Wed, Nov 24, 2021 at 02:03:09PM +0800, Jason Wang wrote:
> > > When booting with scalable mode, I hit this error:
> > >
> > > qemu-system-x86_64: vtd_iova_to_slpte: detected splte reserve non-
> zero iova=0xf002, level=0x1slpte=0x102681803)
> > > qemu-system-x86_64: vtd_iommu_translate: detected translation
> failure (dev=01:00:00, iova=0xf002)
> > > qemu-system-x86_64: New fault is not recorded due to compression of
> faults
> > >
> > > This is because the SNP bit is set since Linux kernel commit
> > > 6c00612d0cba1 ("iommu/vt-d: Report right snoop capability when using
> > > FL for IOVA") where SNP bit is set if scalable mode is on though this
> > > seems to be an violation on the spec which said the SNP bit is
> > > considered to be reserved if SC is not supported.
> >
> > When I was reading that commit, I was actually confused by this change:
> >
> > ---8<---
> > diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> > index 956a02eb40b4..0ee5f1bd8af2 100644
> > --- a/drivers/iommu/intel/iommu.c
> > +++ b/drivers/iommu/intel/iommu.c
> > @@ -658,7 +658,14 @@ static int
> domain_update_iommu_snooping(struct intel_iommu *skip)
> > rcu_read_lock();
> > for_each_active_iommu(iommu, drhd) {
> > if (iommu != skip) {
> > -   if (!ecap_sc_support(iommu->ecap)) {
> > +   /*
> > +* If the hardware is operating in the scalable 
> > mode,
> > +* the snooping control is always supported since we
> > +* always set PASID-table-entry.PGSNP bit if the 
> > domain
> > +* is managed outside (UNMANAGED).
> > +*/
> > +   if (!sm_supported(iommu) &&
> > +   !ecap_sc_support(iommu->ecap)) {
> > ret = 0;
> > break;
> > }
> > ---8<---
> >
> > Does it mean that for some hardwares that has sm_supported()==true,
> it'll have
> > SC bit cleared in ecap register?
> 
> I guess not, so it's probably only the problem of vIOMMU.
> 
> > That sounds odd, and not sure why.  Maybe Yi
> > Liu or Yi Sun may know?
> 
> Another interesting point is that, it looks to me after that commit
> SNP is used for the domain that is not UNMANAGED even if PGSNP is not
> set.

Per spec, if the PGSNP is set, it means the final page access is snooped.
If it's not set, then it's up to other bit to decide it. For detail, you may
refer to table 6 of chapter 3.9 in vtd 3.2 spec.

Regards,
Yi Liu

RE: [PATCH] intel-iommu: ignore SNP bit in scalable mode

2021-11-24 Thread Liu, Yi L

> From: Jason Wang 
> Sent: Wednesday, November 24, 2021 5:35 PM
> 
> On Wed, Nov 24, 2021 at 5:23 PM Peter Xu  wrote:
> >
> > On Wed, Nov 24, 2021 at 05:01:42PM +0800, Jason Wang wrote:
> > > > > > > -static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t
> level)
> > > > > > > +static bool vtd_slpte_nonzero_rsvd(IntelIOMMUState *s,
> > > > > > > +   uint64_t slpte, uint32_t 
> > > > > > > level)
> > > > > > >  {
> > > > > > >  uint64_t rsvd_mask = vtd_spte_rsvd[level];
> > > > > > >
> > > > > > > @@ -979,6 +980,10 @@ static bool
> vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
> > > > > > >  rsvd_mask = vtd_spte_rsvd_large[level];
> > > > > > >  }
> > > > > > >
> > > > > > > +if (s->scalable_mode) {
> > > > > > > +rsvd_mask &= ~VTD_SPTE_SNP;
> > > > > > > +}
> > > > > >
> > > > > > IMHO what we want to do is only to skip the leaves of pgtables on
> SNP, so maybe
> > > > > > we still want to keep checking the bit 11 reserved for e.g. common
> pgtable dir
> > > > > > entries?
> > >
> > > Maybe, but it's probably a question that can only be answered by
> > > Intel. I can change it for the next version if you stick.
> >
> > I'm reading vtd spec v3.1 (June 2019) here, and chap 9.8 told me they're
> > reserved bits for pgdir entries, as no SNP bit defined on pgdir entries.
> 
> Yes, you're right.

yeah. The SNP bit is only available in the leaf paging entry. e.g. for 4KB 
pages,
the SNP bit is in the PTE, but for 2MB pages, the SNP bit is in PDE, and etc.

Regards,
Yi Liu

RE: [PATCH] intel-iommu: ignore SNP bit in scalable mode

2021-11-24 Thread Liu, Yi L

> From: Peter Xu 
> Sent: Thursday, November 25, 2021 12:31 PM
> 
> On Thu, Nov 25, 2021 at 04:03:34AM +0000, Liu, Yi L wrote:
> > > From: Peter Xu 
> > > Sent: Wednesday, November 24, 2021 3:57 PM
> > >
> > > On Wed, Nov 24, 2021 at 02:03:09PM +0800, Jason Wang wrote:
> > > > When booting with scalable mode, I hit this error:
> > > >
> > > > qemu-system-x86_64: vtd_iova_to_slpte: detected splte reserve non-
> > > zero iova=0xf002, level=0x1slpte=0x102681803)
> > > > qemu-system-x86_64: vtd_iommu_translate: detected translation
> failure
> > > (dev=01:00:00, iova=0xf002)
> > > > qemu-system-x86_64: New fault is not recorded due to compression
> of
> > > faults
> > > >
> > > > This is because the SNP bit is set since Linux kernel commit
> > > > 6c00612d0cba1 ("iommu/vt-d: Report right snoop capability when
> using
> > > > FL for IOVA") where SNP bit is set if scalable mode is on though this
> > > > seems to be an violation on the spec which said the SNP bit is
> > > > considered to be reserved if SC is not supported.
> > >
> > > When I was reading that commit, I was actually confused by this change:
> > >
> > > ---8<---
> > > diff --git a/drivers/iommu/intel/iommu.c
> b/drivers/iommu/intel/iommu.c
> > > index 956a02eb40b4..0ee5f1bd8af2 100644
> > > --- a/drivers/iommu/intel/iommu.c
> > > +++ b/drivers/iommu/intel/iommu.c
> > > @@ -658,7 +658,14 @@ static int
> domain_update_iommu_snooping(struct
> > > intel_iommu *skip)
> > > rcu_read_lock();
> > > for_each_active_iommu(iommu, drhd) {
> > > if (iommu != skip) {
> > > -   if (!ecap_sc_support(iommu->ecap)) {
> > > +   /*
> > > +* If the hardware is operating in the scalable 
> > > mode,
> > > +* the snooping control is always supported since 
> > > we
> > > +* always set PASID-table-entry.PGSNP bit if the 
> > > domain
> > > +* is managed outside (UNMANAGED).
> > > +*/
> > > +   if (!sm_supported(iommu) &&
> > > +   !ecap_sc_support(iommu->ecap)) {
> > > ret = 0;
> > > break;
> > > }
> > > ---8<---
> > >
> > > Does it mean that for some hardwares that has sm_supported()==true,
> it'll
> > > have  SC bit cleared in ecap register?  That sounds odd, and not sure why.
> Maybe
> > > Yi Liu or Yi Sun may know?
> >
> > scalable mode has no dependency on SC, so it's possible.
> 
> I see; thanks, Yi.
> 
> However then OTOH I don't understand above comment
> 
>   "If the hardware is operating in the scalable mode, the snooping control is
>always supported since... "
> 
> Because the current qemu vt-d emulation should fall into the case that Yi
> mentioned - we support initial scalable mode but no SC yet..

chapter 3.9 of 3.2 spec says below.

“If the remapping hardware is setup in scalable-mode (RTADDR_REG.TTM=01b)
and the Page Snoop (PGSNP) field in PASID-table entry is Set, access to the
final page is snooped.”

It means the PGSNP field is available under scalable mode. And spec also
says below in chapter 96. of 3.2 spec.

"Requests snoop processor caches irrespective of, other attributes in the
request or other fields in paging structure entries used to translate the
request."

It means the PGSNP field of PASID table entry is the first class control
of the snoop behaviour. Also it means the scalable mode has the snoop
control by default. ^_^. So the comment in the above commit is correct
since the policy of intel iommu driver is always setting the PGSNP bit.
But spec is not so clear. Will reach out to make it more clearer in the
spec. thanks for catching it. :-)

Regards,
Yi Liu

RE: [PATCH] intel-iommu: ignore SNP bit in scalable mode

2021-11-24 Thread Liu, Yi L

> From: Peter Xu 
> Sent: Wednesday, November 24, 2021 3:57 PM
> 
> On Wed, Nov 24, 2021 at 02:03:09PM +0800, Jason Wang wrote:
> > When booting with scalable mode, I hit this error:
> >
> > qemu-system-x86_64: vtd_iova_to_slpte: detected splte reserve non-
> zero iova=0xf002, level=0x1slpte=0x102681803)
> > qemu-system-x86_64: vtd_iommu_translate: detected translation failure
> (dev=01:00:00, iova=0xf002)
> > qemu-system-x86_64: New fault is not recorded due to compression of
> faults
> >
> > This is because the SNP bit is set since Linux kernel commit
> > 6c00612d0cba1 ("iommu/vt-d: Report right snoop capability when using
> > FL for IOVA") where SNP bit is set if scalable mode is on though this
> > seems to be an violation on the spec which said the SNP bit is
> > considered to be reserved if SC is not supported.
> 
> When I was reading that commit, I was actually confused by this change:
> 
> ---8<---
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 956a02eb40b4..0ee5f1bd8af2 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -658,7 +658,14 @@ static int domain_update_iommu_snooping(struct
> intel_iommu *skip)
> rcu_read_lock();
> for_each_active_iommu(iommu, drhd) {
> if (iommu != skip) {
> -   if (!ecap_sc_support(iommu->ecap)) {
> +   /*
> +* If the hardware is operating in the scalable mode,
> +* the snooping control is always supported since we
> +* always set PASID-table-entry.PGSNP bit if the 
> domain
> +* is managed outside (UNMANAGED).
> +*/
> +   if (!sm_supported(iommu) &&
> +   !ecap_sc_support(iommu->ecap)) {
> ret = 0;
> break;
> }
> ---8<---
> 
> Does it mean that for some hardwares that has sm_supported()==true, it'll
> have  SC bit cleared in ecap register?  That sounds odd, and not sure why.  
> Maybe
> Yi Liu or Yi Sun may know?

scalable mode has no dependency on SC, so it's possible.

> >
> > To unbreak the guest, ignore the SNP bit for scalable mode first. In
> > the future we may consider to add SC support.
> 
> Oh yes, I remembered the last time we discussed this.  Could you remind
> me what's missing for us to support SC?
> 
> IIUC, for common device emulations we can just declare SC==1, right?  As all
> the DMAs (including kernel accels like vhost) will be from host processors so
> there're no coherent issues with guest vcpu threads.
> 
> If that's correct, the only challenge is device assignment in any form (I am
> not familiar with vdpa; so perhaps that includes vfio, vpda and any other
> kind of assigning host devices to guest?), then we'll try to detect 
> IOMMU_CACHE
> capability from the host iommu groups that covers the assigned devices,
> and we only set SC==1 if we have cache coherency on all the devices?

above looks good to me. SC bit means SNP field available in leaf paging
structure. So we need to check the host side's SC capability for the assigned
devices, then decide whether set SC or not. Then guest iommu driver can
set the SNP bit per its policy.

btw. there is a discussion on the IOMMU_CACHE, it's considered to be
an incorrect usage to let it linked with no_snoop. So there may be some
cleanup later. anyhow, just let you two be aware of it.

https://lore.kernel.org/kvm/20210922234954.gb964...@nvidia.com/

Regards,
Yi Liu

> 
> >
> > Signed-off-by: Jason Wang 
> > ---
> >  hw/i386/intel_iommu.c  | 18 --
> >  hw/i386/intel_iommu_internal.h |  2 ++
> >  2 files changed, 14 insertions(+), 6 deletions(-)
> >
> > diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
> > index 294499ee20..3bcac56c3e 100644
> > --- a/hw/i386/intel_iommu.c
> > +++ b/hw/i386/intel_iommu.c
> > @@ -969,7 +969,8 @@ static dma_addr_t
> vtd_get_iova_pgtbl_base(IntelIOMMUState *s,
> >  static uint64_t vtd_spte_rsvd[5];
> >  static uint64_t vtd_spte_rsvd_large[5];
> >
> > -static bool vtd_slpte_nonzero_rsvd(uint64_t slpte, uint32_t level)
> > +static bool vtd_slpte_nonzero_rsvd(IntelIOMMUState *s,
> > +   uint64_t slpte, uint32_t level)
> >  {
> >  uint64_t rsvd_mask = vtd_spte_rsvd[level];
> >
> > @@ -979,6 +980,10 @@ static bool vtd_slpte_nonzero_rsvd(uint64_t slpte,
> uint32_t level)
> >  rsvd_mask = vtd_spte_rsvd_large[level];
> >  }
> >
> > +if (s->scalable_mode) {
> > +rsvd_mask &= ~VTD_SPTE_SNP;
> > +}
> 
> IMHO what we want to do is only to skip the leaves of pgtables on SNP, so
> maybe
> we still want to keep checking the bit 11 reserved for e.g. common pgtable
> dir
> entries?
> 
> To do so, how about directly modifying the vtd_spte_rsvd* fields in
> vtd_init()?
> I think we only need to modify

RE: [PATCH] docs: Add '-device intel-iommu' entry

2021-07-07 Thread Liu, Yi L

> From: Peter Xu < pet...@redhat.com >
> Sent: Saturday, June 12, 2021 2:55 AM
> 
> The parameters of intel-iommu device are non-trivial to understand.  Add
> an
> entry for it so that people can reference to it when using.
> 
> There're actually a few more options there, but I hide them explicitly
> because
> they shouldn't be used by normal QEMU users.

yes, it's a good start.

Looks good to me.
Reviewed-by: Liu Yi L 

Regards,
Yi Liu

> 
> Cc: Chao Yang 
> Cc: Lei Yang 
> Cc: Jing Zhao 
> Cc: Jason Wang 
> Cc: Michael S. Tsirkin 
> Cc: Alex Williamson 
> Signed-off-by: Peter Xu 
> ---
>  qemu-options.hx | 32 
>  1 file changed, 32 insertions(+)
> 
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 14258784b3a..4bb04243907 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -926,6 +926,38 @@ SRST
> 
>  ``-device pci-ipmi-bt,bmc=id``
>  Like the KCS interface, but defines a BT interface on the PCI bus.
> +
> +``-device intel-iommu[,option=...]``
> +This is only supported by ``-machine q35``, which will enable Intel VT-d
> +emulation within the guest.  It supports below options:
> +
> +``intremap=on|off`` (default: auto)
> +This enables interrupt remapping feature in the guest.  It's required
> +to enable complete x2apic.  Currently it only supports kvm
> +kernel-irqchip modes ``off`` or ``split``.  Full kernel-irqchip is 
> not
> +yet supported.
> +
> +``caching-mode=on|off`` (default: off)
> +This enables caching mode for the VT-d emulated device.  When
> +caching-mode is enabled, each guest DMA buffer mapping will
> generate an
> +IOTLB invalidation from the guest IOMMU driver to the vIOMMU
> device in
> +a synchronous way.  It is required for ``-device vfio-pci`` to work
> +with the VT-d device, because host assigned devices requires to setup
> +the DMA mapping on the host before guest DMA starts.
> +
> +``device-iotlb=on|off`` (default: off)
> +This enables device-iotlb capability for the emulated VT-d device.  
> So
> +far virtio/vhost should be the only real user for this parameter,
> +paired with ats=on configured for the device.
> +
> +``aw-bits=39|48`` (default: 39)
> +This decides the address width of IOVA address space.  The address
> +space has 39 bits width for 3-level IOMMU page tables, and 48 bits 
> for
> +4-level IOMMU page tables.
> +
> +Please also refer to the wiki page for general scenarios of VT-d
> +emulation in QEMU: https://wiki.qemu.org/Features/VT-d.
> +
>  ERST
> 
>  DEF("name", HAS_ARG, QEMU_OPTION_name,
> --
> 2.31.1
>

[RFC v11 23/25] intel_iommu: propagate PASID-based iotlb invalidation to host

2021-03-02 Thread Liu Yi L

This patch propagates PASID-based iotlb invalidation to host.

Intel VT-d 3.0 supports nested translation in PASID granular.
Guest SVA support could be implemented by configuring nested
translation on specific PASID. This is also known as dual stage
DMA translation.

Under such configuration, guest owns the GVA->GPA translation
which is configured as first level page table in host side for
a specific pasid, and host owns GPA->HPA translation. As guest
owns first level translation table, piotlb invalidation should
be propagated to host since host IOMMU will cache first level
page table related mappings during DMA address translation.

This patch traps the guest PASID-based iotlb flush and propagate
it to host.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
rfcv4 (v1) -> rfcv5 (v2):
*) removed the valid check to vtd_pasid_as instance as rfcv5 ensures
   all vtd_pasid_as instances in hash table should be valid.
---
 hw/i386/intel_iommu.c  | 113 +
 hw/i386/intel_iommu_internal.h |   7 ++
 2 files changed, 120 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index b709440b15..915db7ad1f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3083,16 +3083,129 @@ static bool vtd_process_pasid_desc(IntelIOMMUState *s,
 return !pc_info.error_happened ? true : false;
 }
 
+/**
+ * Caller of this function should hold iommu_lock.
+ */
+static void vtd_invalidate_piotlb(IntelIOMMUState *s,
+  VTDBus *vtd_bus,
+  int devfn,
+  struct iommu_cache_invalidate_info *cache)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+HostIOMMUContext *iommu_ctx;
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+if (!vtd_dev_icx) {
+goto out;
+}
+iommu_ctx = vtd_dev_icx->iommu_ctx;
+if (!iommu_ctx) {
+goto out;
+}
+if (host_iommu_ctx_flush_stage1_cache(iommu_ctx, cache)) {
+error_report("Cache flush failed");
+}
+out:
+return;
+}
+
+/**
+ * This function is a loop function for the s->vtd_pasid_as
+ * list with VTDPIOTLBInvInfo as execution filter. It propagates
+ * the piotlb invalidation to host. Caller of this function
+ * should hold iommu_lock.
+ */
+static void vtd_flush_pasid_iotlb(gpointer key, gpointer value,
+  gpointer user_data)
+{
+VTDPIOTLBInvInfo *piotlb_info = user_data;
+VTDPASIDAddressSpace *vtd_pasid_as = value;
+VTDPASIDCacheEntry *pc_entry = _pasid_as->pasid_cache_entry;
+uint16_t did;
+
+did = vtd_pe_get_domain_id(_entry->pasid_entry);
+
+if ((piotlb_info->domain_id == did) &&
+(piotlb_info->pasid == vtd_pasid_as->pasid)) {
+vtd_invalidate_piotlb(vtd_pasid_as->iommu_state,
+  vtd_pasid_as->vtd_bus,
+  vtd_pasid_as->devfn,
+  piotlb_info->cache_info);
+}
+
+/*
+ * TODO: needs to add QEMU piotlb flush when QEMU piotlb
+ * infrastructure is ready. For now, it is enough for passthru
+ * devices.
+ */
+}
+
 static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
 uint16_t domain_id,
 uint32_t pasid)
 {
+VTDPIOTLBInvInfo piotlb_info;
+struct iommu_cache_invalidate_info *cache_info;
+
+cache_info = g_malloc0(sizeof(*cache_info));
+
+cache_info->argsz = sizeof(*cache_info);
+cache_info->version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
+cache_info->cache = IOMMU_CACHE_INV_TYPE_IOTLB;
+cache_info->granularity = IOMMU_INV_GRANU_PASID;
+cache_info->granu.pasid_info.pasid = pasid;
+cache_info->granu.pasid_info.flags = IOMMU_INV_PASID_FLAGS_PASID;
+
+piotlb_info.domain_id = domain_id;
+piotlb_info.pasid = pasid;
+piotlb_info.cache_info = cache_info;
+
+vtd_iommu_lock(s);
+/*
+ * Here loops all the vtd_pasid_as instances in s->vtd_pasid_as
+ * to find out the affected devices since piotlb invalidation
+ * should check pasid cache per architecture point of view.
+ */
+g_hash_table_foreach(s->vtd_pasid_as,
+ vtd_flush_pasid_iotlb, _info);
+vtd_iommu_unlock(s);
+g_free(cache_info);
 }
 
 static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
uint32_t pasid, hwaddr addr, uint8_t am,
bool ih)
 {
+VTDPIOTLBInvInfo piotlb_info;
+struct iommu_cache_invalidate_info *cache_info;
+
+cache_info = g_malloc0(sizeof(*cache_info));
+
+cache_info->argsz = sizeof(*cache_info);
+cache_info->version = IOMMU_CACHE_INVALIDATE_INFO

[RFC v11 21/25] vfio: add support for flush iommu stage-1 cache

2021-03-02 Thread Liu Yi L

This patch adds flush_stage1_cache() definition in HostIOMUContextClass.
And adds corresponding implementation in VFIO. This is to expose a way
for vIOMMU to flush stage-1 cache in host side since guest owns stage-1
translation structures in dual stage DMA translation configuration.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Acked-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/iommu/host_iommu_context.c | 19 +++
 hw/vfio/common.c  | 24 
 include/hw/iommu/host_iommu_context.h |  8 
 3 files changed, 51 insertions(+)

diff --git a/hw/iommu/host_iommu_context.c b/hw/iommu/host_iommu_context.c
index d7139bcb86..59f5e7af9e 100644
--- a/hw/iommu/host_iommu_context.c
+++ b/hw/iommu/host_iommu_context.c
@@ -69,6 +69,25 @@ int host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext 
*iommu_ctx,
 return hicxc->unbind_stage1_pgtbl(iommu_ctx, unbind);
 }
 
+int host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+ struct iommu_cache_invalidate_info *cache)
+{
+HostIOMMUContextClass *hicxc;
+
+hicxc = HOST_IOMMU_CONTEXT_GET_CLASS(iommu_ctx);
+
+if (!hicxc) {
+return -EINVAL;
+}
+
+if (!(iommu_ctx->info->features & IOMMU_NESTING_FEAT_CACHE_INVLD) ||
+!hicxc->flush_stage1_cache) {
+return -EINVAL;
+}
+
+return hicxc->flush_stage1_cache(iommu_ctx, cache);
+}
+
 void host_iommu_ctx_init(void *_iommu_ctx, size_t instance_size,
  const char *mrtypename,
  struct iommu_nesting_info *info)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index a12708bcb7..122866fa85 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1623,6 +1623,29 @@ static int 
vfio_host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
 return ret;
 }
 
+static int vfio_host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+struct iommu_cache_invalidate_info *cache)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_nesting_op *op;
+unsigned long argsz;
+int ret = 0;
+
+argsz = sizeof(*op) + sizeof(*cache);
+op = g_malloc0(argsz);
+op->argsz = argsz;
+op->flags = VFIO_IOMMU_NESTING_OP_CACHE_INVLD;
+memcpy(>data, cache, sizeof(*cache));
+
+if (ioctl(container->fd, VFIO_IOMMU_NESTING_OP, op)) {
+ret = -errno;
+error_report("%s: iommu cache flush failed: %m", __func__);
+}
+g_free(op);
+return ret;
+}
+
 /**
  * Get iommu info from host. Caller of this funcion should free
  * the memory pointed by the returned pointer stored in @info
@@ -2389,6 +2412,7 @@ static void 
vfio_host_iommu_context_class_init(ObjectClass *klass,
 
 hicxc->bind_stage1_pgtbl = vfio_host_iommu_ctx_bind_stage1_pgtbl;
 hicxc->unbind_stage1_pgtbl = vfio_host_iommu_ctx_unbind_stage1_pgtbl;
+hicxc->flush_stage1_cache = vfio_host_iommu_ctx_flush_stage1_cache;
 }
 
 static const TypeInfo vfio_host_iommu_context_info = {
diff --git a/include/hw/iommu/host_iommu_context.h 
b/include/hw/iommu/host_iommu_context.h
index 3498a3e25d..8b1171fbf9 100644
--- a/include/hw/iommu/host_iommu_context.h
+++ b/include/hw/iommu/host_iommu_context.h
@@ -55,6 +55,12 @@ typedef struct HostIOMMUContextClass {
 /* Undo a previous bind. @unbind specifies the unbind info. */
 int (*unbind_stage1_pgtbl)(HostIOMMUContext *iommu_ctx,
struct iommu_gpasid_bind_data *unbind);
+/*
+ * Propagate stage-1 cache flush to host IOMMU, cache
+ * info specifid in @cache
+ */
+int (*flush_stage1_cache)(HostIOMMUContext *iommu_ctx,
+  struct iommu_cache_invalidate_info *cache);
 } HostIOMMUContextClass;
 
 /*
@@ -70,6 +76,8 @@ int host_iommu_ctx_bind_stage1_pgtbl(HostIOMMUContext 
*iommu_ctx,
  struct iommu_gpasid_bind_data *bind);
 int host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
  struct iommu_gpasid_bind_data *unbind);
+int host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+   struct iommu_cache_invalidate_info *cache);
 void host_iommu_ctx_init(void *_iommu_ctx, size_t instance_size,
  const char *mrtypename,
  struct iommu_nesting_info *info);
-- 
2.25.1

[RFC v11 16/25] intel_iommu: process PASID cache invalidation

2021-03-02 Thread Liu Yi L

This patch adds PASID cache invalidation handling. When guest enabled
PASID usages (e.g. SVA), guest software should issue a proper PASID
cache invalidation when caching-mode is exposed. This patch only adds
the draft handling of pasid cache invalidation. Detailed handling will
be added in subsequent patches.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
rfcv4 (v1) -> rfcv5 (v2):
*) remove vtd_pasid_cache_gsi(), vtd_pasid_cache_psi()
   and vtd_pasid_cache_dsi()
---
 hw/i386/intel_iommu.c  | 40 +-
 hw/i386/intel_iommu_internal.h | 12 ++
 hw/i386/trace-events   |  3 +++
 3 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 7786f97ed6..c4b0db15cb 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2421,6 +2421,37 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, 
VTDInvDesc *inv_desc)
 return true;
 }
 
+static bool vtd_process_pasid_desc(IntelIOMMUState *s,
+   VTDInvDesc *inv_desc)
+{
+if ((inv_desc->val[0] & VTD_INV_DESC_PASIDC_RSVD_VAL0) ||
+(inv_desc->val[1] & VTD_INV_DESC_PASIDC_RSVD_VAL1) ||
+(inv_desc->val[2] & VTD_INV_DESC_PASIDC_RSVD_VAL2) ||
+(inv_desc->val[3] & VTD_INV_DESC_PASIDC_RSVD_VAL3)) {
+error_report_once("non-zero-field-in-pc_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+switch (inv_desc->val[0] & VTD_INV_DESC_PASIDC_G) {
+case VTD_INV_DESC_PASIDC_DSI:
+break;
+
+case VTD_INV_DESC_PASIDC_PASID_SI:
+break;
+
+case VTD_INV_DESC_PASIDC_GLOBAL:
+break;
+
+default:
+error_report_once("invalid-inv-granu-in-pc_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+return true;
+}
+
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
  VTDInvDesc *inv_desc)
 {
@@ -2528,12 +2559,11 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 }
 break;
 
-/*
- * TODO: the entity of below two cases will be implemented in future 
series.
- * To make guest (which integrates scalable mode support patch set in
- * iommu driver) work, just return true is enough so far.
- */
 case VTD_INV_DESC_PC:
+trace_vtd_inv_desc("pasid-cache", inv_desc.val[1], inv_desc.val[0]);
+if (!vtd_process_pasid_desc(s, _desc)) {
+return false;
+}
 break;
 
 case VTD_INV_DESC_PIOTLB:
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 6abb4836a1..3c8853ab88 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -463,6 +463,18 @@ typedef union VTDInvDesc VTDInvDesc;
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
 
+#define VTD_INV_DESC_PASIDC_G  (3ULL << 4)
+#define VTD_INV_DESC_PASIDC_PASID(val) (((val) >> 32) & 0xfULL)
+#define VTD_INV_DESC_PASIDC_DID(val)   (((val) >> 16) & VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_PASIDC_RSVD_VAL0  0xfff0ffc0ULL
+#define VTD_INV_DESC_PASIDC_RSVD_VAL1  0xULL
+#define VTD_INV_DESC_PASIDC_RSVD_VAL2  0xULL
+#define VTD_INV_DESC_PASIDC_RSVD_VAL3  0xULL
+
+#define VTD_INV_DESC_PASIDC_DSI(0ULL << 4)
+#define VTD_INV_DESC_PASIDC_PASID_SI   (1ULL << 4)
+#define VTD_INV_DESC_PASIDC_GLOBAL (3ULL << 4)
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
 uint16_t domain_id;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 71536a7c20..f7cd4e5656 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -22,6 +22,9 @@ vtd_inv_qi_head(uint16_t head) "read head %d"
 vtd_inv_qi_tail(uint16_t head) "write tail %d"
 vtd_inv_qi_fetch(void) ""
 vtd_context_cache_reset(void) ""
+vtd_pasid_cache_gsi(void) ""
+vtd_pasid_cache_dsi(uint16_t domain) "Domian slective PC invalidation domain 
0x%"PRIx16
+vtd_pasid_cache_psi(uint16_t domain, uint32_t pasid) "PASID slective PC 
invalidation domain 0x%"PRIx16" pasid 0x%"PRIx32
 vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
 vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" 
devfn %"PRIu8" not present"
 vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t 
domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" 
domain 0x%"PRIx16
-- 
2.25.1

[RFC v11 15/25] intel_iommu: add virtual command capability support

2021-03-02 Thread Liu Yi L

This patch adds virtual command support to Intel vIOMMU per
Intel VT-d 3.1 spec. And adds two virtual commands: allocate
pasid and free pasid.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
Signed-off-by: Yi Sun 
---
rfcv10 -> rfcv11:
*) use /dev/ioasid FD for pasid alloc/free, /dev/ioasid open is added in
   latter patch.
---
 hw/i386/intel_iommu.c  | 141 +
 hw/i386/intel_iommu_internal.h |  37 +
 hw/i386/trace-events   |   1 +
 include/hw/i386/intel_iommu.h  |  10 ++-
 4 files changed, 188 insertions(+), 1 deletion(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 203c898fa4..7786f97ed6 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -40,6 +40,11 @@
 #include "kvm/kvm_i386.h"
 #include "migration/vmstate.h"
 #include "trace.h"
+#include 
+#include 
+
+int ioasid_fd = -1;
+uint32_t ioasid_bits;
 
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
@@ -2678,6 +2683,118 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s)
 }
 }
 
+static int vtd_request_pasid_alloc(IntelIOMMUState *s, uint32_t *pasid)
+{
+struct ioasid_alloc_request req;
+int ret;
+
+req.argsz = sizeof(req);
+req.flags = 0;
+req.range.min = VTD_HPASID_MIN;
+req.range.max = VTD_HPASID_MAX;
+
+if (s->ioasid_fd < 0) {
+error_report("%s: No available allocation interface", __func__);
+return -1;
+}
+
+vtd_iommu_lock(s);
+ret = ioctl(s->ioasid_fd, IOASID_REQUEST_ALLOC, );
+if (ret < 0) {
+error_report("%s: alloc failed %d", __func__, ret);
+}
+printf("%s, ret: %d\n", __func__, ret);
+vtd_iommu_unlock(s);
+*pasid = ret;
+return (ret < 0) ? ret : 0;
+}
+
+static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
+{
+int ret = -1;
+
+if (s->ioasid_fd < 0) {
+error_report("%s: No available allocation interface", __func__);
+return -1;
+}
+
+vtd_iommu_lock(s);
+ret = ioctl(s->ioasid_fd, IOASID_REQUEST_FREE, );
+if (ret < 0) {
+error_report("%s: free failed (%m)", __func__);
+}
+vtd_iommu_unlock(s);
+
+return ret;
+}
+
+/*
+ * If IP is not set, set it then return.
+ * If IP is already set, return.
+ */
+static void vtd_vcmd_set_ip(IntelIOMMUState *s)
+{
+s->vcrsp = 1;
+vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+ ((uint64_t) s->vcrsp));
+}
+
+static void vtd_vcmd_clear_ip(IntelIOMMUState *s)
+{
+s->vcrsp &= (~((uint64_t)(0x1)));
+vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+ ((uint64_t) s->vcrsp));
+}
+
+/* Handle write to Virtual Command Register */
+static int vtd_handle_vcmd_write(IntelIOMMUState *s, uint64_t val)
+{
+uint32_t pasid;
+int ret = -1;
+
+trace_vtd_reg_write_vcmd(s->vcrsp, val);
+
+if (!(s->vccap & VTD_VCCAP_PAS) ||
+ (s->vcrsp & 1)) {
+return -1;
+}
+
+/*
+ * Since vCPU should be blocked when the guest VMCD
+ * write was trapped to here. Should be no other vCPUs
+ * try to access VCMD if guest software is well written.
+ * However, we still emulate the IP bit here in case of
+ * bad guest software. Also align with the spec.
+ */
+vtd_vcmd_set_ip(s);
+
+switch (val & VTD_VCMD_CMD_MASK) {
+case VTD_VCMD_ALLOC_PASID:
+ret = vtd_request_pasid_alloc(s, );
+if (ret) {
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
+} else {
+s->vcrsp |= VTD_VCRSP_RSLT(pasid);
+}
+break;
+
+case VTD_VCMD_FREE_PASID:
+pasid = VTD_VCMD_PASID_VALUE(val);
+ret = vtd_request_pasid_free(s, pasid);
+if (ret < 0) {
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
+}
+break;
+
+default:
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);
+error_report_once("Virtual Command: unsupported command!!!");
+break;
+}
+vtd_vcmd_clear_ip(s);
+return 0;
+}
+
 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
 {
 IntelIOMMUState *s = opaque;
@@ -2966,6 +3083,23 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 vtd_set_long(s, addr, val);
 break;
 
+case DMAR_VCMD_REG:
+if (!vtd_handle_vcmd_write(s, val)) {
+if (size == 4) {
+vtd_set_long(s, addr, val);
+} else {
+vtd_set_quad(s, addr, val);
+}
+}
+break;
+
+case DMAR_VCMD_REG_HI:
+assert(size == 4);
+if (!vtd_handle_vcmd_write(s, val)) {
+vtd_set_long(s, addr, val);
+}
+break;
+
 defau

[RFC v11 13/25] vfio: init HostIOMMUContext per-container

2021-03-02 Thread Liu Yi L

In this patch, QEMU firstly gets iommu info from kernel to check the
supported capabilities by a VFIO_IOMMU_TYPE1_NESTING iommu. And inits
HostIOMMUContet instance.

For vfio-pci devices, it could use pci_device_set/unset_iommu() to
expose host iommu context to vIOMMU emulators. vIOMMU emulators
could make use of the methods provided by host iommu context. e.g.
propagate requests to host iommu.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
---
 hw/vfio/common.c  | 135 +-
 hw/vfio/pci.c |  17 +
 include/hw/vfio/vfio-common.h |   1 +
 3 files changed, 118 insertions(+), 35 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 433938c245..a12708bcb7 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1623,41 +1623,11 @@ static int 
vfio_host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
 return ret;
 }
 
-static int vfio_init_container(VFIOContainer *container, int group_fd,
-   bool want_nested, Error **errp)
-{
-int iommu_type, ret;
-
-iommu_type = vfio_get_iommu_type(container, want_nested, errp);
-if (iommu_type < 0) {
-return iommu_type;
-}
-
-ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, >fd);
-if (ret) {
-error_setg_errno(errp, errno, "Failed to set group container");
-return -errno;
-}
-
-while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
-if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
-/*
- * On sPAPR, despite the IOMMU subdriver always advertises v1 and
- * v2, the running platform may not support v2 and there is no
- * way to guess it until an IOMMU group gets added to the 
container.
- * So in case it fails with v2, try v1 as a fallback.
- */
-iommu_type = VFIO_SPAPR_TCE_IOMMU;
-continue;
-}
-error_setg_errno(errp, errno, "Failed to set iommu for container");
-return -errno;
-}
-
-container->iommu_type = iommu_type;
-return 0;
-}
-
+/**
+ * Get iommu info from host. Caller of this funcion should free
+ * the memory pointed by the returned pointer stored in @info
+ * after a successful calling when finished its usage.
+ */
 static int vfio_get_iommu_info(VFIOContainer *container,
struct vfio_iommu_type1_info **info)
 {
@@ -1702,6 +1672,101 @@ vfio_get_iommu_info_cap(struct vfio_iommu_type1_info 
*info, uint16_t id)
 return NULL;
 }
 
+static int vfio_get_nesting_iommu_cap(VFIOContainer *container,
+   struct vfio_iommu_type1_info_cap_nesting **cap_nesting)
+{
+struct vfio_iommu_type1_info *info;
+struct vfio_info_cap_header *hdr;
+struct vfio_iommu_type1_info_cap_nesting *cap;
+struct iommu_nesting_info *nest_info;
+int ret;
+uint32_t minsz, cap_size;
+
+ret = vfio_get_iommu_info(container, );
+if (ret) {
+return ret;
+}
+
+hdr = vfio_get_iommu_info_cap(info,
+VFIO_IOMMU_TYPE1_INFO_CAP_NESTING);
+if (!hdr) {
+g_free(info);
+return -EINVAL;
+}
+
+cap = container_of(hdr,
+struct vfio_iommu_type1_info_cap_nesting, header);
+
+nest_info = >info;
+minsz = offsetof(struct iommu_nesting_info, vendor);
+if (nest_info->argsz < minsz) {
+g_free(info);
+return -EINVAL;
+}
+
+cap_size = offsetof(struct vfio_iommu_type1_info_cap_nesting, info) +
+   nest_info->argsz;
+*cap_nesting = g_malloc0(cap_size);
+memcpy(*cap_nesting, cap, cap_size);
+
+g_free(info);
+return 0;
+}
+
+static int vfio_init_container(VFIOContainer *container, int group_fd,
+   bool want_nested, Error **errp)
+{
+int iommu_type, ret;
+
+iommu_type = vfio_get_iommu_type(container, want_nested, errp);
+if (iommu_type < 0) {
+return iommu_type;
+}
+
+ret = ioctl(group_fd, VFIO_GROUP_SET_CONTAINER, >fd);
+if (ret) {
+error_setg_errno(errp, errno, "Failed to set group container");
+return -errno;
+}
+
+while (ioctl(container->fd, VFIO_SET_IOMMU, iommu_type)) {
+if (iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
+/*
+ * On sPAPR, despite the IOMMU subdriver always advertises v1 and
+ * v2, the running platform may not support v2 and there is no
+ * way to guess it until an IOMMU group gets added to the 
container.
+ * So in case it fails with v2, try v1 as a fallback.
+ */
+iommu_type = VFIO_SPAPR_TCE_IOMMU;
+continue;
+}
+error_setg_errno(errp, errno, "Failed to set iommu for container");
+return -errno;
+

[RFC v11 24/25] intel_iommu: process PASID-based Device-TLB invalidation

2021-03-02 Thread Liu Yi L

This patch adds an empty handling for PASID-based Device-TLB
invalidation. For now it is enough as it is not necessary to
propagate it to host for passthru device and also there is no
emulated device has device tlb.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 18 ++
 hw/i386/intel_iommu_internal.h |  1 +
 2 files changed, 19 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 915db7ad1f..932c235f37 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3258,6 +3258,17 @@ static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
 return true;
 }
 
+static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s,
+   VTDInvDesc *inv_desc)
+{
+/*
+ * no need to handle it for passthru device, for emulated
+ * devices with device tlb, it may be required, but for now,
+ * return is enough
+ */
+return true;
+}
+
 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
   VTDInvDesc *inv_desc)
 {
@@ -3380,6 +3391,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 }
 break;
 
+case VTD_INV_DESC_DEV_PIOTLB:
+trace_vtd_inv_desc("device-piotlb", inv_desc.hi, inv_desc.lo);
+if (!vtd_process_device_piotlb_desc(s, _desc)) {
+return false;
+}
+break;
+
 case VTD_INV_DESC_DEVICE:
 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
 if (!vtd_process_device_iotlb_desc(s, _desc)) {
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 7fbdd53b60..be29f3672b 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -405,6 +405,7 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_WAIT   0x5 /* Invalidation Wait Descriptor */
 #define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */
 #define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */
+#define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/
 #define VTD_INV_DESC_NONE   0   /* Not an Invalidate Descriptor */
 
 /* Masks for Invalidation Wait Descriptor*/
-- 
2.25.1

[RFC v11 12/25] vfio: add HostIOMMUContext support

2021-03-02 Thread Liu Yi L

This patch adds support for HostIOMMUContext, implements bind_stage1_pgtbl()
and unbind_stage1_pgtbl() for vIOMMU to setup dual stage DMA translation for
passthru devices on hardware.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
---
 hw/vfio/common.c  | 70 +++
 include/hw/iommu/host_iommu_context.h |  3 ++
 include/hw/vfio/vfio-common.h |  3 ++
 3 files changed, 76 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 883815d5b0..433938c245 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1575,6 +1575,54 @@ static int vfio_get_iommu_type(VFIOContainer *container,
 return ret;
 }
 
+static int vfio_host_iommu_ctx_bind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
+ struct iommu_gpasid_bind_data *bind)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_nesting_op *op;
+unsigned long argsz;
+int ret = 0;
+
+argsz = sizeof(*op) + sizeof(*bind);
+op = g_malloc0(argsz);
+op->argsz = argsz;
+op->flags = VFIO_IOMMU_NESTING_OP_BIND_PGTBL;
+memcpy(>data, bind, sizeof(*bind));
+
+if (ioctl(container->fd, VFIO_IOMMU_NESTING_OP, op)) {
+ret = -errno;
+error_report("%s: pasid (%llu) bind failed: %m",
+  __func__, bind->hpasid);
+}
+g_free(op);
+return ret;
+}
+
+static int vfio_host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
+ struct iommu_gpasid_bind_data *unbind)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_nesting_op *op;
+unsigned long argsz;
+int ret = 0;
+
+argsz = sizeof(*op) + sizeof(*unbind);
+op = g_malloc0(argsz);
+op->argsz = argsz;
+op->flags = VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL;
+memcpy(>data, unbind, sizeof(*unbind));
+
+if (ioctl(container->fd, VFIO_IOMMU_NESTING_OP, op)) {
+ret = -errno;
+error_report("%s: pasid (%llu) unbind failed: %m",
+  __func__, unbind->hpasid);
+}
+g_free(op);
+return ret;
+}
+
 static int vfio_init_container(VFIOContainer *container, int group_fd,
bool want_nested, Error **errp)
 {
@@ -2268,3 +2316,25 @@ int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
 }
 return vfio_eeh_container_op(container, op);
 }
+
+static void vfio_host_iommu_context_class_init(ObjectClass *klass,
+   void *data)
+{
+HostIOMMUContextClass *hicxc = HOST_IOMMU_CONTEXT_CLASS(klass);
+
+hicxc->bind_stage1_pgtbl = vfio_host_iommu_ctx_bind_stage1_pgtbl;
+hicxc->unbind_stage1_pgtbl = vfio_host_iommu_ctx_unbind_stage1_pgtbl;
+}
+
+static const TypeInfo vfio_host_iommu_context_info = {
+.parent = TYPE_HOST_IOMMU_CONTEXT,
+.name = TYPE_VFIO_HOST_IOMMU_CONTEXT,
+.class_init = vfio_host_iommu_context_class_init,
+};
+
+static void vfio_register_types(void)
+{
+type_register_static(_host_iommu_context_info);
+}
+
+type_init(vfio_register_types)
diff --git a/include/hw/iommu/host_iommu_context.h 
b/include/hw/iommu/host_iommu_context.h
index 41c4176c15..3498a3e25d 100644
--- a/include/hw/iommu/host_iommu_context.h
+++ b/include/hw/iommu/host_iommu_context.h
@@ -33,6 +33,9 @@
 #define TYPE_HOST_IOMMU_CONTEXT "qemu:host-iommu-context"
 #define HOST_IOMMU_CONTEXT(obj) \
 OBJECT_CHECK(HostIOMMUContext, (obj), TYPE_HOST_IOMMU_CONTEXT)
+#define HOST_IOMMU_CONTEXT_CLASS(klass) \
+OBJECT_CLASS_CHECK(HostIOMMUContextClass, (klass), \
+ TYPE_HOST_IOMMU_CONTEXT)
 #define HOST_IOMMU_CONTEXT_GET_CLASS(obj) \
 OBJECT_GET_CLASS(HostIOMMUContextClass, (obj), \
  TYPE_HOST_IOMMU_CONTEXT)
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 293d3785f3..55241ee270 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -26,6 +26,7 @@
 #include "qemu/notify.h"
 #include "ui/console.h"
 #include "hw/display/ramfb.h"
+#include "hw/iommu/host_iommu_context.h"
 #ifdef CONFIG_LINUX
 #include 
 #endif
@@ -33,6 +34,8 @@
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
+#define TYPE_VFIO_HOST_IOMMU_CONTEXT "qemu:vfio-host-iommu-context"
+
 enum {
 VFIO_DEVICE_TYPE_PCI = 0,
 VFIO_DEVICE_TYPE_PLATFORM = 1,
-- 
2.25.1

[RFC v11 22/25] intel_iommu: process PASID-based iotlb invalidation

2021-03-02 Thread Liu Yi L

This patch adds the basic PASID-based iotlb (piotlb) invalidation
support. piotlb is used during walking Intel VT-d 1st level page
table. This patch only adds the basic processing. Detailed handling
will be added in next patch.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 53 ++
 hw/i386/intel_iommu_internal.h | 13 +
 2 files changed, 66 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 740dc63090..b709440b15 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3083,6 +3083,55 @@ static bool vtd_process_pasid_desc(IntelIOMMUState *s,
 return !pc_info.error_happened ? true : false;
 }
 
+static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
+uint16_t domain_id,
+uint32_t pasid)
+{
+}
+
+static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
+   uint32_t pasid, hwaddr addr, uint8_t am,
+   bool ih)
+{
+}
+
+static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
+VTDInvDesc *inv_desc)
+{
+uint16_t domain_id;
+uint32_t pasid;
+uint8_t am;
+hwaddr addr;
+
+if ((inv_desc->val[0] & VTD_INV_DESC_PIOTLB_RSVD_VAL0) ||
+(inv_desc->val[1] & VTD_INV_DESC_PIOTLB_RSVD_VAL1)) {
+error_report_once("non-zero-field-in-piotlb_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+domain_id = VTD_INV_DESC_PIOTLB_DID(inv_desc->val[0]);
+pasid = VTD_INV_DESC_PIOTLB_PASID(inv_desc->val[0]);
+switch (inv_desc->val[0] & VTD_INV_DESC_IOTLB_G) {
+case VTD_INV_DESC_PIOTLB_ALL_IN_PASID:
+vtd_piotlb_pasid_invalidate(s, domain_id, pasid);
+break;
+
+case VTD_INV_DESC_PIOTLB_PSI_IN_PASID:
+am = VTD_INV_DESC_PIOTLB_AM(inv_desc->val[1]);
+addr = (hwaddr) VTD_INV_DESC_PIOTLB_ADDR(inv_desc->val[1]);
+vtd_piotlb_page_invalidate(s, domain_id, pasid, addr, am,
+   VTD_INV_DESC_PIOTLB_IH(inv_desc->val[1]));
+break;
+
+default:
+error_report_once("Invalid granularity in P-IOTLB desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+return true;
+}
+
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
  VTDInvDesc *inv_desc)
 {
@@ -3198,6 +3247,10 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 break;
 
 case VTD_INV_DESC_PIOTLB:
+trace_vtd_inv_desc("p-iotlb", inv_desc.val[1], inv_desc.val[0]);
+if (!vtd_process_piotlb_desc(s, _desc)) {
+return false;
+}
 break;
 
 case VTD_INV_DESC_WAIT:
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index eae57f457c..24b5f934c3 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -476,6 +476,19 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_PASIDC_PASID_SI   (1ULL << 4)
 #define VTD_INV_DESC_PASIDC_GLOBAL (3ULL << 4)
 
+#define VTD_INV_DESC_PIOTLB_ALL_IN_PASID  (2ULL << 4)
+#define VTD_INV_DESC_PIOTLB_PSI_IN_PASID  (3ULL << 4)
+
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff0ffc0ULL
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL
+
+#define VTD_INV_DESC_PIOTLB_PASID(val)(((val) >> 32) & 0xfULL)
+#define VTD_INV_DESC_PIOTLB_DID(val)  (((val) >> 16) & \
+ VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_PIOTLB_ADDR(val) ((val) & ~0xfffULL)
+#define VTD_INV_DESC_PIOTLB_AM(val)   ((val) & 0x3fULL)
+#define VTD_INV_DESC_PIOTLB_IH(val)   (((val) >> 6) & 0x1)
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
 uint16_t domain_id;
-- 
2.25.1

[RFC v11 25/25] intel_iommu: modify x-scalable-mode to be string option

2021-03-02 Thread Liu Yi L

Intel VT-d 3.0 introduces scalable mode, and it has a bunch of capabilities
related to scalable mode translation, thus there are multiple combinations.
While this vIOMMU implementation wants simplify it for user by providing
typical combinations. User could config it by "x-scalable-mode" option. The
usage is as below:

"-device intel-iommu,x-scalable-mode=["legacy"|"modern"|"off"]"

 - "legacy": gives support for SL page table
 - "modern": gives support for FL page table, pasid, virtual command
 - "off": no scalable mode support
 -  if not configured, means no scalable mode support, if not proper
configured, will throw error

Note: this patch is supposed to be merged when the whole vSVA patch series
were merged.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
Signed-off-by: Yi Sun 
---
rfcv10 -> rfcv11:
*) this series uses /dev/ioasid for PASID allocation/free. In this patch,
   /dev/ioasid is opened it when deciding config.

rfcv5 (v2) -> rfcv6:
*) reports want_nested to VFIO;
*) assert iommu_set/unset_iommu_context() if vIOMMU is not scalable modern.
---
 hw/i386/intel_iommu.c  | 74 --
 hw/i386/intel_iommu_internal.h |  3 ++
 include/hw/i386/intel_iommu.h  |  2 +
 3 files changed, 75 insertions(+), 4 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 932c235f37..c7322357d3 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -4085,7 +4085,7 @@ static Property vtd_properties[] = {
 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
   VTD_HOST_ADDRESS_WIDTH),
 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
-DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
+DEFINE_PROP_STRING("x-scalable-mode", IntelIOMMUState, scalable_mode_str),
 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
 DEFINE_PROP_END_OF_LIST(),
 };
@@ -4454,6 +4454,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus, int devfn)
 static int vtd_dev_get_iommu_attr(PCIBus *bus, void *opaque, int32_t devfn,
IOMMUAttr attr, void *data)
 {
+IntelIOMMUState *s = opaque;
 int ret = 0;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
@@ -4463,8 +4464,7 @@ static int vtd_dev_get_iommu_attr(PCIBus *bus, void 
*opaque, int32_t devfn,
 {
 bool *pdata = data;
 
-/* return false until vSVA is ready */
-*pdata = false;
+*pdata = s->scalable_modern ? true : false;
 break;
 }
 default:
@@ -4558,6 +4558,8 @@ static int vtd_dev_set_iommu_context(PCIBus *bus, void 
*opaque,
 VTDHostIOMMUContext *vtd_dev_icx;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+/* only modern scalable supports unset_ioimmu_context */
+assert(s->scalable_modern);
 
 vtd_bus = vtd_find_add_bus(s, bus);
 
@@ -4592,6 +4594,8 @@ static void vtd_dev_unset_iommu_context(PCIBus *bus, void 
*opaque, int devfn)
 VTDHostIOMMUContext *vtd_dev_icx;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+/* only modern scalable supports set_ioimmu_context */
+assert(s->scalable_modern);
 
 vtd_bus = vtd_find_add_bus(s, bus);
 
@@ -4820,8 +4824,13 @@ static void vtd_init(IntelIOMMUState *s)
 }
 
 /* TODO: read cap/ecap from host to decide which cap to be exposed. */
-if (s->scalable_mode) {
+if (s->scalable_mode && !s->scalable_modern) {
 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
+} else if (s->scalable_mode && s->scalable_modern) {
+s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID |
+   VTD_ECAP_FLTS | VTD_ECAP_PSS(VTD_PASID_SS) |
+   VTD_ECAP_VCS;
+s->vccap |= VTD_VCCAP_PAS;
 }
 
 if (!s->cap_finalized) {
@@ -4962,6 +4971,63 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 return false;
 }
 
+if (s->scalable_mode_str &&
+(strcmp(s->scalable_mode_str, "off") &&
+ strcmp(s->scalable_mode_str, "modern") &&
+ strcmp(s->scalable_mode_str, "legacy"))) {
+error_setg(errp, "Invalid x-scalable-mode config,"
+ "Please use \"modern\", \"legacy\" or \"off\"");
+return false;
+}
+
+if (s->scalable_mode_str &&
+!strcmp(s->scalable_mode_str, "legacy")) {
+s->scalable_mode = true;
+s->scalable_modern = false;
+} else if (s->scalable_mode_

[RFC v11 11/25] intel_iommu: add set/unset_iommu_context callback

2021-03-02 Thread Liu Yi L

This patch adds set/unset_iommu_context() impelementation in Intel
vIOMMU. PCIe devices (VFIO case) sets HostIOMMUContext to vIOMMU as
an ack of vIOMMU's "want_nested" attribute. Thus vIOMMU could build
DMA protection based on nested paging of host IOMMU.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c | 71 ---
 include/hw/i386/intel_iommu.h | 21 +--
 2 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index d89d6d7dd5..8419fd2818 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3380,23 +3380,33 @@ static const MemoryRegionOps vtd_mem_ir_ops = {
 },
 };
 
-VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
+/**
+ * Fetch a VTDBus instance for given PCIBus. If no existing instance,
+ * allocate one.
+ */
+static VTDBus *vtd_find_add_bus(IntelIOMMUState *s, PCIBus *bus)
 {
 uintptr_t key = (uintptr_t)bus;
 VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, );
-VTDAddressSpace *vtd_dev_as;
-char name[128];
 
 if (!vtd_bus) {
 uintptr_t *new_key = g_malloc(sizeof(*new_key));
 *new_key = (uintptr_t)bus;
 /* No corresponding free() */
-vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
-PCI_DEVFN_MAX);
+vtd_bus = g_malloc0(sizeof(VTDBus));
 vtd_bus->bus = bus;
 g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
 }
+return vtd_bus;
+}
 
+VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
+{
+VTDBus *vtd_bus;
+VTDAddressSpace *vtd_dev_as;
+char name[128];
+
+vtd_bus = vtd_find_add_bus(s, bus);
 vtd_dev_as = vtd_bus->dev_as[devfn];
 
 if (!vtd_dev_as) {
@@ -3484,6 +3494,55 @@ static int vtd_dev_get_iommu_attr(PCIBus *bus, void 
*opaque, int32_t devfn,
 return ret;
 }
 
+static int vtd_dev_set_iommu_context(PCIBus *bus, void *opaque,
+ int devfn,
+ HostIOMMUContext *iommu_ctx)
+{
+IntelIOMMUState *s = opaque;
+VTDBus *vtd_bus;
+VTDHostIOMMUContext *vtd_dev_icx;
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+vtd_bus = vtd_find_add_bus(s, bus);
+
+vtd_iommu_lock(s);
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+
+assert(!vtd_dev_icx);
+
+vtd_bus->dev_icx[devfn] = vtd_dev_icx =
+g_malloc0(sizeof(VTDHostIOMMUContext));
+vtd_dev_icx->vtd_bus = vtd_bus;
+vtd_dev_icx->devfn = (uint8_t)devfn;
+vtd_dev_icx->iommu_state = s;
+vtd_dev_icx->iommu_ctx = iommu_ctx;
+
+vtd_iommu_unlock(s);
+
+return 0;
+}
+
+static void vtd_dev_unset_iommu_context(PCIBus *bus, void *opaque, int devfn)
+{
+IntelIOMMUState *s = opaque;
+VTDBus *vtd_bus;
+VTDHostIOMMUContext *vtd_dev_icx;
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+vtd_bus = vtd_find_add_bus(s, bus);
+
+vtd_iommu_lock(s);
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+g_free(vtd_dev_icx);
+vtd_bus->dev_icx[devfn] = NULL;
+
+vtd_iommu_unlock(s);
+}
+
 static uint64_t get_naturally_aligned_size(uint64_t start,
uint64_t size, int gaw)
 {
@@ -3781,6 +3840,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
 .get_iommu_attr = vtd_dev_get_iommu_attr,
+.set_iommu_context = vtd_dev_set_iommu_context,
+.unset_iommu_context = vtd_dev_unset_iommu_context,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 41783ee46d..28396675ef 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -63,6 +63,7 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry;
 typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
 typedef struct VTDPASIDDirEntry VTDPASIDDirEntry;
 typedef struct VTDPASIDEntry VTDPASIDEntry;
+typedef struct VTDHostIOMMUContext VTDHostIOMMUContext;
 
 /* Context-Entry */
 struct VTDContextEntry {
@@ -111,10 +112,20 @@ struct VTDAddressSpace {
 IOVATree *iova_tree;  /* Traces mapped IOVA ranges */
 };
 
+struct VTDHostIOMMUContext {
+VTDBus *vtd_bus;
+uint8_t devfn;
+HostIOMMUContext *iommu_ctx;
+IntelIOMMUState *iommu_state;
+};
+
 struct VTDBus {
-PCIBus* bus;   /* A reference to the bus to provide 
translation for */
+/* A reference to the bus to provide translation for */
+PCIBus *bus;
 /* A table of VTDAddressSpace objects indexed by devfn */
-VTDAddressSpace *dev_as[];
+VTDAddressSpace *dev_as[PCI_D

[RFC v11 10/25] hw/pci: introduce pci_device_set/unset_iommu_context()

2021-03-02 Thread Liu Yi L

For nesting IOMMU translation capable platforms, vIOMMUs running on
such system could be implemented upon physical IOMMU nested paging
(VFIO case). vIOMMU advertises such implementation by "want_nested"
attribute to PCIe devices (e.g. VFIO PCI). Once "want_nested" is
satisfied, device (VFIO case) should set HostIOMMUContext to vIOMMU,
thus vIOMMU could manage stage-1 translation. DMAs out from such
devices would be protected through the stage-1 page tables owned by
guest together with stage-2 page tables owned by host.

This patch adds pci_device_set/unset_iommu_context() to set/unset
HostIOMMUContext for a given PCIe device (VFIO case). Caller of set
should fail if set operation failed.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
rfcv5 (v2) -> rfcv6:
*) pci_device_set_iommu_context() returns 0 if callback is not implemented.
---
 hw/pci/pci.c | 28 
 include/hw/pci/pci.h | 10 ++
 2 files changed, 38 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 19365e2799..a2c270a5d6 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2749,6 +2749,34 @@ int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr 
attr, void *data)
 return -ENOENT;
 }
 
+int pci_device_set_iommu_context(PCIDevice *dev,
+ HostIOMMUContext *iommu_ctx)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->set_iommu_context) {
+return bus->iommu_ops->set_iommu_context(bus,
+  bus->iommu_opaque, devfn, iommu_ctx);
+}
+return 0;
+}
+
+void pci_device_unset_iommu_context(PCIDevice *dev)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->unset_iommu_context) {
+bus->iommu_ops->unset_iommu_context(bus,
+ bus->iommu_opaque, devfn);
+}
+}
+
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
 {
 bus->iommu_ops = ops;
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index b99e05c81e..1eeb177f4f 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -10,6 +10,8 @@
 #include "hw/pci/pcie.h"
 #include "qom/object.h"
 
+#include "hw/iommu/host_iommu_context.h"
+
 extern bool pci_available;
 
 /* PCI bus */
@@ -495,10 +497,18 @@ struct PCIIOMMUOps {
 void *opaque, int32_t devfn);
 int (*get_iommu_attr)(PCIBus *bus, void *opaque, int32_t devfn,
IOMMUAttr attr, void *data);
+int (*set_iommu_context)(PCIBus *bus, void *opaque,
+ int32_t devfn,
+ HostIOMMUContext *iommu_ctx);
+void (*unset_iommu_context)(PCIBus *bus, void *opaque,
+int32_t devfn);
 };
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
 int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr attr, void *data);
+int pci_device_set_iommu_context(PCIDevice *dev,
+ HostIOMMUContext *iommu_ctx);
+void pci_device_unset_iommu_context(PCIDevice *dev);
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *iommu_ops, void *opaque);
 
 static inline void
-- 
2.25.1

[RFC v11 20/25] intel_iommu: do not pass down pasid bind for PASID #0

2021-03-02 Thread Liu Yi L

RID_PASID field was introduced in VT-d 3.0 spec, it is used
for DMA requests w/o PASID in scalable mode VT-d. It is also
known as IOVA. And in VT-d 3.1 spec, there is definition on it:

"Implementations not supporting RID_PASID capability
(ECAP_REG.RPS is 0b), use a PASID value of 0 to perform
address translation for requests without PASID."

This patch adds a check against the PASIDs which are going to be
bound to device. For PASID #0, it is not necessary to pass down
pasid bind request for it since PASID #0 is used as RID_PASID for
DMA requests without pasid. Further reason is current Intel vIOMMU
supports gIOVA by shadowing guest 2nd level page table. However,
in future, if guest IOMMU driver uses 1st level page table to store
IOVA mappings, then guest IOVA support will also be done via nested
translation. When gIOVA is over FLPT, then vIOMMU should pass down
the pasid bind request for PASID #0 to host, host needs to bind the
guest IOVA page table to a proper PASID. e.g. PASID value in RID_PASID
field for PF/VF if ECAP_REG.RPS is clear or default PASID for ADI
(Assignable Device Interface in Scalable IOV solution).

IOVA over FLPT support on Intel VT-d:
https://lore.kernel.org/linux-iommu/20191219031634.15168-1-baolu...@linux.intel.com/

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index c99fd3b167..740dc63090 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1915,6 +1915,16 @@ static int vtd_bind_guest_pasid(IntelIOMMUState *s, 
VTDBus *vtd_bus,
 HostIOMMUContext *iommu_ctx;
 int ret = -1;
 
+if (pasid < VTD_HPASID_MIN) {
+/*
+ * If pasid < VTD_HPASID_MIN, this pasid is not allocated
+ * from host. No need to pass down the changes on it to host.
+ * TODO: when IOVA over FLPT is ready, this switch should be
+ * refined.
+ */
+return 0;
+}
+
 vtd_dev_icx = vtd_bus->dev_icx[devfn];
 if (!vtd_dev_icx) {
 /* means no need to go further, e.g. for emulated devices */
-- 
2.25.1

[RFC v11 19/25] intel_iommu: replay pasid binds after context cache invalidation

2021-03-02 Thread Liu Yi L

This patch replays guest pasid bindings after context cache
invalidation. This is a behavior to ensure safety. Actually,
programmer should issue pasid cache invalidation with proper
granularity after issuing a context cache invalidation.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 50 ++
 hw/i386/intel_iommu_internal.h |  1 +
 hw/i386/trace-events   |  1 +
 3 files changed, 52 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 0fdc2c6e82..c99fd3b167 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -73,6 +73,10 @@ static void vtd_address_space_refresh_all(IntelIOMMUState 
*s);
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
 
 static void vtd_pasid_cache_reset(IntelIOMMUState *s);
+static void vtd_pasid_cache_sync(IntelIOMMUState *s,
+ VTDPASIDCacheInfo *pc_info);
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+  VTDBus *vtd_bus, uint16_t devfn);
 
 static void vtd_panic_require_caching_mode(void)
 {
@@ -1875,7 +1879,10 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
 
 static void vtd_context_global_invalidate(IntelIOMMUState *s)
 {
+VTDPASIDCacheInfo pc_info = { .error_happened = false, };
+
 trace_vtd_inv_desc_cc_global();
+
 /* Protects context cache */
 vtd_iommu_lock(s);
 s->context_cache_gen++;
@@ -1892,6 +1899,9 @@ static void vtd_context_global_invalidate(IntelIOMMUState 
*s)
  * VT-d emulation codes.
  */
 vtd_iommu_replay_all(s);
+
+pc_info.type = VTD_PASID_CACHE_GLOBAL_INV;
+vtd_pasid_cache_sync(s, _info);
 }
 
 /**
@@ -2030,6 +2040,21 @@ static void 
vtd_context_device_invalidate(IntelIOMMUState *s,
  * happened.
  */
 vtd_sync_shadow_page_table(vtd_as);
+/*
+ * Per spec, context flush should also followed with PASID
+ * cache and iotlb flush. Regards to a device selective
+ * context cache invalidation:
+ * if (emaulted_device)
+ *invalidate pasid cahce and pasid-based iotlb
+ * else if (assigned_device)
+ *check if the device has been bound to any pasid
+ *invoke pasid_unbind regards to each bound pasid
+ * Here, we have vtd_pasid_cache_devsi() to invalidate pasid
+ * caches, while for piotlb in QEMU, we don't have it yet, so
+ * no handling. For assigned device, host iommu driver would
+ * flush piotlb when a pasid unbind is pass down to it.
+ */
+ vtd_pasid_cache_devsi(s, vtd_bus, devfn_it);
 }
 }
 }
@@ -2656,6 +2681,12 @@ static gboolean vtd_flush_pasid(gpointer key, gpointer 
value,
 /* Fall through */
 case VTD_PASID_CACHE_GLOBAL_INV:
 break;
+case VTD_PASID_CACHE_DEVSI:
+if (pc_info->vtd_bus != vtd_bus ||
+pc_info->devfn != devfn) {
+return false;
+}
+break;
 default:
 error_report("invalid pc_info->type");
 abort();
@@ -2863,6 +2894,11 @@ static void 
vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
 case VTD_PASID_CACHE_GLOBAL_INV:
 /* loop all assigned devices */
 break;
+case VTD_PASID_CACHE_DEVSI:
+walk_info.vtd_bus = pc_info->vtd_bus;
+walk_info.devfn = pc_info->devfn;
+vtd_replay_pasid_bind_for_dev(s, start, end, _info);
+return;
 case VTD_PASID_CACHE_FORCE_RESET:
 /* For force reset, no need to go further replay */
 return;
@@ -2951,6 +2987,20 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s,
 vtd_iommu_unlock(s);
 }
 
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+  VTDBus *vtd_bus, uint16_t devfn)
+{
+VTDPASIDCacheInfo pc_info = { .error_happened = false, };
+
+trace_vtd_pasid_cache_devsi(devfn);
+
+pc_info.type = VTD_PASID_CACHE_DEVSI;
+pc_info.vtd_bus = vtd_bus;
+pc_info.devfn = devfn;
+
+vtd_pasid_cache_sync(s, _info);
+}
+
 /**
  * Caller of this function should hold iommu_lock
  */
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index e4c7b23455..eae57f457c 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -548,6 +548,7 @@ typedef enum VTDPCInvType {
 VTD_PASID_CACHE_FORCE_RESET = 0,
 /* pasid cache invalidation rely on guest PASID entry */
 VTD_PASID_CACHE_GLOBAL_INV,
+VTD_PASID_CACHE_DEVSI,
 VTD_PASID_CACHE_DOMSI,
 VTD_PASID_CACHE_PASIDSI,
 } VTDPCInvType;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 60d20c1335

[RFC v11 07/25] vfio: pass nesting requirement into vfio_get_group()

2021-03-02 Thread Liu Yi L

This patch passes the nesting requirement into vfio_get_group() to
indicate whether VFIO_TYPE1_NESTING_IOMMU is required.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
---
 hw/vfio/ap.c  | 2 +-
 hw/vfio/ccw.c | 2 +-
 hw/vfio/common.c  | 3 ++-
 hw/vfio/pci.c | 9 -
 hw/vfio/platform.c| 2 +-
 include/hw/vfio/vfio-common.h | 3 ++-
 6 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index 9571c2f91f..06cefac7a1 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -83,7 +83,7 @@ static VFIOGroup *vfio_ap_get_group(VFIOAPDevice *vapdev, 
Error **errp)
 
 g_free(group_path);
 
-return vfio_get_group(groupid, _space_memory, errp);
+return vfio_get_group(groupid, _space_memory, false, errp);
 }
 
 static void vfio_ap_realize(DeviceState *dev, Error **errp)
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index bc78a0ad76..4a9ca9414a 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -647,7 +647,7 @@ static VFIOGroup *vfio_ccw_get_group(S390CCWDevice *cdev, 
Error **errp)
 return NULL;
 }
 
-return vfio_get_group(groupid, _space_memory, errp);
+return vfio_get_group(groupid, _space_memory, false, errp);
 }
 
 static void vfio_ccw_realize(DeviceState *dev, Error **errp)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 6ff1daa763..44097875e2 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1923,7 +1923,8 @@ static void vfio_disconnect_container(VFIOGroup *group)
 }
 }
 
-VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
+VFIOGroup *vfio_get_group(int groupid, AddressSpace *as,
+  bool want_nested, Error **errp)
 {
 VFIOGroup *group;
 char path[32];
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index f74be78209..437f51338e 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2771,6 +2771,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 int groupid;
 int i, ret;
 bool is_mdev;
+bool want_nested;
 
 if (!vdev->vbasedev.sysfsdev) {
 if (!(~vdev->host.domain || ~vdev->host.bus ||
@@ -2817,7 +2818,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 trace_vfio_realize(vdev->vbasedev.name, groupid);
 
-group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), 
errp);
+if (pci_device_get_iommu_attr(pdev,
+ IOMMU_WANT_NESTING, _nested)) {
+want_nested = false;
+}
+
+group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev),
+   want_nested, errp);
 if (!group) {
 goto error;
 }
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index cc3f66f7e4..42c6ae7689 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -577,7 +577,7 @@ static int vfio_base_device_init(VFIODevice *vbasedev, 
Error **errp)
 
 trace_vfio_platform_base_device_init(vbasedev->name, groupid);
 
-group = vfio_get_group(groupid, _space_memory, errp);
+group = vfio_get_group(groupid, _space_memory, false, errp);
 if (!group) {
 return -ENOENT;
 }
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index 6141162d7a..293d3785f3 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -196,7 +196,8 @@ void vfio_region_unmap(VFIORegion *region);
 void vfio_region_exit(VFIORegion *region);
 void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
-VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
+VFIOGroup *vfio_get_group(int groupid, AddressSpace *as,
+  bool want_nested, Error **errp);
 void vfio_put_group(VFIOGroup *group);
 int vfio_get_device(VFIOGroup *group, const char *name,
 VFIODevice *vbasedev, Error **errp);
-- 
2.25.1

[RFC v11 08/25] vfio: check VFIO_TYPE1_NESTING_IOMMU support

2021-03-02 Thread Liu Yi L

VFIO needs to check VFIO_TYPE1_NESTING_IOMMU support with Kernel before
further using it. e.g. requires to check IOMMU UAPI support.

Referred patch from Eric Auger: https://patchwork.kernel.org/patch/11040499/

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
Signed-off-by: Eric Auger 
Signed-off-by: Yi Sun 
---
 hw/vfio/common.c | 37 ++---
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 44097875e2..883815d5b0 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1543,30 +1543,44 @@ static void vfio_put_address_space(VFIOAddressSpace 
*space)
 }
 
 /*
- * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
+ * vfio_get_iommu_type - selects the richest iommu_type (NESTING first)
  */
 static int vfio_get_iommu_type(VFIOContainer *container,
+   bool want_nested,
Error **errp)
 {
-int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
+int iommu_types[] = { VFIO_TYPE1_NESTING_IOMMU,
+  VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
   VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
-int i;
+int i, ret = -EINVAL;
 
 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
-return iommu_types[i];
+if (iommu_types[i] == VFIO_TYPE1_NESTING_IOMMU) {
+if (!want_nested) {
+continue;
+}
+}
+ret = iommu_types[i];
+break;
 }
 }
-error_setg(errp, "No available IOMMU models");
-return -EINVAL;
+
+if (ret < 0) {
+error_setg(errp, "No available IOMMU models");
+} else if (want_nested && ret != VFIO_TYPE1_NESTING_IOMMU) {
+error_setg(errp, "Nested mode requested but not supported");
+ret = -EINVAL;
+}
+return ret;
 }
 
 static int vfio_init_container(VFIOContainer *container, int group_fd,
-   Error **errp)
+   bool want_nested, Error **errp)
 {
 int iommu_type, ret;
 
-iommu_type = vfio_get_iommu_type(container, errp);
+iommu_type = vfio_get_iommu_type(container, want_nested, errp);
 if (iommu_type < 0) {
 return iommu_type;
 }
@@ -1666,7 +1680,7 @@ static void vfio_get_iommu_info_migration(VFIOContainer 
*container,
 }
 
 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
-  Error **errp)
+  bool want_nested, Error **errp)
 {
 VFIOContainer *container;
 int ret, fd;
@@ -1738,12 +1752,13 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 QLIST_INIT(>giommu_list);
 QLIST_INIT(>hostwin_list);
 
-ret = vfio_init_container(container, group->fd, errp);
+ret = vfio_init_container(container, group->fd, want_nested, errp);
 if (ret) {
 goto free_container_exit;
 }
 
 switch (container->iommu_type) {
+case VFIO_TYPE1_NESTING_IOMMU:
 case VFIO_TYPE1v2_IOMMU:
 case VFIO_TYPE1_IOMMU:
 {
@@ -1968,7 +1983,7 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as,
 group->groupid = groupid;
 QLIST_INIT(>device_list);
 
-if (vfio_connect_container(group, as, errp)) {
+if (vfio_connect_container(group, as, want_nested, errp)) {
 error_prepend(errp, "failed to setup container for group %d: ",
   groupid);
 goto close_fd_exit;
-- 
2.25.1

[RFC v11 18/25] intel_iommu: bind/unbind guest page table to host

2021-03-02 Thread Liu Yi L

This patch captures the guest PASID table entry modifications and
propagates the changes to host to setup dual stage DMA translation.
The guest page table is configured as 1st level page table (GVA->GPA)
whose translation result would further go through host VT-d 2nd
level page table(GPA->HPA) under nested translation mode. This is the
key part of vSVA support, and also a key to support IOVA over 1st-
level page table for Intel VT-d in virtualization environment.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Signed-off-by: Liu Yi L 
---
rfcv10 -> rfcv11:
*) Add @error_happened in struct VTDPASIDCacheInfo to track the
   bind/unbind failure, and return to guest.
---
 hw/i386/intel_iommu.c  | 138 +
 hw/i386/intel_iommu_internal.h |  19 +
 2 files changed, 143 insertions(+), 14 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index a8f895807a..0fdc2c6e82 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include "qemu/jhash.h"
+#include 
 
 int ioasid_fd = -1;
 uint32_t ioasid_bits;
@@ -705,6 +706,24 @@ static inline uint32_t 
vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
 return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce->val[0]) + 7);
 }
 
+static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
+{
+return 48 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM) * 9;
+}
+
+static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
+{
+return pe->val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+}
+
+static inline void pasid_cache_info_set_error(VTDPASIDCacheInfo *pc_info)
+{
+if (pc_info->error_happened) {
+return;
+}
+pc_info->error_happened = true;
+}
+
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 {
 return pdire->val & 1;
@@ -1875,6 +1894,85 @@ static void 
vtd_context_global_invalidate(IntelIOMMUState *s)
 vtd_iommu_replay_all(s);
 }
 
+/**
+ * Caller should hold iommu_lock.
+ */
+static int vtd_bind_guest_pasid(IntelIOMMUState *s, VTDBus *vtd_bus,
+int devfn, int pasid, VTDPASIDEntry *pe,
+VTDPASIDOp op)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+HostIOMMUContext *iommu_ctx;
+int ret = -1;
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+if (!vtd_dev_icx) {
+/* means no need to go further, e.g. for emulated devices */
+return 0;
+}
+
+iommu_ctx = vtd_dev_icx->iommu_ctx;
+if (!iommu_ctx) {
+return -EINVAL;
+}
+
+switch (op) {
+case VTD_PASID_BIND:
+{
+struct iommu_gpasid_bind_data *g_bind_data;
+
+g_bind_data = g_malloc0(sizeof(*g_bind_data));
+
+g_bind_data->argsz = sizeof(*g_bind_data);
+g_bind_data->version = IOMMU_GPASID_BIND_VERSION_1;
+g_bind_data->format = IOMMU_PASID_FORMAT_INTEL_VTD;
+g_bind_data->gpgd = vtd_pe_get_flpt_base(pe);
+g_bind_data->addr_width = vtd_pe_get_fl_aw(pe);
+g_bind_data->hpasid = pasid;
+g_bind_data->gpasid = pasid;
+g_bind_data->flags |= IOMMU_SVA_GPASID_VAL;
+g_bind_data->vendor.vtd.flags =
+ (VTD_SM_PASID_ENTRY_SRE_BIT(pe->val[2]) ?
+IOMMU_SVA_VTD_GPASID_SRE : 0)
+   | (VTD_SM_PASID_ENTRY_EAFE_BIT(pe->val[2]) ?
+IOMMU_SVA_VTD_GPASID_EAFE : 0)
+   | (VTD_SM_PASID_ENTRY_PCD_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_PCD : 0)
+   | (VTD_SM_PASID_ENTRY_PWT_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_PWT : 0)
+   | (VTD_SM_PASID_ENTRY_EMTE_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_EMTE : 0)
+   | (VTD_SM_PASID_ENTRY_CD_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_CD : 0);
+g_bind_data->vendor.vtd.pat = VTD_SM_PASID_ENTRY_PAT(pe->val[1]);
+g_bind_data->vendor.vtd.emt = VTD_SM_PASID_ENTRY_EMT(pe->val[1]);
+ret = host_iommu_ctx_bind_stage1_pgtbl(iommu_ctx, g_bind_data);
+g_free(g_bind_data);
+break;
+}
+case VTD_PASID_UNBIND:
+{
+struct iommu_gpasid_bind_data *g_unbind_data;
+
+g_unbind_data = g_malloc0(sizeof(*g_unbind_data));
+
+g_unbind_data->argsz = sizeof(*g_unbind_data);
+g_unbind_data->version = IOMMU_GPASID_BIND_VERSION_1;
+g_unbind_data->format = IOMMU_PASID_FORMAT_INTEL_VTD;
+g_unbind_data->hpasid = pasid;
+ret = host_iommu_ctx_unbind_stage1_pgtbl(iommu_ctx, g_unbind_data);
+g_free(g_unbind_data)

[RFC v11 17/25] intel_iommu: add PASID cache management infrastructure

2021-03-02 Thread Liu Yi L

This patch adds a PASID cache management infrastructure based on
new added structure VTDPASIDAddressSpace, which is used to track
the PASID usage and future PASID tagged DMA address translation
support in vIOMMU.

struct VTDPASIDAddressSpace {
VTDBus *vtd_bus;
uint8_t devfn;
AddressSpace as;
uint32_t pasid;
IntelIOMMUState *iommu_state;
VTDContextCacheEntry context_cache_entry;
QLIST_ENTRY(VTDPASIDAddressSpace) next;
VTDPASIDCacheEntry pasid_cache_entry;
};

Ideally, a VTDPASIDAddressSpace instance is created when a PASID
is bound with a DMA AddressSpace. Intel VT-d spec requires guest
software to issue pasid cache invalidation when bind or unbind a
pasid with an address space under caching-mode. However, as
VTDPASIDAddressSpace instances also act as pasid cache in this
implementation, its creation also happens during vIOMMU PASID
tagged DMA translation. The creation in this path will not be
added in this patch since no PASID-capable emulated devices for
now.

The implementation in this patch manages VTDPASIDAddressSpace
instances per PASID+BDF (lookup and insert will use PASID and
BDF) since Intel VT-d spec allows per-BDF PASID Table. When a
guest bind a PASID with an AddressSpace, QEMU will capture the
guest pasid selective pasid cache invalidation, and allocate
remove a VTDPASIDAddressSpace instance per the invalidation
reasons:

*) a present pasid entry moved to non-present
*) a present pasid entry to be a present entry
*) a non-present pasid entry moved to present

vIOMMU emulator could figure out the reason by fetching latest
guest pasid entry.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
rfcv4 (v1) -> rfcv5 (v2):
*) merged this patch with former replay binding patch, makes
   PSI/DSI/GSI use the unified function to do cache invalidation
   and pasid binding replay.
*) dropped pasid_cache_gen in both iommu_state and vtd_pasid_as
   as it is not necessary so far, we may want it when one day
   initroduce emulated SVA-capable device.
---
 hw/i386/intel_iommu.c  | 471 -
 hw/i386/intel_iommu_internal.h |  21 ++
 hw/i386/trace-events   |   1 +
 include/hw/i386/intel_iommu.h  |  28 ++
 4 files changed, 520 insertions(+), 1 deletion(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index c4b0db15cb..a8f895807a 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -42,6 +42,7 @@
 #include "trace.h"
 #include 
 #include 
+#include "qemu/jhash.h"
 
 int ioasid_fd = -1;
 uint32_t ioasid_bits;
@@ -70,6 +71,8 @@ uint32_t ioasid_bits;
 static void vtd_address_space_refresh_all(IntelIOMMUState *s);
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
 
+static void vtd_pasid_cache_reset(IntelIOMMUState *s);
+
 static void vtd_panic_require_caching_mode(void)
 {
 error_report("We need to set caching-mode=on for intel-iommu to enable "
@@ -281,6 +284,7 @@ static void vtd_reset_caches(IntelIOMMUState *s)
 vtd_iommu_lock(s);
 vtd_reset_iotlb_locked(s);
 vtd_reset_context_cache_locked(s);
+vtd_pasid_cache_reset(s);
 vtd_iommu_unlock(s);
 }
 
@@ -691,6 +695,16 @@ static inline bool vtd_pe_type_check(X86IOMMUState 
*x86_iommu,
 return true;
 }
 
+static inline uint16_t vtd_pe_get_domain_id(VTDPASIDEntry *pe)
+{
+return VTD_SM_PASID_ENTRY_DID((pe)->val[1]);
+}
+
+static inline uint32_t vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
+{
+return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce->val[0]) + 7);
+}
+
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 {
 return pdire->val & 1;
@@ -2421,9 +2435,443 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, 
VTDInvDesc *inv_desc)
 return true;
 }
 
+static inline void vtd_init_pasid_key(uint32_t pasid,
+ uint16_t sid,
+ struct pasid_key *key)
+{
+key->pasid = pasid;
+key->sid = sid;
+}
+
+static guint vtd_pasid_as_key_hash(gconstpointer v)
+{
+struct pasid_key *key = (struct pasid_key *)v;
+uint32_t a, b, c;
+
+/* Jenkins hash */
+a = b = c = JHASH_INITVAL + sizeof(*key);
+a += key->sid;
+b += extract32(key->pasid, 0, 16);
+c += extract32(key->pasid, 16, 16);
+
+__jhash_mix(a, b, c);
+__jhash_final(a, b, c);
+
+return c;
+}
+
+static gboolean vtd_pasid_as_key_equal(gconstpointer v1, gconstpointer v2)
+{
+const struct pasid_key *k1 = v1;
+const struct pasid_key *k2 = v2;
+
+return (k1->pasid == k2->pasid) && (k1->sid == k2->sid);
+}
+
+static inline int vtd_dev_get_pe_from_pasid(IntelIOMMUState *s,
+uint8_t bus_num,
+

[RFC v11 04/25] hw/pci: modify pci_setup_iommu() to set PCIIOMMUOps

2021-03-02 Thread Liu Yi L

This patch modifies pci_setup_iommu() to set PCIIOMMUOps
instead of setting PCIIOMMUFunc. PCIIOMMUFunc is used to
get an address space for a PCI device in vendor specific
way. The PCIIOMMUOps still offers this functionality. But
using PCIIOMMUOps leaves space to add more iommu related
vendor specific operations.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Reviewed-by: David Gibson 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
rfcv9 -> rfcv10:
*) Fix a bug in pci_device_iommu_address_space()
 +   iommu_bus->iommu_ops->get_address_space) &&
 =>
 +   !iommu_bus->iommu_ops->get_address_space) &&
---
 hw/alpha/typhoon.c   |  6 +-
 hw/arm/smmu-common.c |  6 +-
 hw/hppa/dino.c   |  6 +-
 hw/i386/amd_iommu.c  |  6 +-
 hw/i386/intel_iommu.c|  6 +-
 hw/pci-host/designware.c |  6 +-
 hw/pci-host/pnv_phb3.c   |  6 +-
 hw/pci-host/pnv_phb4.c   |  6 +-
 hw/pci-host/ppce500.c|  6 +-
 hw/pci-host/prep.c   |  6 +-
 hw/pci-host/sabre.c  |  6 +-
 hw/pci/pci.c | 18 +-
 hw/ppc/ppc440_pcix.c |  6 +-
 hw/ppc/spapr_pci.c   |  6 +-
 hw/s390x/s390-pci-bus.c  |  8 ++--
 hw/virtio/virtio-iommu.c |  6 +-
 include/hw/pci/pci.h |  8 ++--
 include/hw/pci/pci_bus.h |  2 +-
 18 files changed, 96 insertions(+), 24 deletions(-)

diff --git a/hw/alpha/typhoon.c b/hw/alpha/typhoon.c
index a42b319812..47ff561c81 100644
--- a/hw/alpha/typhoon.c
+++ b/hw/alpha/typhoon.c
@@ -740,6 +740,10 @@ static AddressSpace *typhoon_pci_dma_iommu(PCIBus *bus, 
void *opaque, int devfn)
 return >pchip.iommu_as;
 }
 
+static const PCIIOMMUOps typhoon_iommu_ops = {
+.get_address_space = typhoon_pci_dma_iommu,
+};
+
 static void typhoon_set_irq(void *opaque, int irq, int level)
 {
 TyphoonState *s = opaque;
@@ -897,7 +901,7 @@ PCIBus *typhoon_init(MemoryRegion *ram, ISABus **isa_bus, 
qemu_irq *p_rtc_irq,
  "iommu-typhoon", UINT64_MAX);
 address_space_init(>pchip.iommu_as, MEMORY_REGION(>pchip.iommu),
"pchip0-pci");
-pci_setup_iommu(b, typhoon_pci_dma_iommu, s);
+pci_setup_iommu(b, _iommu_ops, s);
 
 /* Pchip0 PCI special/interrupt acknowledge, 0x801.F800., 64MB.  */
 memory_region_init_io(>pchip.reg_iack, OBJECT(s), _pci_iack_ops,
diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 405d5c5325..bbaf6565cf 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -444,6 +444,10 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void 
*opaque, int devfn)
 return >as;
 }
 
+static const PCIIOMMUOps smmu_ops = {
+.get_address_space = smmu_find_add_as,
+};
+
 IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid)
 {
 uint8_t bus_n, devfn;
@@ -514,7 +518,7 @@ static void smmu_base_realize(DeviceState *dev, Error 
**errp)
 s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL);
 
 if (s->primary_bus) {
-pci_setup_iommu(s->primary_bus, smmu_find_add_as, s);
+pci_setup_iommu(s->primary_bus, _ops, s);
 } else {
 error_setg(errp, "SMMU is not attached to any PCI bus!");
 }
diff --git a/hw/hppa/dino.c b/hw/hppa/dino.c
index 5b82c9440d..1b2228891a 100644
--- a/hw/hppa/dino.c
+++ b/hw/hppa/dino.c
@@ -459,6 +459,10 @@ static AddressSpace *dino_pcihost_set_iommu(PCIBus *bus, 
void *opaque,
 return >bm_as;
 }
 
+static const PCIIOMMUOps dino_iommu_ops = {
+.get_address_space = dino_pcihost_set_iommu,
+};
+
 /*
  * Dino interrupts are connected as shown on Page 78, Table 23
  * (Little-endian bit numbers)
@@ -580,7 +584,7 @@ PCIBus *dino_init(MemoryRegion *addr_space,
 memory_region_add_subregion(>bm, 0xfff0,
 >bm_cpu_alias);
 address_space_init(>bm_as, >bm, "pci-bm");
-pci_setup_iommu(b, dino_pcihost_set_iommu, s);
+pci_setup_iommu(b, _iommu_ops, s);
 
 *p_rtc_irq = qemu_allocate_irq(dino_set_timer_irq, s, 0);
 *p_ser_irq = qemu_allocate_irq(dino_set_serial_irq, s, 0);
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 74a93a5d93..3676a20c25 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -1452,6 +1452,10 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, 
void *opaque, int devfn)
 return _as[devfn]->as;
 }
 
+static const PCIIOMMUOps amdvi_iommu_ops = {
+.get_address_space = amdvi_host_dma_iommu,
+};
+
 static const MemoryRegionOps mmio_mem_ops = {
 .read = amdvi_mmio_read,
 .write = amdvi_mmio_write,
@@ -1579,7 +1583,7 @@ static void amdvi_realize(DeviceState *dev, Error **errp)
 
 sysbus_init_mmio(SYS_BUS_DEVICE(s), >mmio);
 sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR);
-pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
+

[RFC v11 02/25] scripts/update-linux-headers: Import ioasid.h

2021-03-02 Thread Liu Yi L

Update the script to import the new ioasid.h uapi header.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Michael S. Tsirkin 
Cc: Cornelia Huck 
Cc: Paolo Bonzini 
Signed-off-by: Liu Yi L 
---
 scripts/update-linux-headers.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index f588678837..7fbd8ace69 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -142,7 +142,7 @@ done
 
 rm -rf "$output/linux-headers/linux"
 mkdir -p "$output/linux-headers/linux"
-for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h iommu.h \
+for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h iommu.h ioasid.h \
   psci.h psp-sev.h userfaultfd.h mman.h; do
 cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
 done
-- 
2.25.1

[RFC v11 09/25] hw/iommu: introduce HostIOMMUContext

2021-03-02 Thread Liu Yi L

Currently, many platform vendors provide the capability of dual stage
DMA address translation in hardware. For example, nested translation
on Intel VT-d scalable mode, nested stage translation on ARM SMMUv3,
and etc. In dual stage DMA address translation, there are two stages
address translation, stage-1 (a.k.a first-level) and stage-2 (a.k.a
second-level) translation structures. Stage-1 translation results are
also subjected to stage-2 translation structures. Take vSVA (Virtual
Shared Virtual Addressing) as an example, guest IOMMU driver owns
stage-1 translation structures (covers GVA->GPA translation), and host
IOMMU driver owns stage-2 translation structures (covers GPA->HPA
translation). VMM is responsible to bind stage-1 translation structures
to host, thus hardware could achieve GVA->GPA and then GPA->HPA
translation. For more background on SVA, refer the below links.
 - https://www.youtube.com/watch?v=Kq_nfGK5MwQ
 - 
https://events19.lfasiallc.com/wp-content/uploads/2017/11/Shared-Virtual-Memory-in-KVM_Yi-Liu.pdf

In QEMU, vIOMMU emulators expose IOMMUs to VM per their own spec (e.g.
Intel VT-d spec). Devices are pass-through to guest via device pass-
through components like VFIO. VFIO is a userspace driver framework
which exposes host IOMMU programming capability to userspace in a
secure manner. e.g. IOVA MAP/UNMAP requests. Information, different
from map/unmap notifications need to be passed from QEMU vIOMMU device
to/from the host IOMMU driver through the VFIO/IOMMU layer:
 1) bind stage-1 translation structures to host
 2) propagate stage-1 cache invalidation to host
 3) DMA address translation fault (I/O page fault) servicing etc.

With the above new interactions in QEMU, it requires an abstract layer
to facilitate the above operations and expose to vIOMMU emulators as an
explicit way for vIOMMU emulators call into VFIO. This patch introduces
HostIOMMUContext to serve it. The HostIOMMUContext is an object which
allows to manage the stage-1 translation when a vIOMMU is implemented
upon physical IOMMU nested paging (VFIO case). It is an abstract object
which needs to be derived for each vIOMMU immplementation based on
physical nested paging. An HostIOMMUContext derived object will be passed
to each VFIO device protected by a vIOMMU using physical nested paging.

This patch also introduces HostIOMMUContextClass to provide methods for
vIOMMU emulators to propagate dual-stage translation related requests to
host. As a beginning, bind_stage1_pgtbl/unbind_stage1_pgtbl() were defined
for configuring vIOMMU's page table to host.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Signed-off-by: Liu Yi L 
---
rfcv10 -> rfcv11:
*) removed pasid_alloc/free in HostIOMMUContextClass, define bind_stage1_pgtbl
   and unbind_stage1_pgtbl() as a start for HostIOMMUContextClass.

rfcv9 -> rfcv10:
*) adjust to meson build
---
 hw/Kconfig|   3 +
 hw/iommu/Kconfig  |   4 +
 hw/iommu/host_iommu_context.c | 106 ++
 hw/iommu/meson.build  |   6 ++
 hw/meson.build|   1 +
 include/hw/iommu/host_iommu_context.h |  75 ++
 6 files changed, 195 insertions(+)
 create mode 100644 hw/iommu/Kconfig
 create mode 100644 hw/iommu/host_iommu_context.c
 create mode 100644 hw/iommu/meson.build
 create mode 100644 include/hw/iommu/host_iommu_context.h

diff --git a/hw/Kconfig b/hw/Kconfig
index 8ea26479c4..fc660790f1 100644
--- a/hw/Kconfig
+++ b/hw/Kconfig
@@ -66,6 +66,9 @@ source tricore/Kconfig
 source unicore32/Kconfig
 source xtensa/Kconfig
 
+# iommu Kconfig
+source iommu/Kconfig
+
 # Symbols used by multiple targets
 config TEST_DEVICES
 bool
diff --git a/hw/iommu/Kconfig b/hw/iommu/Kconfig
new file mode 100644
index 00..039b9a4caf
--- /dev/null
+++ b/hw/iommu/Kconfig
@@ -0,0 +1,4 @@
+config IOMMU
+bool
+default y
+depends on LINUX
diff --git a/hw/iommu/host_iommu_context.c b/hw/iommu/host_iommu_context.c
new file mode 100644
index 00..d7139bcb86
--- /dev/null
+++ b/hw/iommu/host_iommu_context.c
@@ -0,0 +1,106 @@
+/*
+ * QEMU abstract of Host IOMMU
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Authors: Liu Yi L 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+

[RFC v11 14/25] intel_iommu: sync IOMMU nesting cap info for assigned devices

2021-03-02 Thread Liu Yi L

For assigned devices, Intel vIOMMU which wants to build DMA protection
based on physical IOMMU nesting paging should check the IOMMU nesting
support in host side. The host will return IOMMU nesting cap info to
user-space (e.g. VFIO returns IOMMU nesting cap info for nesting type
IOMMU). vIOMMU needs to check:
a) IOMMU model
b) 1st-level page table supports
c) address width
d) pasid support

This patch syncs the IOMMU nesting cap info when PCIe device (VFIO case)
sets HostIOMMUContext to vIOMMU. If the host IOMMU nesting support is not
compatible, vIOMMU should return failure to PCIe device.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 105 +
 hw/i386/intel_iommu_internal.h |  18 ++
 include/hw/i386/intel_iommu.h  |   4 ++
 3 files changed, 127 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 8419fd2818..203c898fa4 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3494,6 +3494,82 @@ static int vtd_dev_get_iommu_attr(PCIBus *bus, void 
*opaque, int32_t devfn,
 return ret;
 }
 
+
+static bool vtd_check_nesting_info(IntelIOMMUState *s,
+   struct iommu_nesting_info *info,
+   struct iommu_nesting_info_vtd *vtd)
+{
+return !((s->aw_bits != info->addr_width) ||
+ ((s->host_cap ^ vtd->cap_reg) & VTD_CAP_MASK & s->host_cap) ||
+ ((s->host_ecap ^ vtd->ecap_reg) & VTD_ECAP_MASK & s->host_ecap) ||
+ (VTD_GET_PSS(s->host_ecap) != (info->pasid_bits - 1)));
+}
+
+/* Caller should hold iommu lock. */
+static bool vtd_sync_nesting_info(IntelIOMMUState *s,
+  struct iommu_nesting_info *info)
+{
+struct iommu_nesting_info_vtd *vtd;
+uint64_t cap, ecap;
+
+vtd =  (struct iommu_nesting_info_vtd *) >vendor.vtd;
+
+if (s->cap_finalized) {
+return vtd_check_nesting_info(s, info, vtd);
+}
+
+if (s->aw_bits > info->addr_width) {
+error_report("User aw-bits: %u > host address width: %u",
+  s->aw_bits, info->addr_width);
+return false;
+}
+
+cap = s->host_cap & vtd->cap_reg & VTD_CAP_MASK;
+s->host_cap &= ~VTD_CAP_MASK;
+s->host_cap |= cap;
+
+ecap = s->host_ecap & vtd->ecap_reg & VTD_ECAP_MASK;
+s->host_ecap &= ~VTD_ECAP_MASK;
+s->host_ecap |= ecap;
+
+if ((VTD_ECAP_PASID & s->host_ecap) && info->pasid_bits &&
+(VTD_GET_PSS(s->host_ecap) > (info->pasid_bits - 1))) {
+s->host_ecap &= ~VTD_ECAP_PSS_MASK;
+s->host_ecap |= VTD_ECAP_PSS(info->pasid_bits - 1);
+}
+return true;
+}
+
+/*
+ * virtual VT-d which wants nested needs to check the host IOMMU
+ * nesting cap info behind the assigned devices. Thus that vIOMMU
+ * could bind guest page table to host.
+ */
+static bool vtd_check_iommu_ctx(IntelIOMMUState *s,
+HostIOMMUContext *iommu_ctx)
+{
+struct iommu_nesting_info *info = iommu_ctx->info;
+uint32_t minsz, size;
+
+if (IOMMU_PASID_FORMAT_INTEL_VTD != info->format) {
+error_report("Format is not compatible for nesting!!!");
+return false;
+}
+
+size = sizeof(struct iommu_nesting_info_vtd);
+minsz = endof(struct iommu_nesting_info, flags);
+if (size > (info->argsz - minsz)) {
+/*
+ * QEMU may have been using new linux-headers/iommu.h than
+ * kernel supports, hence fail it.
+ */
+error_report("IOMMU nesting cap is not compatible!!!");
+return false;
+}
+
+return vtd_sync_nesting_info(s, info);
+}
+
 static int vtd_dev_set_iommu_context(PCIBus *bus, void *opaque,
  int devfn,
  HostIOMMUContext *iommu_ctx)
@@ -3508,6 +3584,11 @@ static int vtd_dev_set_iommu_context(PCIBus *bus, void 
*opaque,
 
 vtd_iommu_lock(s);
 
+if (!vtd_check_iommu_ctx(s, iommu_ctx)) {
+vtd_iommu_unlock(s);
+return -ENOENT;
+}
+
 vtd_dev_icx = vtd_bus->dev_icx[devfn];
 
 assert(!vtd_dev_icx);
@@ -3760,6 +3841,14 @@ static void vtd_init(IntelIOMMUState *s)
 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
 }
 
+if (!s->cap_finalized) {
+s->host_cap = s->cap;
+s->host_ecap = s->ecap;
+} else {
+s->cap = s->host_cap;
+s->ecap = s->host_ecap;
+}
+
 vtd_reset_caches(s);
 
 /* Define registers with default values and bit semantics */
@@ -3886,6 +3975,12 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 return true;
 }
 
+static void vtd

[RFC v11 01/25] scripts/update-linux-headers: Import iommu.h

2021-03-02 Thread Liu Yi L

From: Eric Auger 

Update the script to import the new iommu.h uapi header.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Michael S. Tsirkin 
Cc: Cornelia Huck 
Cc: Paolo Bonzini 
Acked-by: Cornelia Huck 
Signed-off-by: Eric Auger 
Signed-off-by: Liu Yi L 
---
 scripts/update-linux-headers.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index fa6f2b6272..f588678837 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -142,7 +142,7 @@ done
 
 rm -rf "$output/linux-headers/linux"
 mkdir -p "$output/linux-headers/linux"
-for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h \
+for header in kvm.h vfio.h vfio_ccw.h vfio_zdev.h vhost.h iommu.h \
   psci.h psp-sev.h userfaultfd.h mman.h; do
 cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
 done
-- 
2.25.1

[RFC v11 06/25] intel_iommu: add get_iommu_attr() callback

2021-03-02 Thread Liu Yi L

Return vIOMMU attribute to caller. e.g. VFIO call via PCI layer.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index dd11248b6b..d89d6d7dd5 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3462,6 +3462,28 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus, int devfn)
 return vtd_dev_as;
 }
 
+static int vtd_dev_get_iommu_attr(PCIBus *bus, void *opaque, int32_t devfn,
+   IOMMUAttr attr, void *data)
+{
+int ret = 0;
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+switch (attr) {
+case IOMMU_WANT_NESTING:
+{
+bool *pdata = data;
+
+/* return false until vSVA is ready */
+*pdata = false;
+break;
+}
+default:
+ret = -ENOENT;
+}
+return ret;
+}
+
 static uint64_t get_naturally_aligned_size(uint64_t start,
uint64_t size, int gaw)
 {
@@ -3758,6 +3780,7 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
+.get_iommu_attr = vtd_dev_get_iommu_attr,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
-- 
2.25.1

[RFC v11 05/25] hw/pci: introduce pci_device_get_iommu_attr()

2021-03-02 Thread Liu Yi L

This patch adds pci_device_get_iommu_attr() to get vIOMMU attributes.
e.g. if nesting IOMMU wanted.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Signed-off-by: Liu Yi L 
---
 hw/pci/pci.c | 35 ++-
 include/hw/pci/pci.h |  7 +++
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 61f115d742..19365e2799 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2665,7 +2665,8 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
 }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+  PCIBus **pbus, uint8_t *pdevfn)
 {
 PCIBus *bus = pci_get_bus(dev);
 PCIBus *iommu_bus = bus;
@@ -2716,14 +2717,38 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (iommu_bus && iommu_bus->iommu_ops &&
- iommu_bus->iommu_ops->get_address_space) {
-return iommu_bus->iommu_ops->get_address_space(bus,
- iommu_bus->iommu_opaque, devfn);
+*pbus = iommu_bus;
+*pdevfn = devfn;
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->get_address_space) {
+return bus->iommu_ops->get_address_space(bus,
+bus->iommu_opaque, devfn);
 }
 return _space_memory;
 }
 
+int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr attr, void *data)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->get_iommu_attr) {
+return bus->iommu_ops->get_iommu_attr(bus, bus->iommu_opaque,
+   devfn, attr, data);
+}
+return -ENOENT;
+}
+
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
 {
 bus->iommu_ops = ops;
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index d6b962f646..b99e05c81e 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -485,13 +485,20 @@ void pci_bus_get_w64_range(PCIBus *bus, Range *range);
 
 void pci_device_deassert_intx(PCIDevice *dev);
 
+typedef enum IOMMUAttr {
+IOMMU_WANT_NESTING,
+} IOMMUAttr;
+
 typedef struct PCIIOMMUOps PCIIOMMUOps;
 struct PCIIOMMUOps {
 AddressSpace * (*get_address_space)(PCIBus *bus,
 void *opaque, int32_t devfn);
+int (*get_iommu_attr)(PCIBus *bus, void *opaque, int32_t devfn,
+   IOMMUAttr attr, void *data);
 };
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr attr, void *data);
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *iommu_ops, void *opaque);
 
 static inline void
-- 
2.25.1

[RFC v11 03/25] header file update VFIO/IOMMU vSVA APIs kernel 5.12-rc1

2021-03-02 Thread Liu Yi L

Update the kernel uapi/linux/iommu.h, vfio.h and ioasid.h.

This commit updates kernel headers from the below branch:
https://github.com/jacobpan/linux.git vsva-linux-5.12-rc1-v8

Note: this should be replaced with a full header files update when
the vSVA uPAPI is stable.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Michael S. Tsirkin 
Signed-off-by: Liu Yi L 
---
 linux-headers/linux/iommu.h | 413 
 linux-headers/linux/vfio.h  |  84 
 2 files changed, 497 insertions(+)
 create mode 100644 linux-headers/linux/iommu.h

diff --git a/linux-headers/linux/iommu.h b/linux-headers/linux/iommu.h
new file mode 100644
index 00..8824029e53
--- /dev/null
+++ b/linux-headers/linux/iommu.h
@@ -0,0 +1,413 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * IOMMU user API definitions
+ */
+
+#ifndef _IOMMU_H
+#define _IOMMU_H
+
+#include 
+
+#define IOMMU_FAULT_PERM_READ  (1 << 0) /* read */
+#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */
+#define IOMMU_FAULT_PERM_EXEC  (1 << 2) /* exec */
+#define IOMMU_FAULT_PERM_PRIV  (1 << 3) /* privileged */
+
+/* Generic fault types, can be expanded IRQ remapping fault */
+enum iommu_fault_type {
+   IOMMU_FAULT_DMA_UNRECOV = 1,/* unrecoverable fault */
+   IOMMU_FAULT_PAGE_REQ,   /* page request fault */
+};
+
+enum iommu_fault_reason {
+   IOMMU_FAULT_REASON_UNKNOWN = 0,
+
+   /* Could not access the PASID table (fetch caused external abort) */
+   IOMMU_FAULT_REASON_PASID_FETCH,
+
+   /* PASID entry is invalid or has configuration errors */
+   IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
+
+   /*
+* PASID is out of range (e.g. exceeds the maximum PASID
+* supported by the IOMMU) or disabled.
+*/
+   IOMMU_FAULT_REASON_PASID_INVALID,
+
+   /*
+* An external abort occurred fetching (or updating) a translation
+* table descriptor
+*/
+   IOMMU_FAULT_REASON_WALK_EABT,
+
+   /*
+* Could not access the page table entry (Bad address),
+* actual translation fault
+*/
+   IOMMU_FAULT_REASON_PTE_FETCH,
+
+   /* Protection flag check failed */
+   IOMMU_FAULT_REASON_PERMISSION,
+
+   /* access flag check failed */
+   IOMMU_FAULT_REASON_ACCESS,
+
+   /* Output address of a translation stage caused Address Size fault */
+   IOMMU_FAULT_REASON_OOR_ADDRESS,
+};
+
+/**
+ * struct iommu_fault_unrecoverable - Unrecoverable fault data
+ * @reason: reason of the fault, from  iommu_fault_reason
+ * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values)
+ * @pasid: Process Address Space ID
+ * @perm: requested permission access using by the incoming transaction
+ *(IOMMU_FAULT_PERM_* values)
+ * @addr: offending page address
+ * @fetch_addr: address that caused a fetch abort, if any
+ */
+struct iommu_fault_unrecoverable {
+   __u32   reason;
+#define IOMMU_FAULT_UNRECOV_PASID_VALID(1 << 0)
+#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1)
+#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID   (1 << 2)
+   __u32   flags;
+   __u32   pasid;
+   __u32   perm;
+   __u64   addr;
+   __u64   fetch_addr;
+};
+
+/**
+ * struct iommu_fault_page_request - Page Request data
+ * @flags: encodes whether the corresponding fields are valid and whether this
+ * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values).
+ * When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response
+ * must have the same PASID value as the page request. When it is 
clear,
+ * the page response should not have a PASID.
+ * @pasid: Process Address Space ID
+ * @grpid: Page Request Group Index
+ * @perm: requested page permissions (IOMMU_FAULT_PERM_* values)
+ * @addr: page address
+ * @private_data: device-specific private information
+ */
+struct iommu_fault_page_request {
+#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID   (1 << 0)
+#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1)
+#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2)
+#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID  (1 << 3)
+   __u32   flags;
+   __u32   pasid;
+   __u32   grpid;
+   __u32   perm;
+   __u64   addr;
+   __u64   private_data[2];
+};
+
+/**
+ * struct iommu_fault - Generic fault data
+ * @type: fault type from  iommu_fault_type
+ * @padding: reserved for future use (should be zero)
+ * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV
+ * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ
+ * @padding2: sets the fault size to allow for future extensions
+ */
+struct iommu_fault {
+   __u32   type;
+   __u32   padding;
+   union {
+   struct iommu_fault_unrecoverable event;
+   struct iommu_fault_page_request prm;
+   __u8 padding2[56];
+

[RFC v11 00/25] intel_iommu: expose Shared Virtual Addressing to VMs

2021-03-02 Thread Liu Yi L

pdate per latest kernel UAPI definition.
  e) Add patch 0017 to check iommu nesting cap info in set_iommu().
  RFC v5: https://www.spinics.net/lists/kvm/msg211475.html

- RFC v4 -> RFC v5:
  a) Refactor the vfio HostIOMMUContext init code (patch 0008 - 0009 of 
v1 series)
  b) Refactor the pasid binding handling (patch 0011 - 0016 of v1 
series)
  RFC v4: https://patchwork.ozlabs.org/cover/1259648/

- RFC v3.1 -> RFC v4:
  a) Implement HostIOMMUContext in QOM manner.
  b) Add pci_set/unset_iommu_context() to register HostIOMMUContext to
 vIOMMU, thus the lifecircle of HostIOMMUContext is awared in vIOMMU
 side. In such way, vIOMMU could use the methods provided by the
 HostIOMMUContext safely.
  c) Add back patch "[RFC v3 01/25] hw/pci: modify pci_setup_iommu() to 
set PCIIOMMUOps"
  RFCv3.1: https://patchwork.kernel.org/cover/11397879/

- RFC v3 -> v3.1:
  a) Drop IOMMUContext, and rename DualStageIOMMUObject to 
HostIOMMUContext.
 HostIOMMUContext is per-vfio-container, it is exposed to  vIOMMU 
via PCI
 layer. VFIO registers a PCIHostIOMMUFunc callback to PCI layer, 
vIOMMU
 could get HostIOMMUContext instance via it.
  b) Check IOMMU uAPI version by VFIO_CHECK_EXTENSION
  c) Add a check on VFIO_PASID_REQ availability via VFIO_GET_IOMMU_IHNFO
  d) Reorder the series, put vSVA linux header file update in the 
beginning
 put the x-scalable-mode option mofification in the end of the 
series.
  e) Dropped patch "[RFC v3 01/25] hw/pci: modify pci_setup_iommu() to 
set PCIIOMMUOps"
  RFCv3: https://patchwork.kernel.org/cover/11356033/

- RFC v2 -> v3:
  a) Introduce DualStageIOMMUObject to abstract the host IOMMU 
programming
  capability. e.g. request PASID from host, setup IOMMU nesting 
translation
  on host IOMMU. The pasid_alloc/bind_guest_page_table/iommu_cache_flush
  operations are moved to be DualStageIOMMUOps. Thus, 
DualStageIOMMUObject
  is an abstract layer which provides QEMU vIOMMU emulators with an 
explicit
  method to program host IOMMU.
  b) Compared with RFC v2, the IOMMUContext has also been updated. It is
  modified to provide an abstract for vIOMMU emulators. It provides the
  method for pass-through modules (like VFIO) to communicate with host 
IOMMU.
  e.g. tell vIOMMU emulators about the IOMMU nesting capability on host 
side
  and report the host IOMMU DMA translation faults to vIOMMU emulators.
  RFC v2: https://www.spinics.net/lists/kvm/msg198556.html

- RFC v1 -> v2:
  Introduce IOMMUContext to abstract the connection between VFIO
  and vIOMMU emulators, which is a replacement of the PCIPASIDOps
  in RFC v1. Modify x-scalable-mode to be string option instead of
  adding a new option as RFC v1 did. Refined the pasid cache management
  and addressed the TODOs mentioned in RFC v1. 
  RFC v1: https://patchwork.kernel.org/cover/11033657/

---
Eric Auger (1):
  scripts/update-linux-headers: Import iommu.h

Liu Yi L (24):
  scripts/update-linux-headers: Import ioasid.h
  header file update VFIO/IOMMU vSVA APIs kernel 5.12-rc1
  hw/pci: modify pci_setup_iommu() to set PCIIOMMUOps
  hw/pci: introduce pci_device_get_iommu_attr()
  intel_iommu: add get_iommu_attr() callback
  vfio: pass nesting requirement into vfio_get_group()
  vfio: check VFIO_TYPE1_NESTING_IOMMU support
  hw/iommu: introduce HostIOMMUContext
  hw/pci: introduce pci_device_set/unset_iommu_context()
  intel_iommu: add set/unset_iommu_context callback
  vfio: add HostIOMMUContext support
  vfio: init HostIOMMUContext per-container
  intel_iommu: sync IOMMU nesting cap info for assigned devices
  intel_iommu: add virtual command capability support
  intel_iommu: process PASID cache invalidation
  intel_iommu: add PASID cache management infrastructure
  intel_iommu: bind/unbind guest page table to host
  intel_iommu: replay pasid binds after context cache invalidation
  intel_iommu: do not pass down pasid bind for PASID #0
  vfio: add support for flush iommu stage-1 cache
  intel_iommu: process PASID-based iotlb invalidation
  intel_iommu: propagate PASID-based iotlb invalidation to host
  intel_iommu: process PASID-based Device-TLB invalidation
  intel_iommu: modify x-scalable-mode to be string option

 hw/Kconfig|3 +
 hw/alpha/typhoon.c|6 +-
 hw/arm/smmu-common.c  |6 +-
 hw/hppa/dino.c|6 +-
 hw/i386/amd_iommu.c   |6 +-
 hw/i386/intel_iommu.c | 1279 -
 hw/i386/intel_iommu_internal.h|  132 +++
 hw/i386/trace-events  |

[RFC v10 25/25] intel_iommu: modify x-scalable-mode to be string option

2020-09-10 Thread Liu Yi L

Intel VT-d 3.0 introduces scalable mode, and it has a bunch of capabilities
related to scalable mode translation, thus there are multiple combinations.
While this vIOMMU implementation wants simplify it for user by providing
typical combinations. User could config it by "x-scalable-mode" option. The
usage is as below:

"-device intel-iommu,x-scalable-mode=["legacy"|"modern"|"off"]"

 - "legacy": gives support for SL page table
 - "modern": gives support for FL page table, pasid, virtual command
 - "off": no scalable mode support
 -  if not configured, means no scalable mode support, if not proper
configured, will throw error

Note: this patch is supposed to be merged when the whole vSVA patch series
were merged.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
Signed-off-by: Yi Sun 
---
rfcv5 (v2) -> rfcv6:
*) reports want_nested to VFIO;
*) assert iommu_set/unset_iommu_context() if vIOMMU is not scalable modern.
---
 hw/i386/intel_iommu.c  | 39 +++
 hw/i386/intel_iommu_internal.h |  3 +++
 include/hw/i386/intel_iommu.h  |  2 ++
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 2010c33..9781a18 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -4050,7 +4050,7 @@ static Property vtd_properties[] = {
 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
   VTD_HOST_ADDRESS_WIDTH),
 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
-DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
+DEFINE_PROP_STRING("x-scalable-mode", IntelIOMMUState, scalable_mode_str),
 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
 DEFINE_PROP_END_OF_LIST(),
 };
@@ -4419,6 +4419,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus, int devfn)
 static int vtd_dev_get_iommu_attr(PCIBus *bus, void *opaque, int32_t devfn,
IOMMUAttr attr, void *data)
 {
+IntelIOMMUState *s = opaque;
 int ret = 0;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
@@ -4428,8 +4429,7 @@ static int vtd_dev_get_iommu_attr(PCIBus *bus, void 
*opaque, int32_t devfn,
 {
 bool *pdata = data;
 
-/* return false until vSVA is ready */
-*pdata = false;
+*pdata = s->scalable_modern ? true : false;
 break;
 }
 default:
@@ -4523,6 +4523,8 @@ static int vtd_dev_set_iommu_context(PCIBus *bus, void 
*opaque,
 VTDHostIOMMUContext *vtd_dev_icx;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+/* only modern scalable supports set_ioimmu_context */
+assert(s->scalable_modern);
 
 vtd_bus = vtd_find_add_bus(s, bus);
 
@@ -4557,6 +4559,8 @@ static void vtd_dev_unset_iommu_context(PCIBus *bus, void 
*opaque, int devfn)
 VTDHostIOMMUContext *vtd_dev_icx;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+/* only modern scalable supports unset_ioimmu_context */
+assert(s->scalable_modern);
 
 vtd_bus = vtd_find_add_bus(s, bus);
 
@@ -4784,8 +4788,13 @@ static void vtd_init(IntelIOMMUState *s)
 }
 
 /* TODO: read cap/ecap from host to decide which cap to be exposed. */
-if (s->scalable_mode) {
+if (s->scalable_mode && !s->scalable_modern) {
 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
+} else if (s->scalable_mode && s->scalable_modern) {
+s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID |
+   VTD_ECAP_FLTS | VTD_ECAP_PSS(VTD_PASID_SS) |
+   VTD_ECAP_VCS;
+s->vccap |= VTD_VCCAP_PAS;
 }
 
 if (!s->cap_finalized) {
@@ -4926,6 +4935,28 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 return false;
 }
 
+if (s->scalable_mode_str &&
+(strcmp(s->scalable_mode_str, "off") &&
+ strcmp(s->scalable_mode_str, "modern") &&
+ strcmp(s->scalable_mode_str, "legacy"))) {
+error_setg(errp, "Invalid x-scalable-mode config,"
+ "Please use \"modern\", \"legacy\" or \"off\"");
+return false;
+}
+
+if (s->scalable_mode_str &&
+!strcmp(s->scalable_mode_str, "legacy")) {
+s->scalable_mode = true;
+s->scalable_modern = false;
+} else if (s->scalable_mode_str &&
+!strcmp(s->scalable_mode_str, "modern")) {
+s->scalable_mode = true;
+s->scalable_mod

[RFC v10 16/25] vfio: add bind stage-1 page table support

2020-09-10 Thread Liu Yi L

This patch adds bind_stage1_pgtbl() definition in HostIOMMUContextClass,
also adds corresponding implementation in VFIO. This is to expose a way
for vIOMMU to setup dual stage DMA translation for passthru devices on
hardware.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
---
 hw/iommu/host_iommu_context.c | 57 +-
 hw/vfio/common.c  | 58 ++-
 include/hw/iommu/host_iommu_context.h | 19 +++-
 3 files changed, 131 insertions(+), 3 deletions(-)

diff --git a/hw/iommu/host_iommu_context.c b/hw/iommu/host_iommu_context.c
index 5fb2223..c43965c 100644
--- a/hw/iommu/host_iommu_context.c
+++ b/hw/iommu/host_iommu_context.c
@@ -69,23 +69,78 @@ int host_iommu_ctx_pasid_free(HostIOMMUContext *iommu_ctx, 
uint32_t pasid)
 return hicxc->pasid_free(iommu_ctx, pasid);
 }
 
+int host_iommu_ctx_bind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
+ struct iommu_gpasid_bind_data *bind)
+{
+HostIOMMUContextClass *hicxc;
+
+if (!iommu_ctx) {
+return -EINVAL;
+}
+
+hicxc = HOST_IOMMU_CONTEXT_GET_CLASS(iommu_ctx);
+if (!hicxc) {
+return -EINVAL;
+}
+
+if (!(iommu_ctx->flags & HOST_IOMMU_NESTING) ||
+!hicxc->bind_stage1_pgtbl) {
+return -EINVAL;
+}
+
+return hicxc->bind_stage1_pgtbl(iommu_ctx, bind);
+}
+
+int host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
+ struct iommu_gpasid_bind_data *unbind)
+{
+HostIOMMUContextClass *hicxc;
+
+if (!iommu_ctx) {
+return -EINVAL;
+}
+
+hicxc = HOST_IOMMU_CONTEXT_GET_CLASS(iommu_ctx);
+if (!hicxc) {
+return -EINVAL;
+}
+
+if (!(iommu_ctx->flags & HOST_IOMMU_NESTING) ||
+!hicxc->unbind_stage1_pgtbl) {
+return -EINVAL;
+}
+
+return hicxc->unbind_stage1_pgtbl(iommu_ctx, unbind);
+}
+
 void host_iommu_ctx_init(void *_iommu_ctx, size_t instance_size,
  const char *mrtypename,
- uint64_t flags)
+ uint64_t flags,
+ struct iommu_nesting_info *info)
 {
 HostIOMMUContext *iommu_ctx;
 
 object_initialize(_iommu_ctx, instance_size, mrtypename);
 iommu_ctx = HOST_IOMMU_CONTEXT(_iommu_ctx);
 iommu_ctx->flags = flags;
+iommu_ctx->info = g_malloc0(info->argsz);
+memcpy(iommu_ctx->info, info, info->argsz);
 iommu_ctx->initialized = true;
 }
 
+static void host_iommu_ctx_finalize_fn(Object *obj)
+{
+HostIOMMUContext *iommu_ctx = HOST_IOMMU_CONTEXT(obj);
+
+g_free(iommu_ctx->info);
+}
+
 static const TypeInfo host_iommu_context_info = {
 .parent = TYPE_OBJECT,
 .name   = TYPE_HOST_IOMMU_CONTEXT,
 .class_size = sizeof(HostIOMMUContextClass),
 .instance_size  = sizeof(HostIOMMUContext),
+.instance_finalize  = host_iommu_ctx_finalize_fn,
 .abstract   = true,
 };
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index f41deeb..74dbeaf 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1227,6 +1227,54 @@ static int 
vfio_host_iommu_ctx_pasid_free(HostIOMMUContext *iommu_ctx,
 return ret;
 }
 
+static int vfio_host_iommu_ctx_bind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
+ struct iommu_gpasid_bind_data *bind)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_nesting_op *op;
+unsigned long argsz;
+int ret = 0;
+
+argsz = sizeof(*op) + sizeof(*bind);
+op = g_malloc0(argsz);
+op->argsz = argsz;
+op->flags = VFIO_IOMMU_NESTING_OP_BIND_PGTBL;
+memcpy(>data, bind, sizeof(*bind));
+
+if (ioctl(container->fd, VFIO_IOMMU_NESTING_OP, op)) {
+ret = -errno;
+error_report("%s: pasid (%llu) bind failed: %m",
+  __func__, bind->hpasid);
+}
+g_free(op);
+return ret;
+}
+
+static int vfio_host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
+ struct iommu_gpasid_bind_data *unbind)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_nesting_op *op;
+unsigned long argsz;
+int ret = 0;
+
+argsz = sizeof(*op) + sizeof(*unbind);
+op = g_malloc0(argsz);
+op->argsz = argsz;
+op->flags = VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL;
+memcpy(>data, unbind, sizeof(*unbind));
+
+if (ioctl(container->fd, VFIO_IOMMU_NESTING_OP, op)) {
+ret = -errno;
+error_report("%s: pasid (%llu) unbind failed: %m",
+

[RFC v10 13/25] intel_iommu: add virtual command capability support

2020-09-10 Thread Liu Yi L

This patch adds virtual command support to Intel vIOMMU per
Intel VT-d 3.1 spec. And adds two virtual commands: allocate
pasid and free pasid.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
Signed-off-by: Yi Sun 
---
 hw/i386/intel_iommu.c  | 154 -
 hw/i386/intel_iommu_internal.h |  37 ++
 hw/i386/trace-events   |   1 +
 include/hw/i386/intel_iommu.h  |  10 ++-
 4 files changed, 200 insertions(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index bf496f7..f6353c7 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2656,6 +2656,129 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s)
 }
 }
 
+static int vtd_request_pasid_alloc(IntelIOMMUState *s, uint32_t *pasid)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+int ret = -1;
+
+vtd_iommu_lock(s);
+QLIST_FOREACH(vtd_dev_icx, >vtd_dev_icx_list, next) {
+HostIOMMUContext *iommu_ctx = vtd_dev_icx->iommu_ctx;
+
+/*
+ * We'll return the first valid result we got. It's
+ * a bit hackish in that we don't have a good global
+ * interface yet to talk to modules like vfio to deliver
+ * this allocation request, so we're leveraging this
+ * per-device iommu context to do the same thing just
+ * to make sure the allocation happens only once.
+ */
+ret = host_iommu_ctx_pasid_alloc(iommu_ctx, VTD_HPASID_MIN,
+ VTD_HPASID_MAX, pasid);
+if (!ret) {
+break;
+}
+}
+vtd_iommu_unlock(s);
+
+return ret;
+}
+
+static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+int ret = -1;
+
+vtd_iommu_lock(s);
+QLIST_FOREACH(vtd_dev_icx, >vtd_dev_icx_list, next) {
+HostIOMMUContext *iommu_ctx = vtd_dev_icx->iommu_ctx;
+
+/*
+ * Similar with pasid allocation. We'll free the pasid
+ * on the first successful free operation. It's a bit
+ * hackish in that we don't have a good global interface
+ * yet to talk to modules like vfio to deliver this pasid
+ * free request, so we're leveraging this per-device iommu
+ * context to do the same thing just to make sure the free
+ * happens only once.
+ */
+ret = host_iommu_ctx_pasid_free(iommu_ctx, pasid);
+if (!ret) {
+break;
+}
+}
+vtd_iommu_unlock(s);
+
+return ret;
+}
+
+/*
+ * If IP is not set, set it then return.
+ * If IP is already set, return.
+ */
+static void vtd_vcmd_set_ip(IntelIOMMUState *s)
+{
+s->vcrsp = 1;
+vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+ ((uint64_t) s->vcrsp));
+}
+
+static void vtd_vcmd_clear_ip(IntelIOMMUState *s)
+{
+s->vcrsp &= (~((uint64_t)(0x1)));
+vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+ ((uint64_t) s->vcrsp));
+}
+
+/* Handle write to Virtual Command Register */
+static int vtd_handle_vcmd_write(IntelIOMMUState *s, uint64_t val)
+{
+uint32_t pasid;
+int ret = -1;
+
+trace_vtd_reg_write_vcmd(s->vcrsp, val);
+
+if (!(s->vccap & VTD_VCCAP_PAS) ||
+ (s->vcrsp & 1)) {
+return -1;
+}
+
+/*
+ * Since vCPU should be blocked when the guest VMCD
+ * write was trapped to here. Should be no other vCPUs
+ * try to access VCMD if guest software is well written.
+ * However, we still emulate the IP bit here in case of
+ * bad guest software. Also align with the spec.
+ */
+vtd_vcmd_set_ip(s);
+
+switch (val & VTD_VCMD_CMD_MASK) {
+case VTD_VCMD_ALLOC_PASID:
+ret = vtd_request_pasid_alloc(s, );
+if (ret) {
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
+} else {
+s->vcrsp |= VTD_VCRSP_RSLT(pasid);
+}
+break;
+
+case VTD_VCMD_FREE_PASID:
+pasid = VTD_VCMD_PASID_VALUE(val);
+ret = vtd_request_pasid_free(s, pasid);
+if (ret < 0) {
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
+}
+break;
+
+default:
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);
+error_report_once("Virtual Command: unsupported command!!!");
+break;
+}
+vtd_vcmd_clear_ip(s);
+return 0;
+}
+
 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
 {
 IntelIOMMUState *s = opaque;
@@ -2944,6 +3067,23 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 vtd_set_long(s, addr, val);
 break;
 
+case DMAR_VCMD_REG:
+if (!vtd_handle_vcmd_write(s, val)) {
+if (size == 4) {
+vtd_set_long(s, addr, val);
+} else {
+

[RFC v10 12/25] vfio: init HostIOMMUContext per-container

2020-09-10 Thread Liu Yi L

In this patch, QEMU firstly gets iommu info from kernel to check the
supported capabilities by a VFIO_IOMMU_TYPE1_NESTING iommu. And inits
HostIOMMUContet instance.

For vfio-pci devices, it could use pci_device_set/unset_iommu() to
expose host iommu context to vIOMMU emulators. vIOMMU emulators
could make use the methods provided by host iommu context. e.g.
propagate requests to host iommu.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
---
 hw/vfio/common.c | 113 +++
 hw/vfio/pci.c|  17 +
 2 files changed, 130 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 41aaf41..f41deeb 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1227,10 +1227,102 @@ static int 
vfio_host_iommu_ctx_pasid_free(HostIOMMUContext *iommu_ctx,
 return ret;
 }
 
+/**
+ * Get iommu info from host. Caller of this funcion should free
+ * the memory pointed by the returned pointer stored in @info
+ * after a successful calling when finished its usage.
+ */
+static int vfio_get_iommu_info(VFIOContainer *container,
+ struct vfio_iommu_type1_info **info)
+{
+
+size_t argsz = sizeof(struct vfio_iommu_type1_info);
+
+*info = g_malloc0(argsz);
+
+retry:
+(*info)->argsz = argsz;
+
+if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
+g_free(*info);
+*info = NULL;
+return -errno;
+}
+
+if (((*info)->argsz > argsz)) {
+argsz = (*info)->argsz;
+*info = g_realloc(*info, argsz);
+goto retry;
+}
+
+return 0;
+}
+
+static struct vfio_info_cap_header *
+vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
+{
+struct vfio_info_cap_header *hdr;
+void *ptr = info;
+
+if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
+return NULL;
+}
+
+for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
+if (hdr->id == id) {
+return hdr;
+}
+}
+
+return NULL;
+}
+
+static int vfio_get_nesting_iommu_cap(VFIOContainer *container,
+   struct vfio_iommu_type1_info_cap_nesting **cap_nesting)
+{
+struct vfio_iommu_type1_info *info;
+struct vfio_info_cap_header *hdr;
+struct vfio_iommu_type1_info_cap_nesting *cap;
+struct iommu_nesting_info *nest_info;
+int ret;
+uint32_t minsz, cap_size;
+
+ret = vfio_get_iommu_info(container, );
+if (ret) {
+return ret;
+}
+
+hdr = vfio_get_iommu_info_cap(info,
+VFIO_IOMMU_TYPE1_INFO_CAP_NESTING);
+if (!hdr) {
+g_free(info);
+return -EINVAL;
+}
+
+cap = container_of(hdr,
+struct vfio_iommu_type1_info_cap_nesting, header);
+
+nest_info = >info;
+minsz = offsetof(struct iommu_nesting_info, vendor);
+if (nest_info->argsz < minsz) {
+g_free(info);
+return -EINVAL;
+}
+
+cap_size = offsetof(struct vfio_iommu_type1_info_cap_nesting, info) +
+   nest_info->argsz;
+*cap_nesting = g_malloc0(cap_size);
+memcpy(*cap_nesting, cap, cap_size);
+
+g_free(info);
+return 0;
+}
+
 static int vfio_init_container(VFIOContainer *container, int group_fd,
bool want_nested, Error **errp)
 {
 int iommu_type, ret;
+uint64_t flags = 0;
 
 iommu_type = vfio_get_iommu_type(container, want_nested, errp);
 if (iommu_type < 0) {
@@ -1258,6 +1350,27 @@ static int vfio_init_container(VFIOContainer *container, 
int group_fd,
 return -errno;
 }
 
+if (iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
+struct vfio_iommu_type1_info_cap_nesting *nesting = NULL;
+struct iommu_nesting_info *nest_info;
+
+ret = vfio_get_nesting_iommu_cap(container, );
+if (ret) {
+error_setg_errno(errp, -ret,
+ "Failed to get nesting iommu cap");
+return ret;
+}
+
+nest_info = (struct iommu_nesting_info *) >info;
+flags |= (nest_info->features & IOMMU_NESTING_FEAT_SYSWIDE_PASID) ?
+ HOST_IOMMU_PASID_REQUEST : 0;
+host_iommu_ctx_init(>iommu_ctx,
+sizeof(container->iommu_ctx),
+TYPE_VFIO_HOST_IOMMU_CONTEXT,
+flags);
+g_free(nesting);
+}
+
 container->iommu_type = iommu_type;
 return 0;
 }
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index d33fb89..3907c4f 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2704,6 +2704,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 VFIOPCIDevice *vdev = PCI_VFIO(pdev);
 VFIODevice *vbasedev_iter;
 VFIOGroup *group;
+VFIOContainer *container;
 char *tmp, *subsys, group_path[PATH_MAX], *group_name;

[RFC v10 23/25] intel_iommu: propagate PASID-based iotlb invalidation to host

2020-09-10 Thread Liu Yi L

This patch propagates PASID-based iotlb invalidation to host.

Intel VT-d 3.0 supports nested translation in PASID granular.
Guest SVA support could be implemented by configuring nested
translation on specific PASID. This is also known as dual stage
DMA translation.

Under such configuration, guest owns the GVA->GPA translation
which is configured as first level page table in host side for
a specific pasid, and host owns GPA->HPA translation. As guest
owns first level translation table, piotlb invalidation should
be propagated to host since host IOMMU will cache first level
page table related mappings during DMA address translation.

This patch traps the guest PASID-based iotlb flush and propagate
it to host.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
rfcv4 (v1) -> rfcv5 (v2):
*) removed the valid check to vtd_pasid_as instance as rfcv5 ensures
   all vtd_pasid_as instances in hash table should be valid.
---
 hw/i386/intel_iommu.c  | 113 +
 hw/i386/intel_iommu_internal.h |   7 +++
 2 files changed, 120 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 516d7ff..32b0029 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3038,16 +3038,129 @@ static bool vtd_process_pasid_desc(IntelIOMMUState *s,
 return true;
 }
 
+/**
+ * Caller of this function should hold iommu_lock.
+ */
+static void vtd_invalidate_piotlb(IntelIOMMUState *s,
+  VTDBus *vtd_bus,
+  int devfn,
+  struct iommu_cache_invalidate_info *cache)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+HostIOMMUContext *iommu_ctx;
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+if (!vtd_dev_icx) {
+goto out;
+}
+iommu_ctx = vtd_dev_icx->iommu_ctx;
+if (!iommu_ctx) {
+goto out;
+}
+if (host_iommu_ctx_flush_stage1_cache(iommu_ctx, cache)) {
+error_report("Cache flush failed");
+}
+out:
+return;
+}
+
+/**
+ * This function is a loop function for the s->vtd_pasid_as
+ * list with VTDPIOTLBInvInfo as execution filter. It propagates
+ * the piotlb invalidation to host. Caller of this function
+ * should hold iommu_lock.
+ */
+static void vtd_flush_pasid_iotlb(gpointer key, gpointer value,
+  gpointer user_data)
+{
+VTDPIOTLBInvInfo *piotlb_info = user_data;
+VTDPASIDAddressSpace *vtd_pasid_as = value;
+VTDPASIDCacheEntry *pc_entry = _pasid_as->pasid_cache_entry;
+uint16_t did;
+
+did = vtd_pe_get_domain_id(_entry->pasid_entry);
+
+if ((piotlb_info->domain_id == did) &&
+(piotlb_info->pasid == vtd_pasid_as->pasid)) {
+vtd_invalidate_piotlb(vtd_pasid_as->iommu_state,
+  vtd_pasid_as->vtd_bus,
+  vtd_pasid_as->devfn,
+  piotlb_info->cache_info);
+}
+
+/*
+ * TODO: needs to add QEMU piotlb flush when QEMU piotlb
+ * infrastructure is ready. For now, it is enough for passthru
+ * devices.
+ */
+}
+
 static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
 uint16_t domain_id,
 uint32_t pasid)
 {
+VTDPIOTLBInvInfo piotlb_info;
+struct iommu_cache_invalidate_info *cache_info;
+
+cache_info = g_malloc0(sizeof(*cache_info));
+
+cache_info->argsz = sizeof(*cache_info);
+cache_info->version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
+cache_info->cache = IOMMU_CACHE_INV_TYPE_IOTLB;
+cache_info->granularity = IOMMU_INV_GRANU_PASID;
+cache_info->granu.pasid_info.pasid = pasid;
+cache_info->granu.pasid_info.flags = IOMMU_INV_PASID_FLAGS_PASID;
+
+piotlb_info.domain_id = domain_id;
+piotlb_info.pasid = pasid;
+piotlb_info.cache_info = cache_info;
+
+vtd_iommu_lock(s);
+/*
+ * Here loops all the vtd_pasid_as instances in s->vtd_pasid_as
+ * to find out the affected devices since piotlb invalidation
+ * should check pasid cache per architecture point of view.
+ */
+g_hash_table_foreach(s->vtd_pasid_as,
+ vtd_flush_pasid_iotlb, _info);
+vtd_iommu_unlock(s);
+g_free(cache_info);
 }
 
 static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
uint32_t pasid, hwaddr addr, uint8_t am,
bool ih)
 {
+VTDPIOTLBInvInfo piotlb_info;
+struct iommu_cache_invalidate_info *cache_info;
+
+cache_info = g_malloc0(sizeof(*cache_info));
+
+cache_info->argsz = sizeof(*cache_info);
+cache_info->version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
+cache_info->cache =

[RFC v10 18/25] intel_iommu: bind/unbind guest page table to host

2020-09-10 Thread Liu Yi L

This patch captures the guest PASID table entry modifications and
propagates the changes to host to setup dual stage DMA translation.
The guest page table is configured as 1st level page table (GVA->GPA)
whose translation result would further go through host VT-d 2nd
level page table(GPA->HPA) under nested translation mode. This is the
key part of vSVA support, and also a key to support IOVA over 1st-
level page table for Intel VT-d in virtualization environment.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 101 +++--
 hw/i386/intel_iommu_internal.h |  18 
 2 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index af17b36..4f6b80f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -41,6 +41,7 @@
 #include "migration/vmstate.h"
 #include "trace.h"
 #include "qemu/jhash.h"
+#include 
 
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
@@ -700,6 +701,16 @@ static inline uint32_t 
vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
 return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce->val[0]) + 7);
 }
 
+static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
+{
+return 48 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM) * 9;
+}
+
+static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
+{
+return pe->val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+}
+
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 {
 return pdire->val & 1;
@@ -1861,6 +1872,85 @@ static void 
vtd_context_global_invalidate(IntelIOMMUState *s)
 vtd_iommu_replay_all(s);
 }
 
+/**
+ * Caller should hold iommu_lock.
+ */
+static int vtd_bind_guest_pasid(IntelIOMMUState *s, VTDBus *vtd_bus,
+int devfn, int pasid, VTDPASIDEntry *pe,
+VTDPASIDOp op)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+HostIOMMUContext *iommu_ctx;
+int ret = -1;
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+if (!vtd_dev_icx) {
+/* means no need to go further, e.g. for emulated devices */
+return 0;
+}
+
+iommu_ctx = vtd_dev_icx->iommu_ctx;
+if (!iommu_ctx) {
+return -EINVAL;
+}
+
+switch (op) {
+case VTD_PASID_BIND:
+{
+struct iommu_gpasid_bind_data *g_bind_data;
+
+g_bind_data = g_malloc0(sizeof(*g_bind_data));
+
+g_bind_data->argsz = sizeof(*g_bind_data);
+g_bind_data->version = IOMMU_GPASID_BIND_VERSION_1;
+g_bind_data->format = IOMMU_PASID_FORMAT_INTEL_VTD;
+g_bind_data->gpgd = vtd_pe_get_flpt_base(pe);
+g_bind_data->addr_width = vtd_pe_get_fl_aw(pe);
+g_bind_data->hpasid = pasid;
+g_bind_data->gpasid = pasid;
+g_bind_data->flags |= IOMMU_SVA_GPASID_VAL;
+g_bind_data->vendor.vtd.flags =
+ (VTD_SM_PASID_ENTRY_SRE_BIT(pe->val[2]) ?
+IOMMU_SVA_VTD_GPASID_SRE : 0)
+   | (VTD_SM_PASID_ENTRY_EAFE_BIT(pe->val[2]) ?
+IOMMU_SVA_VTD_GPASID_EAFE : 0)
+   | (VTD_SM_PASID_ENTRY_PCD_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_PCD : 0)
+   | (VTD_SM_PASID_ENTRY_PWT_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_PWT : 0)
+   | (VTD_SM_PASID_ENTRY_EMTE_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_EMTE : 0)
+   | (VTD_SM_PASID_ENTRY_CD_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_CD : 0);
+g_bind_data->vendor.vtd.pat = VTD_SM_PASID_ENTRY_PAT(pe->val[1]);
+g_bind_data->vendor.vtd.emt = VTD_SM_PASID_ENTRY_EMT(pe->val[1]);
+ret = host_iommu_ctx_bind_stage1_pgtbl(iommu_ctx, g_bind_data);
+g_free(g_bind_data);
+break;
+}
+case VTD_PASID_UNBIND:
+{
+struct iommu_gpasid_bind_data *g_unbind_data;
+
+g_unbind_data = g_malloc0(sizeof(*g_unbind_data));
+
+g_unbind_data->argsz = sizeof(*g_unbind_data);
+g_unbind_data->version = IOMMU_GPASID_BIND_VERSION_1;
+g_unbind_data->format = IOMMU_PASID_FORMAT_INTEL_VTD;
+g_unbind_data->hpasid = pasid;
+ret = host_iommu_ctx_unbind_stage1_pgtbl(iommu_ctx, g_unbind_data);
+g_free(g_unbind_data);
+break;
+}
+default:
+error_report_once("Unknown VTDPASIDOp!!!\n");
+break;
+}
+
+
+return ret;
+}
+
 /* Do a context-cache device-selective invalidation.
  * @func_mask: FM field aft

[RFC v10 24/25] intel_iommu: process PASID-based Device-TLB invalidation

2020-09-10 Thread Liu Yi L

This patch adds an empty handling for PASID-based Device-TLB
invalidation. For now it is enough as it is not necessary to
propagate it to host for passthru device and also there is no
emulated device has device tlb.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 18 ++
 hw/i386/intel_iommu_internal.h |  1 +
 2 files changed, 19 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 32b0029..2010c33 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3213,6 +3213,17 @@ static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
 return true;
 }
 
+static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s,
+   VTDInvDesc *inv_desc)
+{
+/*
+ * no need to handle it for passthru device, for emulated
+ * devices with device tlb, it may be required, but for now,
+ * return is enough
+ */
+return true;
+}
+
 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
   VTDInvDesc *inv_desc)
 {
@@ -3334,6 +3345,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 }
 break;
 
+case VTD_INV_DESC_DEV_PIOTLB:
+trace_vtd_inv_desc("device-piotlb", inv_desc.hi, inv_desc.lo);
+if (!vtd_process_device_piotlb_desc(s, _desc)) {
+return false;
+}
+break;
+
 case VTD_INV_DESC_DEVICE:
 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
 if (!vtd_process_device_iotlb_desc(s, _desc)) {
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 08ff58e..9b4fc67 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -405,6 +405,7 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_WAIT   0x5 /* Invalidation Wait Descriptor */
 #define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */
 #define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */
+#define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/
 #define VTD_INV_DESC_NONE   0   /* Not an Invalidate Descriptor */
 
 /* Masks for Invalidation Wait Descriptor*/
-- 
2.7.4

[RFC v10 09/25] hw/pci: introduce pci_device_set/unset_iommu_context()

2020-09-10 Thread Liu Yi L

For nesting IOMMU translation capable platforms, vIOMMUs running on
such system could be implemented upon physical IOMMU nested paging
(VFIO case). vIOMMU advertises such implementation by "want_nested"
attribute to PCIe devices (e.g. VFIO PCI). Once "want_nested" is
satisfied, device (VFIO case) should set HostIOMMUContext to vIOMMU,
thus vIOMMU could manage stage-1 translation. DMAs out from such
devices would be protected through the stage-1 page tables owned by
guest together with stage-2 page tables owned by host.

This patch adds pci_device_set/unset_iommu_context() to set/unset
HostIOMMUContext for a given PCIe device (VFIO case). Caller of set
should fail if set operation failed.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
rfcv5 (v2) -> rfcv6:
*) pci_device_set_iommu_context() returns 0 if callback is not implemented.
---
 hw/pci/pci.c | 28 
 include/hw/pci/pci.h | 10 ++
 2 files changed, 38 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 1886f8e..e1b2f05 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2743,6 +2743,34 @@ int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr 
attr, void *data)
 return -ENOENT;
 }
 
+int pci_device_set_iommu_context(PCIDevice *dev,
+ HostIOMMUContext *iommu_ctx)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->set_iommu_context) {
+return bus->iommu_ops->set_iommu_context(bus,
+  bus->iommu_opaque, devfn, iommu_ctx);
+}
+return 0;
+}
+
+void pci_device_unset_iommu_context(PCIDevice *dev)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->unset_iommu_context) {
+bus->iommu_ops->unset_iommu_context(bus,
+ bus->iommu_opaque, devfn);
+}
+}
+
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
 {
 bus->iommu_ops = ops;
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 18b51dd..9348560 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -9,6 +9,8 @@
 
 #include "hw/pci/pcie.h"
 
+#include "hw/iommu/host_iommu_context.h"
+
 extern bool pci_available;
 
 /* PCI bus */
@@ -497,10 +499,18 @@ struct PCIIOMMUOps {
 void *opaque, int32_t devfn);
 int (*get_iommu_attr)(PCIBus *bus, void *opaque, int32_t devfn,
IOMMUAttr attr, void *data);
+int (*set_iommu_context)(PCIBus *bus, void *opaque,
+ int32_t devfn,
+ HostIOMMUContext *iommu_ctx);
+void (*unset_iommu_context)(PCIBus *bus, void *opaque,
+int32_t devfn);
 };
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
 int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr attr, void *data);
+int pci_device_set_iommu_context(PCIDevice *dev,
+ HostIOMMUContext *iommu_ctx);
+void pci_device_unset_iommu_context(PCIDevice *dev);
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *iommu_ops, void *opaque);
 
 static inline void
-- 
2.7.4

[RFC v10 11/25] vfio/common: provide PASID alloc/free hooks

2020-09-10 Thread Liu Yi L

This patch defines vfio_host_iommu_context_info, implements the PASID
alloc/free hooks defined in HostIOMMUContextClass.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
---
 hw/vfio/common.c  | 66 +++
 include/hw/iommu/host_iommu_context.h |  3 ++
 include/hw/vfio/vfio-common.h |  4 +++
 3 files changed, 73 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index af91eca..41aaf41 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1183,6 +1183,50 @@ static int vfio_get_iommu_type(VFIOContainer *container,
 return ret;
 }
 
+static int vfio_host_iommu_ctx_pasid_alloc(HostIOMMUContext *iommu_ctx,
+   uint32_t min, uint32_t max,
+   uint32_t *pasid)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_pasid_request req;
+int ret = 0;
+
+req.argsz = sizeof(req);
+req.flags = VFIO_IOMMU_FLAG_ALLOC_PASID;
+req.range.min = min;
+req.range.max = max;
+
+ret = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, );
+if (ret < 0) {
+error_report("%s: alloc failed (%m)", __func__);
+return ret;
+}
+*pasid = ret;
+return 0;
+}
+
+static int vfio_host_iommu_ctx_pasid_free(HostIOMMUContext *iommu_ctx,
+  uint32_t pasid)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_pasid_request req;
+
+int ret = 0;
+
+req.argsz = sizeof(req);
+req.flags = VFIO_IOMMU_FLAG_FREE_PASID;
+req.range.min = pasid;
+req.range.max = pasid + 1;
+
+ret = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, );
+if (ret) {
+error_report("%s: free failed (%m)", __func__);
+}
+return ret;
+}
+
 static int vfio_init_container(VFIOContainer *container, int group_fd,
bool want_nested, Error **errp)
 {
@@ -1802,3 +1846,25 @@ int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
 }
 return vfio_eeh_container_op(container, op);
 }
+
+static void vfio_host_iommu_context_class_init(ObjectClass *klass,
+   void *data)
+{
+HostIOMMUContextClass *hicxc = HOST_IOMMU_CONTEXT_CLASS(klass);
+
+hicxc->pasid_alloc = vfio_host_iommu_ctx_pasid_alloc;
+hicxc->pasid_free = vfio_host_iommu_ctx_pasid_free;
+}
+
+static const TypeInfo vfio_host_iommu_context_info = {
+.parent = TYPE_HOST_IOMMU_CONTEXT,
+.name = TYPE_VFIO_HOST_IOMMU_CONTEXT,
+.class_init = vfio_host_iommu_context_class_init,
+};
+
+static void vfio_register_types(void)
+{
+type_register_static(_host_iommu_context_info);
+}
+
+type_init(vfio_register_types)
diff --git a/include/hw/iommu/host_iommu_context.h 
b/include/hw/iommu/host_iommu_context.h
index 35c4861..227c433 100644
--- a/include/hw/iommu/host_iommu_context.h
+++ b/include/hw/iommu/host_iommu_context.h
@@ -33,6 +33,9 @@
 #define TYPE_HOST_IOMMU_CONTEXT "qemu:host-iommu-context"
 #define HOST_IOMMU_CONTEXT(obj) \
 OBJECT_CHECK(HostIOMMUContext, (obj), TYPE_HOST_IOMMU_CONTEXT)
+#define HOST_IOMMU_CONTEXT_CLASS(klass) \
+OBJECT_CLASS_CHECK(HostIOMMUContextClass, (klass), \
+ TYPE_HOST_IOMMU_CONTEXT)
 #define HOST_IOMMU_CONTEXT_GET_CLASS(obj) \
 OBJECT_GET_CLASS(HostIOMMUContextClass, (obj), \
  TYPE_HOST_IOMMU_CONTEXT)
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index bdb09f4..a5eaf35 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -26,12 +26,15 @@
 #include "qemu/notify.h"
 #include "ui/console.h"
 #include "hw/display/ramfb.h"
+#include "hw/iommu/host_iommu_context.h"
 #ifdef CONFIG_LINUX
 #include 
 #endif
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
+#define TYPE_VFIO_HOST_IOMMU_CONTEXT "qemu:vfio-host-iommu-context"
+
 enum {
 VFIO_DEVICE_TYPE_PCI = 0,
 VFIO_DEVICE_TYPE_PLATFORM = 1,
@@ -71,6 +74,7 @@ typedef struct VFIOContainer {
 MemoryListener listener;
 MemoryListener prereg_listener;
 unsigned iommu_type;
+HostIOMMUContext iommu_ctx;
 Error *error;
 bool initialized;
 unsigned long pgsizes;
-- 
2.7.4

[RFC v10 14/25] intel_iommu: process PASID cache invalidation

2020-09-10 Thread Liu Yi L

This patch adds PASID cache invalidation handling. When guest enabled
PASID usages (e.g. SVA), guest software should issue a proper PASID
cache invalidation when caching-mode is exposed. This patch only adds
the draft handling of pasid cache invalidation. Detailed handling will
be added in subsequent patches.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
rfcv4 (v1) -> rfcv5 (v2):
*) remove vtd_pasid_cache_gsi(), vtd_pasid_cache_psi()
   and vtd_pasid_cache_dsi()
---
 hw/i386/intel_iommu.c  | 40 +++-
 hw/i386/intel_iommu_internal.h | 12 
 hw/i386/trace-events   |  3 +++
 3 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index f6353c7..b110c7f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2395,6 +2395,37 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, 
VTDInvDesc *inv_desc)
 return true;
 }
 
+static bool vtd_process_pasid_desc(IntelIOMMUState *s,
+   VTDInvDesc *inv_desc)
+{
+if ((inv_desc->val[0] & VTD_INV_DESC_PASIDC_RSVD_VAL0) ||
+(inv_desc->val[1] & VTD_INV_DESC_PASIDC_RSVD_VAL1) ||
+(inv_desc->val[2] & VTD_INV_DESC_PASIDC_RSVD_VAL2) ||
+(inv_desc->val[3] & VTD_INV_DESC_PASIDC_RSVD_VAL3)) {
+error_report_once("non-zero-field-in-pc_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+switch (inv_desc->val[0] & VTD_INV_DESC_PASIDC_G) {
+case VTD_INV_DESC_PASIDC_DSI:
+break;
+
+case VTD_INV_DESC_PASIDC_PASID_SI:
+break;
+
+case VTD_INV_DESC_PASIDC_GLOBAL:
+break;
+
+default:
+error_report_once("invalid-inv-granu-in-pc_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+return true;
+}
+
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
  VTDInvDesc *inv_desc)
 {
@@ -2501,12 +2532,11 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 }
 break;
 
-/*
- * TODO: the entity of below two cases will be implemented in future 
series.
- * To make guest (which integrates scalable mode support patch set in
- * iommu driver) work, just return true is enough so far.
- */
 case VTD_INV_DESC_PC:
+trace_vtd_inv_desc("pasid-cache", inv_desc.val[1], inv_desc.val[0]);
+if (!vtd_process_pasid_desc(s, _desc)) {
+return false;
+}
 break;
 
 case VTD_INV_DESC_PIOTLB:
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 64ac0a8..22d0bc5 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -445,6 +445,18 @@ typedef union VTDInvDesc VTDInvDesc;
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
 
+#define VTD_INV_DESC_PASIDC_G  (3ULL << 4)
+#define VTD_INV_DESC_PASIDC_PASID(val) (((val) >> 32) & 0xfULL)
+#define VTD_INV_DESC_PASIDC_DID(val)   (((val) >> 16) & VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_PASIDC_RSVD_VAL0  0xfff0ffc0ULL
+#define VTD_INV_DESC_PASIDC_RSVD_VAL1  0xULL
+#define VTD_INV_DESC_PASIDC_RSVD_VAL2  0xULL
+#define VTD_INV_DESC_PASIDC_RSVD_VAL3  0xULL
+
+#define VTD_INV_DESC_PASIDC_DSI(0ULL << 4)
+#define VTD_INV_DESC_PASIDC_PASID_SI   (1ULL << 4)
+#define VTD_INV_DESC_PASIDC_GLOBAL (3ULL << 4)
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
 uint16_t domain_id;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 71536a7..f7cd4e5 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -22,6 +22,9 @@ vtd_inv_qi_head(uint16_t head) "read head %d"
 vtd_inv_qi_tail(uint16_t head) "write tail %d"
 vtd_inv_qi_fetch(void) ""
 vtd_context_cache_reset(void) ""
+vtd_pasid_cache_gsi(void) ""
+vtd_pasid_cache_dsi(uint16_t domain) "Domian slective PC invalidation domain 
0x%"PRIx16
+vtd_pasid_cache_psi(uint16_t domain, uint32_t pasid) "PASID slective PC 
invalidation domain 0x%"PRIx16" pasid 0x%"PRIx32
 vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
 vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" 
devfn %"PRIu8" not present"
 vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t 
domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" 
domain 0x%"PRIx16
-- 
2.7.4

[RFC v10 21/25] vfio: add support for flush iommu stage-1 cache

2020-09-10 Thread Liu Yi L

This patch adds flush_stage1_cache() definition in HostIOMUContextClass.
And adds corresponding implementation in VFIO. This is to expose a way
for vIOMMU to flush stage-1 cache in host side since guest owns stage-1
translation structures in dual stage DMA translation configuration.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Acked-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/iommu/host_iommu_context.c | 19 +++
 hw/vfio/common.c  | 24 
 include/hw/iommu/host_iommu_context.h |  8 
 3 files changed, 51 insertions(+)

diff --git a/hw/iommu/host_iommu_context.c b/hw/iommu/host_iommu_context.c
index c43965c..a3f7706 100644
--- a/hw/iommu/host_iommu_context.c
+++ b/hw/iommu/host_iommu_context.c
@@ -113,6 +113,25 @@ int host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext 
*iommu_ctx,
 return hicxc->unbind_stage1_pgtbl(iommu_ctx, unbind);
 }
 
+int host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+ struct iommu_cache_invalidate_info *cache)
+{
+HostIOMMUContextClass *hicxc;
+
+hicxc = HOST_IOMMU_CONTEXT_GET_CLASS(iommu_ctx);
+
+if (!hicxc) {
+return -EINVAL;
+}
+
+if (!(iommu_ctx->flags & HOST_IOMMU_NESTING) ||
+!hicxc->flush_stage1_cache) {
+return -EINVAL;
+}
+
+return hicxc->flush_stage1_cache(iommu_ctx, cache);
+}
+
 void host_iommu_ctx_init(void *_iommu_ctx, size_t instance_size,
  const char *mrtypename,
  uint64_t flags,
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 74dbeaf..77f88e5 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1275,6 +1275,29 @@ static int 
vfio_host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
 return ret;
 }
 
+static int vfio_host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+struct iommu_cache_invalidate_info *cache)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_nesting_op *op;
+unsigned long argsz;
+int ret = 0;
+
+argsz = sizeof(*op) + sizeof(*cache);
+op = g_malloc0(argsz);
+op->argsz = argsz;
+op->flags = VFIO_IOMMU_NESTING_OP_CACHE_INVLD;
+memcpy(>data, cache, sizeof(*cache));
+
+if (ioctl(container->fd, VFIO_IOMMU_NESTING_OP, op)) {
+ret = -errno;
+error_report("%s: iommu cache flush failed: %m", __func__);
+}
+g_free(op);
+return ret;
+}
+
 /**
  * Get iommu info from host. Caller of this funcion should free
  * the memory pointed by the returned pointer stored in @info
@@ -2023,6 +2046,7 @@ static void 
vfio_host_iommu_context_class_init(ObjectClass *klass,
 hicxc->pasid_free = vfio_host_iommu_ctx_pasid_free;
 hicxc->bind_stage1_pgtbl = vfio_host_iommu_ctx_bind_stage1_pgtbl;
 hicxc->unbind_stage1_pgtbl = vfio_host_iommu_ctx_unbind_stage1_pgtbl;
+hicxc->flush_stage1_cache = vfio_host_iommu_ctx_flush_stage1_cache;
 }
 
 static const TypeInfo vfio_host_iommu_context_info = {
diff --git a/include/hw/iommu/host_iommu_context.h 
b/include/hw/iommu/host_iommu_context.h
index 2883ed8..40e860a 100644
--- a/include/hw/iommu/host_iommu_context.h
+++ b/include/hw/iommu/host_iommu_context.h
@@ -64,6 +64,12 @@ typedef struct HostIOMMUContextClass {
 /* Undo a previous bind. @unbind specifies the unbind info. */
 int (*unbind_stage1_pgtbl)(HostIOMMUContext *iommu_ctx,
struct iommu_gpasid_bind_data *unbind);
+/*
+ * Propagate stage-1 cache flush to host IOMMU, cache
+ * info specifid in @cache
+ */
+int (*flush_stage1_cache)(HostIOMMUContext *iommu_ctx,
+  struct iommu_cache_invalidate_info *cache);
 } HostIOMMUContextClass;
 
 /*
@@ -85,6 +91,8 @@ int host_iommu_ctx_bind_stage1_pgtbl(HostIOMMUContext 
*iommu_ctx,
  struct iommu_gpasid_bind_data *bind);
 int host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
  struct iommu_gpasid_bind_data *unbind);
+int host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+   struct iommu_cache_invalidate_info *cache);
 
 void host_iommu_ctx_init(void *_iommu_ctx, size_t instance_size,
  const char *mrtypename,
-- 
2.7.4

[RFC v10 17/25] intel_iommu: sync IOMMU nesting cap info for assigned devices

2020-09-10 Thread Liu Yi L

For assigned devices, Intel vIOMMU which wants to build DMA protection
based on physical IOMMU nesting paging should check the IOMMU nesting
support in host side. The host will return IOMMU nesting cap info to
user-space (e.g. VFIO returns IOMMU nesting cap info for nesting type
IOMMU). vIOMMU needs to check:
a) IOMMU model
b) 1st-level page table supports
c) address width
d) pasid support

This patch syncs the IOMMU nesting cap info when PCIe device (VFIO case)
sets HostIOMMUContext to vIOMMU. If the host IOMMU nesting support is not
compatible, vIOMMU should return failure to PCIe device.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 105 +
 hw/i386/intel_iommu_internal.h |  18 +++
 include/hw/i386/intel_iommu.h  |   4 ++
 3 files changed, 127 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1fce772..af17b36 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -4103,6 +4103,82 @@ static int vtd_dev_get_iommu_attr(PCIBus *bus, void 
*opaque, int32_t devfn,
 return ret;
 }
 
+
+static bool vtd_check_nesting_info(IntelIOMMUState *s,
+   struct iommu_nesting_info *info,
+   struct iommu_nesting_info_vtd *vtd)
+{
+return !((s->aw_bits != info->addr_width) ||
+ ((s->host_cap ^ vtd->cap_reg) & VTD_CAP_MASK & s->host_cap) ||
+ ((s->host_ecap ^ vtd->ecap_reg) & VTD_ECAP_MASK & s->host_ecap) ||
+ (VTD_GET_PSS(s->host_ecap) != (info->pasid_bits - 1)));
+}
+
+/* Caller should hold iommu lock. */
+static bool vtd_sync_nesting_info(IntelIOMMUState *s,
+  struct iommu_nesting_info *info)
+{
+struct iommu_nesting_info_vtd *vtd;
+uint64_t cap, ecap;
+
+vtd =  (struct iommu_nesting_info_vtd *) >vendor.vtd;
+
+if (s->cap_finalized) {
+return vtd_check_nesting_info(s, info, vtd);
+}
+
+if (s->aw_bits > info->addr_width) {
+error_report("User aw-bits: %u > host address width: %u",
+  s->aw_bits, info->addr_width);
+return false;
+}
+
+cap = s->host_cap & vtd->cap_reg & VTD_CAP_MASK;
+s->host_cap &= ~VTD_CAP_MASK;
+s->host_cap |= cap;
+
+ecap = s->host_ecap & vtd->ecap_reg & VTD_ECAP_MASK;
+s->host_ecap &= ~VTD_ECAP_MASK;
+s->host_ecap |= ecap;
+
+if ((VTD_ECAP_PASID & s->host_ecap) && info->pasid_bits &&
+(VTD_GET_PSS(s->host_ecap) > (info->pasid_bits - 1))) {
+s->host_ecap &= ~VTD_ECAP_PSS_MASK;
+s->host_ecap |= VTD_ECAP_PSS(info->pasid_bits - 1);
+}
+return true;
+}
+
+/*
+ * virtual VT-d which wants nested needs to check the host IOMMU
+ * nesting cap info behind the assigned devices. Thus that vIOMMU
+ * could bind guest page table to host.
+ */
+static bool vtd_check_iommu_ctx(IntelIOMMUState *s,
+HostIOMMUContext *iommu_ctx)
+{
+struct iommu_nesting_info *info = iommu_ctx->info;
+uint32_t minsz, size;
+
+if (IOMMU_PASID_FORMAT_INTEL_VTD != info->format) {
+error_report("Format is not compatible for nesting!!!");
+return false;
+}
+
+size = sizeof(struct iommu_nesting_info_vtd);
+minsz = endof(struct iommu_nesting_info, flags);
+if (size > (info->argsz - minsz)) {
+/*
+ * QEMU may have been using new linux-headers/iommu.h than
+ * kernel supports, hence fail it.
+ */
+error_report("IOMMU nesting cap is not compatible!!!");
+return false;
+}
+
+return vtd_sync_nesting_info(s, info);
+}
+
 static int vtd_dev_set_iommu_context(PCIBus *bus, void *opaque,
  int devfn,
  HostIOMMUContext *iommu_ctx)
@@ -4117,6 +4193,11 @@ static int vtd_dev_set_iommu_context(PCIBus *bus, void 
*opaque,
 
 vtd_iommu_lock(s);
 
+if (!vtd_check_iommu_ctx(s, iommu_ctx)) {
+vtd_iommu_unlock(s);
+return -ENOENT;
+}
+
 vtd_dev_icx = vtd_bus->dev_icx[devfn];
 
 assert(!vtd_dev_icx);
@@ -4372,6 +4453,14 @@ static void vtd_init(IntelIOMMUState *s)
 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
 }
 
+if (!s->cap_finalized) {
+s->host_cap = s->cap;
+s->host_ecap = s->ecap;
+} else {
+s->cap = s->host_cap;
+s->ecap = s->host_ecap;
+}
+
 vtd_reset_caches(s);
 
 /* Define registers with default values and bit semantics */
@@ -4505,6 +4594,12 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 return true;
 }
 
+static v

[RFC v10 03/25] hw/pci: modify pci_setup_iommu() to set PCIIOMMUOps

2020-09-10 Thread Liu Yi L

This patch modifies pci_setup_iommu() to set PCIIOMMUOps
instead of setting PCIIOMMUFunc. PCIIOMMUFunc is used to
get an address space for a PCI device in vendor specific
way. The PCIIOMMUOps still offers this functionality. But
using PCIIOMMUOps leaves space to add more iommu related
vendor specific operations.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Reviewed-by: David Gibson 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
rfcv9 -> rfcv10:
*) Fix a bug in pci_device_iommu_address_space()
 +   iommu_bus->iommu_ops->get_address_space) &&
 =>
 +   !iommu_bus->iommu_ops->get_address_space) &&
---
 hw/alpha/typhoon.c   |  6 +-
 hw/arm/smmu-common.c |  6 +-
 hw/hppa/dino.c   |  6 +-
 hw/i386/amd_iommu.c  |  6 +-
 hw/i386/intel_iommu.c|  6 +-
 hw/pci-host/designware.c |  6 +-
 hw/pci-host/pnv_phb3.c   |  6 +-
 hw/pci-host/pnv_phb4.c   |  6 +-
 hw/pci-host/ppce500.c|  6 +-
 hw/pci-host/prep.c   |  6 +-
 hw/pci-host/sabre.c  |  6 +-
 hw/pci/pci.c | 18 +-
 hw/ppc/ppc440_pcix.c |  6 +-
 hw/ppc/spapr_pci.c   |  6 +-
 hw/s390x/s390-pci-bus.c  |  8 ++--
 hw/virtio/virtio-iommu.c |  6 +-
 include/hw/pci/pci.h |  8 ++--
 include/hw/pci/pci_bus.h |  2 +-
 18 files changed, 96 insertions(+), 24 deletions(-)

diff --git a/hw/alpha/typhoon.c b/hw/alpha/typhoon.c
index 29d44df..c4ac693 100644
--- a/hw/alpha/typhoon.c
+++ b/hw/alpha/typhoon.c
@@ -740,6 +740,10 @@ static AddressSpace *typhoon_pci_dma_iommu(PCIBus *bus, 
void *opaque, int devfn)
 return >pchip.iommu_as;
 }
 
+static const PCIIOMMUOps typhoon_iommu_ops = {
+.get_address_space = typhoon_pci_dma_iommu,
+};
+
 static void typhoon_set_irq(void *opaque, int irq, int level)
 {
 TyphoonState *s = opaque;
@@ -897,7 +901,7 @@ PCIBus *typhoon_init(MemoryRegion *ram, ISABus **isa_bus, 
qemu_irq *p_rtc_irq,
  "iommu-typhoon", UINT64_MAX);
 address_space_init(>pchip.iommu_as, MEMORY_REGION(>pchip.iommu),
"pchip0-pci");
-pci_setup_iommu(b, typhoon_pci_dma_iommu, s);
+pci_setup_iommu(b, _iommu_ops, s);
 
 /* Pchip0 PCI special/interrupt acknowledge, 0x801.F800., 64MB.  */
 memory_region_init_io(>pchip.reg_iack, OBJECT(s), _pci_iack_ops,
diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index 3838db1..542d816 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -444,6 +444,10 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void 
*opaque, int devfn)
 return >as;
 }
 
+static const PCIIOMMUOps smmu_ops = {
+.get_address_space = smmu_find_add_as,
+};
+
 IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid)
 {
 uint8_t bus_n, devfn;
@@ -513,7 +517,7 @@ static void smmu_base_realize(DeviceState *dev, Error 
**errp)
 s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL);
 
 if (s->primary_bus) {
-pci_setup_iommu(s->primary_bus, smmu_find_add_as, s);
+pci_setup_iommu(s->primary_bus, _ops, s);
 } else {
 error_setg(errp, "SMMU is not attached to any PCI bus!");
 }
diff --git a/hw/hppa/dino.c b/hw/hppa/dino.c
index 7f0c622..ca2dea4 100644
--- a/hw/hppa/dino.c
+++ b/hw/hppa/dino.c
@@ -459,6 +459,10 @@ static AddressSpace *dino_pcihost_set_iommu(PCIBus *bus, 
void *opaque,
 return >bm_as;
 }
 
+static const PCIIOMMUOps dino_iommu_ops = {
+.get_address_space = dino_pcihost_set_iommu,
+};
+
 /*
  * Dino interrupts are connected as shown on Page 78, Table 23
  * (Little-endian bit numbers)
@@ -580,7 +584,7 @@ PCIBus *dino_init(MemoryRegion *addr_space,
 memory_region_add_subregion(>bm, 0xfff0,
 >bm_cpu_alias);
 address_space_init(>bm_as, >bm, "pci-bm");
-pci_setup_iommu(b, dino_pcihost_set_iommu, s);
+pci_setup_iommu(b, _iommu_ops, s);
 
 *p_rtc_irq = qemu_allocate_irq(dino_set_timer_irq, s, 0);
 *p_ser_irq = qemu_allocate_irq(dino_set_serial_irq, s, 0);
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 74a93a5..3676a20 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -1452,6 +1452,10 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, 
void *opaque, int devfn)
 return _as[devfn]->as;
 }
 
+static const PCIIOMMUOps amdvi_iommu_ops = {
+.get_address_space = amdvi_host_dma_iommu,
+};
+
 static const MemoryRegionOps mmio_mem_ops = {
 .read = amdvi_mmio_read,
 .write = amdvi_mmio_write,
@@ -1579,7 +1583,7 @@ static void amdvi_realize(DeviceState *dev, Error **errp)
 
 sysbus_init_mmio(SYS_BUS_DEVICE(s), >mmio);
 sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR);
-pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
+pci_setup_iommu(bus, _iommu

[RFC v10 10/25] intel_iommu: add set/unset_iommu_context callback

2020-09-10 Thread Liu Yi L

This patch adds set/unset_iommu_context() impelementation in Intel
vIOMMU. PCIe devices (VFIO case) sets HostIOMMUContext to vIOMMU as
an ack of vIOMMU's "want_nested" attribute. Thus vIOMMU could build
DMA protection based on nested paging of host IOMMU.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c | 71 ---
 include/hw/i386/intel_iommu.h | 21 ++---
 2 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 333f172..bf496f7 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3358,23 +3358,33 @@ static const MemoryRegionOps vtd_mem_ir_ops = {
 },
 };
 
-VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
+/**
+ * Fetch a VTDBus instance for given PCIBus. If no existing instance,
+ * allocate one.
+ */
+static VTDBus *vtd_find_add_bus(IntelIOMMUState *s, PCIBus *bus)
 {
 uintptr_t key = (uintptr_t)bus;
 VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, );
-VTDAddressSpace *vtd_dev_as;
-char name[128];
 
 if (!vtd_bus) {
 uintptr_t *new_key = g_malloc(sizeof(*new_key));
 *new_key = (uintptr_t)bus;
 /* No corresponding free() */
-vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
-PCI_DEVFN_MAX);
+vtd_bus = g_malloc0(sizeof(VTDBus));
 vtd_bus->bus = bus;
 g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
 }
+return vtd_bus;
+}
 
+VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
+{
+VTDBus *vtd_bus;
+VTDAddressSpace *vtd_dev_as;
+char name[128];
+
+vtd_bus = vtd_find_add_bus(s, bus);
 vtd_dev_as = vtd_bus->dev_as[devfn];
 
 if (!vtd_dev_as) {
@@ -3462,6 +3472,55 @@ static int vtd_dev_get_iommu_attr(PCIBus *bus, void 
*opaque, int32_t devfn,
 return ret;
 }
 
+static int vtd_dev_set_iommu_context(PCIBus *bus, void *opaque,
+ int devfn,
+ HostIOMMUContext *iommu_ctx)
+{
+IntelIOMMUState *s = opaque;
+VTDBus *vtd_bus;
+VTDHostIOMMUContext *vtd_dev_icx;
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+vtd_bus = vtd_find_add_bus(s, bus);
+
+vtd_iommu_lock(s);
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+
+assert(!vtd_dev_icx);
+
+vtd_bus->dev_icx[devfn] = vtd_dev_icx =
+g_malloc0(sizeof(VTDHostIOMMUContext));
+vtd_dev_icx->vtd_bus = vtd_bus;
+vtd_dev_icx->devfn = (uint8_t)devfn;
+vtd_dev_icx->iommu_state = s;
+vtd_dev_icx->iommu_ctx = iommu_ctx;
+
+vtd_iommu_unlock(s);
+
+return 0;
+}
+
+static void vtd_dev_unset_iommu_context(PCIBus *bus, void *opaque, int devfn)
+{
+IntelIOMMUState *s = opaque;
+VTDBus *vtd_bus;
+VTDHostIOMMUContext *vtd_dev_icx;
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+vtd_bus = vtd_find_add_bus(s, bus);
+
+vtd_iommu_lock(s);
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+g_free(vtd_dev_icx);
+vtd_bus->dev_icx[devfn] = NULL;
+
+vtd_iommu_unlock(s);
+}
+
 static uint64_t get_naturally_aligned_size(uint64_t start,
uint64_t size, int gaw)
 {
@@ -3758,6 +3817,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
 .get_iommu_attr = vtd_dev_get_iommu_attr,
+.set_iommu_context = vtd_dev_set_iommu_context,
+.unset_iommu_context = vtd_dev_unset_iommu_context,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 3870052..b5fefb9 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -64,6 +64,7 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry;
 typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
 typedef struct VTDPASIDDirEntry VTDPASIDDirEntry;
 typedef struct VTDPASIDEntry VTDPASIDEntry;
+typedef struct VTDHostIOMMUContext VTDHostIOMMUContext;
 
 /* Context-Entry */
 struct VTDContextEntry {
@@ -112,10 +113,20 @@ struct VTDAddressSpace {
 IOVATree *iova_tree;  /* Traces mapped IOVA ranges */
 };
 
+struct VTDHostIOMMUContext {
+VTDBus *vtd_bus;
+uint8_t devfn;
+HostIOMMUContext *iommu_ctx;
+IntelIOMMUState *iommu_state;
+};
+
 struct VTDBus {
-PCIBus* bus;   /* A reference to the bus to provide 
translation for */
+/* A reference to the bus to provide translation for */
+PCIBus *bus;
 /* A table of VTDAddressSpace objects indexed by devfn */
-VTDAddressSpace *dev_as[];
+VTDAddressSpace *dev_as[PCI_D

[RFC v10 01/25] scripts/update-linux-headers: Import iommu.h

2020-09-10 Thread Liu Yi L

From: Eric Auger 

Update the script to import the new iommu.h uapi header.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Michael S. Tsirkin 
Cc: Cornelia Huck 
Cc: Paolo Bonzini 
Acked-by: Cornelia Huck 
Signed-off-by: Eric Auger 
---
 scripts/update-linux-headers.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index 29c27f4..5b64ee3 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -141,7 +141,7 @@ done
 
 rm -rf "$output/linux-headers/linux"
 mkdir -p "$output/linux-headers/linux"
-for header in kvm.h vfio.h vfio_ccw.h vhost.h \
+for header in kvm.h vfio.h vfio_ccw.h vhost.h iommu.h \
   psci.h psp-sev.h userfaultfd.h mman.h; do
 cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
 done
-- 
2.7.4

[RFC v10 15/25] intel_iommu: add PASID cache management infrastructure

2020-09-10 Thread Liu Yi L

This patch adds a PASID cache management infrastructure based on
new added structure VTDPASIDAddressSpace, which is used to track
the PASID usage and future PASID tagged DMA address translation
support in vIOMMU.

struct VTDPASIDAddressSpace {
VTDBus *vtd_bus;
uint8_t devfn;
AddressSpace as;
uint32_t pasid;
IntelIOMMUState *iommu_state;
VTDContextCacheEntry context_cache_entry;
QLIST_ENTRY(VTDPASIDAddressSpace) next;
VTDPASIDCacheEntry pasid_cache_entry;
};

Ideally, a VTDPASIDAddressSpace instance is created when a PASID
is bound with a DMA AddressSpace. Intel VT-d spec requires guest
software to issue pasid cache invalidation when bind or unbind a
pasid with an address space under caching-mode. However, as
VTDPASIDAddressSpace instances also act as pasid cache in this
implementation, its creation also happens during vIOMMU PASID
tagged DMA translation. The creation in this path will not be
added in this patch since no PASID-capable emulated devices for
now.

The implementation in this patch manages VTDPASIDAddressSpace
instances per PASID+BDF (lookup and insert will use PASID and
BDF) since Intel VT-d spec allows per-BDF PASID Table. When a
guest bind a PASID with an AddressSpace, QEMU will capture the
guest pasid selective pasid cache invalidation, and allocate
remove a VTDPASIDAddressSpace instance per the invalidation
reasons:

*) a present pasid entry moved to non-present
*) a present pasid entry to be a present entry
*) a non-present pasid entry moved to present

vIOMMU emulator could figure out the reason by fetching latest
guest pasid entry.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
rfcv4 (v1) -> rfcv5 (v2):
*) merged this patch with former replay binding patch, makes
   PSI/DSI/GSI use the unified function to do cache invalidation
   and pasid binding replay.
*) dropped pasid_cache_gen in both iommu_state and vtd_pasid_as
   as it is not necessary so far, we may want it when one day
   initroduce emulated SVA-capable device.
---
 hw/i386/intel_iommu.c  | 464 +
 hw/i386/intel_iommu_internal.h |  21 ++
 hw/i386/trace-events   |   1 +
 include/hw/i386/intel_iommu.h  |  24 +++
 4 files changed, 510 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index b110c7f..1fce772 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -40,6 +40,7 @@
 #include "kvm_i386.h"
 #include "migration/vmstate.h"
 #include "trace.h"
+#include "qemu/jhash.h"
 
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
@@ -65,6 +66,8 @@
 static void vtd_address_space_refresh_all(IntelIOMMUState *s);
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
 
+static void vtd_pasid_cache_reset(IntelIOMMUState *s);
+
 static void vtd_panic_require_caching_mode(void)
 {
 error_report("We need to set caching-mode=on for intel-iommu to enable "
@@ -276,6 +279,7 @@ static void vtd_reset_caches(IntelIOMMUState *s)
 vtd_iommu_lock(s);
 vtd_reset_iotlb_locked(s);
 vtd_reset_context_cache_locked(s);
+vtd_pasid_cache_reset(s);
 vtd_iommu_unlock(s);
 }
 
@@ -686,6 +690,16 @@ static inline bool vtd_pe_type_check(X86IOMMUState 
*x86_iommu,
 return true;
 }
 
+static inline uint16_t vtd_pe_get_domain_id(VTDPASIDEntry *pe)
+{
+return VTD_SM_PASID_ENTRY_DID((pe)->val[1]);
+}
+
+static inline uint32_t vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
+{
+return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce->val[0]) + 7);
+}
+
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 {
 return pdire->val & 1;
@@ -2395,9 +2409,443 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, 
VTDInvDesc *inv_desc)
 return true;
 }
 
+static inline void vtd_init_pasid_key(uint32_t pasid,
+ uint16_t sid,
+ struct pasid_key *key)
+{
+key->pasid = pasid;
+key->sid = sid;
+}
+
+static guint vtd_pasid_as_key_hash(gconstpointer v)
+{
+struct pasid_key *key = (struct pasid_key *)v;
+uint32_t a, b, c;
+
+/* Jenkins hash */
+a = b = c = JHASH_INITVAL + sizeof(*key);
+a += key->sid;
+b += extract32(key->pasid, 0, 16);
+c += extract32(key->pasid, 16, 16);
+
+__jhash_mix(a, b, c);
+__jhash_final(a, b, c);
+
+return c;
+}
+
+static gboolean vtd_pasid_as_key_equal(gconstpointer v1, gconstpointer v2)
+{
+const struct pasid_key *k1 = v1;
+const struct pasid_key *k2 = v2;
+
+return (k1->pasid == k2->pasid) && (k1->sid == k2->sid);
+}
+
+static inline int vtd_dev_get_pe_from_pasid(IntelIOMMUState *s,
+uint8_t bus_num,
+

[RFC v10 06/25] vfio: pass nesting requirement into vfio_get_group()

2020-09-10 Thread Liu Yi L

This patch passes the nesting requirement into vfio_get_group() to
indicate whether VFIO_TYPE1_NESTING_IOMMU is required.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
---
 hw/vfio/ap.c  | 2 +-
 hw/vfio/ccw.c | 2 +-
 hw/vfio/common.c  | 3 ++-
 hw/vfio/pci.c | 9 -
 hw/vfio/platform.c| 2 +-
 include/hw/vfio/vfio-common.h | 3 ++-
 6 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index cec6fe1..b5f3159 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -83,7 +83,7 @@ static VFIOGroup *vfio_ap_get_group(VFIOAPDevice *vapdev, 
Error **errp)
 
 g_free(group_path);
 
-return vfio_get_group(groupid, _space_memory, errp);
+return vfio_get_group(groupid, _space_memory, false, errp);
 }
 
 static void vfio_ap_realize(DeviceState *dev, Error **errp)
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index ff7f369..30d00a7 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -621,7 +621,7 @@ static VFIOGroup *vfio_ccw_get_group(S390CCWDevice *cdev, 
Error **errp)
 return NULL;
 }
 
-return vfio_get_group(groupid, _space_memory, errp);
+return vfio_get_group(groupid, _space_memory, false, errp);
 }
 
 static void vfio_ccw_realize(DeviceState *dev, Error **errp)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 3335714..80d7a00 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1457,7 +1457,8 @@ static void vfio_disconnect_container(VFIOGroup *group)
 }
 }
 
-VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
+VFIOGroup *vfio_get_group(int groupid, AddressSpace *as,
+  bool want_nested, Error **errp)
 {
 VFIOGroup *group;
 char path[32];
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 3611dcd..d33fb89 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2711,6 +2711,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 int groupid;
 int i, ret;
 bool is_mdev;
+bool want_nested;
 
 if (!vdev->vbasedev.sysfsdev) {
 if (!(~vdev->host.domain || ~vdev->host.bus ||
@@ -2768,7 +2769,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 trace_vfio_realize(vdev->vbasedev.name, groupid);
 
-group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), 
errp);
+if (pci_device_get_iommu_attr(pdev,
+ IOMMU_WANT_NESTING, _nested)) {
+want_nested = false;
+}
+
+group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev),
+   want_nested, errp);
 if (!group) {
 goto error;
 }
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index 869ed2c..5eac6a2 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -580,7 +580,7 @@ static int vfio_base_device_init(VFIODevice *vbasedev, 
Error **errp)
 
 trace_vfio_platform_base_device_init(vbasedev->name, groupid);
 
-group = vfio_get_group(groupid, _space_memory, errp);
+group = vfio_get_group(groupid, _space_memory, false, errp);
 if (!group) {
 return -ENOENT;
 }
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index c78f3ff..bdb09f4 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -174,7 +174,8 @@ void vfio_region_mmaps_set_enabled(VFIORegion *region, bool 
enabled);
 void vfio_region_exit(VFIORegion *region);
 void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
-VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
+VFIOGroup *vfio_get_group(int groupid, AddressSpace *as,
+  bool want_nested, Error **errp);
 void vfio_put_group(VFIOGroup *group);
 int vfio_get_device(VFIOGroup *group, const char *name,
 VFIODevice *vbasedev, Error **errp);
-- 
2.7.4

[RFC v10 08/25] hw/iommu: introduce HostIOMMUContext

2020-09-10 Thread Liu Yi L

Currently, many platform vendors provide the capability of dual stage
DMA address translation in hardware. For example, nested translation
on Intel VT-d scalable mode, nested stage translation on ARM SMMUv3,
and etc. In dual stage DMA address translation, there are two stages
address translation, stage-1 (a.k.a first-level) and stage-2 (a.k.a
second-level) translation structures. Stage-1 translation results are
also subjected to stage-2 translation structures. Take vSVA (Virtual
Shared Virtual Addressing) as an example, guest IOMMU driver owns
stage-1 translation structures (covers GVA->GPA translation), and host
IOMMU driver owns stage-2 translation structures (covers GPA->HPA
translation). VMM is responsible to bind stage-1 translation structures
to host, thus hardware could achieve GVA->GPA and then GPA->HPA
translation. For more background on SVA, refer the below links.
 - https://www.youtube.com/watch?v=Kq_nfGK5MwQ
 - 
https://events19.lfasiallc.com/wp-content/uploads/2017/11/Shared-Virtual-Memory-in-KVM_Yi-Liu.pdf

In QEMU, vIOMMU emulators expose IOMMUs to VM per their own spec (e.g.
Intel VT-d spec). Devices are pass-through to guest via device pass-
through components like VFIO. VFIO is a userspace driver framework
which exposes host IOMMU programming capability to userspace in a
secure manner. e.g. IOVA MAP/UNMAP requests. Information, different
from map/unmap notifications need to be passed from QEMU vIOMMU device
to/from the host IOMMU driver through the VFIO/IOMMU layer:
 1) PASID allocation (allow host to intercept in PASID allocation)
 2) bind stage-1 translation structures to host
 3) propagate stage-1 cache invalidation to host
 4) DMA address translation fault (I/O page fault) servicing etc.

With the above new interactions in QEMU, it requires an abstract layer
to facilitate the above operations and expose to vIOMMU emulators as an
explicit way for vIOMMU emulators call into VFIO. This patch introduces
HostIOMMUContext to serve it. The HostIOMMUContext is an object which
allows to manage the stage-1 translation when a vIOMMU is implemented
upon physical IOMMU nested paging (VFIO case). It is an abstract object
which needs to be derived for each vIOMMU immplementation based on
physical nested paging. An HostIOMMUContext derived object will be passed
to each VFIO device protected by a vIOMMU using physical nested paging.

This patchg also introduces HostIOMMUContextClass to provide methods for
vIOMMU emulators to propagate dual-stage translation related requests to
host. As a beginning, PASID allocation/free are defined to propagate PASID
allocation/free requests to host which is required for the vendors which
manage PASID in system-wide.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Signed-off-by: Liu Yi L 
---
rfcv9 -> rfcv10:
*) adjust to meson build
---
 hw/Kconfig|  3 ++
 hw/iommu/Kconfig  |  4 ++
 hw/iommu/host_iommu_context.c | 97 +++
 hw/iommu/meson.build  |  6 +++
 hw/meson.build|  1 +
 include/hw/iommu/host_iommu_context.h | 75 +++
 6 files changed, 186 insertions(+)
 create mode 100644 hw/iommu/Kconfig
 create mode 100644 hw/iommu/host_iommu_context.c
 create mode 100644 hw/iommu/meson.build
 create mode 100644 include/hw/iommu/host_iommu_context.h

diff --git a/hw/Kconfig b/hw/Kconfig
index 4de1797..ac74bed 100644
--- a/hw/Kconfig
+++ b/hw/Kconfig
@@ -65,6 +65,9 @@ source tricore/Kconfig
 source unicore32/Kconfig
 source xtensa/Kconfig
 
+# iommu Kconfig
+source iommu/Kconfig
+
 # Symbols used by multiple targets
 config TEST_DEVICES
 bool
diff --git a/hw/iommu/Kconfig b/hw/iommu/Kconfig
new file mode 100644
index 000..039b9a4
--- /dev/null
+++ b/hw/iommu/Kconfig
@@ -0,0 +1,4 @@
+config IOMMU
+bool
+default y
+depends on LINUX
diff --git a/hw/iommu/host_iommu_context.c b/hw/iommu/host_iommu_context.c
new file mode 100644
index 000..5fb2223
--- /dev/null
+++ b/hw/iommu/host_iommu_context.c
@@ -0,0 +1,97 @@
+/*
+ * QEMU abstract of Host IOMMU
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Authors: Liu Yi L 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "q

[RFC v10 00/25] intel_iommu: expose Shared Virtual Addressing to VMs

2020-09-10 Thread Liu Yi L

 "[RFC v3 01/25] hw/pci: modify pci_setup_iommu() to 
set PCIIOMMUOps"
  RFCv3.1: https://patchwork.kernel.org/cover/11397879/

- RFC v3 -> v3.1:
  a) Drop IOMMUContext, and rename DualStageIOMMUObject to 
HostIOMMUContext.
 HostIOMMUContext is per-vfio-container, it is exposed to  vIOMMU 
via PCI
 layer. VFIO registers a PCIHostIOMMUFunc callback to PCI layer, 
vIOMMU
 could get HostIOMMUContext instance via it.
  b) Check IOMMU uAPI version by VFIO_CHECK_EXTENSION
  c) Add a check on VFIO_PASID_REQ availability via VFIO_GET_IOMMU_IHNFO
  d) Reorder the series, put vSVA linux header file update in the 
beginning
 put the x-scalable-mode option mofification in the end of the 
series.
  e) Dropped patch "[RFC v3 01/25] hw/pci: modify pci_setup_iommu() to 
set PCIIOMMUOps"
  RFCv3: https://patchwork.kernel.org/cover/11356033/

- RFC v2 -> v3:
  a) Introduce DualStageIOMMUObject to abstract the host IOMMU 
programming
  capability. e.g. request PASID from host, setup IOMMU nesting 
translation
  on host IOMMU. The pasid_alloc/bind_guest_page_table/iommu_cache_flush
  operations are moved to be DualStageIOMMUOps. Thus, 
DualStageIOMMUObject
  is an abstract layer which provides QEMU vIOMMU emulators with an 
explicit
  method to program host IOMMU.
  b) Compared with RFC v2, the IOMMUContext has also been updated. It is
  modified to provide an abstract for vIOMMU emulators. It provides the
  method for pass-through modules (like VFIO) to communicate with host 
IOMMU.
  e.g. tell vIOMMU emulators about the IOMMU nesting capability on host 
side
  and report the host IOMMU DMA translation faults to vIOMMU emulators.
  RFC v2: https://www.spinics.net/lists/kvm/msg198556.html

- RFC v1 -> v2:
  Introduce IOMMUContext to abstract the connection between VFIO
  and vIOMMU emulators, which is a replacement of the PCIPASIDOps
  in RFC v1. Modify x-scalable-mode to be string option instead of
  adding a new option as RFC v1 did. Refined the pasid cache management

---
Eric Auger (1):
  scripts/update-linux-headers: Import iommu.h

Liu Yi L (24):
  header file update VFIO/IOMMU vSVA APIs kernel 5.9-rc2
  hw/pci: modify pci_setup_iommu() to set PCIIOMMUOps
  hw/pci: introduce pci_device_get_iommu_attr()
  intel_iommu: add get_iommu_attr() callback
  vfio: pass nesting requirement into vfio_get_group()
  vfio: check VFIO_TYPE1_NESTING_IOMMU support
  hw/iommu: introduce HostIOMMUContext
  hw/pci: introduce pci_device_set/unset_iommu_context()
  intel_iommu: add set/unset_iommu_context callback
  vfio/common: provide PASID alloc/free hooks
  vfio: init HostIOMMUContext per-container
  intel_iommu: add virtual command capability support
  intel_iommu: process PASID cache invalidation
  intel_iommu: add PASID cache management infrastructure
  vfio: add bind stage-1 page table support
  intel_iommu: sync IOMMU nesting cap info for assigned devices
  intel_iommu: bind/unbind guest page table to host
  intel_iommu: replay pasid binds after context cache invalidation
  intel_iommu: do not pass down pasid bind for PASID #0
  vfio: add support for flush iommu stage-1 cache
  intel_iommu: process PASID-based iotlb invalidation
  intel_iommu: propagate PASID-based iotlb invalidation to host
  intel_iommu: process PASID-based Device-TLB invalidation
  intel_iommu: modify x-scalable-mode to be string option

 hw/Kconfig|3 +
 hw/alpha/typhoon.c|6 +-
 hw/arm/smmu-common.c  |6 +-
 hw/hppa/dino.c|6 +-
 hw/i386/amd_iommu.c   |6 +-
 hw/i386/intel_iommu.c | 1231 -
 hw/i386/intel_iommu_internal.h|  131 
 hw/i386/trace-events  |6 +
 hw/iommu/Kconfig  |4 +
 hw/iommu/host_iommu_context.c |  171 +
 hw/iommu/meson.build  |6 +
 hw/meson.build|1 +
 hw/pci-host/designware.c  |6 +-
 hw/pci-host/pnv_phb3.c|6 +-
 hw/pci-host/pnv_phb4.c|6 +-
 hw/pci-host/ppce500.c |6 +-
 hw/pci-host/prep.c|6 +-
 hw/pci-host/sabre.c   |6 +-
 hw/pci/pci.c  |   73 +-
 hw/ppc/ppc440_pcix.c  |6 +-
 hw/ppc/spapr_pci.c|6 +-
 hw/s390x/s390-pci-bus.c   |8 +-
 hw/vfio/ap.c  |2 +-
 hw/vfio/ccw.c |2 +-
 hw/vfio/common.c  |  299 +++-
 hw/vfio/pci.c |   26 +-
 hw/vfio/platform.c|2 +-
 hw/v

[RFC v10 07/25] vfio: check VFIO_TYPE1_NESTING_IOMMU support

2020-09-10 Thread Liu Yi L

VFIO needs to check VFIO_TYPE1_NESTING_IOMMU support with Kernel before
further using it. e.g. requires to check IOMMU UAPI support.

Referred patch from Eric Auger: https://patchwork.kernel.org/patch/11040499/

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
Signed-off-by: Eric Auger 
Signed-off-by: Yi Sun 
---
 hw/vfio/common.c | 37 ++---
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 80d7a00..af91eca 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1151,30 +1151,44 @@ static void vfio_put_address_space(VFIOAddressSpace 
*space)
 }
 
 /*
- * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
+ * vfio_get_iommu_type - selects the richest iommu_type (NESTING first)
  */
 static int vfio_get_iommu_type(VFIOContainer *container,
+   bool want_nested,
Error **errp)
 {
-int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
+int iommu_types[] = { VFIO_TYPE1_NESTING_IOMMU,
+  VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
   VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
-int i;
+int i, ret = -EINVAL;
 
 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
-return iommu_types[i];
+if (iommu_types[i] == VFIO_TYPE1_NESTING_IOMMU) {
+if (!want_nested) {
+continue;
+}
+}
+ret = iommu_types[i];
+break;
 }
 }
-error_setg(errp, "No available IOMMU models");
-return -EINVAL;
+
+if (ret < 0) {
+error_setg(errp, "No available IOMMU models");
+} else if (want_nested && ret != VFIO_TYPE1_NESTING_IOMMU) {
+error_setg(errp, "Nested mode requested but not supported");
+ret = -EINVAL;
+}
+return ret;
 }
 
 static int vfio_init_container(VFIOContainer *container, int group_fd,
-   Error **errp)
+   bool want_nested, Error **errp)
 {
 int iommu_type, ret;
 
-iommu_type = vfio_get_iommu_type(container, errp);
+iommu_type = vfio_get_iommu_type(container, want_nested, errp);
 if (iommu_type < 0) {
 return iommu_type;
 }
@@ -1205,7 +1219,7 @@ static int vfio_init_container(VFIOContainer *container, 
int group_fd,
 }
 
 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
-  Error **errp)
+  bool want_nested, Error **errp)
 {
 VFIOContainer *container;
 int ret, fd;
@@ -1276,12 +1290,13 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 QLIST_INIT(>giommu_list);
 QLIST_INIT(>hostwin_list);
 
-ret = vfio_init_container(container, group->fd, errp);
+ret = vfio_init_container(container, group->fd, want_nested, errp);
 if (ret) {
 goto free_container_exit;
 }
 
 switch (container->iommu_type) {
+case VFIO_TYPE1_NESTING_IOMMU:
 case VFIO_TYPE1v2_IOMMU:
 case VFIO_TYPE1_IOMMU:
 {
@@ -1502,7 +1517,7 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as,
 group->groupid = groupid;
 QLIST_INIT(>device_list);
 
-if (vfio_connect_container(group, as, errp)) {
+if (vfio_connect_container(group, as, want_nested, errp)) {
 error_prepend(errp, "failed to setup container for group %d: ",
   groupid);
 goto close_fd_exit;
-- 
2.7.4

[RFC v10 05/25] intel_iommu: add get_iommu_attr() callback

2020-09-10 Thread Liu Yi L

Return vIOMMU attribute to caller. e.g. VFIO call via PCI layer.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index bf13d59..333f172 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3440,6 +3440,28 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus, int devfn)
 return vtd_dev_as;
 }
 
+static int vtd_dev_get_iommu_attr(PCIBus *bus, void *opaque, int32_t devfn,
+   IOMMUAttr attr, void *data)
+{
+int ret = 0;
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+switch (attr) {
+case IOMMU_WANT_NESTING:
+{
+bool *pdata = data;
+
+/* return false until vSVA is ready */
+*pdata = false;
+break;
+}
+default:
+ret = -ENOENT;
+}
+return ret;
+}
+
 static uint64_t get_naturally_aligned_size(uint64_t start,
uint64_t size, int gaw)
 {
@@ -3735,6 +3757,7 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
+.get_iommu_attr = vtd_dev_get_iommu_attr,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
-- 
2.7.4

[RFC v10 02/25] header file update VFIO/IOMMU vSVA APIs kernel 5.9-rc2

2020-09-10 Thread Liu Yi L

The kernel uapi/linux/iommu.h header file includes the
extensions for vSVA support. e.g. bind gpasid, iommu
fault report related user structures and etc.

This commit updates kernel headers from the below branch:
https://github.com/luxis1999/linux-vsva.git vsva-linux-5.9-rc2-v7

Note: this should be replaced with a full header files update when
the vSVA uPAPI is stable.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Michael S. Tsirkin 
Cc: Cornelia Huck 
Cc: Paolo Bonzini 
Signed-off-by: Liu Yi L 
---
 linux-headers/linux/iommu.h | 420 
 linux-headers/linux/vfio.h  | 103 ++-
 2 files changed, 522 insertions(+), 1 deletion(-)
 create mode 100644 linux-headers/linux/iommu.h

diff --git a/linux-headers/linux/iommu.h b/linux-headers/linux/iommu.h
new file mode 100644
index 000..be6cce1
--- /dev/null
+++ b/linux-headers/linux/iommu.h
@@ -0,0 +1,420 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * IOMMU user API definitions
+ */
+
+#ifndef _IOMMU_H
+#define _IOMMU_H
+
+#include 
+
+#define IOMMU_FAULT_PERM_READ  (1 << 0) /* read */
+#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */
+#define IOMMU_FAULT_PERM_EXEC  (1 << 2) /* exec */
+#define IOMMU_FAULT_PERM_PRIV  (1 << 3) /* privileged */
+
+/* Generic fault types, can be expanded IRQ remapping fault */
+enum iommu_fault_type {
+   IOMMU_FAULT_DMA_UNRECOV = 1,/* unrecoverable fault */
+   IOMMU_FAULT_PAGE_REQ,   /* page request fault */
+};
+
+enum iommu_fault_reason {
+   IOMMU_FAULT_REASON_UNKNOWN = 0,
+
+   /* Could not access the PASID table (fetch caused external abort) */
+   IOMMU_FAULT_REASON_PASID_FETCH,
+
+   /* PASID entry is invalid or has configuration errors */
+   IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
+
+   /*
+* PASID is out of range (e.g. exceeds the maximum PASID
+* supported by the IOMMU) or disabled.
+*/
+   IOMMU_FAULT_REASON_PASID_INVALID,
+
+   /*
+* An external abort occurred fetching (or updating) a translation
+* table descriptor
+*/
+   IOMMU_FAULT_REASON_WALK_EABT,
+
+   /*
+* Could not access the page table entry (Bad address),
+* actual translation fault
+*/
+   IOMMU_FAULT_REASON_PTE_FETCH,
+
+   /* Protection flag check failed */
+   IOMMU_FAULT_REASON_PERMISSION,
+
+   /* access flag check failed */
+   IOMMU_FAULT_REASON_ACCESS,
+
+   /* Output address of a translation stage caused Address Size fault */
+   IOMMU_FAULT_REASON_OOR_ADDRESS,
+};
+
+/**
+ * struct iommu_fault_unrecoverable - Unrecoverable fault data
+ * @reason: reason of the fault, from  iommu_fault_reason
+ * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values)
+ * @pasid: Process Address Space ID
+ * @perm: requested permission access using by the incoming transaction
+ *(IOMMU_FAULT_PERM_* values)
+ * @addr: offending page address
+ * @fetch_addr: address that caused a fetch abort, if any
+ */
+struct iommu_fault_unrecoverable {
+   __u32   reason;
+#define IOMMU_FAULT_UNRECOV_PASID_VALID(1 << 0)
+#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1)
+#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID   (1 << 2)
+   __u32   flags;
+   __u32   pasid;
+   __u32   perm;
+   __u64   addr;
+   __u64   fetch_addr;
+};
+
+/**
+ * struct iommu_fault_page_request - Page Request data
+ * @flags: encodes whether the corresponding fields are valid and whether this
+ * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values).
+ * When IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID is set, the page response
+ * must have the same PASID value as the page request. When it is 
clear,
+ * the page response should not have a PASID.
+ * @pasid: Process Address Space ID
+ * @grpid: Page Request Group Index
+ * @perm: requested page permissions (IOMMU_FAULT_PERM_* values)
+ * @addr: page address
+ * @private_data: device-specific private information
+ */
+struct iommu_fault_page_request {
+#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID   (1 << 0)
+#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1)
+#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2)
+#define IOMMU_FAULT_PAGE_RESPONSE_NEEDS_PASID  (1 << 3)
+   __u32   flags;
+   __u32   pasid;
+   __u32   grpid;
+   __u32   perm;
+   __u64   addr;
+   __u64   private_data[2];
+};
+
+/**
+ * struct iommu_fault - Generic fault data
+ * @type: fault type from  iommu_fault_type
+ * @padding: reserved for future use (should be zero)
+ * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV
+ * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ
+ * @padding2: sets the fault size to allow for future extensions
+ */
+struct iommu_fault {
+   __u32   type;
+   __u32   padding;
+

[RFC v10 22/25] intel_iommu: process PASID-based iotlb invalidation

2020-09-10 Thread Liu Yi L

This patch adds the basic PASID-based iotlb (piotlb) invalidation
support. piotlb is used during walking Intel VT-d 1st level page
table. This patch only adds the basic processing. Detailed handling
will be added in next patch.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 53 ++
 hw/i386/intel_iommu_internal.h | 13 +++
 2 files changed, 66 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 55623e8..516d7ff 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3038,6 +3038,55 @@ static bool vtd_process_pasid_desc(IntelIOMMUState *s,
 return true;
 }
 
+static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
+uint16_t domain_id,
+uint32_t pasid)
+{
+}
+
+static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
+   uint32_t pasid, hwaddr addr, uint8_t am,
+   bool ih)
+{
+}
+
+static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
+VTDInvDesc *inv_desc)
+{
+uint16_t domain_id;
+uint32_t pasid;
+uint8_t am;
+hwaddr addr;
+
+if ((inv_desc->val[0] & VTD_INV_DESC_PIOTLB_RSVD_VAL0) ||
+(inv_desc->val[1] & VTD_INV_DESC_PIOTLB_RSVD_VAL1)) {
+error_report_once("non-zero-field-in-piotlb_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+domain_id = VTD_INV_DESC_PIOTLB_DID(inv_desc->val[0]);
+pasid = VTD_INV_DESC_PIOTLB_PASID(inv_desc->val[0]);
+switch (inv_desc->val[0] & VTD_INV_DESC_IOTLB_G) {
+case VTD_INV_DESC_PIOTLB_ALL_IN_PASID:
+vtd_piotlb_pasid_invalidate(s, domain_id, pasid);
+break;
+
+case VTD_INV_DESC_PIOTLB_PSI_IN_PASID:
+am = VTD_INV_DESC_PIOTLB_AM(inv_desc->val[1]);
+addr = (hwaddr) VTD_INV_DESC_PIOTLB_ADDR(inv_desc->val[1]);
+vtd_piotlb_page_invalidate(s, domain_id, pasid, addr, am,
+   VTD_INV_DESC_PIOTLB_IH(inv_desc->val[1]));
+break;
+
+default:
+error_report_once("Invalid granularity in P-IOTLB desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+return true;
+}
+
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
  VTDInvDesc *inv_desc)
 {
@@ -3152,6 +3201,10 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 break;
 
 case VTD_INV_DESC_PIOTLB:
+trace_vtd_inv_desc("p-iotlb", inv_desc.val[1], inv_desc.val[0]);
+if (!vtd_process_piotlb_desc(s, _desc)) {
+return false;
+}
 break;
 
 case VTD_INV_DESC_WAIT:
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 9805b84..118d568 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -476,6 +476,19 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_PASIDC_PASID_SI   (1ULL << 4)
 #define VTD_INV_DESC_PASIDC_GLOBAL (3ULL << 4)
 
+#define VTD_INV_DESC_PIOTLB_ALL_IN_PASID  (2ULL << 4)
+#define VTD_INV_DESC_PIOTLB_PSI_IN_PASID  (3ULL << 4)
+
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff0ffc0ULL
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL
+
+#define VTD_INV_DESC_PIOTLB_PASID(val)(((val) >> 32) & 0xfULL)
+#define VTD_INV_DESC_PIOTLB_DID(val)  (((val) >> 16) & \
+ VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_PIOTLB_ADDR(val) ((val) & ~0xfffULL)
+#define VTD_INV_DESC_PIOTLB_AM(val)   ((val) & 0x3fULL)
+#define VTD_INV_DESC_PIOTLB_IH(val)   (((val) >> 6) & 0x1)
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
 uint16_t domain_id;
-- 
2.7.4

[RFC v10 20/25] intel_iommu: do not pass down pasid bind for PASID #0

2020-09-10 Thread Liu Yi L

RID_PASID field was introduced in VT-d 3.0 spec, it is used
for DMA requests w/o PASID in scalable mode VT-d. It is also
known as IOVA. And in VT-d 3.1 spec, there is definition on it:

"Implementations not supporting RID_PASID capability
(ECAP_REG.RPS is 0b), use a PASID value of 0 to perform
address translation for requests without PASID."

This patch adds a check against the PASIDs which are going to be
bound to device. For PASID #0, it is not necessary to pass down
pasid bind request for it since PASID #0 is used as RID_PASID for
DMA requests without pasid. Further reason is current Intel vIOMMU
supports gIOVA by shadowing guest 2nd level page table. However,
in future, if guest IOMMU driver uses 1st level page table to store
IOVA mappings, then guest IOVA support will also be done via nested
translation. When gIOVA is over FLPT, then vIOMMU should pass down
the pasid bind request for PASID #0 to host, host needs to bind the
guest IOVA page table to a proper PASID. e.g. PASID value in RID_PASID
field for PF/VF if ECAP_REG.RPS is clear or default PASID for ADI
(Assignable Device Interface in Scalable IOV solution).

IOVA over FLPT support on Intel VT-d:
https://lore.kernel.org/linux-iommu/20191219031634.15168-1-baolu...@linux.intel.com/

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 7bc9735..55623e8 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1893,6 +1893,16 @@ static int vtd_bind_guest_pasid(IntelIOMMUState *s, 
VTDBus *vtd_bus,
 HostIOMMUContext *iommu_ctx;
 int ret = -1;
 
+if (pasid < VTD_HPASID_MIN) {
+/*
+ * If pasid < VTD_HPASID_MIN, this pasid is not allocated
+ * from host. No need to pass down the changes on it to host.
+ * TODO: when IOVA over FLPT is ready, this switch should be
+ * refined.
+ */
+return 0;
+}
+
 vtd_dev_icx = vtd_bus->dev_icx[devfn];
 if (!vtd_dev_icx) {
 /* means no need to go further, e.g. for emulated devices */
-- 
2.7.4

[RFC v10 04/25] hw/pci: introduce pci_device_get_iommu_attr()

2020-09-10 Thread Liu Yi L

This patch adds pci_device_get_iommu_attr() to get vIOMMU attributes.
e.g. if nesting IOMMU wanted.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Signed-off-by: Liu Yi L 
---
 hw/pci/pci.c | 35 ++-
 include/hw/pci/pci.h |  7 +++
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 1967746..1886f8e 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2659,7 +2659,8 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
 }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+  PCIBus **pbus, uint8_t *pdevfn)
 {
 PCIBus *bus = pci_get_bus(dev);
 PCIBus *iommu_bus = bus;
@@ -2710,14 +2711,38 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (iommu_bus && iommu_bus->iommu_ops &&
- iommu_bus->iommu_ops->get_address_space) {
-return iommu_bus->iommu_ops->get_address_space(bus,
- iommu_bus->iommu_opaque, devfn);
+*pbus = iommu_bus;
+*pdevfn = devfn;
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->get_address_space) {
+return bus->iommu_ops->get_address_space(bus,
+bus->iommu_opaque, devfn);
 }
 return _space_memory;
 }
 
+int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr attr, void *data)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->get_iommu_attr) {
+return bus->iommu_ops->get_iommu_attr(bus, bus->iommu_opaque,
+   devfn, attr, data);
+}
+return -ENOENT;
+}
+
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
 {
 bus->iommu_ops = ops;
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 7c46a78..18b51dd 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -487,13 +487,20 @@ void pci_bus_get_w64_range(PCIBus *bus, Range *range);
 
 void pci_device_deassert_intx(PCIDevice *dev);
 
+typedef enum IOMMUAttr {
+IOMMU_WANT_NESTING,
+} IOMMUAttr;
+
 typedef struct PCIIOMMUOps PCIIOMMUOps;
 struct PCIIOMMUOps {
 AddressSpace * (*get_address_space)(PCIBus *bus,
 void *opaque, int32_t devfn);
+int (*get_iommu_attr)(PCIBus *bus, void *opaque, int32_t devfn,
+   IOMMUAttr attr, void *data);
 };
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr attr, void *data);
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *iommu_ops, void *opaque);
 
 static inline void
-- 
2.7.4

[RFC v10 19/25] intel_iommu: replay pasid binds after context cache invalidation

2020-09-10 Thread Liu Yi L

This patch replays guest pasid bindings after context cache
invalidation. This is a behavior to ensure safety. Actually,
programmer should issue pasid cache invalidation with proper
granularity after issuing a context cache invalidation.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 50 ++
 hw/i386/intel_iommu_internal.h |  1 +
 hw/i386/trace-events   |  1 +
 3 files changed, 52 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 4f6b80f..7bc9735 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -68,6 +68,10 @@ static void vtd_address_space_refresh_all(IntelIOMMUState 
*s);
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
 
 static void vtd_pasid_cache_reset(IntelIOMMUState *s);
+static void vtd_pasid_cache_sync(IntelIOMMUState *s,
+ VTDPASIDCacheInfo *pc_info);
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+  VTDBus *vtd_bus, uint16_t devfn);
 
 static void vtd_panic_require_caching_mode(void)
 {
@@ -1853,7 +1857,10 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
 
 static void vtd_context_global_invalidate(IntelIOMMUState *s)
 {
+VTDPASIDCacheInfo pc_info;
+
 trace_vtd_inv_desc_cc_global();
+
 /* Protects context cache */
 vtd_iommu_lock(s);
 s->context_cache_gen++;
@@ -1870,6 +1877,9 @@ static void vtd_context_global_invalidate(IntelIOMMUState 
*s)
  * VT-d emulation codes.
  */
 vtd_iommu_replay_all(s);
+
+pc_info.type = VTD_PASID_CACHE_GLOBAL_INV;
+vtd_pasid_cache_sync(s, _info);
 }
 
 /**
@@ -2008,6 +2018,21 @@ static void 
vtd_context_device_invalidate(IntelIOMMUState *s,
  * happened.
  */
 vtd_sync_shadow_page_table(vtd_as);
+/*
+ * Per spec, context flush should also followed with PASID
+ * cache and iotlb flush. Regards to a device selective
+ * context cache invalidation:
+ * if (emaulted_device)
+ *invalidate pasid cahce and pasid-based iotlb
+ * else if (assigned_device)
+ *check if the device has been bound to any pasid
+ *invoke pasid_unbind regards to each bound pasid
+ * Here, we have vtd_pasid_cache_devsi() to invalidate pasid
+ * caches, while for piotlb in QEMU, we don't have it yet, so
+ * no handling. For assigned device, host iommu driver would
+ * flush piotlb when a pasid unbind is pass down to it.
+ */
+ vtd_pasid_cache_devsi(s, vtd_bus, devfn_it);
 }
 }
 }
@@ -2622,6 +2647,12 @@ static gboolean vtd_flush_pasid(gpointer key, gpointer 
value,
 /* Fall through */
 case VTD_PASID_CACHE_GLOBAL_INV:
 break;
+case VTD_PASID_CACHE_DEVSI:
+if (pc_info->vtd_bus != vtd_bus ||
+pc_info->devfn != devfn) {
+return false;
+}
+break;
 default:
 error_report("invalid pc_info->type");
 abort();
@@ -2821,6 +2852,11 @@ static void 
vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
 case VTD_PASID_CACHE_GLOBAL_INV:
 /* loop all assigned devices */
 break;
+case VTD_PASID_CACHE_DEVSI:
+walk_info.vtd_bus = pc_info->vtd_bus;
+walk_info.devfn = pc_info->devfn;
+vtd_replay_pasid_bind_for_dev(s, start, end, _info);
+return;
 case VTD_PASID_CACHE_FORCE_RESET:
 /* For force reset, no need to go further replay */
 return;
@@ -2906,6 +2942,20 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s,
 vtd_iommu_unlock(s);
 }
 
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+  VTDBus *vtd_bus, uint16_t devfn)
+{
+VTDPASIDCacheInfo pc_info;
+
+trace_vtd_pasid_cache_devsi(devfn);
+
+pc_info.type = VTD_PASID_CACHE_DEVSI;
+pc_info.vtd_bus = vtd_bus;
+pc_info.devfn = devfn;
+
+vtd_pasid_cache_sync(s, _info);
+}
+
 /**
  * Caller of this function should hold iommu_lock
  */
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 51691d0..9805b84 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -548,6 +548,7 @@ typedef enum VTDPCInvType {
 VTD_PASID_CACHE_FORCE_RESET = 0,
 /* pasid cache invalidation rely on guest PASID entry */
 VTD_PASID_CACHE_GLOBAL_INV,
+VTD_PASID_CACHE_DEVSI,
 VTD_PASID_CACHE_DOMSI,
 VTD_PASID_CACHE_PASIDSI,
 } VTDPCInvType;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 60d20c1..3853fa8 100644
--- a/hw/i386/trace-events
+++ b/hw/i386

[RFC v9 18/25] intel_iommu: bind/unbind guest page table to host

2020-07-28 Thread Liu Yi L

This patch captures the guest PASID table entry modifications and
propagates the changes to host to setup dual stage DMA translation.
The guest page table is configured as 1st level page table (GVA->GPA)
whose translation result would further go through host VT-d 2nd
level page table(GPA->HPA) under nested translation mode. This is the
key part of vSVA support, and also a key to support IOVA over 1st-
level page table for Intel VT-d in virtualization environment.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 101 +++--
 hw/i386/intel_iommu_internal.h |  18 
 2 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 3128374..3986e5f 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -41,6 +41,7 @@
 #include "migration/vmstate.h"
 #include "trace.h"
 #include "qemu/jhash.h"
+#include 
 
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
@@ -700,6 +701,16 @@ static inline uint32_t 
vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
 return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce->val[0]) + 7);
 }
 
+static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
+{
+return 48 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM) * 9;
+}
+
+static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
+{
+return pe->val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+}
+
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 {
 return pdire->val & 1;
@@ -1861,6 +1872,85 @@ static void 
vtd_context_global_invalidate(IntelIOMMUState *s)
 vtd_iommu_replay_all(s);
 }
 
+/**
+ * Caller should hold iommu_lock.
+ */
+static int vtd_bind_guest_pasid(IntelIOMMUState *s, VTDBus *vtd_bus,
+int devfn, int pasid, VTDPASIDEntry *pe,
+VTDPASIDOp op)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+HostIOMMUContext *iommu_ctx;
+int ret = -1;
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+if (!vtd_dev_icx) {
+/* means no need to go further, e.g. for emulated devices */
+return 0;
+}
+
+iommu_ctx = vtd_dev_icx->iommu_ctx;
+if (!iommu_ctx) {
+return -EINVAL;
+}
+
+switch (op) {
+case VTD_PASID_BIND:
+{
+struct iommu_gpasid_bind_data *g_bind_data;
+
+g_bind_data = g_malloc0(sizeof(*g_bind_data));
+
+g_bind_data->argsz = sizeof(*g_bind_data);
+g_bind_data->version = IOMMU_GPASID_BIND_VERSION_1;
+g_bind_data->format = IOMMU_PASID_FORMAT_INTEL_VTD;
+g_bind_data->gpgd = vtd_pe_get_flpt_base(pe);
+g_bind_data->addr_width = vtd_pe_get_fl_aw(pe);
+g_bind_data->hpasid = pasid;
+g_bind_data->gpasid = pasid;
+g_bind_data->flags |= IOMMU_SVA_GPASID_VAL;
+g_bind_data->vendor.vtd.flags =
+ (VTD_SM_PASID_ENTRY_SRE_BIT(pe->val[2]) ?
+IOMMU_SVA_VTD_GPASID_SRE : 0)
+   | (VTD_SM_PASID_ENTRY_EAFE_BIT(pe->val[2]) ?
+IOMMU_SVA_VTD_GPASID_EAFE : 0)
+   | (VTD_SM_PASID_ENTRY_PCD_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_PCD : 0)
+   | (VTD_SM_PASID_ENTRY_PWT_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_PWT : 0)
+   | (VTD_SM_PASID_ENTRY_EMTE_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_EMTE : 0)
+   | (VTD_SM_PASID_ENTRY_CD_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_CD : 0);
+g_bind_data->vendor.vtd.pat = VTD_SM_PASID_ENTRY_PAT(pe->val[1]);
+g_bind_data->vendor.vtd.emt = VTD_SM_PASID_ENTRY_EMT(pe->val[1]);
+ret = host_iommu_ctx_bind_stage1_pgtbl(iommu_ctx, g_bind_data);
+g_free(g_bind_data);
+break;
+}
+case VTD_PASID_UNBIND:
+{
+struct iommu_gpasid_bind_data *g_unbind_data;
+
+g_unbind_data = g_malloc0(sizeof(*g_unbind_data));
+
+g_unbind_data->argsz = sizeof(*g_unbind_data);
+g_unbind_data->version = IOMMU_GPASID_BIND_VERSION_1;
+g_unbind_data->format = IOMMU_PASID_FORMAT_INTEL_VTD;
+g_unbind_data->hpasid = pasid;
+ret = host_iommu_ctx_unbind_stage1_pgtbl(iommu_ctx, g_unbind_data);
+g_free(g_unbind_data);
+break;
+}
+default:
+error_report_once("Unknown VTDPASIDOp!!!\n");
+break;
+}
+
+
+return ret;
+}
+
 /* Do a context-cache device-selective invalidation.
  * @func_mask: FM field aft

[RFC v9 19/25] intel_iommu: replay pasid binds after context cache invalidation

2020-07-28 Thread Liu Yi L

This patch replays guest pasid bindings after context cache
invalidation. This is a behavior to ensure safety. Actually,
programmer should issue pasid cache invalidation with proper
granularity after issuing a context cache invalidation.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 50 ++
 hw/i386/intel_iommu_internal.h |  1 +
 hw/i386/trace-events   |  1 +
 3 files changed, 52 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 3986e5f..efad0af 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -68,6 +68,10 @@ static void vtd_address_space_refresh_all(IntelIOMMUState 
*s);
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
 
 static void vtd_pasid_cache_reset(IntelIOMMUState *s);
+static void vtd_pasid_cache_sync(IntelIOMMUState *s,
+ VTDPASIDCacheInfo *pc_info);
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+  VTDBus *vtd_bus, uint16_t devfn);
 
 static void vtd_panic_require_caching_mode(void)
 {
@@ -1853,7 +1857,10 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
 
 static void vtd_context_global_invalidate(IntelIOMMUState *s)
 {
+VTDPASIDCacheInfo pc_info;
+
 trace_vtd_inv_desc_cc_global();
+
 /* Protects context cache */
 vtd_iommu_lock(s);
 s->context_cache_gen++;
@@ -1870,6 +1877,9 @@ static void vtd_context_global_invalidate(IntelIOMMUState 
*s)
  * VT-d emulation codes.
  */
 vtd_iommu_replay_all(s);
+
+pc_info.type = VTD_PASID_CACHE_GLOBAL_INV;
+vtd_pasid_cache_sync(s, _info);
 }
 
 /**
@@ -2008,6 +2018,21 @@ static void 
vtd_context_device_invalidate(IntelIOMMUState *s,
  * happened.
  */
 vtd_sync_shadow_page_table(vtd_as);
+/*
+ * Per spec, context flush should also followed with PASID
+ * cache and iotlb flush. Regards to a device selective
+ * context cache invalidation:
+ * if (emaulted_device)
+ *invalidate pasid cahce and pasid-based iotlb
+ * else if (assigned_device)
+ *check if the device has been bound to any pasid
+ *invoke pasid_unbind regards to each bound pasid
+ * Here, we have vtd_pasid_cache_devsi() to invalidate pasid
+ * caches, while for piotlb in QEMU, we don't have it yet, so
+ * no handling. For assigned device, host iommu driver would
+ * flush piotlb when a pasid unbind is pass down to it.
+ */
+ vtd_pasid_cache_devsi(s, vtd_bus, devfn_it);
 }
 }
 }
@@ -2622,6 +2647,12 @@ static gboolean vtd_flush_pasid(gpointer key, gpointer 
value,
 /* Fall through */
 case VTD_PASID_CACHE_GLOBAL_INV:
 break;
+case VTD_PASID_CACHE_DEVSI:
+if (pc_info->vtd_bus != vtd_bus ||
+pc_info->devfn != devfn) {
+return false;
+}
+break;
 default:
 error_report("invalid pc_info->type");
 abort();
@@ -2821,6 +2852,11 @@ static void 
vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
 case VTD_PASID_CACHE_GLOBAL_INV:
 /* loop all assigned devices */
 break;
+case VTD_PASID_CACHE_DEVSI:
+walk_info.vtd_bus = pc_info->vtd_bus;
+walk_info.devfn = pc_info->devfn;
+vtd_replay_pasid_bind_for_dev(s, start, end, _info);
+return;
 case VTD_PASID_CACHE_FORCE_RESET:
 /* For force reset, no need to go further replay */
 return;
@@ -2906,6 +2942,20 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s,
 vtd_iommu_unlock(s);
 }
 
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+  VTDBus *vtd_bus, uint16_t devfn)
+{
+VTDPASIDCacheInfo pc_info;
+
+trace_vtd_pasid_cache_devsi(devfn);
+
+pc_info.type = VTD_PASID_CACHE_DEVSI;
+pc_info.vtd_bus = vtd_bus;
+pc_info.devfn = devfn;
+
+vtd_pasid_cache_sync(s, _info);
+}
+
 /**
  * Caller of this function should hold iommu_lock
  */
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 51691d0..9805b84 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -548,6 +548,7 @@ typedef enum VTDPCInvType {
 VTD_PASID_CACHE_FORCE_RESET = 0,
 /* pasid cache invalidation rely on guest PASID entry */
 VTD_PASID_CACHE_GLOBAL_INV,
+VTD_PASID_CACHE_DEVSI,
 VTD_PASID_CACHE_DOMSI,
 VTD_PASID_CACHE_PASIDSI,
 } VTDPCInvType;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 60d20c1..3853fa8 100644
--- a/hw/i386/trace-events
+++ b/hw/i386

[RFC v9 07/25] vfio: check VFIO_TYPE1_NESTING_IOMMU support

2020-07-28 Thread Liu Yi L

VFIO needs to check VFIO_TYPE1_NESTING_IOMMU support with Kernel before
further using it. e.g. requires to check IOMMU UAPI support.

Referred patch from Eric Auger: https://patchwork.kernel.org/patch/11040499/

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
Signed-off-by: Eric Auger 
Signed-off-by: Yi Sun 
---
 hw/vfio/common.c | 37 ++---
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 80d7a00..af91eca 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1151,30 +1151,44 @@ static void vfio_put_address_space(VFIOAddressSpace 
*space)
 }
 
 /*
- * vfio_get_iommu_type - selects the richest iommu_type (v2 first)
+ * vfio_get_iommu_type - selects the richest iommu_type (NESTING first)
  */
 static int vfio_get_iommu_type(VFIOContainer *container,
+   bool want_nested,
Error **errp)
 {
-int iommu_types[] = { VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
+int iommu_types[] = { VFIO_TYPE1_NESTING_IOMMU,
+  VFIO_TYPE1v2_IOMMU, VFIO_TYPE1_IOMMU,
   VFIO_SPAPR_TCE_v2_IOMMU, VFIO_SPAPR_TCE_IOMMU };
-int i;
+int i, ret = -EINVAL;
 
 for (i = 0; i < ARRAY_SIZE(iommu_types); i++) {
 if (ioctl(container->fd, VFIO_CHECK_EXTENSION, iommu_types[i])) {
-return iommu_types[i];
+if (iommu_types[i] == VFIO_TYPE1_NESTING_IOMMU) {
+if (!want_nested) {
+continue;
+}
+}
+ret = iommu_types[i];
+break;
 }
 }
-error_setg(errp, "No available IOMMU models");
-return -EINVAL;
+
+if (ret < 0) {
+error_setg(errp, "No available IOMMU models");
+} else if (want_nested && ret != VFIO_TYPE1_NESTING_IOMMU) {
+error_setg(errp, "Nested mode requested but not supported");
+ret = -EINVAL;
+}
+return ret;
 }
 
 static int vfio_init_container(VFIOContainer *container, int group_fd,
-   Error **errp)
+   bool want_nested, Error **errp)
 {
 int iommu_type, ret;
 
-iommu_type = vfio_get_iommu_type(container, errp);
+iommu_type = vfio_get_iommu_type(container, want_nested, errp);
 if (iommu_type < 0) {
 return iommu_type;
 }
@@ -1205,7 +1219,7 @@ static int vfio_init_container(VFIOContainer *container, 
int group_fd,
 }
 
 static int vfio_connect_container(VFIOGroup *group, AddressSpace *as,
-  Error **errp)
+  bool want_nested, Error **errp)
 {
 VFIOContainer *container;
 int ret, fd;
@@ -1276,12 +1290,13 @@ static int vfio_connect_container(VFIOGroup *group, 
AddressSpace *as,
 QLIST_INIT(>giommu_list);
 QLIST_INIT(>hostwin_list);
 
-ret = vfio_init_container(container, group->fd, errp);
+ret = vfio_init_container(container, group->fd, want_nested, errp);
 if (ret) {
 goto free_container_exit;
 }
 
 switch (container->iommu_type) {
+case VFIO_TYPE1_NESTING_IOMMU:
 case VFIO_TYPE1v2_IOMMU:
 case VFIO_TYPE1_IOMMU:
 {
@@ -1502,7 +1517,7 @@ VFIOGroup *vfio_get_group(int groupid, AddressSpace *as,
 group->groupid = groupid;
 QLIST_INIT(>device_list);
 
-if (vfio_connect_container(group, as, errp)) {
+if (vfio_connect_container(group, as, want_nested, errp)) {
 error_prepend(errp, "failed to setup container for group %d: ",
   groupid);
 goto close_fd_exit;
-- 
2.7.4

[RFC v9 15/25] intel_iommu: add PASID cache management infrastructure

2020-07-28 Thread Liu Yi L

This patch adds a PASID cache management infrastructure based on
new added structure VTDPASIDAddressSpace, which is used to track
the PASID usage and future PASID tagged DMA address translation
support in vIOMMU.

struct VTDPASIDAddressSpace {
VTDBus *vtd_bus;
uint8_t devfn;
AddressSpace as;
uint32_t pasid;
IntelIOMMUState *iommu_state;
VTDContextCacheEntry context_cache_entry;
QLIST_ENTRY(VTDPASIDAddressSpace) next;
VTDPASIDCacheEntry pasid_cache_entry;
};

Ideally, a VTDPASIDAddressSpace instance is created when a PASID
is bound with a DMA AddressSpace. Intel VT-d spec requires guest
software to issue pasid cache invalidation when bind or unbind a
pasid with an address space under caching-mode. However, as
VTDPASIDAddressSpace instances also act as pasid cache in this
implementation, its creation also happens during vIOMMU PASID
tagged DMA translation. The creation in this path will not be
added in this patch since no PASID-capable emulated devices for
now.

The implementation in this patch manages VTDPASIDAddressSpace
instances per PASID+BDF (lookup and insert will use PASID and
BDF) since Intel VT-d spec allows per-BDF PASID Table. When a
guest bind a PASID with an AddressSpace, QEMU will capture the
guest pasid selective pasid cache invalidation, and allocate
remove a VTDPASIDAddressSpace instance per the invalidation
reasons:

*) a present pasid entry moved to non-present
*) a present pasid entry to be a present entry
*) a non-present pasid entry moved to present

vIOMMU emulator could figure out the reason by fetching latest
guest pasid entry.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
rfcv4 (v1) -> rfcv5 (v2):
*) merged this patch with former replay binding patch, makes
   PSI/DSI/GSI use the unified function to do cache invalidation
   and pasid binding replay.
*) dropped pasid_cache_gen in both iommu_state and vtd_pasid_as
   as it is not necessary so far, we may want it when one day
   initroduce emulated SVA-capable device.
---
 hw/i386/intel_iommu.c  | 464 +
 hw/i386/intel_iommu_internal.h |  21 ++
 hw/i386/trace-events   |   1 +
 include/hw/i386/intel_iommu.h  |  24 +++
 4 files changed, 510 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 7efa98c..9b35092 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -40,6 +40,7 @@
 #include "kvm_i386.h"
 #include "migration/vmstate.h"
 #include "trace.h"
+#include "qemu/jhash.h"
 
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
@@ -65,6 +66,8 @@
 static void vtd_address_space_refresh_all(IntelIOMMUState *s);
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
 
+static void vtd_pasid_cache_reset(IntelIOMMUState *s);
+
 static void vtd_panic_require_caching_mode(void)
 {
 error_report("We need to set caching-mode=on for intel-iommu to enable "
@@ -276,6 +279,7 @@ static void vtd_reset_caches(IntelIOMMUState *s)
 vtd_iommu_lock(s);
 vtd_reset_iotlb_locked(s);
 vtd_reset_context_cache_locked(s);
+vtd_pasid_cache_reset(s);
 vtd_iommu_unlock(s);
 }
 
@@ -686,6 +690,16 @@ static inline bool vtd_pe_type_check(X86IOMMUState 
*x86_iommu,
 return true;
 }
 
+static inline uint16_t vtd_pe_get_domain_id(VTDPASIDEntry *pe)
+{
+return VTD_SM_PASID_ENTRY_DID((pe)->val[1]);
+}
+
+static inline uint32_t vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
+{
+return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce->val[0]) + 7);
+}
+
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 {
 return pdire->val & 1;
@@ -2395,9 +2409,443 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, 
VTDInvDesc *inv_desc)
 return true;
 }
 
+static inline void vtd_init_pasid_key(uint32_t pasid,
+ uint16_t sid,
+ struct pasid_key *key)
+{
+key->pasid = pasid;
+key->sid = sid;
+}
+
+static guint vtd_pasid_as_key_hash(gconstpointer v)
+{
+struct pasid_key *key = (struct pasid_key *)v;
+uint32_t a, b, c;
+
+/* Jenkins hash */
+a = b = c = JHASH_INITVAL + sizeof(*key);
+a += key->sid;
+b += extract32(key->pasid, 0, 16);
+c += extract32(key->pasid, 16, 16);
+
+__jhash_mix(a, b, c);
+__jhash_final(a, b, c);
+
+return c;
+}
+
+static gboolean vtd_pasid_as_key_equal(gconstpointer v1, gconstpointer v2)
+{
+const struct pasid_key *k1 = v1;
+const struct pasid_key *k2 = v2;
+
+return (k1->pasid == k2->pasid) && (k1->sid == k2->sid);
+}
+
+static inline int vtd_dev_get_pe_from_pasid(IntelIOMMUState *s,
+uint8_t bus_num,
+

[RFC v9 17/25] intel_iommu: sync IOMMU nesting cap info for assigned devices

2020-07-28 Thread Liu Yi L

For assigned devices, Intel vIOMMU which wants to build DMA protection
based on physical IOMMU nesting paging should check the IOMMU nesting
support in host side. The host will return IOMMU nesting cap info to
user-space (e.g. VFIO returns IOMMU nesting cap info for nesting type
IOMMU). vIOMMU needs to check:
a) IOMMU model
b) 1st-level page table supports
c) address width
d) pasid support

This patch syncs the IOMMU nesting cap info when PCIe device (VFIO case)
sets HostIOMMUContext to vIOMMU. If the host IOMMU nesting support is not
compatible, vIOMMU should return failure to PCIe device.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 107 +
 hw/i386/intel_iommu_internal.h |  18 +++
 include/hw/i386/intel_iommu.h  |   4 ++
 3 files changed, 129 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 9b35092..3128374 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -4104,6 +4104,84 @@ static int vtd_dev_get_iommu_attr(PCIBus *bus, void 
*opaque, int32_t devfn,
 return ret;
 }
 
+
+static bool vtd_check_nesting_info(IntelIOMMUState *s,
+   struct iommu_nesting_info *info,
+   struct iommu_nesting_info_vtd *vtd)
+{
+return !((s->aw_bits != info->addr_width) ||
+ ((s->host_cap & VTD_CAP_MASK) !=
+  (vtd->cap_reg & VTD_CAP_MASK)) ||
+ ((s->host_ecap & VTD_ECAP_MASK) !=
+  (vtd->ecap_reg & VTD_ECAP_MASK)) ||
+ (VTD_GET_PSS(s->host_ecap) != (info->pasid_bits - 1)));
+}
+
+/* Caller should hold iommu lock. */
+static bool vtd_sync_nesting_info(IntelIOMMUState *s,
+  struct iommu_nesting_info *info)
+{
+struct iommu_nesting_info_vtd *vtd;
+uint64_t cap, ecap;
+
+vtd =  (struct iommu_nesting_info_vtd *) >data;
+
+if (s->cap_finalized) {
+return vtd_check_nesting_info(s, info, vtd);
+}
+
+if (s->aw_bits > info->addr_width) {
+error_report("User aw-bits: %u > host address width: %u",
+  s->aw_bits, info->addr_width);
+return false;
+}
+
+cap = s->host_cap & vtd->cap_reg & VTD_CAP_MASK;
+s->host_cap &= ~VTD_CAP_MASK;
+s->host_cap |= cap;
+
+ecap = s->host_ecap & vtd->ecap_reg & VTD_ECAP_MASK;
+s->host_ecap &= ~VTD_ECAP_MASK;
+s->host_ecap |= ecap;
+
+if ((VTD_ECAP_PASID & s->host_ecap) && info->pasid_bits &&
+(VTD_GET_PSS(s->host_ecap) > (info->pasid_bits - 1))) {
+s->host_ecap &= ~VTD_ECAP_PSS_MASK;
+s->host_ecap |= VTD_ECAP_PSS(info->pasid_bits - 1);
+}
+return true;
+}
+
+/*
+ * virtual VT-d which wants nested needs to check the host IOMMU
+ * nesting cap info behind the assigned devices. Thus that vIOMMU
+ * could bind guest page table to host.
+ */
+static bool vtd_check_iommu_ctx(IntelIOMMUState *s,
+HostIOMMUContext *iommu_ctx)
+{
+struct iommu_nesting_info *info = iommu_ctx->info;
+uint32_t minsz, size;
+
+if (IOMMU_PASID_FORMAT_INTEL_VTD != info->format) {
+error_report("Format is not compatible for nesting!!!");
+return false;
+}
+
+size = sizeof(struct iommu_nesting_info_vtd);
+minsz = endof(struct iommu_nesting_info, flags);
+if (size > (info->argsz - minsz)) {
+/*
+ * QEMU may have been using new linux-headers/iommu.h than
+ * kernel supports, hence fail it.
+ */
+error_report("IOMMU nesting cap is not compatible!!!");
+return false;
+}
+
+return vtd_sync_nesting_info(s, info);
+}
+
 static int vtd_dev_set_iommu_context(PCIBus *bus, void *opaque,
  int devfn,
  HostIOMMUContext *iommu_ctx)
@@ -4118,6 +4196,11 @@ static int vtd_dev_set_iommu_context(PCIBus *bus, void 
*opaque,
 
 vtd_iommu_lock(s);
 
+if (!vtd_check_iommu_ctx(s, iommu_ctx)) {
+vtd_iommu_unlock(s);
+return -ENOENT;
+}
+
 vtd_dev_icx = vtd_bus->dev_icx[devfn];
 
 assert(!vtd_dev_icx);
@@ -4373,6 +4456,14 @@ static void vtd_init(IntelIOMMUState *s)
 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
 }
 
+if (!s->cap_finalized) {
+s->host_cap = s->cap;
+s->host_ecap = s->ecap;
+} else {
+s->cap = s->host_cap;
+s->ecap = s->host_ecap;
+}
+
 vtd_reset_caches(s);
 
 /* Define registers with default values and bit semantics */
@@ -4506,6 +4597,12 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)

[RFC v9 25/25] intel_iommu: modify x-scalable-mode to be string option

2020-07-28 Thread Liu Yi L

Intel VT-d 3.0 introduces scalable mode, and it has a bunch of capabilities
related to scalable mode translation, thus there are multiple combinations.
While this vIOMMU implementation wants simplify it for user by providing
typical combinations. User could config it by "x-scalable-mode" option. The
usage is as below:

"-device intel-iommu,x-scalable-mode=["legacy"|"modern"|"off"]"

 - "legacy": gives support for SL page table
 - "modern": gives support for FL page table, pasid, virtual command
 - "off": no scalable mode support
 -  if not configured, means no scalable mode support, if not proper
configured, will throw error

Note: this patch is supposed to be merged when the whole vSVA patch series
were merged.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
Signed-off-by: Yi Sun 
---
rfcv5 (v2) -> rfcv6:
*) reports want_nested to VFIO;
*) assert iommu_set/unset_iommu_context() if vIOMMU is not scalable modern.
---
 hw/i386/intel_iommu.c  | 39 +++
 hw/i386/intel_iommu_internal.h |  3 +++
 include/hw/i386/intel_iommu.h  |  2 ++
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 11f815d..73a44b5 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -4050,7 +4050,7 @@ static Property vtd_properties[] = {
 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
   VTD_HOST_ADDRESS_WIDTH),
 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
-DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
+DEFINE_PROP_STRING("x-scalable-mode", IntelIOMMUState, scalable_mode_str),
 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
 DEFINE_PROP_END_OF_LIST(),
 };
@@ -4420,6 +4420,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus, int devfn)
 static int vtd_dev_get_iommu_attr(PCIBus *bus, void *opaque, int32_t devfn,
IOMMUAttr attr, void *data)
 {
+IntelIOMMUState *s = opaque;
 int ret = 0;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
@@ -4429,8 +4430,7 @@ static int vtd_dev_get_iommu_attr(PCIBus *bus, void 
*opaque, int32_t devfn,
 {
 bool *pdata = data;
 
-/* return false until vSVA is ready */
-*pdata = false;
+*pdata = s->scalable_modern ? true : false;
 break;
 }
 default:
@@ -4526,6 +4526,8 @@ static int vtd_dev_set_iommu_context(PCIBus *bus, void 
*opaque,
 VTDHostIOMMUContext *vtd_dev_icx;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+/* only modern scalable supports set_ioimmu_context */
+assert(s->scalable_modern);
 
 vtd_bus = vtd_find_add_bus(s, bus);
 
@@ -4560,6 +4562,8 @@ static void vtd_dev_unset_iommu_context(PCIBus *bus, void 
*opaque, int devfn)
 VTDHostIOMMUContext *vtd_dev_icx;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+/* only modern scalable supports unset_ioimmu_context */
+assert(s->scalable_modern);
 
 vtd_bus = vtd_find_add_bus(s, bus);
 
@@ -4787,8 +4791,13 @@ static void vtd_init(IntelIOMMUState *s)
 }
 
 /* TODO: read cap/ecap from host to decide which cap to be exposed. */
-if (s->scalable_mode) {
+if (s->scalable_mode && !s->scalable_modern) {
 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
+} else if (s->scalable_mode && s->scalable_modern) {
+s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID |
+   VTD_ECAP_FLTS | VTD_ECAP_PSS(VTD_PASID_SS) |
+   VTD_ECAP_VCS;
+s->vccap |= VTD_VCCAP_PAS;
 }
 
 if (!s->cap_finalized) {
@@ -4929,6 +4938,28 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 return false;
 }
 
+if (s->scalable_mode_str &&
+(strcmp(s->scalable_mode_str, "off") &&
+ strcmp(s->scalable_mode_str, "modern") &&
+ strcmp(s->scalable_mode_str, "legacy"))) {
+error_setg(errp, "Invalid x-scalable-mode config,"
+ "Please use \"modern\", \"legacy\" or \"off\"");
+return false;
+}
+
+if (s->scalable_mode_str &&
+!strcmp(s->scalable_mode_str, "legacy")) {
+s->scalable_mode = true;
+s->scalable_modern = false;
+} else if (s->scalable_mode_str &&
+!strcmp(s->scalable_mode_str, "modern")) {
+s->scalable_mode = true;
+s->scalable_mod

[RFC v9 22/25] intel_iommu: process PASID-based iotlb invalidation

2020-07-28 Thread Liu Yi L

This patch adds the basic PASID-based iotlb (piotlb) invalidation
support. piotlb is used during walking Intel VT-d 1st level page
table. This patch only adds the basic processing. Detailed handling
will be added in next patch.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 53 ++
 hw/i386/intel_iommu_internal.h | 13 +++
 2 files changed, 66 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index f0e1afa..de7947c 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3038,6 +3038,55 @@ static bool vtd_process_pasid_desc(IntelIOMMUState *s,
 return true;
 }
 
+static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
+uint16_t domain_id,
+uint32_t pasid)
+{
+}
+
+static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
+   uint32_t pasid, hwaddr addr, uint8_t am,
+   bool ih)
+{
+}
+
+static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
+VTDInvDesc *inv_desc)
+{
+uint16_t domain_id;
+uint32_t pasid;
+uint8_t am;
+hwaddr addr;
+
+if ((inv_desc->val[0] & VTD_INV_DESC_PIOTLB_RSVD_VAL0) ||
+(inv_desc->val[1] & VTD_INV_DESC_PIOTLB_RSVD_VAL1)) {
+error_report_once("non-zero-field-in-piotlb_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+domain_id = VTD_INV_DESC_PIOTLB_DID(inv_desc->val[0]);
+pasid = VTD_INV_DESC_PIOTLB_PASID(inv_desc->val[0]);
+switch (inv_desc->val[0] & VTD_INV_DESC_IOTLB_G) {
+case VTD_INV_DESC_PIOTLB_ALL_IN_PASID:
+vtd_piotlb_pasid_invalidate(s, domain_id, pasid);
+break;
+
+case VTD_INV_DESC_PIOTLB_PSI_IN_PASID:
+am = VTD_INV_DESC_PIOTLB_AM(inv_desc->val[1]);
+addr = (hwaddr) VTD_INV_DESC_PIOTLB_ADDR(inv_desc->val[1]);
+vtd_piotlb_page_invalidate(s, domain_id, pasid, addr, am,
+   VTD_INV_DESC_PIOTLB_IH(inv_desc->val[1]));
+break;
+
+default:
+error_report_once("Invalid granularity in P-IOTLB desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+return true;
+}
+
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
  VTDInvDesc *inv_desc)
 {
@@ -3152,6 +3201,10 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 break;
 
 case VTD_INV_DESC_PIOTLB:
+trace_vtd_inv_desc("p-iotlb", inv_desc.val[1], inv_desc.val[0]);
+if (!vtd_process_piotlb_desc(s, _desc)) {
+return false;
+}
 break;
 
 case VTD_INV_DESC_WAIT:
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 9805b84..118d568 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -476,6 +476,19 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_PASIDC_PASID_SI   (1ULL << 4)
 #define VTD_INV_DESC_PASIDC_GLOBAL (3ULL << 4)
 
+#define VTD_INV_DESC_PIOTLB_ALL_IN_PASID  (2ULL << 4)
+#define VTD_INV_DESC_PIOTLB_PSI_IN_PASID  (3ULL << 4)
+
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff0ffc0ULL
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL
+
+#define VTD_INV_DESC_PIOTLB_PASID(val)(((val) >> 32) & 0xfULL)
+#define VTD_INV_DESC_PIOTLB_DID(val)  (((val) >> 16) & \
+ VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_PIOTLB_ADDR(val) ((val) & ~0xfffULL)
+#define VTD_INV_DESC_PIOTLB_AM(val)   ((val) & 0x3fULL)
+#define VTD_INV_DESC_PIOTLB_IH(val)   (((val) >> 6) & 0x1)
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
 uint16_t domain_id;
-- 
2.7.4

[RFC v9 23/25] intel_iommu: propagate PASID-based iotlb invalidation to host

2020-07-28 Thread Liu Yi L

This patch propagates PASID-based iotlb invalidation to host.

Intel VT-d 3.0 supports nested translation in PASID granular.
Guest SVA support could be implemented by configuring nested
translation on specific PASID. This is also known as dual stage
DMA translation.

Under such configuration, guest owns the GVA->GPA translation
which is configured as first level page table in host side for
a specific pasid, and host owns GPA->HPA translation. As guest
owns first level translation table, piotlb invalidation should
be propagated to host since host IOMMU will cache first level
page table related mappings during DMA address translation.

This patch traps the guest PASID-based iotlb flush and propagate
it to host.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
rfcv4 (v1) -> rfcv5 (v2):
*) removed the valid check to vtd_pasid_as instance as rfcv5 ensures
   all vtd_pasid_as instances in hash table should be valid.
---
 hw/i386/intel_iommu.c  | 113 +
 hw/i386/intel_iommu_internal.h |   7 +++
 2 files changed, 120 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index de7947c..db4460a 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3038,16 +3038,129 @@ static bool vtd_process_pasid_desc(IntelIOMMUState *s,
 return true;
 }
 
+/**
+ * Caller of this function should hold iommu_lock.
+ */
+static void vtd_invalidate_piotlb(IntelIOMMUState *s,
+  VTDBus *vtd_bus,
+  int devfn,
+  struct iommu_cache_invalidate_info *cache)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+HostIOMMUContext *iommu_ctx;
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+if (!vtd_dev_icx) {
+goto out;
+}
+iommu_ctx = vtd_dev_icx->iommu_ctx;
+if (!iommu_ctx) {
+goto out;
+}
+if (host_iommu_ctx_flush_stage1_cache(iommu_ctx, cache)) {
+error_report("Cache flush failed");
+}
+out:
+return;
+}
+
+/**
+ * This function is a loop function for the s->vtd_pasid_as
+ * list with VTDPIOTLBInvInfo as execution filter. It propagates
+ * the piotlb invalidation to host. Caller of this function
+ * should hold iommu_lock.
+ */
+static void vtd_flush_pasid_iotlb(gpointer key, gpointer value,
+  gpointer user_data)
+{
+VTDPIOTLBInvInfo *piotlb_info = user_data;
+VTDPASIDAddressSpace *vtd_pasid_as = value;
+VTDPASIDCacheEntry *pc_entry = _pasid_as->pasid_cache_entry;
+uint16_t did;
+
+did = vtd_pe_get_domain_id(_entry->pasid_entry);
+
+if ((piotlb_info->domain_id == did) &&
+(piotlb_info->pasid == vtd_pasid_as->pasid)) {
+vtd_invalidate_piotlb(vtd_pasid_as->iommu_state,
+  vtd_pasid_as->vtd_bus,
+  vtd_pasid_as->devfn,
+  piotlb_info->cache_info);
+}
+
+/*
+ * TODO: needs to add QEMU piotlb flush when QEMU piotlb
+ * infrastructure is ready. For now, it is enough for passthru
+ * devices.
+ */
+}
+
 static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
 uint16_t domain_id,
 uint32_t pasid)
 {
+VTDPIOTLBInvInfo piotlb_info;
+struct iommu_cache_invalidate_info *cache_info;
+
+cache_info = g_malloc0(sizeof(*cache_info));
+
+cache_info->argsz = sizeof(*cache_info);
+cache_info->version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
+cache_info->cache = IOMMU_CACHE_INV_TYPE_IOTLB;
+cache_info->granularity = IOMMU_INV_GRANU_PASID;
+cache_info->granu.pasid_info.pasid = pasid;
+cache_info->granu.pasid_info.flags = IOMMU_INV_PASID_FLAGS_PASID;
+
+piotlb_info.domain_id = domain_id;
+piotlb_info.pasid = pasid;
+piotlb_info.cache_info = cache_info;
+
+vtd_iommu_lock(s);
+/*
+ * Here loops all the vtd_pasid_as instances in s->vtd_pasid_as
+ * to find out the affected devices since piotlb invalidation
+ * should check pasid cache per architecture point of view.
+ */
+g_hash_table_foreach(s->vtd_pasid_as,
+ vtd_flush_pasid_iotlb, _info);
+vtd_iommu_unlock(s);
+g_free(cache_info);
 }
 
 static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
uint32_t pasid, hwaddr addr, uint8_t am,
bool ih)
 {
+VTDPIOTLBInvInfo piotlb_info;
+struct iommu_cache_invalidate_info *cache_info;
+
+cache_info = g_malloc0(sizeof(*cache_info));
+
+cache_info->argsz = sizeof(*cache_info);
+cache_info->version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
+cache_info->cache =

[RFC v9 08/25] hw/iommu: introduce HostIOMMUContext

2020-07-28 Thread Liu Yi L

Currently, many platform vendors provide the capability of dual stage
DMA address translation in hardware. For example, nested translation
on Intel VT-d scalable mode, nested stage translation on ARM SMMUv3,
and etc. In dual stage DMA address translation, there are two stages
address translation, stage-1 (a.k.a first-level) and stage-2 (a.k.a
second-level) translation structures. Stage-1 translation results are
also subjected to stage-2 translation structures. Take vSVA (Virtual
Shared Virtual Addressing) as an example, guest IOMMU driver owns
stage-1 translation structures (covers GVA->GPA translation), and host
IOMMU driver owns stage-2 translation structures (covers GPA->HPA
translation). VMM is responsible to bind stage-1 translation structures
to host, thus hardware could achieve GVA->GPA and then GPA->HPA
translation. For more background on SVA, refer the below links.
 - https://www.youtube.com/watch?v=Kq_nfGK5MwQ
 - https://events19.lfasiallc.com/wp-content/uploads/2017/11/\
Shared-Virtual-Memory-in-KVM_Yi-Liu.pdf

In QEMU, vIOMMU emulators expose IOMMUs to VM per their own spec (e.g.
Intel VT-d spec). Devices are pass-through to guest via device pass-
through components like VFIO. VFIO is a userspace driver framework
which exposes host IOMMU programming capability to userspace in a
secure manner. e.g. IOVA MAP/UNMAP requests. Information, different
from map/unmap notifications need to be passed from QEMU vIOMMU device
to/from the host IOMMU driver through the VFIO/IOMMU layer:
 1) PASID allocation (allow host to intercept in PASID allocation)
 2) bind stage-1 translation structures to host
 3) propagate stage-1 cache invalidation to host
 4) DMA address translation fault (I/O page fault) servicing etc.

With the above new interactions in QEMU, it requires an abstract layer
to facilitate the above operations and expose to vIOMMU emulators as an
explicit way for vIOMMU emulators call into VFIO. This patch introduces
HostIOMMUContext to serve it. The HostIOMMUContext is an object which
allows to manage the stage-1 translation when a vIOMMU is implemented
upon physical IOMMU nested paging (VFIO case). It is an abstract object
which needs to be derived for each vIOMMU immplementation based on
physical nested paging. An HostIOMMUContext derived object will be passed
to each VFIO device protected by a vIOMMU using physical nested paging.

This patchg also introduces HostIOMMUContextClass to provide methods for
vIOMMU emulators to propagate dual-stage translation related requests to
host. As a beginning, PASID allocation/free are defined to propagate PASID
allocation/free requests to host which is required for the vendors which
manage PASID in system-wide.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Signed-off-by: Liu Yi L 
---
 hw/Makefile.objs  |  1 +
 hw/iommu/Makefile.objs|  1 +
 hw/iommu/host_iommu_context.c | 97 +++
 include/hw/iommu/host_iommu_context.h | 75 +++
 4 files changed, 174 insertions(+)
 create mode 100644 hw/iommu/Makefile.objs
 create mode 100644 hw/iommu/host_iommu_context.c
 create mode 100644 include/hw/iommu/host_iommu_context.h

diff --git a/hw/Makefile.objs b/hw/Makefile.objs
index 14b7ea4..0648ff9 100644
--- a/hw/Makefile.objs
+++ b/hw/Makefile.objs
@@ -40,6 +40,7 @@ devices-dirs-$(CONFIG_MEM_DEVICE) += mem/
 devices-dirs-$(CONFIG_NUBUS) += nubus/
 devices-dirs-y += semihosting/
 devices-dirs-y += smbios/
+devices-dirs-y += iommu/
 endif
 
 common-obj-y += $(devices-dirs-y)
diff --git a/hw/iommu/Makefile.objs b/hw/iommu/Makefile.objs
new file mode 100644
index 000..e6eed4e
--- /dev/null
+++ b/hw/iommu/Makefile.objs
@@ -0,0 +1 @@
+obj-y += host_iommu_context.o
diff --git a/hw/iommu/host_iommu_context.c b/hw/iommu/host_iommu_context.c
new file mode 100644
index 000..5fb2223
--- /dev/null
+++ b/hw/iommu/host_iommu_context.c
@@ -0,0 +1,97 @@
+/*
+ * QEMU abstract of Host IOMMU
+ *
+ * Copyright (C) 2020 Intel Corporation.
+ *
+ * Authors: Liu Yi L 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "qapi/visitor.h"
+#include "hw/iommu/host_iommu_context.h

[RFC v9 03/25] hw/pci: modify pci_setup_iommu() to set PCIIOMMUOps

2020-07-28 Thread Liu Yi L

This patch modifies pci_setup_iommu() to set PCIIOMMUOps
instead of setting PCIIOMMUFunc. PCIIOMMUFunc is used to
get an address space for a PCI device in vendor specific
way. The PCIIOMMUOps still offers this functionality. But
using PCIIOMMUOps leaves space to add more iommu related
vendor specific operations.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Reviewed-by: David Gibson 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/alpha/typhoon.c   |  6 +-
 hw/arm/smmu-common.c |  6 +-
 hw/hppa/dino.c   |  6 +-
 hw/i386/amd_iommu.c  |  6 +-
 hw/i386/intel_iommu.c|  6 +-
 hw/pci-host/designware.c |  6 +-
 hw/pci-host/pnv_phb3.c   |  6 +-
 hw/pci-host/pnv_phb4.c   |  6 +-
 hw/pci-host/ppce500.c|  6 +-
 hw/pci-host/prep.c   |  6 +-
 hw/pci-host/sabre.c  |  6 +-
 hw/pci/pci.c | 18 +-
 hw/ppc/ppc440_pcix.c |  6 +-
 hw/ppc/spapr_pci.c   |  6 +-
 hw/s390x/s390-pci-bus.c  |  8 ++--
 hw/virtio/virtio-iommu.c |  6 +-
 include/hw/pci/pci.h |  8 ++--
 include/hw/pci/pci_bus.h |  2 +-
 18 files changed, 96 insertions(+), 24 deletions(-)

diff --git a/hw/alpha/typhoon.c b/hw/alpha/typhoon.c
index 29d44df..c4ac693 100644
--- a/hw/alpha/typhoon.c
+++ b/hw/alpha/typhoon.c
@@ -740,6 +740,10 @@ static AddressSpace *typhoon_pci_dma_iommu(PCIBus *bus, 
void *opaque, int devfn)
 return >pchip.iommu_as;
 }
 
+static const PCIIOMMUOps typhoon_iommu_ops = {
+.get_address_space = typhoon_pci_dma_iommu,
+};
+
 static void typhoon_set_irq(void *opaque, int irq, int level)
 {
 TyphoonState *s = opaque;
@@ -897,7 +901,7 @@ PCIBus *typhoon_init(MemoryRegion *ram, ISABus **isa_bus, 
qemu_irq *p_rtc_irq,
  "iommu-typhoon", UINT64_MAX);
 address_space_init(>pchip.iommu_as, MEMORY_REGION(>pchip.iommu),
"pchip0-pci");
-pci_setup_iommu(b, typhoon_pci_dma_iommu, s);
+pci_setup_iommu(b, _iommu_ops, s);
 
 /* Pchip0 PCI special/interrupt acknowledge, 0x801.F800., 64MB.  */
 memory_region_init_io(>pchip.reg_iack, OBJECT(s), _pci_iack_ops,
diff --git a/hw/arm/smmu-common.c b/hw/arm/smmu-common.c
index e13a5f4..447146e 100644
--- a/hw/arm/smmu-common.c
+++ b/hw/arm/smmu-common.c
@@ -343,6 +343,10 @@ static AddressSpace *smmu_find_add_as(PCIBus *bus, void 
*opaque, int devfn)
 return >as;
 }
 
+static const PCIIOMMUOps smmu_ops = {
+.get_address_space = smmu_find_add_as,
+};
+
 IOMMUMemoryRegion *smmu_iommu_mr(SMMUState *s, uint32_t sid)
 {
 uint8_t bus_n, devfn;
@@ -437,7 +441,7 @@ static void smmu_base_realize(DeviceState *dev, Error 
**errp)
 s->smmu_pcibus_by_busptr = g_hash_table_new(NULL, NULL);
 
 if (s->primary_bus) {
-pci_setup_iommu(s->primary_bus, smmu_find_add_as, s);
+pci_setup_iommu(s->primary_bus, _ops, s);
 } else {
 error_setg(errp, "SMMU is not attached to any PCI bus!");
 }
diff --git a/hw/hppa/dino.c b/hw/hppa/dino.c
index 7f0c622..ca2dea4 100644
--- a/hw/hppa/dino.c
+++ b/hw/hppa/dino.c
@@ -459,6 +459,10 @@ static AddressSpace *dino_pcihost_set_iommu(PCIBus *bus, 
void *opaque,
 return >bm_as;
 }
 
+static const PCIIOMMUOps dino_iommu_ops = {
+.get_address_space = dino_pcihost_set_iommu,
+};
+
 /*
  * Dino interrupts are connected as shown on Page 78, Table 23
  * (Little-endian bit numbers)
@@ -580,7 +584,7 @@ PCIBus *dino_init(MemoryRegion *addr_space,
 memory_region_add_subregion(>bm, 0xfff0,
 >bm_cpu_alias);
 address_space_init(>bm_as, >bm, "pci-bm");
-pci_setup_iommu(b, dino_pcihost_set_iommu, s);
+pci_setup_iommu(b, _iommu_ops, s);
 
 *p_rtc_irq = qemu_allocate_irq(dino_set_timer_irq, s, 0);
 *p_ser_irq = qemu_allocate_irq(dino_set_serial_irq, s, 0);
diff --git a/hw/i386/amd_iommu.c b/hw/i386/amd_iommu.c
index 087f601..77f183d 100644
--- a/hw/i386/amd_iommu.c
+++ b/hw/i386/amd_iommu.c
@@ -1452,6 +1452,10 @@ static AddressSpace *amdvi_host_dma_iommu(PCIBus *bus, 
void *opaque, int devfn)
 return _as[devfn]->as;
 }
 
+static const PCIIOMMUOps amdvi_iommu_ops = {
+.get_address_space = amdvi_host_dma_iommu,
+};
+
 static const MemoryRegionOps mmio_mem_ops = {
 .read = amdvi_mmio_read,
 .write = amdvi_mmio_write,
@@ -1579,7 +1583,7 @@ static void amdvi_realize(DeviceState *dev, Error **errp)
 
 sysbus_init_mmio(SYS_BUS_DEVICE(s), >mmio);
 sysbus_mmio_map(SYS_BUS_DEVICE(s), 0, AMDVI_BASE_ADDR);
-pci_setup_iommu(bus, amdvi_host_dma_iommu, s);
+pci_setup_iommu(bus, _iommu_ops, s);
 s->devid = object_property_get_int(OBJECT(>pci), "addr", _abort);
 msi_init(>pci.dev, 0, 1, true, false, errp);
 amdvi_init(s);
diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.

[RFC v9 20/25] intel_iommu: do not pass down pasid bind for PASID #0

2020-07-28 Thread Liu Yi L

RID_PASID field was introduced in VT-d 3.0 spec, it is used
for DMA requests w/o PASID in scalable mode VT-d. It is also
known as IOVA. And in VT-d 3.1 spec, there is definition on it:

"Implementations not supporting RID_PASID capability
(ECAP_REG.RPS is 0b), use a PASID value of 0 to perform
address translation for requests without PASID."

This patch adds a check against the PASIDs which are going to be
bound to device. For PASID #0, it is not necessary to pass down
pasid bind request for it since PASID #0 is used as RID_PASID for
DMA requests without pasid. Further reason is current Intel vIOMMU
supports gIOVA by shadowing guest 2nd level page table. However,
in future, if guest IOMMU driver uses 1st level page table to store
IOVA mappings, then guest IOVA support will also be done via nested
translation. When gIOVA is over FLPT, then vIOMMU should pass down
the pasid bind request for PASID #0 to host, host needs to bind the
guest IOVA page table to a proper PASID. e.g. PASID value in RID_PASID
field for PF/VF if ECAP_REG.RPS is clear or default PASID for ADI
(Assignable Device Interface in Scalable IOV solution).

IOVA over FLPT support on Intel VT-d:
https://lkml.org/lkml/2019/9/23/297

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index efad0af..f0e1afa 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1893,6 +1893,16 @@ static int vtd_bind_guest_pasid(IntelIOMMUState *s, 
VTDBus *vtd_bus,
 HostIOMMUContext *iommu_ctx;
 int ret = -1;
 
+if (pasid < VTD_HPASID_MIN) {
+/*
+ * If pasid < VTD_HPASID_MIN, this pasid is not allocated
+ * from host. No need to pass down the changes on it to host.
+ * TODO: when IOVA over FLPT is ready, this switch should be
+ * refined.
+ */
+return 0;
+}
+
 vtd_dev_icx = vtd_bus->dev_icx[devfn];
 if (!vtd_dev_icx) {
 /* means no need to go further, e.g. for emulated devices */
-- 
2.7.4

[RFC v9 24/25] intel_iommu: process PASID-based Device-TLB invalidation

2020-07-28 Thread Liu Yi L

This patch adds an empty handling for PASID-based Device-TLB
invalidation. For now it is enough as it is not necessary to
propagate it to host for passthru device and also there is no
emulated device has device tlb.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 18 ++
 hw/i386/intel_iommu_internal.h |  1 +
 2 files changed, 19 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index db4460a..11f815d 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3213,6 +3213,17 @@ static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
 return true;
 }
 
+static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s,
+   VTDInvDesc *inv_desc)
+{
+/*
+ * no need to handle it for passthru device, for emulated
+ * devices with device tlb, it may be required, but for now,
+ * return is enough
+ */
+return true;
+}
+
 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
   VTDInvDesc *inv_desc)
 {
@@ -3334,6 +3345,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 }
 break;
 
+case VTD_INV_DESC_DEV_PIOTLB:
+trace_vtd_inv_desc("device-piotlb", inv_desc.hi, inv_desc.lo);
+if (!vtd_process_device_piotlb_desc(s, _desc)) {
+return false;
+}
+break;
+
 case VTD_INV_DESC_DEVICE:
 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
 if (!vtd_process_device_iotlb_desc(s, _desc)) {
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 08ff58e..9b4fc67 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -405,6 +405,7 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_WAIT   0x5 /* Invalidation Wait Descriptor */
 #define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */
 #define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */
+#define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/
 #define VTD_INV_DESC_NONE   0   /* Not an Invalidate Descriptor */
 
 /* Masks for Invalidation Wait Descriptor*/
-- 
2.7.4

[RFC v9 21/25] vfio: add support for flush iommu stage-1 cache

2020-07-28 Thread Liu Yi L

This patch adds flush_stage1_cache() definition in HostIOMUContextClass.
And adds corresponding implementation in VFIO. This is to expose a way
for vIOMMU to flush stage-1 cache in host side since guest owns stage-1
translation structures in dual stage DMA translation configuration.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Acked-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/iommu/host_iommu_context.c | 19 +++
 hw/vfio/common.c  | 24 
 include/hw/iommu/host_iommu_context.h |  8 
 3 files changed, 51 insertions(+)

diff --git a/hw/iommu/host_iommu_context.c b/hw/iommu/host_iommu_context.c
index c43965c..a3f7706 100644
--- a/hw/iommu/host_iommu_context.c
+++ b/hw/iommu/host_iommu_context.c
@@ -113,6 +113,25 @@ int host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext 
*iommu_ctx,
 return hicxc->unbind_stage1_pgtbl(iommu_ctx, unbind);
 }
 
+int host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+ struct iommu_cache_invalidate_info *cache)
+{
+HostIOMMUContextClass *hicxc;
+
+hicxc = HOST_IOMMU_CONTEXT_GET_CLASS(iommu_ctx);
+
+if (!hicxc) {
+return -EINVAL;
+}
+
+if (!(iommu_ctx->flags & HOST_IOMMU_NESTING) ||
+!hicxc->flush_stage1_cache) {
+return -EINVAL;
+}
+
+return hicxc->flush_stage1_cache(iommu_ctx, cache);
+}
+
 void host_iommu_ctx_init(void *_iommu_ctx, size_t instance_size,
  const char *mrtypename,
  uint64_t flags,
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 3f09e74..97b8200 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1275,6 +1275,29 @@ static int 
vfio_host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
 return ret;
 }
 
+static int vfio_host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+struct iommu_cache_invalidate_info *cache)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_nesting_op *op;
+unsigned long argsz;
+int ret = 0;
+
+argsz = sizeof(*op) + sizeof(*cache);
+op = g_malloc0(argsz);
+op->argsz = argsz;
+op->flags = VFIO_IOMMU_NESTING_OP_CACHE_INVLD;
+memcpy(>data, cache, sizeof(*cache));
+
+if (ioctl(container->fd, VFIO_IOMMU_NESTING_OP, op)) {
+ret = -errno;
+error_report("%s: iommu cache flush failed: %m", __func__);
+}
+g_free(op);
+return ret;
+}
+
 /**
  * Get iommu info from host. Caller of this funcion should free
  * the memory pointed by the returned pointer stored in @info
@@ -2023,6 +2046,7 @@ static void 
vfio_host_iommu_context_class_init(ObjectClass *klass,
 hicxc->pasid_free = vfio_host_iommu_ctx_pasid_free;
 hicxc->bind_stage1_pgtbl = vfio_host_iommu_ctx_bind_stage1_pgtbl;
 hicxc->unbind_stage1_pgtbl = vfio_host_iommu_ctx_unbind_stage1_pgtbl;
+hicxc->flush_stage1_cache = vfio_host_iommu_ctx_flush_stage1_cache;
 }
 
 static const TypeInfo vfio_host_iommu_context_info = {
diff --git a/include/hw/iommu/host_iommu_context.h 
b/include/hw/iommu/host_iommu_context.h
index 2883ed8..40e860a 100644
--- a/include/hw/iommu/host_iommu_context.h
+++ b/include/hw/iommu/host_iommu_context.h
@@ -64,6 +64,12 @@ typedef struct HostIOMMUContextClass {
 /* Undo a previous bind. @unbind specifies the unbind info. */
 int (*unbind_stage1_pgtbl)(HostIOMMUContext *iommu_ctx,
struct iommu_gpasid_bind_data *unbind);
+/*
+ * Propagate stage-1 cache flush to host IOMMU, cache
+ * info specifid in @cache
+ */
+int (*flush_stage1_cache)(HostIOMMUContext *iommu_ctx,
+  struct iommu_cache_invalidate_info *cache);
 } HostIOMMUContextClass;
 
 /*
@@ -85,6 +91,8 @@ int host_iommu_ctx_bind_stage1_pgtbl(HostIOMMUContext 
*iommu_ctx,
  struct iommu_gpasid_bind_data *bind);
 int host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
  struct iommu_gpasid_bind_data *unbind);
+int host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+   struct iommu_cache_invalidate_info *cache);
 
 void host_iommu_ctx_init(void *_iommu_ctx, size_t instance_size,
  const char *mrtypename,
-- 
2.7.4

[RFC v9 09/25] hw/pci: introduce pci_device_set/unset_iommu_context()

2020-07-28 Thread Liu Yi L

For nesting IOMMU translation capable platforms, vIOMMUs running on
such system could be implemented upon physical IOMMU nested paging
(VFIO case). vIOMMU advertises such implementation by "want_nested"
attribute to PCIe devices (e.g. VFIO PCI). Once "want_nested" is
satisfied, device (VFIO case) should set HostIOMMUContext to vIOMMU,
thus vIOMMU could manage stage-1 translation. DMAs out from such
devices would be protected through the stage-1 page tables owned by
guest together with stage-2 page tables owned by host.

This patch adds pci_device_set/unset_iommu_context() to set/unset
HostIOMMUContext for a given PCIe device (VFIO case). Caller of set
should fail if set operation failed.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
rfcv5 (v2) -> rfcv6:
*) pci_device_set_iommu_context() returns 0 if callback is not implemented.
---
 hw/pci/pci.c | 28 
 include/hw/pci/pci.h | 10 ++
 2 files changed, 38 insertions(+)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index 3c27805..59864c6 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2743,6 +2743,34 @@ int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr 
attr, void *data)
 return -ENOENT;
 }
 
+int pci_device_set_iommu_context(PCIDevice *dev,
+ HostIOMMUContext *iommu_ctx)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->set_iommu_context) {
+return bus->iommu_ops->set_iommu_context(bus,
+  bus->iommu_opaque, devfn, iommu_ctx);
+}
+return 0;
+}
+
+void pci_device_unset_iommu_context(PCIDevice *dev)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->unset_iommu_context) {
+bus->iommu_ops->unset_iommu_context(bus,
+ bus->iommu_opaque, devfn);
+}
+}
+
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
 {
 bus->iommu_ops = ops;
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 822ada5..64f1958 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -9,6 +9,8 @@
 
 #include "hw/pci/pcie.h"
 
+#include "hw/iommu/host_iommu_context.h"
+
 extern bool pci_available;
 
 /* PCI bus */
@@ -496,10 +498,18 @@ struct PCIIOMMUOps {
 void *opaque, int32_t devfn);
 int (*get_iommu_attr)(PCIBus *bus, void *opaque, int32_t devfn,
IOMMUAttr attr, void *data);
+int (*set_iommu_context)(PCIBus *bus, void *opaque,
+ int32_t devfn,
+ HostIOMMUContext *iommu_ctx);
+void (*unset_iommu_context)(PCIBus *bus, void *opaque,
+int32_t devfn);
 };
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
 int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr attr, void *data);
+int pci_device_set_iommu_context(PCIDevice *dev,
+ HostIOMMUContext *iommu_ctx);
+void pci_device_unset_iommu_context(PCIDevice *dev);
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *iommu_ops, void *opaque);
 
 static inline void
-- 
2.7.4

[RFC v9 05/25] intel_iommu: add get_iommu_attr() callback

2020-07-28 Thread Liu Yi L

Return vIOMMU attribute to caller. e.g. VFIO call via PCI layer.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 76c2f70..9ad7242 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3441,6 +3441,28 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus, int devfn)
 return vtd_dev_as;
 }
 
+static int vtd_dev_get_iommu_attr(PCIBus *bus, void *opaque, int32_t devfn,
+   IOMMUAttr attr, void *data)
+{
+int ret = 0;
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+switch (attr) {
+case IOMMU_WANT_NESTING:
+{
+bool *pdata = data;
+
+/* return false until vSVA is ready */
+*pdata = false;
+break;
+}
+default:
+ret = -ENOENT;
+}
+return ret;
+}
+
 static uint64_t get_naturally_aligned_size(uint64_t start,
uint64_t size, int gaw)
 {
@@ -3736,6 +3758,7 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
+.get_iommu_attr = vtd_dev_get_iommu_attr,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
-- 
2.7.4

[RFC v9 16/25] vfio: add bind stage-1 page table support

2020-07-28 Thread Liu Yi L

This patch adds bind_stage1_pgtbl() definition in HostIOMMUContextClass,
also adds corresponding implementation in VFIO. This is to expose a way
for vIOMMU to setup dual stage DMA translation for passthru devices on
hardware.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
---
 hw/iommu/host_iommu_context.c | 57 +-
 hw/vfio/common.c  | 58 ++-
 include/hw/iommu/host_iommu_context.h | 19 +++-
 3 files changed, 131 insertions(+), 3 deletions(-)

diff --git a/hw/iommu/host_iommu_context.c b/hw/iommu/host_iommu_context.c
index 5fb2223..c43965c 100644
--- a/hw/iommu/host_iommu_context.c
+++ b/hw/iommu/host_iommu_context.c
@@ -69,23 +69,78 @@ int host_iommu_ctx_pasid_free(HostIOMMUContext *iommu_ctx, 
uint32_t pasid)
 return hicxc->pasid_free(iommu_ctx, pasid);
 }
 
+int host_iommu_ctx_bind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
+ struct iommu_gpasid_bind_data *bind)
+{
+HostIOMMUContextClass *hicxc;
+
+if (!iommu_ctx) {
+return -EINVAL;
+}
+
+hicxc = HOST_IOMMU_CONTEXT_GET_CLASS(iommu_ctx);
+if (!hicxc) {
+return -EINVAL;
+}
+
+if (!(iommu_ctx->flags & HOST_IOMMU_NESTING) ||
+!hicxc->bind_stage1_pgtbl) {
+return -EINVAL;
+}
+
+return hicxc->bind_stage1_pgtbl(iommu_ctx, bind);
+}
+
+int host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
+ struct iommu_gpasid_bind_data *unbind)
+{
+HostIOMMUContextClass *hicxc;
+
+if (!iommu_ctx) {
+return -EINVAL;
+}
+
+hicxc = HOST_IOMMU_CONTEXT_GET_CLASS(iommu_ctx);
+if (!hicxc) {
+return -EINVAL;
+}
+
+if (!(iommu_ctx->flags & HOST_IOMMU_NESTING) ||
+!hicxc->unbind_stage1_pgtbl) {
+return -EINVAL;
+}
+
+return hicxc->unbind_stage1_pgtbl(iommu_ctx, unbind);
+}
+
 void host_iommu_ctx_init(void *_iommu_ctx, size_t instance_size,
  const char *mrtypename,
- uint64_t flags)
+ uint64_t flags,
+ struct iommu_nesting_info *info)
 {
 HostIOMMUContext *iommu_ctx;
 
 object_initialize(_iommu_ctx, instance_size, mrtypename);
 iommu_ctx = HOST_IOMMU_CONTEXT(_iommu_ctx);
 iommu_ctx->flags = flags;
+iommu_ctx->info = g_malloc0(info->argsz);
+memcpy(iommu_ctx->info, info, info->argsz);
 iommu_ctx->initialized = true;
 }
 
+static void host_iommu_ctx_finalize_fn(Object *obj)
+{
+HostIOMMUContext *iommu_ctx = HOST_IOMMU_CONTEXT(obj);
+
+g_free(iommu_ctx->info);
+}
+
 static const TypeInfo host_iommu_context_info = {
 .parent = TYPE_OBJECT,
 .name   = TYPE_HOST_IOMMU_CONTEXT,
 .class_size = sizeof(HostIOMMUContextClass),
 .instance_size  = sizeof(HostIOMMUContext),
+.instance_finalize  = host_iommu_ctx_finalize_fn,
 .abstract   = true,
 };
 
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 9d90732..3f09e74 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1227,6 +1227,54 @@ static int 
vfio_host_iommu_ctx_pasid_free(HostIOMMUContext *iommu_ctx,
 return ret;
 }
 
+static int vfio_host_iommu_ctx_bind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
+ struct iommu_gpasid_bind_data *bind)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_nesting_op *op;
+unsigned long argsz;
+int ret = 0;
+
+argsz = sizeof(*op) + sizeof(*bind);
+op = g_malloc0(argsz);
+op->argsz = argsz;
+op->flags = VFIO_IOMMU_NESTING_OP_BIND_PGTBL;
+memcpy(>data, bind, sizeof(*bind));
+
+if (ioctl(container->fd, VFIO_IOMMU_NESTING_OP, op)) {
+ret = -errno;
+error_report("%s: pasid (%llu) bind failed: %m",
+  __func__, bind->hpasid);
+}
+g_free(op);
+return ret;
+}
+
+static int vfio_host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
+ struct iommu_gpasid_bind_data *unbind)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_nesting_op *op;
+unsigned long argsz;
+int ret = 0;
+
+argsz = sizeof(*op) + sizeof(*unbind);
+op = g_malloc0(argsz);
+op->argsz = argsz;
+op->flags = VFIO_IOMMU_NESTING_OP_UNBIND_PGTBL;
+memcpy(>data, unbind, sizeof(*unbind));
+
+if (ioctl(container->fd, VFIO_IOMMU_NESTING_OP, op)) {
+ret = -errno;
+error_report("%s: pasid (%llu) unbind failed: %m",
+

[RFC v9 13/25] intel_iommu: add virtual command capability support

2020-07-28 Thread Liu Yi L

This patch adds virtual command support to Intel vIOMMU per
Intel VT-d 3.1 spec. And adds two virtual commands: allocate
pasid and free pasid.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
Signed-off-by: Yi Sun 
---
 hw/i386/intel_iommu.c  | 154 -
 hw/i386/intel_iommu_internal.h |  37 ++
 hw/i386/trace-events   |   1 +
 include/hw/i386/intel_iommu.h  |  10 ++-
 4 files changed, 200 insertions(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 4550cb9..191d124 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2656,6 +2656,129 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s)
 }
 }
 
+static int vtd_request_pasid_alloc(IntelIOMMUState *s, uint32_t *pasid)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+int ret = -1;
+
+vtd_iommu_lock(s);
+QLIST_FOREACH(vtd_dev_icx, >vtd_dev_icx_list, next) {
+HostIOMMUContext *iommu_ctx = vtd_dev_icx->iommu_ctx;
+
+/*
+ * We'll return the first valid result we got. It's
+ * a bit hackish in that we don't have a good global
+ * interface yet to talk to modules like vfio to deliver
+ * this allocation request, so we're leveraging this
+ * per-device iommu context to do the same thing just
+ * to make sure the allocation happens only once.
+ */
+ret = host_iommu_ctx_pasid_alloc(iommu_ctx, VTD_HPASID_MIN,
+ VTD_HPASID_MAX, pasid);
+if (!ret) {
+break;
+}
+}
+vtd_iommu_unlock(s);
+
+return ret;
+}
+
+static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+int ret = -1;
+
+vtd_iommu_lock(s);
+QLIST_FOREACH(vtd_dev_icx, >vtd_dev_icx_list, next) {
+HostIOMMUContext *iommu_ctx = vtd_dev_icx->iommu_ctx;
+
+/*
+ * Similar with pasid allocation. We'll free the pasid
+ * on the first successful free operation. It's a bit
+ * hackish in that we don't have a good global interface
+ * yet to talk to modules like vfio to deliver this pasid
+ * free request, so we're leveraging this per-device iommu
+ * context to do the same thing just to make sure the free
+ * happens only once.
+ */
+ret = host_iommu_ctx_pasid_free(iommu_ctx, pasid);
+if (!ret) {
+break;
+}
+}
+vtd_iommu_unlock(s);
+
+return ret;
+}
+
+/*
+ * If IP is not set, set it then return.
+ * If IP is already set, return.
+ */
+static void vtd_vcmd_set_ip(IntelIOMMUState *s)
+{
+s->vcrsp = 1;
+vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+ ((uint64_t) s->vcrsp));
+}
+
+static void vtd_vcmd_clear_ip(IntelIOMMUState *s)
+{
+s->vcrsp &= (~((uint64_t)(0x1)));
+vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+ ((uint64_t) s->vcrsp));
+}
+
+/* Handle write to Virtual Command Register */
+static int vtd_handle_vcmd_write(IntelIOMMUState *s, uint64_t val)
+{
+uint32_t pasid;
+int ret = -1;
+
+trace_vtd_reg_write_vcmd(s->vcrsp, val);
+
+if (!(s->vccap & VTD_VCCAP_PAS) ||
+ (s->vcrsp & 1)) {
+return -1;
+}
+
+/*
+ * Since vCPU should be blocked when the guest VMCD
+ * write was trapped to here. Should be no other vCPUs
+ * try to access VCMD if guest software is well written.
+ * However, we still emulate the IP bit here in case of
+ * bad guest software. Also align with the spec.
+ */
+vtd_vcmd_set_ip(s);
+
+switch (val & VTD_VCMD_CMD_MASK) {
+case VTD_VCMD_ALLOC_PASID:
+ret = vtd_request_pasid_alloc(s, );
+if (ret) {
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
+} else {
+s->vcrsp |= VTD_VCRSP_RSLT(pasid);
+}
+break;
+
+case VTD_VCMD_FREE_PASID:
+pasid = VTD_VCMD_PASID_VALUE(val);
+ret = vtd_request_pasid_free(s, pasid);
+if (ret < 0) {
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
+}
+break;
+
+default:
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);
+error_report_once("Virtual Command: unsupported command!!!");
+break;
+}
+vtd_vcmd_clear_ip(s);
+return 0;
+}
+
 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
 {
 IntelIOMMUState *s = opaque;
@@ -2944,6 +3067,23 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 vtd_set_long(s, addr, val);
 break;
 
+case DMAR_VCMD_REG:
+if (!vtd_handle_vcmd_write(s, val)) {
+if (size == 4) {
+vtd_set_long(s, addr, val);
+} else {
+

[RFC v9 10/25] intel_iommu: add set/unset_iommu_context callback

2020-07-28 Thread Liu Yi L

This patch adds set/unset_iommu_context() impelementation in Intel
vIOMMU. PCIe devices (VFIO case) sets HostIOMMUContext to vIOMMU as
an ack of vIOMMU's "want_nested" attribute. Thus vIOMMU could build
DMA protection based on nested paging of host IOMMU.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c | 71 ---
 include/hw/i386/intel_iommu.h | 21 ++---
 2 files changed, 83 insertions(+), 9 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 9ad7242..4550cb9 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3359,23 +3359,33 @@ static const MemoryRegionOps vtd_mem_ir_ops = {
 },
 };
 
-VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
+/**
+ * Fetch a VTDBus instance for given PCIBus. If no existing instance,
+ * allocate one.
+ */
+static VTDBus *vtd_find_add_bus(IntelIOMMUState *s, PCIBus *bus)
 {
 uintptr_t key = (uintptr_t)bus;
 VTDBus *vtd_bus = g_hash_table_lookup(s->vtd_as_by_busptr, );
-VTDAddressSpace *vtd_dev_as;
-char name[128];
 
 if (!vtd_bus) {
 uintptr_t *new_key = g_malloc(sizeof(*new_key));
 *new_key = (uintptr_t)bus;
 /* No corresponding free() */
-vtd_bus = g_malloc0(sizeof(VTDBus) + sizeof(VTDAddressSpace *) * \
-PCI_DEVFN_MAX);
+vtd_bus = g_malloc0(sizeof(VTDBus));
 vtd_bus->bus = bus;
 g_hash_table_insert(s->vtd_as_by_busptr, new_key, vtd_bus);
 }
+return vtd_bus;
+}
 
+VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, PCIBus *bus, int devfn)
+{
+VTDBus *vtd_bus;
+VTDAddressSpace *vtd_dev_as;
+char name[128];
+
+vtd_bus = vtd_find_add_bus(s, bus);
 vtd_dev_as = vtd_bus->dev_as[devfn];
 
 if (!vtd_dev_as) {
@@ -3463,6 +3473,55 @@ static int vtd_dev_get_iommu_attr(PCIBus *bus, void 
*opaque, int32_t devfn,
 return ret;
 }
 
+static int vtd_dev_set_iommu_context(PCIBus *bus, void *opaque,
+ int devfn,
+ HostIOMMUContext *iommu_ctx)
+{
+IntelIOMMUState *s = opaque;
+VTDBus *vtd_bus;
+VTDHostIOMMUContext *vtd_dev_icx;
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+vtd_bus = vtd_find_add_bus(s, bus);
+
+vtd_iommu_lock(s);
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+
+assert(!vtd_dev_icx);
+
+vtd_bus->dev_icx[devfn] = vtd_dev_icx =
+g_malloc0(sizeof(VTDHostIOMMUContext));
+vtd_dev_icx->vtd_bus = vtd_bus;
+vtd_dev_icx->devfn = (uint8_t)devfn;
+vtd_dev_icx->iommu_state = s;
+vtd_dev_icx->iommu_ctx = iommu_ctx;
+
+vtd_iommu_unlock(s);
+
+return 0;
+}
+
+static void vtd_dev_unset_iommu_context(PCIBus *bus, void *opaque, int devfn)
+{
+IntelIOMMUState *s = opaque;
+VTDBus *vtd_bus;
+VTDHostIOMMUContext *vtd_dev_icx;
+
+assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+
+vtd_bus = vtd_find_add_bus(s, bus);
+
+vtd_iommu_lock(s);
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+g_free(vtd_dev_icx);
+vtd_bus->dev_icx[devfn] = NULL;
+
+vtd_iommu_unlock(s);
+}
+
 static uint64_t get_naturally_aligned_size(uint64_t start,
uint64_t size, int gaw)
 {
@@ -3759,6 +3818,8 @@ static AddressSpace *vtd_host_dma_iommu(PCIBus *bus, void 
*opaque, int devfn)
 static PCIIOMMUOps vtd_iommu_ops = {
 .get_address_space = vtd_host_dma_iommu,
 .get_iommu_attr = vtd_dev_get_iommu_attr,
+.set_iommu_context = vtd_dev_set_iommu_context,
+.unset_iommu_context = vtd_dev_unset_iommu_context,
 };
 
 static bool vtd_decide_config(IntelIOMMUState *s, Error **errp)
diff --git a/include/hw/i386/intel_iommu.h b/include/hw/i386/intel_iommu.h
index 3870052..b5fefb9 100644
--- a/include/hw/i386/intel_iommu.h
+++ b/include/hw/i386/intel_iommu.h
@@ -64,6 +64,7 @@ typedef union VTD_IR_TableEntry VTD_IR_TableEntry;
 typedef union VTD_IR_MSIAddress VTD_IR_MSIAddress;
 typedef struct VTDPASIDDirEntry VTDPASIDDirEntry;
 typedef struct VTDPASIDEntry VTDPASIDEntry;
+typedef struct VTDHostIOMMUContext VTDHostIOMMUContext;
 
 /* Context-Entry */
 struct VTDContextEntry {
@@ -112,10 +113,20 @@ struct VTDAddressSpace {
 IOVATree *iova_tree;  /* Traces mapped IOVA ranges */
 };
 
+struct VTDHostIOMMUContext {
+VTDBus *vtd_bus;
+uint8_t devfn;
+HostIOMMUContext *iommu_ctx;
+IntelIOMMUState *iommu_state;
+};
+
 struct VTDBus {
-PCIBus* bus;   /* A reference to the bus to provide 
translation for */
+/* A reference to the bus to provide translation for */
+PCIBus *bus;
 /* A table of VTDAddressSpace objects indexed by devfn */
-VTDAddressSpace *dev_as[];
+VTDAddressSpace *dev_as[PCI_D

[RFC v9 14/25] intel_iommu: process PASID cache invalidation

2020-07-28 Thread Liu Yi L

This patch adds PASID cache invalidation handling. When guest enabled
PASID usages (e.g. SVA), guest software should issue a proper PASID
cache invalidation when caching-mode is exposed. This patch only adds
the draft handling of pasid cache invalidation. Detailed handling will
be added in subsequent patches.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
rfcv4 (v1) -> rfcv5 (v2):
*) remove vtd_pasid_cache_gsi(), vtd_pasid_cache_psi()
   and vtd_pasid_cache_dsi()
---
 hw/i386/intel_iommu.c  | 40 +++-
 hw/i386/intel_iommu_internal.h | 12 
 hw/i386/trace-events   |  3 +++
 3 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 191d124..7efa98c 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2395,6 +2395,37 @@ static bool vtd_process_iotlb_desc(IntelIOMMUState *s, 
VTDInvDesc *inv_desc)
 return true;
 }
 
+static bool vtd_process_pasid_desc(IntelIOMMUState *s,
+   VTDInvDesc *inv_desc)
+{
+if ((inv_desc->val[0] & VTD_INV_DESC_PASIDC_RSVD_VAL0) ||
+(inv_desc->val[1] & VTD_INV_DESC_PASIDC_RSVD_VAL1) ||
+(inv_desc->val[2] & VTD_INV_DESC_PASIDC_RSVD_VAL2) ||
+(inv_desc->val[3] & VTD_INV_DESC_PASIDC_RSVD_VAL3)) {
+error_report_once("non-zero-field-in-pc_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+switch (inv_desc->val[0] & VTD_INV_DESC_PASIDC_G) {
+case VTD_INV_DESC_PASIDC_DSI:
+break;
+
+case VTD_INV_DESC_PASIDC_PASID_SI:
+break;
+
+case VTD_INV_DESC_PASIDC_GLOBAL:
+break;
+
+default:
+error_report_once("invalid-inv-granu-in-pc_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+return true;
+}
+
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
  VTDInvDesc *inv_desc)
 {
@@ -2501,12 +2532,11 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 }
 break;
 
-/*
- * TODO: the entity of below two cases will be implemented in future 
series.
- * To make guest (which integrates scalable mode support patch set in
- * iommu driver) work, just return true is enough so far.
- */
 case VTD_INV_DESC_PC:
+trace_vtd_inv_desc("pasid-cache", inv_desc.val[1], inv_desc.val[0]);
+if (!vtd_process_pasid_desc(s, _desc)) {
+return false;
+}
 break;
 
 case VTD_INV_DESC_PIOTLB:
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 64ac0a8..22d0bc5 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -445,6 +445,18 @@ typedef union VTDInvDesc VTDInvDesc;
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM | VTD_SL_TM)) : \
 (0x3800ULL | ~(VTD_HAW_MASK(aw) | VTD_SL_IGN_COM))
 
+#define VTD_INV_DESC_PASIDC_G  (3ULL << 4)
+#define VTD_INV_DESC_PASIDC_PASID(val) (((val) >> 32) & 0xfULL)
+#define VTD_INV_DESC_PASIDC_DID(val)   (((val) >> 16) & VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_PASIDC_RSVD_VAL0  0xfff0ffc0ULL
+#define VTD_INV_DESC_PASIDC_RSVD_VAL1  0xULL
+#define VTD_INV_DESC_PASIDC_RSVD_VAL2  0xULL
+#define VTD_INV_DESC_PASIDC_RSVD_VAL3  0xULL
+
+#define VTD_INV_DESC_PASIDC_DSI(0ULL << 4)
+#define VTD_INV_DESC_PASIDC_PASID_SI   (1ULL << 4)
+#define VTD_INV_DESC_PASIDC_GLOBAL (3ULL << 4)
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
 uint16_t domain_id;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 71536a7..f7cd4e5 100644
--- a/hw/i386/trace-events
+++ b/hw/i386/trace-events
@@ -22,6 +22,9 @@ vtd_inv_qi_head(uint16_t head) "read head %d"
 vtd_inv_qi_tail(uint16_t head) "write tail %d"
 vtd_inv_qi_fetch(void) ""
 vtd_context_cache_reset(void) ""
+vtd_pasid_cache_gsi(void) ""
+vtd_pasid_cache_dsi(uint16_t domain) "Domian slective PC invalidation domain 
0x%"PRIx16
+vtd_pasid_cache_psi(uint16_t domain, uint32_t pasid) "PASID slective PC 
invalidation domain 0x%"PRIx16" pasid 0x%"PRIx32
 vtd_re_not_present(uint8_t bus) "Root entry bus %"PRIu8" not present"
 vtd_ce_not_present(uint8_t bus, uint8_t devfn) "Context entry bus %"PRIu8" 
devfn %"PRIu8" not present"
 vtd_iotlb_page_hit(uint16_t sid, uint64_t addr, uint64_t slpte, uint16_t 
domain) "IOTLB page hit sid 0x%"PRIx16" iova 0x%"PRIx64" slpte 0x%"PRIx64" 
domain 0x%"PRIx16
-- 
2.7.4

[RFC v9 06/25] vfio: pass nesting requirement into vfio_get_group()

2020-07-28 Thread Liu Yi L

This patch passes the nesting requirement into vfio_get_group() to
indicate whether VFIO_TYPE1_NESTING_IOMMU is required.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
---
 hw/vfio/ap.c  | 2 +-
 hw/vfio/ccw.c | 2 +-
 hw/vfio/common.c  | 3 ++-
 hw/vfio/pci.c | 9 -
 hw/vfio/platform.c| 2 +-
 include/hw/vfio/vfio-common.h | 3 ++-
 6 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/hw/vfio/ap.c b/hw/vfio/ap.c
index b9330a8..2101a1c 100644
--- a/hw/vfio/ap.c
+++ b/hw/vfio/ap.c
@@ -82,7 +82,7 @@ static VFIOGroup *vfio_ap_get_group(VFIOAPDevice *vapdev, 
Error **errp)
 
 g_free(group_path);
 
-return vfio_get_group(groupid, _space_memory, errp);
+return vfio_get_group(groupid, _space_memory, false, errp);
 }
 
 static void vfio_ap_realize(DeviceState *dev, Error **errp)
diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c
index ff7f369..30d00a7 100644
--- a/hw/vfio/ccw.c
+++ b/hw/vfio/ccw.c
@@ -621,7 +621,7 @@ static VFIOGroup *vfio_ccw_get_group(S390CCWDevice *cdev, 
Error **errp)
 return NULL;
 }
 
-return vfio_get_group(groupid, _space_memory, errp);
+return vfio_get_group(groupid, _space_memory, false, errp);
 }
 
 static void vfio_ccw_realize(DeviceState *dev, Error **errp)
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 3335714..80d7a00 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1457,7 +1457,8 @@ static void vfio_disconnect_container(VFIOGroup *group)
 }
 }
 
-VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp)
+VFIOGroup *vfio_get_group(int groupid, AddressSpace *as,
+  bool want_nested, Error **errp)
 {
 VFIOGroup *group;
 char path[32];
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 2e561c0..8cd1e72 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2714,6 +2714,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 int groupid;
 int i, ret;
 bool is_mdev;
+bool want_nested;
 
 if (!vdev->vbasedev.sysfsdev) {
 if (!(~vdev->host.domain || ~vdev->host.bus ||
@@ -2771,7 +2772,13 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 
 trace_vfio_realize(vdev->vbasedev.name, groupid);
 
-group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev), 
errp);
+if (pci_device_get_iommu_attr(pdev,
+ IOMMU_WANT_NESTING, _nested)) {
+want_nested = false;
+}
+
+group = vfio_get_group(groupid, pci_device_iommu_address_space(pdev),
+   want_nested, errp);
 if (!group) {
 goto error;
 }
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index ac2cefc..7ad7702 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -580,7 +580,7 @@ static int vfio_base_device_init(VFIODevice *vbasedev, 
Error **errp)
 
 trace_vfio_platform_base_device_init(vbasedev->name, groupid);
 
-group = vfio_get_group(groupid, _space_memory, errp);
+group = vfio_get_group(groupid, _space_memory, false, errp);
 if (!group) {
 return -ENOENT;
 }
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index c78f3ff..bdb09f4 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -174,7 +174,8 @@ void vfio_region_mmaps_set_enabled(VFIORegion *region, bool 
enabled);
 void vfio_region_exit(VFIORegion *region);
 void vfio_region_finalize(VFIORegion *region);
 void vfio_reset_handler(void *opaque);
-VFIOGroup *vfio_get_group(int groupid, AddressSpace *as, Error **errp);
+VFIOGroup *vfio_get_group(int groupid, AddressSpace *as,
+  bool want_nested, Error **errp);
 void vfio_put_group(VFIOGroup *group);
 int vfio_get_device(VFIOGroup *group, const char *name,
 VFIODevice *vbasedev, Error **errp);
-- 
2.7.4

[RFC v9 12/25] vfio: init HostIOMMUContext per-container

2020-07-28 Thread Liu Yi L

In this patch, QEMU firstly gets iommu info from kernel to check the
supported capabilities by a VFIO_IOMMU_TYPE1_NESTING iommu. And inits
HostIOMMUContet instance.

For vfio-pci devices, it could use pci_device_set/unset_iommu() to
expose host iommu context to vIOMMU emulators. vIOMMU emulators
could make use the methods provided by host iommu context. e.g.
propagate requests to host iommu.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
---
 hw/vfio/common.c | 113 +++
 hw/vfio/pci.c|  17 +
 2 files changed, 130 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 41aaf41..9d90732 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1227,10 +1227,102 @@ static int 
vfio_host_iommu_ctx_pasid_free(HostIOMMUContext *iommu_ctx,
 return ret;
 }
 
+/**
+ * Get iommu info from host. Caller of this funcion should free
+ * the memory pointed by the returned pointer stored in @info
+ * after a successful calling when finished its usage.
+ */
+static int vfio_get_iommu_info(VFIOContainer *container,
+ struct vfio_iommu_type1_info **info)
+{
+
+size_t argsz = sizeof(struct vfio_iommu_type1_info);
+
+*info = g_malloc0(argsz);
+
+retry:
+(*info)->argsz = argsz;
+
+if (ioctl(container->fd, VFIO_IOMMU_GET_INFO, *info)) {
+g_free(*info);
+*info = NULL;
+return -errno;
+}
+
+if (((*info)->argsz > argsz)) {
+argsz = (*info)->argsz;
+*info = g_realloc(*info, argsz);
+goto retry;
+}
+
+return 0;
+}
+
+static struct vfio_info_cap_header *
+vfio_get_iommu_info_cap(struct vfio_iommu_type1_info *info, uint16_t id)
+{
+struct vfio_info_cap_header *hdr;
+void *ptr = info;
+
+if (!(info->flags & VFIO_IOMMU_INFO_CAPS)) {
+return NULL;
+}
+
+for (hdr = ptr + info->cap_offset; hdr != ptr; hdr = ptr + hdr->next) {
+if (hdr->id == id) {
+return hdr;
+}
+}
+
+return NULL;
+}
+
+static int vfio_get_nesting_iommu_cap(VFIOContainer *container,
+   struct vfio_iommu_type1_info_cap_nesting **cap_nesting)
+{
+struct vfio_iommu_type1_info *info;
+struct vfio_info_cap_header *hdr;
+struct vfio_iommu_type1_info_cap_nesting *cap;
+struct iommu_nesting_info *nest_info;
+int ret;
+uint32_t minsz, cap_size;
+
+ret = vfio_get_iommu_info(container, );
+if (ret) {
+return ret;
+}
+
+hdr = vfio_get_iommu_info_cap(info,
+VFIO_IOMMU_TYPE1_INFO_CAP_NESTING);
+if (!hdr) {
+g_free(info);
+return -EINVAL;
+}
+
+cap = container_of(hdr,
+struct vfio_iommu_type1_info_cap_nesting, header);
+
+nest_info = >info;
+minsz = offsetof(struct iommu_nesting_info, data);
+if (nest_info->argsz < minsz) {
+g_free(info);
+return -EINVAL;
+}
+
+cap_size = offsetof(struct vfio_iommu_type1_info_cap_nesting, info) +
+   nest_info->argsz;
+*cap_nesting = g_malloc0(cap_size);
+memcpy(*cap_nesting, cap, cap_size);
+
+g_free(info);
+return 0;
+}
+
 static int vfio_init_container(VFIOContainer *container, int group_fd,
bool want_nested, Error **errp)
 {
 int iommu_type, ret;
+uint64_t flags = 0;
 
 iommu_type = vfio_get_iommu_type(container, want_nested, errp);
 if (iommu_type < 0) {
@@ -1258,6 +1350,27 @@ static int vfio_init_container(VFIOContainer *container, 
int group_fd,
 return -errno;
 }
 
+if (iommu_type == VFIO_TYPE1_NESTING_IOMMU) {
+struct vfio_iommu_type1_info_cap_nesting *nesting = NULL;
+struct iommu_nesting_info *nest_info;
+
+ret = vfio_get_nesting_iommu_cap(container, );
+if (ret) {
+error_setg_errno(errp, -ret,
+ "Failed to get nesting iommu cap");
+return ret;
+}
+
+nest_info = (struct iommu_nesting_info *) >info;
+flags |= (nest_info->features & IOMMU_NESTING_FEAT_SYSWIDE_PASID) ?
+ HOST_IOMMU_PASID_REQUEST : 0;
+host_iommu_ctx_init(>iommu_ctx,
+sizeof(container->iommu_ctx),
+TYPE_VFIO_HOST_IOMMU_CONTEXT,
+flags);
+g_free(nesting);
+}
+
 container->iommu_type = iommu_type;
 return 0;
 }
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 8cd1e72..f954c28 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -2707,6 +2707,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
 VFIOPCIDevice *vdev = PCI_VFIO(pdev);
 VFIODevice *vbasedev_iter;
 VFIOGroup *group;
+VFIOContainer *container;
 char *tmp, *subsys, group_path[PATH_MAX], *group_name;

[RFC v9 11/25] vfio/common: provide PASID alloc/free hooks

2020-07-28 Thread Liu Yi L

This patch defines vfio_host_iommu_context_info, implements the PASID
alloc/free hooks defined in HostIOMMUContextClass.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Signed-off-by: Liu Yi L 
---
 hw/vfio/common.c  | 66 +++
 include/hw/iommu/host_iommu_context.h |  3 ++
 include/hw/vfio/vfio-common.h |  4 +++
 3 files changed, 73 insertions(+)

diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index af91eca..41aaf41 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1183,6 +1183,50 @@ static int vfio_get_iommu_type(VFIOContainer *container,
 return ret;
 }
 
+static int vfio_host_iommu_ctx_pasid_alloc(HostIOMMUContext *iommu_ctx,
+   uint32_t min, uint32_t max,
+   uint32_t *pasid)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_pasid_request req;
+int ret = 0;
+
+req.argsz = sizeof(req);
+req.flags = VFIO_IOMMU_FLAG_ALLOC_PASID;
+req.range.min = min;
+req.range.max = max;
+
+ret = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, );
+if (ret < 0) {
+error_report("%s: alloc failed (%m)", __func__);
+return ret;
+}
+*pasid = ret;
+return 0;
+}
+
+static int vfio_host_iommu_ctx_pasid_free(HostIOMMUContext *iommu_ctx,
+  uint32_t pasid)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_pasid_request req;
+
+int ret = 0;
+
+req.argsz = sizeof(req);
+req.flags = VFIO_IOMMU_FLAG_FREE_PASID;
+req.range.min = pasid;
+req.range.max = pasid + 1;
+
+ret = ioctl(container->fd, VFIO_IOMMU_PASID_REQUEST, );
+if (ret) {
+error_report("%s: free failed (%m)", __func__);
+}
+return ret;
+}
+
 static int vfio_init_container(VFIOContainer *container, int group_fd,
bool want_nested, Error **errp)
 {
@@ -1802,3 +1846,25 @@ int vfio_eeh_as_op(AddressSpace *as, uint32_t op)
 }
 return vfio_eeh_container_op(container, op);
 }
+
+static void vfio_host_iommu_context_class_init(ObjectClass *klass,
+   void *data)
+{
+HostIOMMUContextClass *hicxc = HOST_IOMMU_CONTEXT_CLASS(klass);
+
+hicxc->pasid_alloc = vfio_host_iommu_ctx_pasid_alloc;
+hicxc->pasid_free = vfio_host_iommu_ctx_pasid_free;
+}
+
+static const TypeInfo vfio_host_iommu_context_info = {
+.parent = TYPE_HOST_IOMMU_CONTEXT,
+.name = TYPE_VFIO_HOST_IOMMU_CONTEXT,
+.class_init = vfio_host_iommu_context_class_init,
+};
+
+static void vfio_register_types(void)
+{
+type_register_static(_host_iommu_context_info);
+}
+
+type_init(vfio_register_types)
diff --git a/include/hw/iommu/host_iommu_context.h 
b/include/hw/iommu/host_iommu_context.h
index 35c4861..227c433 100644
--- a/include/hw/iommu/host_iommu_context.h
+++ b/include/hw/iommu/host_iommu_context.h
@@ -33,6 +33,9 @@
 #define TYPE_HOST_IOMMU_CONTEXT "qemu:host-iommu-context"
 #define HOST_IOMMU_CONTEXT(obj) \
 OBJECT_CHECK(HostIOMMUContext, (obj), TYPE_HOST_IOMMU_CONTEXT)
+#define HOST_IOMMU_CONTEXT_CLASS(klass) \
+OBJECT_CLASS_CHECK(HostIOMMUContextClass, (klass), \
+ TYPE_HOST_IOMMU_CONTEXT)
 #define HOST_IOMMU_CONTEXT_GET_CLASS(obj) \
 OBJECT_GET_CLASS(HostIOMMUContextClass, (obj), \
  TYPE_HOST_IOMMU_CONTEXT)
diff --git a/include/hw/vfio/vfio-common.h b/include/hw/vfio/vfio-common.h
index bdb09f4..a5eaf35 100644
--- a/include/hw/vfio/vfio-common.h
+++ b/include/hw/vfio/vfio-common.h
@@ -26,12 +26,15 @@
 #include "qemu/notify.h"
 #include "ui/console.h"
 #include "hw/display/ramfb.h"
+#include "hw/iommu/host_iommu_context.h"
 #ifdef CONFIG_LINUX
 #include 
 #endif
 
 #define VFIO_MSG_PREFIX "vfio %s: "
 
+#define TYPE_VFIO_HOST_IOMMU_CONTEXT "qemu:vfio-host-iommu-context"
+
 enum {
 VFIO_DEVICE_TYPE_PCI = 0,
 VFIO_DEVICE_TYPE_PLATFORM = 1,
@@ -71,6 +74,7 @@ typedef struct VFIOContainer {
 MemoryListener listener;
 MemoryListener prereg_listener;
 unsigned iommu_type;
+HostIOMMUContext iommu_ctx;
 Error *error;
 bool initialized;
 unsigned long pgsizes;
-- 
2.7.4

[RFC v9 02/25] header file update VFIO/IOMMU vSVA APIs kernel 5.8-rc6

2020-07-28 Thread Liu Yi L

The kernel uapi/linux/iommu.h header file includes the
extensions for vSVA support. e.g. bind gpasid, iommu
fault report related user structures and etc.

This commit updates kernel headers from the below branch:
https://github.com/luxis1999/linux-vsva.git: vsva-linux-5.8-rc6-v6

Note: this should be replaced with a full header files update when
the vSVA uPAPI is stable.

TODO: add the note for the Linux version.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Michael S. Tsirkin 
Cc: Cornelia Huck 
Cc: Paolo Bonzini 
Signed-off-by: Liu Yi L 
---
 linux-headers/linux/iommu.h | 409 
 linux-headers/linux/vfio.h  |  92 +-
 2 files changed, 500 insertions(+), 1 deletion(-)
 create mode 100644 linux-headers/linux/iommu.h

diff --git a/linux-headers/linux/iommu.h b/linux-headers/linux/iommu.h
new file mode 100644
index 000..82b6a45
--- /dev/null
+++ b/linux-headers/linux/iommu.h
@@ -0,0 +1,409 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+/*
+ * IOMMU user API definitions
+ */
+
+#ifndef _IOMMU_H
+#define _IOMMU_H
+
+#include 
+
+#define IOMMU_FAULT_PERM_READ  (1 << 0) /* read */
+#define IOMMU_FAULT_PERM_WRITE (1 << 1) /* write */
+#define IOMMU_FAULT_PERM_EXEC  (1 << 2) /* exec */
+#define IOMMU_FAULT_PERM_PRIV  (1 << 3) /* privileged */
+
+/* Generic fault types, can be expanded IRQ remapping fault */
+enum iommu_fault_type {
+   IOMMU_FAULT_DMA_UNRECOV = 1,/* unrecoverable fault */
+   IOMMU_FAULT_PAGE_REQ,   /* page request fault */
+};
+
+enum iommu_fault_reason {
+   IOMMU_FAULT_REASON_UNKNOWN = 0,
+
+   /* Could not access the PASID table (fetch caused external abort) */
+   IOMMU_FAULT_REASON_PASID_FETCH,
+
+   /* PASID entry is invalid or has configuration errors */
+   IOMMU_FAULT_REASON_BAD_PASID_ENTRY,
+
+   /*
+* PASID is out of range (e.g. exceeds the maximum PASID
+* supported by the IOMMU) or disabled.
+*/
+   IOMMU_FAULT_REASON_PASID_INVALID,
+
+   /*
+* An external abort occurred fetching (or updating) a translation
+* table descriptor
+*/
+   IOMMU_FAULT_REASON_WALK_EABT,
+
+   /*
+* Could not access the page table entry (Bad address),
+* actual translation fault
+*/
+   IOMMU_FAULT_REASON_PTE_FETCH,
+
+   /* Protection flag check failed */
+   IOMMU_FAULT_REASON_PERMISSION,
+
+   /* access flag check failed */
+   IOMMU_FAULT_REASON_ACCESS,
+
+   /* Output address of a translation stage caused Address Size fault */
+   IOMMU_FAULT_REASON_OOR_ADDRESS,
+};
+
+/**
+ * struct iommu_fault_unrecoverable - Unrecoverable fault data
+ * @reason: reason of the fault, from  iommu_fault_reason
+ * @flags: parameters of this fault (IOMMU_FAULT_UNRECOV_* values)
+ * @pasid: Process Address Space ID
+ * @perm: requested permission access using by the incoming transaction
+ *(IOMMU_FAULT_PERM_* values)
+ * @addr: offending page address
+ * @fetch_addr: address that caused a fetch abort, if any
+ */
+struct iommu_fault_unrecoverable {
+   __u32   reason;
+#define IOMMU_FAULT_UNRECOV_PASID_VALID(1 << 0)
+#define IOMMU_FAULT_UNRECOV_ADDR_VALID (1 << 1)
+#define IOMMU_FAULT_UNRECOV_FETCH_ADDR_VALID   (1 << 2)
+   __u32   flags;
+   __u32   pasid;
+   __u32   perm;
+   __u64   addr;
+   __u64   fetch_addr;
+};
+
+/**
+ * struct iommu_fault_page_request - Page Request data
+ * @flags: encodes whether the corresponding fields are valid and whether this
+ * is the last page in group (IOMMU_FAULT_PAGE_REQUEST_* values)
+ * @pasid: Process Address Space ID
+ * @grpid: Page Request Group Index
+ * @perm: requested page permissions (IOMMU_FAULT_PERM_* values)
+ * @addr: page address
+ * @private_data: device-specific private information
+ */
+struct iommu_fault_page_request {
+#define IOMMU_FAULT_PAGE_REQUEST_PASID_VALID   (1 << 0)
+#define IOMMU_FAULT_PAGE_REQUEST_LAST_PAGE (1 << 1)
+#define IOMMU_FAULT_PAGE_REQUEST_PRIV_DATA (1 << 2)
+   __u32   flags;
+   __u32   pasid;
+   __u32   grpid;
+   __u32   perm;
+   __u64   addr;
+   __u64   private_data[2];
+};
+
+/**
+ * struct iommu_fault - Generic fault data
+ * @type: fault type from  iommu_fault_type
+ * @padding: reserved for future use (should be zero)
+ * @event: fault event, when @type is %IOMMU_FAULT_DMA_UNRECOV
+ * @prm: Page Request message, when @type is %IOMMU_FAULT_PAGE_REQ
+ * @padding2: sets the fault size to allow for future extensions
+ */
+struct iommu_fault {
+   __u32   type;
+   __u32   padding;
+   union {
+   struct iommu_fault_unrecoverable event;
+   struct iommu_fault_page_request prm;
+   __u8 padding2[56];
+   };
+};
+
+/**
+ * enum iommu_page_response_code - Return status of fault h

[RFC v9 01/25] scripts/update-linux-headers: Import iommu.h

2020-07-28 Thread Liu Yi L

From: Eric Auger 

Update the script to import the new iommu.h uapi header.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Michael S. Tsirkin 
Cc: Cornelia Huck 
Cc: Paolo Bonzini 
Acked-by: Cornelia Huck 
Signed-off-by: Eric Auger 
---
 scripts/update-linux-headers.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/update-linux-headers.sh b/scripts/update-linux-headers.sh
index 29c27f4..5b64ee3 100755
--- a/scripts/update-linux-headers.sh
+++ b/scripts/update-linux-headers.sh
@@ -141,7 +141,7 @@ done
 
 rm -rf "$output/linux-headers/linux"
 mkdir -p "$output/linux-headers/linux"
-for header in kvm.h vfio.h vfio_ccw.h vhost.h \
+for header in kvm.h vfio.h vfio_ccw.h vhost.h iommu.h \
   psci.h psp-sev.h userfaultfd.h mman.h; do
 cp "$tmpdir/include/linux/$header" "$output/linux-headers/linux"
 done
-- 
2.7.4

[RFC v9 00/25] intel_iommu: expose Shared Virtual Addressing to VMs

2020-07-28 Thread Liu Yi L

UContext, and rename DualStageIOMMUObject to 
HostIOMMUContext.
 HostIOMMUContext is per-vfio-container, it is exposed to  vIOMMU 
via PCI
 layer. VFIO registers a PCIHostIOMMUFunc callback to PCI layer, 
vIOMMU
 could get HostIOMMUContext instance via it.
  b) Check IOMMU uAPI version by VFIO_CHECK_EXTENSION
  c) Add a check on VFIO_PASID_REQ availability via VFIO_GET_IOMMU_IHNFO
  d) Reorder the series, put vSVA linux header file update in the 
beginning
 put the x-scalable-mode option mofification in the end of the 
series.
  e) Dropped patch "[RFC v3 01/25] hw/pci: modify pci_setup_iommu() to 
set PCIIOMMUOps"
  RFCv3: https://patchwork.kernel.org/cover/11356033/

- RFC v2 -> v3:
  a) Introduce DualStageIOMMUObject to abstract the host IOMMU 
programming
  capability. e.g. request PASID from host, setup IOMMU nesting 
translation
  on host IOMMU. The pasid_alloc/bind_guest_page_table/iommu_cache_flush
  operations are moved to be DualStageIOMMUOps. Thus, 
DualStageIOMMUObject
  is an abstract layer which provides QEMU vIOMMU emulators with an 
explicit
  method to program host IOMMU.
  b) Compared with RFC v2, the IOMMUContext has also been updated. It is
  modified to provide an abstract for vIOMMU emulators. It provides the
  method for pass-through modules (like VFIO) to communicate with host 
IOMMU.
  e.g. tell vIOMMU emulators about the IOMMU nesting capability on host 
side
  and report the host IOMMU DMA translation faults to vIOMMU emulators.
  RFC v2: https://www.spinics.net/lists/kvm/msg198556.html

- RFC v1 -> v2:
  Introduce IOMMUContext to abstract the connection between VFIO
  and vIOMMU emulators, which is a replacement of the PCIPASIDOps
  in RFC v1. Modify x-scalable-mode to be string option instead of
  adding a new option as RFC v1 did. Refined the pasid cache management

---
Eric Auger (1):
  scripts/update-linux-headers: Import iommu.h

Liu Yi L (24):
  header file update VFIO/IOMMU vSVA APIs kernel 5.8-rc6
  hw/pci: modify pci_setup_iommu() to set PCIIOMMUOps
  hw/pci: introduce pci_device_get_iommu_attr()
  intel_iommu: add get_iommu_attr() callback
  vfio: pass nesting requirement into vfio_get_group()
  vfio: check VFIO_TYPE1_NESTING_IOMMU support
  hw/iommu: introduce HostIOMMUContext
  hw/pci: introduce pci_device_set/unset_iommu_context()
  intel_iommu: add set/unset_iommu_context callback
  vfio/common: provide PASID alloc/free hooks
  vfio: init HostIOMMUContext per-container
  intel_iommu: add virtual command capability support
  intel_iommu: process PASID cache invalidation
  intel_iommu: add PASID cache management infrastructure
  vfio: add bind stage-1 page table support
  intel_iommu: sync IOMMU nesting cap info for assigned devices
  intel_iommu: bind/unbind guest page table to host
  intel_iommu: replay pasid binds after context cache invalidation
  intel_iommu: do not pass down pasid bind for PASID #0
  vfio: add support for flush iommu stage-1 cache
  intel_iommu: process PASID-based iotlb invalidation
  intel_iommu: propagate PASID-based iotlb invalidation to host
  intel_iommu: process PASID-based Device-TLB invalidation
  intel_iommu: modify x-scalable-mode to be string option

 hw/Makefile.objs  |1 +
 hw/alpha/typhoon.c|6 +-
 hw/arm/smmu-common.c  |6 +-
 hw/hppa/dino.c|6 +-
 hw/i386/amd_iommu.c   |6 +-
 hw/i386/intel_iommu.c | 1233 -
 hw/i386/intel_iommu_internal.h|  131 
 hw/i386/trace-events  |6 +
 hw/iommu/Makefile.objs|1 +
 hw/iommu/host_iommu_context.c |  171 +
 hw/pci-host/designware.c  |6 +-
 hw/pci-host/pnv_phb3.c|6 +-
 hw/pci-host/pnv_phb4.c|6 +-
 hw/pci-host/ppce500.c |6 +-
 hw/pci-host/prep.c|6 +-
 hw/pci-host/sabre.c   |6 +-
 hw/pci/pci.c  |   73 +-
 hw/ppc/ppc440_pcix.c  |6 +-
 hw/ppc/spapr_pci.c|6 +-
 hw/s390x/s390-pci-bus.c   |8 +-
 hw/vfio/ap.c  |2 +-
 hw/vfio/ccw.c |2 +-
 hw/vfio/common.c  |  299 +++-
 hw/vfio/pci.c |   26 +-
 hw/vfio/platform.c|2 +-
 hw/virtio/virtio-iommu.c  |6 +-
 include/hw/i386/intel_iommu.h |   61 +-
 include/hw/iommu/host_iommu_context.h |  103 +++
 include/hw/pci/pci.h  |   25 +-
 include/hw/pci/pci_bus.h  |2 +-
 include/hw/vfio/vfio-common.h |7 +-
 linux-headers/

[RFC v9 04/25] hw/pci: introduce pci_device_get_iommu_attr()

2020-07-28 Thread Liu Yi L

This patch adds pci_device_get_iommu_attr() to get vIOMMU attributes.
e.g. if nesting IOMMU wanted.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Michael S. Tsirkin 
Signed-off-by: Liu Yi L 
---
 hw/pci/pci.c | 35 ++-
 include/hw/pci/pci.h |  7 +++
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index b2a2077..3c27805 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -2659,7 +2659,8 @@ static void pci_device_class_base_init(ObjectClass 
*klass, void *data)
 }
 }
 
-AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+static void pci_device_get_iommu_bus_devfn(PCIDevice *dev,
+  PCIBus **pbus, uint8_t *pdevfn)
 {
 PCIBus *bus = pci_get_bus(dev);
 PCIBus *iommu_bus = bus;
@@ -2710,14 +2711,38 @@ AddressSpace *pci_device_iommu_address_space(PCIDevice 
*dev)
 
 iommu_bus = parent_bus;
 }
-if (iommu_bus && iommu_bus->iommu_ops &&
- iommu_bus->iommu_ops->get_address_space) {
-return iommu_bus->iommu_ops->get_address_space(bus,
- iommu_bus->iommu_opaque, devfn);
+*pbus = iommu_bus;
+*pdevfn = devfn;
+}
+
+AddressSpace *pci_device_iommu_address_space(PCIDevice *dev)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->get_address_space) {
+return bus->iommu_ops->get_address_space(bus,
+bus->iommu_opaque, devfn);
 }
 return _space_memory;
 }
 
+int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr attr, void *data)
+{
+PCIBus *bus;
+uint8_t devfn;
+
+pci_device_get_iommu_bus_devfn(dev, , );
+if (bus && bus->iommu_ops &&
+bus->iommu_ops->get_iommu_attr) {
+return bus->iommu_ops->get_iommu_attr(bus, bus->iommu_opaque,
+   devfn, attr, data);
+}
+return -ENOENT;
+}
+
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *ops, void *opaque)
 {
 bus->iommu_ops = ops;
diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index 18cfba5..822ada5 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -486,13 +486,20 @@ void pci_bus_get_w64_range(PCIBus *bus, Range *range);
 
 void pci_device_deassert_intx(PCIDevice *dev);
 
+typedef enum IOMMUAttr {
+IOMMU_WANT_NESTING,
+} IOMMUAttr;
+
 typedef struct PCIIOMMUOps PCIIOMMUOps;
 struct PCIIOMMUOps {
 AddressSpace * (*get_address_space)(PCIBus *bus,
 void *opaque, int32_t devfn);
+int (*get_iommu_attr)(PCIBus *bus, void *opaque, int32_t devfn,
+   IOMMUAttr attr, void *data);
 };
 
 AddressSpace *pci_device_iommu_address_space(PCIDevice *dev);
+int pci_device_get_iommu_attr(PCIDevice *dev, IOMMUAttr attr, void *data);
 void pci_setup_iommu(PCIBus *bus, const PCIIOMMUOps *iommu_ops, void *opaque);
 
 static inline void
-- 
2.7.4

[RFC v8 24/25] intel_iommu: process PASID-based Device-TLB invalidation

2020-07-12 Thread Liu Yi L

This patch adds an empty handling for PASID-based Device-TLB
invalidation. For now it is enough as it is not necessary to
propagate it to host for passthru device and also there is no
emulated device has device tlb.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 18 ++
 hw/i386/intel_iommu_internal.h |  1 +
 2 files changed, 19 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index d3c41a6..2bbb4b1 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3213,6 +3213,17 @@ static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
 return true;
 }
 
+static bool vtd_process_device_piotlb_desc(IntelIOMMUState *s,
+   VTDInvDesc *inv_desc)
+{
+/*
+ * no need to handle it for passthru device, for emulated
+ * devices with device tlb, it may be required, but for now,
+ * return is enough
+ */
+return true;
+}
+
 static bool vtd_process_device_iotlb_desc(IntelIOMMUState *s,
   VTDInvDesc *inv_desc)
 {
@@ -3334,6 +3345,13 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 }
 break;
 
+case VTD_INV_DESC_DEV_PIOTLB:
+trace_vtd_inv_desc("device-piotlb", inv_desc.hi, inv_desc.lo);
+if (!vtd_process_device_piotlb_desc(s, _desc)) {
+return false;
+}
+break;
+
 case VTD_INV_DESC_DEVICE:
 trace_vtd_inv_desc("device", inv_desc.hi, inv_desc.lo);
 if (!vtd_process_device_iotlb_desc(s, _desc)) {
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 08ff58e..9b4fc67 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -405,6 +405,7 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_WAIT   0x5 /* Invalidation Wait Descriptor */
 #define VTD_INV_DESC_PIOTLB 0x6 /* PASID-IOTLB Invalidate Desc */
 #define VTD_INV_DESC_PC 0x7 /* PASID-cache Invalidate Desc */
+#define VTD_INV_DESC_DEV_PIOTLB 0x8 /* PASID-based-DIOTLB inv_desc*/
 #define VTD_INV_DESC_NONE   0   /* Not an Invalidate Descriptor */
 
 /* Masks for Invalidation Wait Descriptor*/
-- 
2.7.4

[RFC v8 25/25] intel_iommu: modify x-scalable-mode to be string option

2020-07-12 Thread Liu Yi L

Intel VT-d 3.0 introduces scalable mode, and it has a bunch of capabilities
related to scalable mode translation, thus there are multiple combinations.
While this vIOMMU implementation wants simplify it for user by providing
typical combinations. User could config it by "x-scalable-mode" option. The
usage is as below:

"-device intel-iommu,x-scalable-mode=["legacy"|"modern"|"off"]"

 - "legacy": gives support for SL page table
 - "modern": gives support for FL page table, pasid, virtual command
 - "off": no scalable mode support
 -  if not configured, means no scalable mode support, if not proper
configured, will throw error

Note: this patch is supposed to be merged when the whole vSVA patch series
were merged.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
Signed-off-by: Yi Sun 
---
rfcv5 (v2) -> rfcv6:
*) reports want_nested to VFIO;
*) assert iommu_set/unset_iommu_context() if vIOMMU is not scalable modern.
---
 hw/i386/intel_iommu.c  | 39 +++
 hw/i386/intel_iommu_internal.h |  3 +++
 include/hw/i386/intel_iommu.h  |  2 ++
 3 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 2bbb4b1..d807484 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -4050,7 +4050,7 @@ static Property vtd_properties[] = {
 DEFINE_PROP_UINT8("aw-bits", IntelIOMMUState, aw_bits,
   VTD_HOST_ADDRESS_WIDTH),
 DEFINE_PROP_BOOL("caching-mode", IntelIOMMUState, caching_mode, FALSE),
-DEFINE_PROP_BOOL("x-scalable-mode", IntelIOMMUState, scalable_mode, FALSE),
+DEFINE_PROP_STRING("x-scalable-mode", IntelIOMMUState, scalable_mode_str),
 DEFINE_PROP_BOOL("dma-drain", IntelIOMMUState, dma_drain, true),
 DEFINE_PROP_END_OF_LIST(),
 };
@@ -4420,6 +4420,7 @@ VTDAddressSpace *vtd_find_add_as(IntelIOMMUState *s, 
PCIBus *bus, int devfn)
 static int vtd_dev_get_iommu_attr(PCIBus *bus, void *opaque, int32_t devfn,
IOMMUAttr attr, void *data)
 {
+IntelIOMMUState *s = opaque;
 int ret = 0;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
@@ -4429,8 +4430,7 @@ static int vtd_dev_get_iommu_attr(PCIBus *bus, void 
*opaque, int32_t devfn,
 {
 bool *pdata = data;
 
-/* return false until vSVA is ready */
-*pdata = false;
+*pdata = s->scalable_modern ? true : false;
 break;
 }
 default:
@@ -4526,6 +4526,8 @@ static int vtd_dev_set_iommu_context(PCIBus *bus, void 
*opaque,
 VTDHostIOMMUContext *vtd_dev_icx;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+/* only modern scalable supports set_ioimmu_context */
+assert(s->scalable_modern);
 
 vtd_bus = vtd_find_add_bus(s, bus);
 
@@ -4560,6 +4562,8 @@ static void vtd_dev_unset_iommu_context(PCIBus *bus, void 
*opaque, int devfn)
 VTDHostIOMMUContext *vtd_dev_icx;
 
 assert(0 <= devfn && devfn < PCI_DEVFN_MAX);
+/* only modern scalable supports unset_ioimmu_context */
+assert(s->scalable_modern);
 
 vtd_bus = vtd_find_add_bus(s, bus);
 
@@ -4787,8 +4791,13 @@ static void vtd_init(IntelIOMMUState *s)
 }
 
 /* TODO: read cap/ecap from host to decide which cap to be exposed. */
-if (s->scalable_mode) {
+if (s->scalable_mode && !s->scalable_modern) {
 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
+} else if (s->scalable_mode && s->scalable_modern) {
+s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_PASID |
+   VTD_ECAP_FLTS | VTD_ECAP_PSS(VTD_PASID_SS) |
+   VTD_ECAP_VCS;
+s->vccap |= VTD_VCCAP_PAS;
 }
 
 if (!s->cap_finalized) {
@@ -4929,6 +4938,28 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)
 return false;
 }
 
+if (s->scalable_mode_str &&
+(strcmp(s->scalable_mode_str, "off") &&
+ strcmp(s->scalable_mode_str, "modern") &&
+ strcmp(s->scalable_mode_str, "legacy"))) {
+error_setg(errp, "Invalid x-scalable-mode config,"
+ "Please use \"modern\", \"legacy\" or \"off\"");
+return false;
+}
+
+if (s->scalable_mode_str &&
+!strcmp(s->scalable_mode_str, "legacy")) {
+s->scalable_mode = true;
+s->scalable_modern = false;
+} else if (s->scalable_mode_str &&
+!strcmp(s->scalable_mode_str, "modern")) {
+s->scalable_mode = true;
+s->scalable_mod

[RFC v8 22/25] intel_iommu: process PASID-based iotlb invalidation

2020-07-12 Thread Liu Yi L

This patch adds the basic PASID-based iotlb (piotlb) invalidation
support. piotlb is used during walking Intel VT-d 1st level page
table. This patch only adds the basic processing. Detailed handling
will be added in next patch.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 53 ++
 hw/i386/intel_iommu_internal.h | 13 +++
 2 files changed, 66 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 47af7b1..e6364ee 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3038,6 +3038,55 @@ static bool vtd_process_pasid_desc(IntelIOMMUState *s,
 return true;
 }
 
+static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
+uint16_t domain_id,
+uint32_t pasid)
+{
+}
+
+static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
+   uint32_t pasid, hwaddr addr, uint8_t am,
+   bool ih)
+{
+}
+
+static bool vtd_process_piotlb_desc(IntelIOMMUState *s,
+VTDInvDesc *inv_desc)
+{
+uint16_t domain_id;
+uint32_t pasid;
+uint8_t am;
+hwaddr addr;
+
+if ((inv_desc->val[0] & VTD_INV_DESC_PIOTLB_RSVD_VAL0) ||
+(inv_desc->val[1] & VTD_INV_DESC_PIOTLB_RSVD_VAL1)) {
+error_report_once("non-zero-field-in-piotlb_inv_desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+
+domain_id = VTD_INV_DESC_PIOTLB_DID(inv_desc->val[0]);
+pasid = VTD_INV_DESC_PIOTLB_PASID(inv_desc->val[0]);
+switch (inv_desc->val[0] & VTD_INV_DESC_IOTLB_G) {
+case VTD_INV_DESC_PIOTLB_ALL_IN_PASID:
+vtd_piotlb_pasid_invalidate(s, domain_id, pasid);
+break;
+
+case VTD_INV_DESC_PIOTLB_PSI_IN_PASID:
+am = VTD_INV_DESC_PIOTLB_AM(inv_desc->val[1]);
+addr = (hwaddr) VTD_INV_DESC_PIOTLB_ADDR(inv_desc->val[1]);
+vtd_piotlb_page_invalidate(s, domain_id, pasid, addr, am,
+   VTD_INV_DESC_PIOTLB_IH(inv_desc->val[1]));
+break;
+
+default:
+error_report_once("Invalid granularity in P-IOTLB desc hi: 0x%" PRIx64
+  " lo: 0x%" PRIx64, inv_desc->val[1], inv_desc->val[0]);
+return false;
+}
+return true;
+}
+
 static bool vtd_process_inv_iec_desc(IntelIOMMUState *s,
  VTDInvDesc *inv_desc)
 {
@@ -3152,6 +3201,10 @@ static bool vtd_process_inv_desc(IntelIOMMUState *s)
 break;
 
 case VTD_INV_DESC_PIOTLB:
+trace_vtd_inv_desc("p-iotlb", inv_desc.val[1], inv_desc.val[0]);
+if (!vtd_process_piotlb_desc(s, _desc)) {
+return false;
+}
 break;
 
 case VTD_INV_DESC_WAIT:
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 9805b84..118d568 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -476,6 +476,19 @@ typedef union VTDInvDesc VTDInvDesc;
 #define VTD_INV_DESC_PASIDC_PASID_SI   (1ULL << 4)
 #define VTD_INV_DESC_PASIDC_GLOBAL (3ULL << 4)
 
+#define VTD_INV_DESC_PIOTLB_ALL_IN_PASID  (2ULL << 4)
+#define VTD_INV_DESC_PIOTLB_PSI_IN_PASID  (3ULL << 4)
+
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL0 0xfff0ffc0ULL
+#define VTD_INV_DESC_PIOTLB_RSVD_VAL1 0xf80ULL
+
+#define VTD_INV_DESC_PIOTLB_PASID(val)(((val) >> 32) & 0xfULL)
+#define VTD_INV_DESC_PIOTLB_DID(val)  (((val) >> 16) & \
+ VTD_DOMAIN_ID_MASK)
+#define VTD_INV_DESC_PIOTLB_ADDR(val) ((val) & ~0xfffULL)
+#define VTD_INV_DESC_PIOTLB_AM(val)   ((val) & 0x3fULL)
+#define VTD_INV_DESC_PIOTLB_IH(val)   (((val) >> 6) & 0x1)
+
 /* Information about page-selective IOTLB invalidate */
 struct VTDIOTLBPageInvInfo {
 uint16_t domain_id;
-- 
2.7.4

[RFC v8 23/25] intel_iommu: propagate PASID-based iotlb invalidation to host

2020-07-12 Thread Liu Yi L

This patch propagates PASID-based iotlb invalidation to host.

Intel VT-d 3.0 supports nested translation in PASID granular.
Guest SVA support could be implemented by configuring nested
translation on specific PASID. This is also known as dual stage
DMA translation.

Under such configuration, guest owns the GVA->GPA translation
which is configured as first level page table in host side for
a specific pasid, and host owns GPA->HPA translation. As guest
owns first level translation table, piotlb invalidation should
be propagated to host since host IOMMU will cache first level
page table related mappings during DMA address translation.

This patch traps the guest PASID-based iotlb flush and propagate
it to host.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
rfcv4 (v1) -> rfcv5 (v2):
*) removed the valid check to vtd_pasid_as instance as rfcv5 ensures
   all vtd_pasid_as instances in hash table should be valid.
---
 hw/i386/intel_iommu.c  | 113 +
 hw/i386/intel_iommu_internal.h |   7 +++
 2 files changed, 120 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index e6364ee..d3c41a6 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -3038,16 +3038,129 @@ static bool vtd_process_pasid_desc(IntelIOMMUState *s,
 return true;
 }
 
+/**
+ * Caller of this function should hold iommu_lock.
+ */
+static void vtd_invalidate_piotlb(IntelIOMMUState *s,
+  VTDBus *vtd_bus,
+  int devfn,
+  struct iommu_cache_invalidate_info *cache)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+HostIOMMUContext *iommu_ctx;
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+if (!vtd_dev_icx) {
+goto out;
+}
+iommu_ctx = vtd_dev_icx->iommu_ctx;
+if (!iommu_ctx) {
+goto out;
+}
+if (host_iommu_ctx_flush_stage1_cache(iommu_ctx, cache)) {
+error_report("Cache flush failed");
+}
+out:
+return;
+}
+
+/**
+ * This function is a loop function for the s->vtd_pasid_as
+ * list with VTDPIOTLBInvInfo as execution filter. It propagates
+ * the piotlb invalidation to host. Caller of this function
+ * should hold iommu_lock.
+ */
+static void vtd_flush_pasid_iotlb(gpointer key, gpointer value,
+  gpointer user_data)
+{
+VTDPIOTLBInvInfo *piotlb_info = user_data;
+VTDPASIDAddressSpace *vtd_pasid_as = value;
+VTDPASIDCacheEntry *pc_entry = _pasid_as->pasid_cache_entry;
+uint16_t did;
+
+did = vtd_pe_get_domain_id(_entry->pasid_entry);
+
+if ((piotlb_info->domain_id == did) &&
+(piotlb_info->pasid == vtd_pasid_as->pasid)) {
+vtd_invalidate_piotlb(vtd_pasid_as->iommu_state,
+  vtd_pasid_as->vtd_bus,
+  vtd_pasid_as->devfn,
+  piotlb_info->cache_info);
+}
+
+/*
+ * TODO: needs to add QEMU piotlb flush when QEMU piotlb
+ * infrastructure is ready. For now, it is enough for passthru
+ * devices.
+ */
+}
+
 static void vtd_piotlb_pasid_invalidate(IntelIOMMUState *s,
 uint16_t domain_id,
 uint32_t pasid)
 {
+VTDPIOTLBInvInfo piotlb_info;
+struct iommu_cache_invalidate_info *cache_info;
+
+cache_info = g_malloc0(sizeof(*cache_info));
+
+cache_info->argsz = sizeof(*cache_info);
+cache_info->version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
+cache_info->cache = IOMMU_CACHE_INV_TYPE_IOTLB;
+cache_info->granularity = IOMMU_INV_GRANU_PASID;
+cache_info->granu.pasid_info.pasid = pasid;
+cache_info->granu.pasid_info.flags = IOMMU_INV_PASID_FLAGS_PASID;
+
+piotlb_info.domain_id = domain_id;
+piotlb_info.pasid = pasid;
+piotlb_info.cache_info = cache_info;
+
+vtd_iommu_lock(s);
+/*
+ * Here loops all the vtd_pasid_as instances in s->vtd_pasid_as
+ * to find out the affected devices since piotlb invalidation
+ * should check pasid cache per architecture point of view.
+ */
+g_hash_table_foreach(s->vtd_pasid_as,
+ vtd_flush_pasid_iotlb, _info);
+vtd_iommu_unlock(s);
+g_free(cache_info);
 }
 
 static void vtd_piotlb_page_invalidate(IntelIOMMUState *s, uint16_t domain_id,
uint32_t pasid, hwaddr addr, uint8_t am,
bool ih)
 {
+VTDPIOTLBInvInfo piotlb_info;
+struct iommu_cache_invalidate_info *cache_info;
+
+cache_info = g_malloc0(sizeof(*cache_info));
+
+cache_info->argsz = sizeof(*cache_info);
+cache_info->version = IOMMU_CACHE_INVALIDATE_INFO_VERSION_1;
+cache_info->cache =

[RFC v8 20/25] intel_iommu: do not pass down pasid bind for PASID #0

2020-07-12 Thread Liu Yi L

RID_PASID field was introduced in VT-d 3.0 spec, it is used
for DMA requests w/o PASID in scalable mode VT-d. It is also
known as IOVA. And in VT-d 3.1 spec, there is definition on it:

"Implementations not supporting RID_PASID capability
(ECAP_REG.RPS is 0b), use a PASID value of 0 to perform
address translation for requests without PASID."

This patch adds a check against the PASIDs which are going to be
bound to device. For PASID #0, it is not necessary to pass down
pasid bind request for it since PASID #0 is used as RID_PASID for
DMA requests without pasid. Further reason is current Intel vIOMMU
supports gIOVA by shadowing guest 2nd level page table. However,
in future, if guest IOMMU driver uses 1st level page table to store
IOVA mappings, then guest IOVA support will also be done via nested
translation. When gIOVA is over FLPT, then vIOMMU should pass down
the pasid bind request for PASID #0 to host, host needs to bind the
guest IOVA page table to a proper PASID. e.g. PASID value in RID_PASID
field for PF/VF if ECAP_REG.RPS is clear or default PASID for ADI
(Assignable Device Interface in Scalable IOV solution).

IOVA over FLPT support on Intel VT-d:
https://lkml.org/lkml/2019/9/23/297

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index de2ba0e..47af7b1 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -1893,6 +1893,16 @@ static int vtd_bind_guest_pasid(IntelIOMMUState *s, 
VTDBus *vtd_bus,
 HostIOMMUContext *iommu_ctx;
 int ret = -1;
 
+if (pasid < VTD_HPASID_MIN) {
+/*
+ * If pasid < VTD_HPASID_MIN, this pasid is not allocated
+ * from host. No need to pass down the changes on it to host.
+ * TODO: when IOVA over FLPT is ready, this switch should be
+ * refined.
+ */
+return 0;
+}
+
 vtd_dev_icx = vtd_bus->dev_icx[devfn];
 if (!vtd_dev_icx) {
 /* means no need to go further, e.g. for emulated devices */
-- 
2.7.4

[RFC v8 21/25] vfio: add support for flush iommu stage-1 cache

2020-07-12 Thread Liu Yi L

This patch adds flush_stage1_cache() definition in HostIOMUContextClass.
And adds corresponding implementation in VFIO. This is to expose a way
for vIOMMU to flush stage-1 cache in host side since guest owns stage-1
translation structures in dual stage DMA translation configuration.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Eric Auger 
Cc: Yi Sun 
Cc: David Gibson 
Cc: Alex Williamson 
Acked-by: Peter Xu 
Signed-off-by: Liu Yi L 
---
 hw/iommu/host_iommu_context.c | 19 +++
 hw/vfio/common.c  | 24 
 include/hw/iommu/host_iommu_context.h |  8 
 3 files changed, 51 insertions(+)

diff --git a/hw/iommu/host_iommu_context.c b/hw/iommu/host_iommu_context.c
index 0e7e790..7c8be15 100644
--- a/hw/iommu/host_iommu_context.c
+++ b/hw/iommu/host_iommu_context.c
@@ -113,6 +113,25 @@ int host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext 
*iommu_ctx,
 return hicxc->unbind_stage1_pgtbl(iommu_ctx, unbind);
 }
 
+int host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+ struct iommu_cache_invalidate_info *cache)
+{
+HostIOMMUContextClass *hicxc;
+
+hicxc = HOST_IOMMU_CONTEXT_GET_CLASS(iommu_ctx);
+
+if (!hicxc) {
+return -EINVAL;
+}
+
+if (!(iommu_ctx->flags & HOST_IOMMU_NESTING) ||
+!hicxc->flush_stage1_cache) {
+return -EINVAL;
+}
+
+return hicxc->flush_stage1_cache(iommu_ctx, cache);
+}
+
 void host_iommu_ctx_init(void *_iommu_ctx, size_t instance_size,
  const char *mrtypename,
  uint64_t flags,
diff --git a/hw/vfio/common.c b/hw/vfio/common.c
index 8bfc9ce..bfe9917 100644
--- a/hw/vfio/common.c
+++ b/hw/vfio/common.c
@@ -1276,6 +1276,29 @@ static int 
vfio_host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
 return ret;
 }
 
+static int vfio_host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+struct iommu_cache_invalidate_info *cache)
+{
+VFIOContainer *container = container_of(iommu_ctx,
+VFIOContainer, iommu_ctx);
+struct vfio_iommu_type1_nesting_op *op;
+unsigned long argsz;
+int ret = 0;
+
+argsz = sizeof(*op) + sizeof(*cache);
+op = g_malloc0(argsz);
+op->argsz = argsz;
+op->flags = VFIO_IOMMU_NESTING_OP_CACHE_INVLD;
+memcpy(>data, cache, sizeof(*cache));
+
+if (ioctl(container->fd, VFIO_IOMMU_NESTING_OP, op)) {
+ret = -errno;
+error_report("%s: iommu cache flush failed: %m", __func__);
+}
+g_free(op);
+return ret;
+}
+
 /**
  * Get iommu info from host. Caller of this funcion should free
  * the memory pointed by the returned pointer stored in @info
@@ -2018,6 +2041,7 @@ static void 
vfio_host_iommu_context_class_init(ObjectClass *klass,
 hicxc->pasid_free = vfio_host_iommu_ctx_pasid_free;
 hicxc->bind_stage1_pgtbl = vfio_host_iommu_ctx_bind_stage1_pgtbl;
 hicxc->unbind_stage1_pgtbl = vfio_host_iommu_ctx_unbind_stage1_pgtbl;
+hicxc->flush_stage1_cache = vfio_host_iommu_ctx_flush_stage1_cache;
 }
 
 static const TypeInfo vfio_host_iommu_context_info = {
diff --git a/include/hw/iommu/host_iommu_context.h 
b/include/hw/iommu/host_iommu_context.h
index 2883ed8..40e860a 100644
--- a/include/hw/iommu/host_iommu_context.h
+++ b/include/hw/iommu/host_iommu_context.h
@@ -64,6 +64,12 @@ typedef struct HostIOMMUContextClass {
 /* Undo a previous bind. @unbind specifies the unbind info. */
 int (*unbind_stage1_pgtbl)(HostIOMMUContext *iommu_ctx,
struct iommu_gpasid_bind_data *unbind);
+/*
+ * Propagate stage-1 cache flush to host IOMMU, cache
+ * info specifid in @cache
+ */
+int (*flush_stage1_cache)(HostIOMMUContext *iommu_ctx,
+  struct iommu_cache_invalidate_info *cache);
 } HostIOMMUContextClass;
 
 /*
@@ -85,6 +91,8 @@ int host_iommu_ctx_bind_stage1_pgtbl(HostIOMMUContext 
*iommu_ctx,
  struct iommu_gpasid_bind_data *bind);
 int host_iommu_ctx_unbind_stage1_pgtbl(HostIOMMUContext *iommu_ctx,
  struct iommu_gpasid_bind_data *unbind);
+int host_iommu_ctx_flush_stage1_cache(HostIOMMUContext *iommu_ctx,
+   struct iommu_cache_invalidate_info *cache);
 
 void host_iommu_ctx_init(void *_iommu_ctx, size_t instance_size,
  const char *mrtypename,
-- 
2.7.4

[RFC v8 18/25] intel_iommu: bind/unbind guest page table to host

2020-07-12 Thread Liu Yi L

This patch captures the guest PASID table entry modifications and
propagates the changes to host to setup dual stage DMA translation.
The guest page table is configured as 1st level page table (GVA->GPA)
whose translation result would further go through host VT-d 2nd
level page table(GPA->HPA) under nested translation mode. This is the
key part of vSVA support, and also a key to support IOVA over 1st-
level page table for Intel VT-d in virtualization environment.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 101 +++--
 hw/i386/intel_iommu_internal.h |  18 
 2 files changed, 114 insertions(+), 5 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index c3e8b20..1b7272c 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -41,6 +41,7 @@
 #include "migration/vmstate.h"
 #include "trace.h"
 #include "qemu/jhash.h"
+#include 
 
 /* context entry operations */
 #define VTD_CE_GET_RID2PASID(ce) \
@@ -700,6 +701,16 @@ static inline uint32_t 
vtd_sm_ce_get_pdt_entry_num(VTDContextEntry *ce)
 return 1U << (VTD_SM_CONTEXT_ENTRY_PDTS(ce->val[0]) + 7);
 }
 
+static inline uint32_t vtd_pe_get_fl_aw(VTDPASIDEntry *pe)
+{
+return 48 + ((pe->val[2] >> 2) & VTD_SM_PASID_ENTRY_FLPM) * 9;
+}
+
+static inline dma_addr_t vtd_pe_get_flpt_base(VTDPASIDEntry *pe)
+{
+return pe->val[2] & VTD_SM_PASID_ENTRY_FLPTPTR;
+}
+
 static inline bool vtd_pdire_present(VTDPASIDDirEntry *pdire)
 {
 return pdire->val & 1;
@@ -1861,6 +1872,85 @@ static void 
vtd_context_global_invalidate(IntelIOMMUState *s)
 vtd_iommu_replay_all(s);
 }
 
+/**
+ * Caller should hold iommu_lock.
+ */
+static int vtd_bind_guest_pasid(IntelIOMMUState *s, VTDBus *vtd_bus,
+int devfn, int pasid, VTDPASIDEntry *pe,
+VTDPASIDOp op)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+HostIOMMUContext *iommu_ctx;
+int ret = -1;
+
+vtd_dev_icx = vtd_bus->dev_icx[devfn];
+if (!vtd_dev_icx) {
+/* means no need to go further, e.g. for emulated devices */
+return 0;
+}
+
+iommu_ctx = vtd_dev_icx->iommu_ctx;
+if (!iommu_ctx) {
+return -EINVAL;
+}
+
+switch (op) {
+case VTD_PASID_BIND:
+{
+struct iommu_gpasid_bind_data *g_bind_data;
+
+g_bind_data = g_malloc0(sizeof(*g_bind_data));
+
+g_bind_data->argsz = sizeof(*g_bind_data);
+g_bind_data->version = IOMMU_GPASID_BIND_VERSION_1;
+g_bind_data->format = IOMMU_PASID_FORMAT_INTEL_VTD;
+g_bind_data->gpgd = vtd_pe_get_flpt_base(pe);
+g_bind_data->addr_width = vtd_pe_get_fl_aw(pe);
+g_bind_data->hpasid = pasid;
+g_bind_data->gpasid = pasid;
+g_bind_data->flags |= IOMMU_SVA_GPASID_VAL;
+g_bind_data->vendor.vtd.flags =
+ (VTD_SM_PASID_ENTRY_SRE_BIT(pe->val[2]) ?
+IOMMU_SVA_VTD_GPASID_SRE : 0)
+   | (VTD_SM_PASID_ENTRY_EAFE_BIT(pe->val[2]) ?
+IOMMU_SVA_VTD_GPASID_EAFE : 0)
+   | (VTD_SM_PASID_ENTRY_PCD_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_PCD : 0)
+   | (VTD_SM_PASID_ENTRY_PWT_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_PWT : 0)
+   | (VTD_SM_PASID_ENTRY_EMTE_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_EMTE : 0)
+   | (VTD_SM_PASID_ENTRY_CD_BIT(pe->val[1]) ?
+IOMMU_SVA_VTD_GPASID_CD : 0);
+g_bind_data->vendor.vtd.pat = VTD_SM_PASID_ENTRY_PAT(pe->val[1]);
+g_bind_data->vendor.vtd.emt = VTD_SM_PASID_ENTRY_EMT(pe->val[1]);
+ret = host_iommu_ctx_bind_stage1_pgtbl(iommu_ctx, g_bind_data);
+g_free(g_bind_data);
+break;
+}
+case VTD_PASID_UNBIND:
+{
+struct iommu_gpasid_bind_data *g_unbind_data;
+
+g_unbind_data = g_malloc0(sizeof(*g_unbind_data));
+
+g_unbind_data->argsz = sizeof(*g_unbind_data);
+g_unbind_data->version = IOMMU_GPASID_BIND_VERSION_1;
+g_unbind_data->format = IOMMU_PASID_FORMAT_INTEL_VTD;
+g_unbind_data->hpasid = pasid;
+ret = host_iommu_ctx_unbind_stage1_pgtbl(iommu_ctx, g_unbind_data);
+g_free(g_unbind_data);
+break;
+}
+default:
+error_report_once("Unknown VTDPASIDOp!!!\n");
+break;
+}
+
+
+return ret;
+}
+
 /* Do a context-cache device-selective invalidation.
  * @func_mask: FM field aft

[RFC v8 19/25] intel_iommu: replay pasid binds after context cache invalidation

2020-07-12 Thread Liu Yi L

This patch replays guest pasid bindings after context cache
invalidation. This is a behavior to ensure safety. Actually,
programmer should issue pasid cache invalidation with proper
granularity after issuing a context cache invalidation.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 50 ++
 hw/i386/intel_iommu_internal.h |  1 +
 hw/i386/trace-events   |  1 +
 3 files changed, 52 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 1b7272c..de2ba0e 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -68,6 +68,10 @@ static void vtd_address_space_refresh_all(IntelIOMMUState 
*s);
 static void vtd_address_space_unmap(VTDAddressSpace *as, IOMMUNotifier *n);
 
 static void vtd_pasid_cache_reset(IntelIOMMUState *s);
+static void vtd_pasid_cache_sync(IntelIOMMUState *s,
+ VTDPASIDCacheInfo *pc_info);
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+  VTDBus *vtd_bus, uint16_t devfn);
 
 static void vtd_panic_require_caching_mode(void)
 {
@@ -1853,7 +1857,10 @@ static void vtd_iommu_replay_all(IntelIOMMUState *s)
 
 static void vtd_context_global_invalidate(IntelIOMMUState *s)
 {
+VTDPASIDCacheInfo pc_info;
+
 trace_vtd_inv_desc_cc_global();
+
 /* Protects context cache */
 vtd_iommu_lock(s);
 s->context_cache_gen++;
@@ -1870,6 +1877,9 @@ static void vtd_context_global_invalidate(IntelIOMMUState 
*s)
  * VT-d emulation codes.
  */
 vtd_iommu_replay_all(s);
+
+pc_info.type = VTD_PASID_CACHE_GLOBAL_INV;
+vtd_pasid_cache_sync(s, _info);
 }
 
 /**
@@ -2008,6 +2018,21 @@ static void 
vtd_context_device_invalidate(IntelIOMMUState *s,
  * happened.
  */
 vtd_sync_shadow_page_table(vtd_as);
+/*
+ * Per spec, context flush should also followed with PASID
+ * cache and iotlb flush. Regards to a device selective
+ * context cache invalidation:
+ * if (emaulted_device)
+ *invalidate pasid cahce and pasid-based iotlb
+ * else if (assigned_device)
+ *check if the device has been bound to any pasid
+ *invoke pasid_unbind regards to each bound pasid
+ * Here, we have vtd_pasid_cache_devsi() to invalidate pasid
+ * caches, while for piotlb in QEMU, we don't have it yet, so
+ * no handling. For assigned device, host iommu driver would
+ * flush piotlb when a pasid unbind is pass down to it.
+ */
+ vtd_pasid_cache_devsi(s, vtd_bus, devfn_it);
 }
 }
 }
@@ -2622,6 +2647,12 @@ static gboolean vtd_flush_pasid(gpointer key, gpointer 
value,
 /* Fall through */
 case VTD_PASID_CACHE_GLOBAL_INV:
 break;
+case VTD_PASID_CACHE_DEVSI:
+if (pc_info->vtd_bus != vtd_bus ||
+pc_info->devfn != devfn) {
+return false;
+}
+break;
 default:
 error_report("invalid pc_info->type");
 abort();
@@ -2821,6 +2852,11 @@ static void 
vtd_replay_guest_pasid_bindings(IntelIOMMUState *s,
 case VTD_PASID_CACHE_GLOBAL_INV:
 /* loop all assigned devices */
 break;
+case VTD_PASID_CACHE_DEVSI:
+walk_info.vtd_bus = pc_info->vtd_bus;
+walk_info.devfn = pc_info->devfn;
+vtd_replay_pasid_bind_for_dev(s, start, end, _info);
+return;
 case VTD_PASID_CACHE_FORCE_RESET:
 /* For force reset, no need to go further replay */
 return;
@@ -2906,6 +2942,20 @@ static void vtd_pasid_cache_sync(IntelIOMMUState *s,
 vtd_iommu_unlock(s);
 }
 
+static void vtd_pasid_cache_devsi(IntelIOMMUState *s,
+  VTDBus *vtd_bus, uint16_t devfn)
+{
+VTDPASIDCacheInfo pc_info;
+
+trace_vtd_pasid_cache_devsi(devfn);
+
+pc_info.type = VTD_PASID_CACHE_DEVSI;
+pc_info.vtd_bus = vtd_bus;
+pc_info.devfn = devfn;
+
+vtd_pasid_cache_sync(s, _info);
+}
+
 /**
  * Caller of this function should hold iommu_lock
  */
diff --git a/hw/i386/intel_iommu_internal.h b/hw/i386/intel_iommu_internal.h
index 51691d0..9805b84 100644
--- a/hw/i386/intel_iommu_internal.h
+++ b/hw/i386/intel_iommu_internal.h
@@ -548,6 +548,7 @@ typedef enum VTDPCInvType {
 VTD_PASID_CACHE_FORCE_RESET = 0,
 /* pasid cache invalidation rely on guest PASID entry */
 VTD_PASID_CACHE_GLOBAL_INV,
+VTD_PASID_CACHE_DEVSI,
 VTD_PASID_CACHE_DOMSI,
 VTD_PASID_CACHE_PASIDSI,
 } VTDPCInvType;
diff --git a/hw/i386/trace-events b/hw/i386/trace-events
index 60d20c1..3853fa8 100644
--- a/hw/i386/trace-events
+++ b/hw/i386

[RFC v8 17/25] intel_iommu: sync IOMMU nesting cap info for assigned devices

2020-07-12 Thread Liu Yi L

For assigned devices, Intel vIOMMU which wants to build DMA protection
based on physical IOMMU nesting paging should check the IOMMU nesting
support in host side. The host will return IOMMU nesting cap info to
user-space (e.g. VFIO returns IOMMU nesting cap info for nesting type
IOMMU). vIOMMU needs to check:
a) IOMMU model
b) 1st-level page table supports
c) address width
d) pasid support

This patch syncs the IOMMU nesting cap info when PCIe device (VFIO case)
sets HostIOMMUContext to vIOMMU. If the host IOMMU nesting support is not
compatible, vIOMMU should return failure to PCIe device.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Signed-off-by: Liu Yi L 
---
 hw/i386/intel_iommu.c  | 107 +
 hw/i386/intel_iommu_internal.h |  18 +++
 include/hw/i386/intel_iommu.h  |   4 ++
 3 files changed, 129 insertions(+)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index c93c360..c3e8b20 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -4104,6 +4104,84 @@ static int vtd_dev_get_iommu_attr(PCIBus *bus, void 
*opaque, int32_t devfn,
 return ret;
 }
 
+
+static bool vtd_check_nesting_info(IntelIOMMUState *s,
+   struct iommu_nesting_info *info,
+   struct iommu_nesting_info_vtd *vtd)
+{
+return !((s->aw_bits != info->addr_width) ||
+ ((s->host_cap & VTD_CAP_MASK) !=
+  (vtd->cap_reg & VTD_CAP_MASK)) ||
+ ((s->host_ecap & VTD_ECAP_MASK) !=
+  (vtd->ecap_reg & VTD_ECAP_MASK)) ||
+ (VTD_GET_PSS(s->host_ecap) != (info->pasid_bits - 1)));
+}
+
+/* Caller should hold iommu lock. */
+static bool vtd_sync_nesting_info(IntelIOMMUState *s,
+  struct iommu_nesting_info *info)
+{
+struct iommu_nesting_info_vtd *vtd;
+uint64_t cap, ecap;
+
+vtd =  (struct iommu_nesting_info_vtd *) >data;
+
+if (s->cap_finalized) {
+return vtd_check_nesting_info(s, info, vtd);
+}
+
+if (s->aw_bits > info->addr_width) {
+error_report("User aw-bits: %u > host address width: %u",
+  s->aw_bits, info->addr_width);
+return false;
+}
+
+cap = s->host_cap & vtd->cap_reg & VTD_CAP_MASK;
+s->host_cap &= ~VTD_CAP_MASK;
+s->host_cap |= cap;
+
+ecap = s->host_ecap & vtd->ecap_reg & VTD_ECAP_MASK;
+s->host_ecap &= ~VTD_ECAP_MASK;
+s->host_ecap |= ecap;
+
+if ((VTD_ECAP_PASID & s->host_ecap) && info->pasid_bits &&
+(VTD_GET_PSS(s->host_ecap) > (info->pasid_bits - 1))) {
+s->host_ecap &= ~VTD_ECAP_PSS_MASK;
+s->host_ecap |= VTD_ECAP_PSS(info->pasid_bits - 1);
+}
+return true;
+}
+
+/*
+ * virtual VT-d which wants nested needs to check the host IOMMU
+ * nesting cap info behind the assigned devices. Thus that vIOMMU
+ * could bind guest page table to host.
+ */
+static bool vtd_check_iommu_ctx(IntelIOMMUState *s,
+HostIOMMUContext *iommu_ctx)
+{
+struct iommu_nesting_info *info = iommu_ctx->info;
+uint32_t minsz, size;
+
+if (IOMMU_PASID_FORMAT_INTEL_VTD != info->format) {
+error_report("Format is not compatible for nesting!!!");
+return false;
+}
+
+size = sizeof(struct iommu_nesting_info_vtd);
+minsz = endof(struct iommu_nesting_info, flags);
+if (size > (info->size - minsz)) {
+/*
+ * QEMU may have been using new linux-headers/iommu.h than
+ * kernel supports, hence fail it.
+ */
+error_report("IOMMU nesting cap is not compatible!!!");
+return false;
+}
+
+return vtd_sync_nesting_info(s, info);
+}
+
 static int vtd_dev_set_iommu_context(PCIBus *bus, void *opaque,
  int devfn,
  HostIOMMUContext *iommu_ctx)
@@ -4118,6 +4196,11 @@ static int vtd_dev_set_iommu_context(PCIBus *bus, void 
*opaque,
 
 vtd_iommu_lock(s);
 
+if (!vtd_check_iommu_ctx(s, iommu_ctx)) {
+vtd_iommu_unlock(s);
+return -ENOENT;
+}
+
 vtd_dev_icx = vtd_bus->dev_icx[devfn];
 
 assert(!vtd_dev_icx);
@@ -4373,6 +4456,14 @@ static void vtd_init(IntelIOMMUState *s)
 s->ecap |= VTD_ECAP_SMTS | VTD_ECAP_SRS | VTD_ECAP_SLTS;
 }
 
+if (!s->cap_finalized) {
+s->host_cap = s->cap;
+s->host_ecap = s->ecap;
+} else {
+s->cap = s->host_cap;
+s->ecap = s->host_ecap;
+}
+
 vtd_reset_caches(s);
 
 /* Define registers with default values and bit semantics */
@@ -4506,6 +4597,12 @@ static bool vtd_decide_config(IntelIOMMUState *s, Error 
**errp)

[RFC v8 13/25] intel_iommu: add virtual command capability support

2020-07-12 Thread Liu Yi L

This patch adds virtual command support to Intel vIOMMU per
Intel VT-d 3.1 spec. And adds two virtual commands: allocate
pasid and free pasid.

Cc: Kevin Tian 
Cc: Jacob Pan 
Cc: Peter Xu 
Cc: Yi Sun 
Cc: Paolo Bonzini 
Cc: Richard Henderson 
Cc: Eduardo Habkost 
Reviewed-by: Peter Xu 
Signed-off-by: Liu Yi L 
Signed-off-by: Yi Sun 
---
 hw/i386/intel_iommu.c  | 154 -
 hw/i386/intel_iommu_internal.h |  37 ++
 hw/i386/trace-events   |   1 +
 include/hw/i386/intel_iommu.h  |  10 ++-
 4 files changed, 200 insertions(+), 2 deletions(-)

diff --git a/hw/i386/intel_iommu.c b/hw/i386/intel_iommu.c
index 8f7c957..46036d4 100644
--- a/hw/i386/intel_iommu.c
+++ b/hw/i386/intel_iommu.c
@@ -2656,6 +2656,129 @@ static void vtd_handle_iectl_write(IntelIOMMUState *s)
 }
 }
 
+static int vtd_request_pasid_alloc(IntelIOMMUState *s, uint32_t *pasid)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+int ret = -1;
+
+vtd_iommu_lock(s);
+QLIST_FOREACH(vtd_dev_icx, >vtd_dev_icx_list, next) {
+HostIOMMUContext *iommu_ctx = vtd_dev_icx->iommu_ctx;
+
+/*
+ * We'll return the first valid result we got. It's
+ * a bit hackish in that we don't have a good global
+ * interface yet to talk to modules like vfio to deliver
+ * this allocation request, so we're leveraging this
+ * per-device iommu context to do the same thing just
+ * to make sure the allocation happens only once.
+ */
+ret = host_iommu_ctx_pasid_alloc(iommu_ctx, VTD_HPASID_MIN,
+ VTD_HPASID_MAX, pasid);
+if (!ret) {
+break;
+}
+}
+vtd_iommu_unlock(s);
+
+return ret;
+}
+
+static int vtd_request_pasid_free(IntelIOMMUState *s, uint32_t pasid)
+{
+VTDHostIOMMUContext *vtd_dev_icx;
+int ret = -1;
+
+vtd_iommu_lock(s);
+QLIST_FOREACH(vtd_dev_icx, >vtd_dev_icx_list, next) {
+HostIOMMUContext *iommu_ctx = vtd_dev_icx->iommu_ctx;
+
+/*
+ * Similar with pasid allocation. We'll free the pasid
+ * on the first successful free operation. It's a bit
+ * hackish in that we don't have a good global interface
+ * yet to talk to modules like vfio to deliver this pasid
+ * free request, so we're leveraging this per-device iommu
+ * context to do the same thing just to make sure the free
+ * happens only once.
+ */
+ret = host_iommu_ctx_pasid_free(iommu_ctx, pasid);
+if (!ret) {
+break;
+}
+}
+vtd_iommu_unlock(s);
+
+return ret;
+}
+
+/*
+ * If IP is not set, set it then return.
+ * If IP is already set, return.
+ */
+static void vtd_vcmd_set_ip(IntelIOMMUState *s)
+{
+s->vcrsp = 1;
+vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+ ((uint64_t) s->vcrsp));
+}
+
+static void vtd_vcmd_clear_ip(IntelIOMMUState *s)
+{
+s->vcrsp &= (~((uint64_t)(0x1)));
+vtd_set_quad_raw(s, DMAR_VCRSP_REG,
+ ((uint64_t) s->vcrsp));
+}
+
+/* Handle write to Virtual Command Register */
+static int vtd_handle_vcmd_write(IntelIOMMUState *s, uint64_t val)
+{
+uint32_t pasid;
+int ret = -1;
+
+trace_vtd_reg_write_vcmd(s->vcrsp, val);
+
+if (!(s->vccap & VTD_VCCAP_PAS) ||
+ (s->vcrsp & 1)) {
+return -1;
+}
+
+/*
+ * Since vCPU should be blocked when the guest VMCD
+ * write was trapped to here. Should be no other vCPUs
+ * try to access VCMD if guest software is well written.
+ * However, we still emulate the IP bit here in case of
+ * bad guest software. Also align with the spec.
+ */
+vtd_vcmd_set_ip(s);
+
+switch (val & VTD_VCMD_CMD_MASK) {
+case VTD_VCMD_ALLOC_PASID:
+ret = vtd_request_pasid_alloc(s, );
+if (ret) {
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_NO_AVAILABLE_PASID);
+} else {
+s->vcrsp |= VTD_VCRSP_RSLT(pasid);
+}
+break;
+
+case VTD_VCMD_FREE_PASID:
+pasid = VTD_VCMD_PASID_VALUE(val);
+ret = vtd_request_pasid_free(s, pasid);
+if (ret < 0) {
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_FREE_INVALID_PASID);
+}
+break;
+
+default:
+s->vcrsp |= VTD_VCRSP_SC(VTD_VCMD_UNDEFINED_CMD);
+error_report_once("Virtual Command: unsupported command!!!");
+break;
+}
+vtd_vcmd_clear_ip(s);
+return 0;
+}
+
 static uint64_t vtd_mem_read(void *opaque, hwaddr addr, unsigned size)
 {
 IntelIOMMUState *s = opaque;
@@ -2944,6 +3067,23 @@ static void vtd_mem_write(void *opaque, hwaddr addr,
 vtd_set_long(s, addr, val);
 break;
 
+case DMAR_VCMD_REG:
+if (!vtd_handle_vcmd_write(s, val)) {
+if (size == 4) {
+vtd_set_long(s, addr, val);
+} else {
+

1 2 3 4 5 6 7 >

1 - 100 of 635 matches

Mail list logo