Re: [PATCH v4 0/2] KVM: x86/mmu: Skip mmu_notifier changes when possible

2021-02-22 Thread Paolo Bonzini

On 22/02/21 03:45, David Stevens wrote:

These patches reduce how often mmu_notifier updates block guest page
faults. The primary benefit of this is the reduction in the likelihood
of extreme latency when handling a page fault due to another thread
having been preempted while modifying host virtual addresses.

v3 -> v4:
  - Fix bug by skipping prefetch during invalidation

v2 -> v3:
  - Added patch to skip check for MMIO page faults
  - Style changes

David Stevens (1):
   KVM: x86/mmu: Consider the hva in mmu_notifier retry

Sean Christopherson (1):
   KVM: x86/mmu: Skip mmu_notifier check when handling MMIO page fault

  arch/powerpc/kvm/book3s_64_mmu_hv.c|  2 +-
  arch/powerpc/kvm/book3s_64_mmu_radix.c |  2 +-
  arch/x86/kvm/mmu/mmu.c | 23 ++--
  arch/x86/kvm/mmu/paging_tmpl.h |  7 ---
  include/linux/kvm_host.h   | 25 +-
  virt/kvm/kvm_main.c| 29 ++
  6 files changed, 72 insertions(+), 16 deletions(-)



Rebased, and queued with the fix that Sean suggested.

Paolo

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH kvmtool 13/21] hw/serial: Refactor trap handler

2021-02-22 Thread Alexandru Elisei
Hi Andre,

On 2/18/21 2:41 PM, Andre Przywara wrote:
> On Tue, 16 Feb 2021 14:22:05 +
> Alexandru Elisei  wrote:
>
>> Hi Andre,
>>
>> Patch looks good, nitpicks below.
>>
>> On 12/10/20 2:29 PM, Andre Przywara wrote:
>>> With the planned retirement of the special ioport emulation code, we
>>> need to provide an emulation function compatible with the MMIO prototype.
>>>
>>> Adjust the trap handler to use that new function, and provide shims to
>>> implement the old ioport interface, for now.
>>>
>>> Signed-off-by: Andre Przywara 
>>> ---
>>>  hw/serial.c | 97 +++--
>>>  1 file changed, 65 insertions(+), 32 deletions(-)
>>>
>>> diff --git a/hw/serial.c b/hw/serial.c
>>> index b0465d99..2907089c 100644
>>> --- a/hw/serial.c
>>> +++ b/hw/serial.c
>>> @@ -242,36 +242,31 @@ void serial8250__inject_sysrq(struct kvm *kvm, char 
>>> sysrq)
>>> sysrq_pending = sysrq;
>>>  }
>>>  
>>> -static bool serial8250_out(struct ioport *ioport, struct kvm_cpu *vcpu, 
>>> u16 port,
>>> -  void *data, int size)
>>> +static bool serial8250_out(struct serial8250_device *dev, struct kvm_cpu 
>>> *vcpu,
>>> +  u16 offset, u8 data)
>>>  {
>>> -   struct serial8250_device *dev = ioport->priv;
>>> -   u16 offset;
>>> bool ret = true;
>>> -   char *addr = data;
>>>  
>>> mutex_lock(>mutex);
>>>  
>>> -   offset = port - dev->iobase;
>>> -
>>> switch (offset) {
>>> case UART_TX:
>>> if (dev->lcr & UART_LCR_DLAB) {
>>> -   dev->dll = ioport__read8(data);
>>> +   dev->dll = data;
>>> break;
>>> }
>>>  
>>> /* Loopback mode */
>>> if (dev->mcr & UART_MCR_LOOP) {
>>> if (dev->rxcnt < FIFO_LEN) {
>>> -   dev->rxbuf[dev->rxcnt++] = *addr;
>>> +   dev->rxbuf[dev->rxcnt++] = data;
>>> dev->lsr |= UART_LSR_DR;
>>> }
>>> break;
>>> }
>>>  
>>> if (dev->txcnt < FIFO_LEN) {
>>> -   dev->txbuf[dev->txcnt++] = *addr;
>>> +   dev->txbuf[dev->txcnt++] = data;
>>> dev->lsr &= ~UART_LSR_TEMT;
>>> if (dev->txcnt == FIFO_LEN / 2)
>>> dev->lsr &= ~UART_LSR_THRE;
>>> @@ -283,18 +278,18 @@ static bool serial8250_out(struct ioport *ioport, 
>>> struct kvm_cpu *vcpu, u16 port
>>> break;
>>> case UART_IER:
>>> if (!(dev->lcr & UART_LCR_DLAB))
>>> -   dev->ier = ioport__read8(data) & 0x0f;
>>> +   dev->ier = data & 0x0f;
>>> else
>>> -   dev->dlm = ioport__read8(data);
>>> +   dev->dlm = data;
>>> break;
>>> case UART_FCR:
>>> -   dev->fcr = ioport__read8(data);
>>> +   dev->fcr = data;
>>> break;
>>> case UART_LCR:
>>> -   dev->lcr = ioport__read8(data);
>>> +   dev->lcr = data;
>>> break;
>>> case UART_MCR:
>>> -   dev->mcr = ioport__read8(data);
>>> +   dev->mcr = data;
>>> break;
>>> case UART_LSR:
>>> /* Factory test */
>>> @@ -303,7 +298,7 @@ static bool serial8250_out(struct ioport *ioport, 
>>> struct kvm_cpu *vcpu, u16 port
>>> /* Not used */
>>> break;
>>> case UART_SCR:
>>> -   dev->scr = ioport__read8(data);
>>> +   dev->scr = data;
>>> break;
>>> default:
>>> ret = false;
>>> @@ -336,46 +331,43 @@ static void serial8250_rx(struct serial8250_device 
>>> *dev, void *data)
>>> }
>>>  }
>>>  
>>> -static bool serial8250_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 
>>> port, void *data, int size)
>>> +static bool serial8250_in(struct serial8250_device *dev, struct kvm_cpu 
>>> *vcpu,
>>> + u16 offset, u8 *data)
>>>  {
>>> -   struct serial8250_device *dev = ioport->priv;
>>> -   u16 offset;
>>> bool ret = true;
>>>  
>>> mutex_lock(>mutex);
>>>  
>>> -   offset = port - dev->iobase;
>>> -
>>> switch (offset) {
>>> case UART_RX:
>>> if (dev->lcr & UART_LCR_DLAB)
>>> -   ioport__write8(data, dev->dll);
>>> +   *data = dev->dll;
>>> else
>>> serial8250_rx(dev, data);
>>> break;
>>> case UART_IER:
>>> if (dev->lcr & UART_LCR_DLAB)
>>> -   ioport__write8(data, dev->dlm);
>>> +   *data = dev->dlm;
>>> else
>>> -   ioport__write8(data, dev->ier);
>>> +   *data = dev->ier;
>>> break;
>>> case UART_IIR:
>>> -   ioport__write8(data, dev->iir | UART_IIR_TYPE_BITS);
>>> +   *data = dev->iir | UART_IIR_TYPE_BITS;
>>> break;
>>> case 

Re: [PATCH kvmtool 07/21] hw/i8042: Switch to new trap handlers

2021-02-22 Thread Alexandru Elisei
Hi Andre,

On 2/18/21 12:09 PM, Andre Przywara wrote:
> On Fri, 12 Feb 2021 10:41:20 +
> Alexandru Elisei  wrote:
>
> Hi,
>
>> On 12/10/20 2:28 PM, Andre Przywara wrote:
>>> Now that the PC keyboard has a trap handler adhering to the MMIO fault
>>> handler prototype, let's switch over to the joint registration routine.
>>>
>>> This allows us to get rid of the ioport shim routines.
>>>
>>> Make the kbd_init() function static on the way.
>>>
>>> Signed-off-by: Andre Przywara 
>>> ---
>>>  hw/i8042.c  | 30 --
>>>  include/kvm/i8042.h |  1 -
>>>  2 files changed, 4 insertions(+), 27 deletions(-)
>>>
>>> diff --git a/hw/i8042.c b/hw/i8042.c
>>> index eb1f9d28..91d79dc4 100644
>>> --- a/hw/i8042.c
>>> +++ b/hw/i8042.c
>>> @@ -325,40 +325,18 @@ static void kbd_io(struct kvm_cpu *vcpu, u64 addr, u8 
>>> *data, u32 len,
>>> ioport__write8(data, value);
>>>  }
>>>  
>>> -/*
>>> - * Called when the OS has written to one of the keyboard's ports (0x60 or 
>>> 0x64)
>>> - */
>>> -static bool kbd_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, 
>>> void *data, int size)
>>> -{
>>> -   kbd_io(vcpu, port, data, size, false, NULL);
>>> -
>>> -   return true;
>>> -}
>>> -
>>> -static bool kbd_out(struct ioport *ioport, struct kvm_cpu *vcpu, u16 port, 
>>> void *data, int size)
>>> -{
>>> -   kbd_io(vcpu, port, data, size, true, NULL);
>>> -
>>> -   return true;
>>> -}
>>> -
>>> -static struct ioport_operations kbd_ops = {
>>> -   .io_in  = kbd_in,
>>> -   .io_out = kbd_out,
>>> -};
>>> -
>>> -int kbd__init(struct kvm *kvm)
>>> +static int kbd__init(struct kvm *kvm)
>>>  {
>>> int r;
>>>  
>>> kbd_reset();
>>> state.kvm = kvm;
>>> -   r = ioport__register(kvm, I8042_DATA_REG, _ops, 2, NULL);
>>> +   r = kvm__register_pio(kvm, I8042_DATA_REG, 2, kbd_io, NULL);  
>> I guess you are registering two addresses here to cover I8042_PORT_B_REG, 
>> right?
>> Might be worth a comment.
> I am registering two ports because the original code did, and I didn't
> dare to touch this. I guess we put this on the wishlist for the device
> emulation fixup series? ;-)
>
> Maybe the intention was to just *reserve* those ports?

Considering that I8042_DATA_REG = 0x60 and I8042_PORT_B_REG = 0x61, and the
emulation handlers handle both of them, I'm pretty sure the intention was to
reserve memory to cover both ports.

>
>>> if (r < 0)
>>> return r;
>>> -   r = ioport__register(kvm, I8042_COMMAND_REG, _ops, 2, NULL);
>>> +   r = kvm__register_pio(kvm, I8042_COMMAND_REG, 2, kbd_io, NULL);  
>> Shouldn't length be 1? The emulation should work only for address 0x64
>> (command/status register), right? Or am I missing something?
> I don't think you are, same as above. Maybe some weird guest is using
> half-word accesses (outw)?

I think you're right, let's not mess with the device emulation right now, after
all that's not the purpose of the series. And I'm fairly confident you know more
about the device and the x86 architecture than me, so I'll trust your judgement 
on
this.

Thanks,

Alex

>
> Cheers,
> Andre
>
>
>> Thanks,
>>
>> Alex
>>
>>> if (r < 0) {
>>> -   ioport__unregister(kvm, I8042_DATA_REG);
>>> +   kvm__deregister_pio(kvm, I8042_DATA_REG);
>>> return r;
>>> }
>>>  
>>> diff --git a/include/kvm/i8042.h b/include/kvm/i8042.h
>>> index 3b4ab688..cd4ae6bb 100644
>>> --- a/include/kvm/i8042.h
>>> +++ b/include/kvm/i8042.h
>>> @@ -7,6 +7,5 @@ struct kvm;
>>>  
>>>  void mouse_queue(u8 c);
>>>  void kbd_queue(u8 c);
>>> -int kbd__init(struct kvm *kvm);
>>>  
>>>  #endif  
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v11 01/13] vfio: VFIO_IOMMU_SET_PASID_TABLE

2021-02-22 Thread Auger Eric
Hi Keqian,

On 2/22/21 1:20 PM, Keqian Zhu wrote:
> Hi Eric,
> 
> On 2021/2/22 18:53, Auger Eric wrote:
>> Hi Keqian,
>>
>> On 2/2/21 1:34 PM, Keqian Zhu wrote:
>>> Hi Eric,
>>>
>>> On 2020/11/16 19:00, Eric Auger wrote:
 From: "Liu, Yi L" 

 This patch adds an VFIO_IOMMU_SET_PASID_TABLE ioctl
 which aims to pass the virtual iommu guest configuration
 to the host. This latter takes the form of the so-called
 PASID table.

 Signed-off-by: Jacob Pan 
 Signed-off-by: Liu, Yi L 
 Signed-off-by: Eric Auger 

 ---
 v11 -> v12:
 - use iommu_uapi_set_pasid_table
 - check SET and UNSET are not set simultaneously (Zenghui)

 v8 -> v9:
 - Merge VFIO_IOMMU_ATTACH/DETACH_PASID_TABLE into a single
   VFIO_IOMMU_SET_PASID_TABLE ioctl.

 v6 -> v7:
 - add a comment related to VFIO_IOMMU_DETACH_PASID_TABLE

 v3 -> v4:
 - restore ATTACH/DETACH
 - add unwind on failure

 v2 -> v3:
 - s/BIND_PASID_TABLE/SET_PASID_TABLE

 v1 -> v2:
 - s/BIND_GUEST_STAGE/BIND_PASID_TABLE
 - remove the struct device arg
 ---
  drivers/vfio/vfio_iommu_type1.c | 65 +
  include/uapi/linux/vfio.h   | 19 ++
  2 files changed, 84 insertions(+)

 diff --git a/drivers/vfio/vfio_iommu_type1.c 
 b/drivers/vfio/vfio_iommu_type1.c
 index 67e827638995..87ddd9e882dc 100644
 --- a/drivers/vfio/vfio_iommu_type1.c
 +++ b/drivers/vfio/vfio_iommu_type1.c
 @@ -2587,6 +2587,41 @@ static int vfio_iommu_iova_build_caps(struct 
 vfio_iommu *iommu,
return ret;
  }
  
 +static void
 +vfio_detach_pasid_table(struct vfio_iommu *iommu)
 +{
 +  struct vfio_domain *d;
 +
 +  mutex_lock(>lock);
 +  list_for_each_entry(d, >domain_list, next)
 +  iommu_detach_pasid_table(d->domain);
 +
 +  mutex_unlock(>lock);
 +}
 +
 +static int
 +vfio_attach_pasid_table(struct vfio_iommu *iommu, unsigned long arg)
 +{
 +  struct vfio_domain *d;
 +  int ret = 0;
 +
 +  mutex_lock(>lock);
 +
 +  list_for_each_entry(d, >domain_list, next) {
 +  ret = iommu_uapi_attach_pasid_table(d->domain, (void __user 
 *)arg);
>>> This design is not very clear to me. This assumes all iommu_domains share 
>>> the same pasid table.
>>>
>>> As I understand, it's reasonable when there is only one group in the 
>>> domain, and only one domain in the vfio_iommu.
>>> If more than one group in the vfio_iommu, the guest may put them into 
>>> different guest iommu_domain, then they have different pasid table.
>>>
>>> Is this the use scenario?
>>
>> the vfio_iommu is attached to a container. all the groups within a
>> container share the same set of page tables (linux
>> Documentation/driver-api/vfio.rst). So to me if you want to use
>> different pasid tables, the groups need to be attached to different
>> containers. Does that make sense to you?
> OK, so this is what I understand about the design. A little question is that 
> when
> we perform attach_pasid_table on a container, maybe we ought to do a sanity
> check to make sure that only one group is in this container, instead of
> iterating all domain?
> 
> To be frank, my main concern is that if we put each group into different 
> container
> under nested mode, then we give up the possibility that they can share stage2 
> page tables,
> which saves host memory and reduces the time of preparing environment for VM.

Referring to the QEMU integration, when you use a virtual IOMMU, there
is generally one VFIO container per viommu protected device
(AddressSpace), independently on the fact nested stage is being used. I
think the exception is if you put 2 assigned devices behind a virtual
PCIe to PCI bridge (pcie-pci-bridge), in that case they have the same
RID, they share the same QEMU AddressSpace and they are put in the same
container, if the kernel does not reject it (underlying pIOMMUs allow
it). See QEMU vfio_connect_container() in hw/vfio/common.c.

In that config, if the assigned devices belong to different groups, you
may end up with 2 groups set to the same container. But this case is not
supported by the guest kernel anyway (independently on the nested stage
integration). You hit a BUG_ON as reported a long time ago in

https://www.mail-archive.com/qemu-devel@nongnu.org/msg608047.html


> 
> To me, I'd like to understand the "container shares page table" to be:
> 1) share stage2 page table under nested mode.
under nested mode they share S2 and with this design devices also share
the same PASID table. Because on the guest they are in the same group.
> 2) share stage1 page table under non-nested mode.
in non nested mode there is a single stage, by default S1.
> 
> As when we perform "map" on a container:
> 1) under nested mode, we setup stage2 mapping.
> 2) under non-nested mode, we setup stage1 mapping.
right
> 
> Indeed, 

Re: [PATCH kvmtool 06/21] hw/i8042: Refactor trap handler

2021-02-22 Thread Alexandru Elisei
Hi Andre,

On 2/18/21 11:48 AM, Andre Przywara wrote:
> On Thu, 18 Feb 2021 11:17:58 +
> Alexandru Elisei  wrote:
>
>> Hi Andre,
>>
>> On 2/18/21 10:34 AM, Andre Przywara wrote:
>>> On Thu, 11 Feb 2021 17:23:13 +
>>> Alexandru Elisei  wrote:
>>>  
 Hi Andre,

 On 12/10/20 2:28 PM, Andre Przywara wrote:  
> With the planned retirement of the special ioport emulation code, we
> need to provide an emulation function compatible with the MMIO
> prototype.
>
> Adjust the trap handler to use that new function, and provide shims to
> implement the old ioport interface, for now.
>
> Signed-off-by: Andre Przywara 
> ---
>  hw/i8042.c | 68 +++---
>  1 file changed, 34 insertions(+), 34 deletions(-)
>
> diff --git a/hw/i8042.c b/hw/i8042.c
> index 36ee183f..eb1f9d28 100644
> --- a/hw/i8042.c
> +++ b/hw/i8042.c
> @@ -292,52 +292,52 @@ static void kbd_reset(void)
>   };
>  }
>  
> -/*
> - * Called when the OS has written to one of the keyboard's ports (0x60 
> or 0x64)
> - */
> -static bool kbd_in(struct ioport *ioport, struct kvm_cpu *vcpu, u16 
> port, void *data, int size)
> +static void kbd_io(struct kvm_cpu *vcpu, u64 addr, u8 *data, u32 len,
> +u8 is_write, void *ptr)
>  {
> - switch (port) {
> - case I8042_COMMAND_REG: {
> - u8 value = kbd_read_status();
> - ioport__write8(data, value);
> + u8 value;
> +
> + if (is_write)
> + value = ioport__read8(data);
> +
> + switch (addr) {
> + case I8042_COMMAND_REG:
> + if (is_write)
> + kbd_write_command(vcpu->kvm, value);
> + else
> + value = kbd_read_status();
>   break;
> - }
> - case I8042_DATA_REG: {
> - u8 value = kbd_read_data();
> - ioport__write8(data, value);
> + case I8042_DATA_REG:
> + if (is_write)
> + kbd_write_data(value);
> + else
> + value = kbd_read_data();
>   break;
> - }
> - case I8042_PORT_B_REG: {
> - ioport__write8(data, 0x20);
> + case I8042_PORT_B_REG:
> + if (!is_write)
> + value = 0x20;
>   break;
> - }
>   default:
> - return false;
> + return;
 Any particular reason for removing the false return value? I don't see it
 mentioned in the commit message. Otherwise this is identical to the two 
 functions
 it replaces.  
>>> Because the MMIO handler prototype is void, in contrast to the PIO one.
>>> Since on returning "false" we only seem to panic kvmtool, this is of
>>> quite limited use, I'd say.  
>> Actually, in ioport.c::kvm__emulate_io(), if kvm->cfg.ioport_debug is true, 
>> it
>> will print an error and then panic in kvm_cpu__start(); otherwise the error 
>> is
>> silently ignored. serial.c is another device where an unknown register 
>> returns
>> false. In rtc.c, the unknown register is ignored. cfi_flash.c prints 
>> debugging
>> information. So I guess kvmtool implements all possible methods of handling 
>> an
>> unknown register *at the same time*, so it's up to you how you want to 
>> handle it.
> Well, the MMIO prototype we are going to use is void anyway, so it's
> just one patch earlier that we get this new behaviour.
> For handling MMIO errors:
> - Hardware MMIO doesn't have a back channel: if the MMIO write triggers
>   some error condition, the device would need to deal with it (setting
>   internal error state, ignore, etc.). On some systems the device could
>   throw some kind of bus error or SError, but this is a rather drastic
>   measure, and is certainly not exercised by those ancient devices.
> - Any kind of error reporting which can be triggered by a guest is
>   frowned upon: it could spam the console or some log file, and so
>   impact host operation. At the end an administrator can't do much about
>   it, anyway.
> - Which leaves the only use to some kvmtool developer debugging some
>   device emulation or investigating weird guest behaviour. And in this
>   case we can more easily have a debug message *inside* the device
>   emulation code, can't we?

That's what I had in mind, debugging messages in the device emulation. If the
guest can access an unknown register offset this can mean one of two things in 
my
opinion: the emulated device registered a memory region bigger that necessary or
the emulated device is not handling all device registers. But that's a subject 
for
another series.

Thanks,

Alex

>
> And since the MMIO handler prototype is void, we have no choice anyway,
> at least not without another huge (and pointless) series to change
> those user as well ;-)
>
> Cheers,
> Andre
>
>   }
>  
> + if (!is_write)
> + 

[RFC PATCH 5/5] KVM: arm64: Make sure pinned vmid is released on VM exit

2021-02-22 Thread Shameer Kolothum
Since the pinned VMID space is not recycled, we need to make sure that
we release the vmid back into the pool when we are done with it.

Signed-off-by: Shameer Kolothum 
---
 arch/arm64/kvm/arm.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index 8955968be49f..d9900ffb88f4 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -181,8 +181,16 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_vgic_destroy(kvm);
 
for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-   if (kvm->vcpus[i]) {
-   kvm_vcpu_destroy(kvm->vcpus[i]);
+   struct kvm_vcpu *vcpu = kvm->vcpus[i];
+
+   if (vcpu) {
+   struct kvm_vmid *vmid = >arch.hw_mmu->vmid;
+
+   if (refcount_read(>pinned)) {
+   ida_free(_pinned_vmids, vmid->vmid);
+   refcount_set(>pinned, 0);
+   }
+   kvm_vcpu_destroy(vcpu);
kvm->vcpus[i] = NULL;
}
}
-- 
2.17.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 4/5] iommu/arm-smmu-v3: Use pinned VMID for NESTED stage with BTM

2021-02-22 Thread Shameer Kolothum
If the SMMU supports BTM and the device belongs to NESTED domain
with shared pasid table, we need to use the VMID allocated by the
KVM for the s2 configuration. Hence, request a pinned VMID from KVM.

Signed-off-by: Shameer Kolothum 
---
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c | 49 -
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c 
b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
index 26bf7da1bcd0..04f83f7c8319 100644
--- a/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
+++ b/drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c
@@ -28,6 +28,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -2195,6 +2196,33 @@ static void arm_smmu_bitmap_free(unsigned long *map, int 
idx)
clear_bit(idx, map);
 }
 
+static int arm_smmu_pinned_vmid_get(struct arm_smmu_domain *smmu_domain)
+{
+   struct arm_smmu_master *master;
+
+   master = list_first_entry_or_null(_domain->devices,
+ struct arm_smmu_master, domain_head);
+   if (!master)
+   return -EINVAL;
+
+   return kvm_pinned_vmid_get(master->dev);
+}
+
+static int arm_smmu_pinned_vmid_put(struct arm_smmu_domain *smmu_domain)
+{
+   struct arm_smmu_master *master;
+
+   master = list_first_entry_or_null(_domain->devices,
+ struct arm_smmu_master, domain_head);
+   if (!master)
+   return -EINVAL;
+
+   if (smmu_domain->s2_cfg.vmid)
+   return kvm_pinned_vmid_put(master->dev);
+
+   return 0;
+}
+
 static void arm_smmu_domain_free(struct iommu_domain *domain)
 {
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
@@ -2215,8 +2243,11 @@ static void arm_smmu_domain_free(struct iommu_domain 
*domain)
mutex_unlock(_smmu_asid_lock);
}
if (s2_cfg->set) {
-   if (s2_cfg->vmid)
-   arm_smmu_bitmap_free(smmu->vmid_map, s2_cfg->vmid);
+   if (s2_cfg->vmid) {
+   if (!(smmu->features & ARM_SMMU_FEAT_BTM) &&
+   smmu_domain->stage != ARM_SMMU_DOMAIN_NESTED)
+   arm_smmu_bitmap_free(smmu->vmid_map, 
s2_cfg->vmid);
+   }
}
 
kfree(smmu_domain);
@@ -3199,6 +3230,17 @@ static int arm_smmu_attach_pasid_table(struct 
iommu_domain *domain,
!(smmu->features & ARM_SMMU_FEAT_2_LVL_CDTAB))
goto out;
 
+   if (smmu->features & ARM_SMMU_FEAT_BTM) {
+   ret = arm_smmu_pinned_vmid_get(smmu_domain);
+   if (ret < 0)
+   goto out;
+
+   if (smmu_domain->s2_cfg.vmid)
+   arm_smmu_bitmap_free(smmu->vmid_map, 
smmu_domain->s2_cfg.vmid);
+
+   smmu_domain->s2_cfg.vmid = (u16)ret;
+   }
+
smmu_domain->s1_cfg.cdcfg.cdtab_dma = cfg->base_ptr;
smmu_domain->s1_cfg.s1cdmax = cfg->pasid_bits;
smmu_domain->s1_cfg.s1fmt = cfg->vendor_data.smmuv3.s1fmt;
@@ -3221,6 +3263,7 @@ static int arm_smmu_attach_pasid_table(struct 
iommu_domain *domain,
 static void arm_smmu_detach_pasid_table(struct iommu_domain *domain)
 {
struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+   struct arm_smmu_device *smmu = smmu_domain->smmu;
struct arm_smmu_master *master;
unsigned long flags;
 
@@ -3237,6 +3280,8 @@ static void arm_smmu_detach_pasid_table(struct 
iommu_domain *domain)
arm_smmu_install_ste_for_dev(master);
spin_unlock_irqrestore(_domain->devices_lock, flags);
 
+   if (smmu->features & ARM_SMMU_FEAT_BTM)
+   arm_smmu_pinned_vmid_put(smmu_domain);
 unlock:
mutex_unlock(_domain->init_mutex);
 }
-- 
2.17.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 3/5] KVM: ARM64: Add support for pinned VMIDs

2021-02-22 Thread Shameer Kolothum
On an ARM64 system with a SMMUv3 implementation that fully supports
Broadcast TLB Maintenance(BTM) feature, the CPU TLB invalidate
instructions are received by SMMU. This is very useful when the
SMMU shares the page tables with the CPU(eg: Guest SVA use case).
For this to work, the SMMU must use the same VMID that is allocated
by KVM to configure the stage 2 translations.

At present KVM VMID allocations are recycled on rollover and may
change as a result. This will create issues if we have to share
the KVM VMID with SMMU. Hence, we spilt the KVM VMID space into
two, the first half follows the normal recycle on rollover policy
while the second half of the VMID pace is used to allocate pinned
VMIDs. This feature is enabled based on a command line option
"kvm-arm.pinned_vmid_enable".

Signed-off-by: Shameer Kolothum 
---
 arch/arm64/include/asm/kvm_host.h |   2 +
 arch/arm64/kvm/Kconfig|   1 +
 arch/arm64/kvm/arm.c  | 104 +-
 3 files changed, 106 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 0cd9f0f75c13..db6441c6a580 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
@@ -65,6 +66,7 @@ struct kvm_vmid {
/* The VMID generation used for the virt. memory system */
u64vmid_gen;
u32vmid;
+   refcount_t   pinned;
 };
 
 struct kvm_s2_mmu {
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index 043756db8f6e..c5c52953e842 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -40,6 +40,7 @@ menuconfig KVM
select HAVE_KVM_VCPU_RUN_PID_CHANGE
select TASKSTATS
select TASK_DELAY_ACCT
+   select HAVE_KVM_PINNED_VMID
help
  Support hosting virtualized guest machines.
 
diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
index c0ffb019ca8b..8955968be49f 100644
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -56,6 +56,19 @@ static atomic64_t kvm_vmid_gen = ATOMIC64_INIT(1);
 static u32 kvm_next_vmid;
 static DEFINE_SPINLOCK(kvm_vmid_lock);
 
+static bool kvm_pinned_vmid_enable;
+
+static int __init early_pinned_vmid_enable(char *buf)
+{
+   return strtobool(buf, _pinned_vmid_enable);
+}
+
+early_param("kvm-arm.pinned_vmid_enable", early_pinned_vmid_enable);
+
+static DEFINE_IDA(kvm_pinned_vmids);
+static u32 kvm_pinned_vmid_start;
+static u32 kvm_pinned_vmid_end;
+
 static bool vgic_present;
 
 static DEFINE_PER_CPU(unsigned char, kvm_arm_hardware_enabled);
@@ -475,6 +488,10 @@ void force_vm_exit(const cpumask_t *mask)
 static bool need_new_vmid_gen(struct kvm_vmid *vmid)
 {
u64 current_vmid_gen = atomic64_read(_vmid_gen);
+
+   if (refcount_read(>pinned))
+   return false;
+
smp_rmb(); /* Orders read of kvm_vmid_gen and kvm->arch.vmid */
return unlikely(READ_ONCE(vmid->vmid_gen) != current_vmid_gen);
 }
@@ -485,6 +502,8 @@ static bool need_new_vmid_gen(struct kvm_vmid *vmid)
  */
 static void update_vmid(struct kvm_vmid *vmid)
 {
+   unsigned int vmid_bits;
+
if (!need_new_vmid_gen(vmid))
return;
 
@@ -521,7 +540,12 @@ static void update_vmid(struct kvm_vmid *vmid)
 
vmid->vmid = kvm_next_vmid;
kvm_next_vmid++;
-   kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1;
+   if (kvm_pinned_vmid_enable)
+   vmid_bits = kvm_get_vmid_bits() - 1;
+   else
+   vmid_bits = kvm_get_vmid_bits();
+
+   kvm_next_vmid &= (1 << vmid_bits) - 1;
 
smp_wmb();
WRITE_ONCE(vmid->vmid_gen, atomic64_read(_vmid_gen));
@@ -569,6 +593,71 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu)
return ret;
 }
 
+int kvm_arch_pinned_vmid_get(struct kvm *kvm)
+{
+   struct kvm_vcpu *vcpu;
+   struct kvm_vmid *kvm_vmid;
+   int ret;
+
+   if (!kvm_pinned_vmid_enable || !atomic_read(>online_vcpus))
+   return -EINVAL;
+
+   vcpu = kvm_get_vcpu(kvm, 0);
+   if (!vcpu)
+   return -EINVAL;
+
+   kvm_vmid = >arch.hw_mmu->vmid;
+
+   spin_lock(_vmid_lock);
+
+   if (refcount_inc_not_zero(_vmid->pinned)) {
+   spin_unlock(_vmid_lock);
+   return kvm_vmid->vmid;
+   }
+
+   ret = ida_alloc_range(_pinned_vmids, kvm_pinned_vmid_start,
+ kvm_pinned_vmid_end, GFP_KERNEL);
+   if (ret < 0) {
+   spin_unlock(_vmid_lock);
+   return ret;
+   }
+
+   force_vm_exit(cpu_all_mask);
+   kvm_call_hyp(__kvm_flush_vm_context);
+
+   kvm_vmid->vmid = (u32)ret;
+   refcount_set(_vmid->pinned, 1);
+   spin_unlock(_vmid_lock);
+
+   return ret;
+}
+
+int kvm_arch_pinned_vmid_put(struct kvm *kvm)
+{
+   struct kvm_vcpu *vcpu;
+   struct kvm_vmid *kvm_vmid;
+
+ 

[RFC PATCH 2/5] KVM: Add generic infrastructure to support pinned VMIDs

2021-02-22 Thread Shameer Kolothum
Provide generic helper functions to get/put pinned VMIDs if the arch
supports it.

Signed-off-by: Shameer Kolothum 
---
 include/linux/kvm_host.h | 17 +
 virt/kvm/Kconfig |  2 ++
 virt/kvm/kvm_main.c  | 25 +
 3 files changed, 44 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 7f2e2a09ebbd..25856db74a08 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -836,6 +836,8 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu, bool usermode_vcpu_not_eligible);
+int kvm_pinned_vmid_get(struct device *dev);
+int kvm_pinned_vmid_put(struct device *dev);
 
 void kvm_flush_remote_tlbs(struct kvm *kvm);
 void kvm_reload_remote_mmus(struct kvm *kvm);
@@ -1478,4 +1480,19 @@ static inline void kvm_handle_signal_exit(struct 
kvm_vcpu *vcpu)
 }
 #endif /* CONFIG_KVM_XFER_TO_GUEST_WORK */
 
+#ifdef CONFIG_HAVE_KVM_PINNED_VMID
+int kvm_arch_pinned_vmid_get(struct kvm *kvm);
+int kvm_arch_pinned_vmid_put(struct kvm *kvm);
+#else
+static inline int kvm_arch_pinned_vmid_get(struct kvm *kvm)
+{
+   return -EINVAL;
+}
+
+static inline int kvm_arch_pinned_vmid_put(struct kvm *kvm)
+{
+   return -EINVAL;
+}
+#endif
+
 #endif
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index 1c37ccd5d402..bb55616c5616 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -63,3 +63,5 @@ config HAVE_KVM_NO_POLL
 
 config KVM_XFER_TO_GUEST_WORK
bool
+config HAVE_KVM_PINNED_VMID
+   bool
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2541a17ff1c4..632d391f0e34 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -51,6 +51,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -2849,6 +2850,30 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
 
+int kvm_pinned_vmid_get(struct device *dev)
+{
+   struct kvm *kvm;
+
+   kvm = vfio_kvm_get_from_dev(dev);
+   if (!kvm)
+   return -EINVAL;
+
+   return kvm_arch_pinned_vmid_get(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_pinned_vmid_get);
+
+int kvm_pinned_vmid_put(struct device *dev)
+{
+   struct kvm *kvm;
+
+   kvm = vfio_kvm_get_from_dev(dev);
+   if (!kvm)
+   return -EINVAL;
+
+   return kvm_arch_pinned_vmid_put(kvm);
+}
+EXPORT_SYMBOL_GPL(kvm_pinned_vmid_put);
+
 #ifndef CONFIG_S390
 /*
  * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
-- 
2.17.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 1/5] vfio: Add a helper to retrieve kvm instance from a dev

2021-02-22 Thread Shameer Kolothum
A device that belongs to vfio_group has the kvm instance associated
with it. Retrieve it.

Signed-off-by: Shameer Kolothum 
---
 drivers/vfio/vfio.c  | 12 
 include/linux/vfio.h |  1 +
 2 files changed, 13 insertions(+)

diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 2151bc7f87ab..d1968e4bf9f4 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -876,6 +876,18 @@ struct vfio_device *vfio_device_get_from_dev(struct device 
*dev)
 }
 EXPORT_SYMBOL_GPL(vfio_device_get_from_dev);
 
+struct kvm *vfio_kvm_get_from_dev(struct device *dev)
+{
+   struct vfio_group *group;
+
+   group = vfio_group_get_from_dev(dev);
+   if (!group)
+   return NULL;
+
+   return group->kvm;
+}
+EXPORT_SYMBOL_GPL(vfio_kvm_get_from_dev);
+
 static struct vfio_device *vfio_device_get_from_name(struct vfio_group *group,
 char *buf)
 {
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 38d3c6a8dc7e..8716298fee27 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -56,6 +56,7 @@ extern void *vfio_del_group_dev(struct device *dev);
 extern struct vfio_device *vfio_device_get_from_dev(struct device *dev);
 extern void vfio_device_put(struct vfio_device *device);
 extern void *vfio_device_data(struct vfio_device *device);
+extern struct kvm *vfio_kvm_get_from_dev(struct device *dev);
 
 /**
  * struct vfio_iommu_driver_ops - VFIO IOMMU driver callbacks
-- 
2.17.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH 0/5] KVM/ARM64 Add support for pinned VMIDs

2021-02-22 Thread Shameer Kolothum
On an ARM64 system with a SMMUv3 implementation that fully supports
Broadcast TLB Maintenance(BTM) feature as part of the Distributed
Virtual Memory(DVM) protocol, the CPU TLB invalidate instructions are
received by SMMUv3. This is very useful when the SMMUv3 shares the
page tables with the CPU(eg: Guest SVA use case). For this to work,
the SMMU must use the same VMID that is allocated by KVM to configure
the stage 2 translations. At present KVM VMID allocations are recycled
on rollover and may change as a result. This will create issues if we
have to share the KVM VMID with SMMU.
 
Please see the discussion here,
https://lore.kernel.org/linux-iommu/20200522101755.GA3453945@myrica/

This series proposes a way to share the VMID between KVM and IOMMU
driver by,

1. Splitting the KVM VMID space into two equal halves based on the
   command line option "kvm-arm.pinned_vmid_enable".
2. First half of the VMID space follows the normal recycle on rollover
   policy.
3. Second half of the VMID space doesn't roll over and is used to
   allocate pinned VMIDs.
4. Provides helper function to retrieve the KVM instance associated
   with a device(if it is part of a vfio group).
5. Introduces generic interfaces to get/put pinned KVM VMIDs.

Open Items:
1. I couldn't figure out a way to determine whether a platform actually
   fully supports DVM/BTM or not. Not sure we can take a call based on
   SMMUv3 BTM feature bit alone. Probably we can get it from firmware
   via IORT?
2. The current splitting of VMID space is only one way to do this and
   probably not the best. Maybe we can follow the pinned ASID method used
   in SVA code. Suggestions welcome here.
3. The detach_pasid_table() interface is not very clear to me as the current
   Qemu prototype is not  using that. This requires fixing from my side.
 
This is based on Jean-Philippe's SVA series[1] and Eric's SMMUv3 dual-stage
support series[2].

The branch with the whole vSVA + BTM solution is here,
https://github.com/hisilicon/kernel-dev/tree/5.10-rc4-2stage-v13-vsva-btm-rfc

This is lightly tested on a HiSilicon D06 platform with uacce/zip dev test tool,
./zip_sva_per -k tlb

Thanks,
Shameer

1. https://github.com/Linaro/linux-kernel-uadk/commits/uacce-devel-5.10
2. 
https://lore.kernel.org/linux-iommu/20201118112151.25412-1-eric.au...@redhat.com/T/

Shameer Kolothum (5):
  vfio: Add a helper to retrieve kvm instance from a dev
  KVM: Add generic infrastructure to support pinned VMIDs
  KVM: ARM64: Add support for pinned VMIDs
  iommu/arm-smmu-v3: Use pinned VMID for NESTED stage with BTM
  KVM: arm64: Make sure pinned vmid is released on VM exit

 arch/arm64/include/asm/kvm_host.h   |   2 +
 arch/arm64/kvm/Kconfig  |   1 +
 arch/arm64/kvm/arm.c| 116 +++-
 drivers/iommu/arm/arm-smmu-v3/arm-smmu-v3.c |  49 -
 drivers/vfio/vfio.c |  12 ++
 include/linux/kvm_host.h|  17 +++
 include/linux/vfio.h|   1 +
 virt/kvm/Kconfig|   2 +
 virt/kvm/kvm_main.c |  25 +
 9 files changed, 220 insertions(+), 5 deletions(-)

-- 
2.17.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH kvmtool 04/21] mmio: Extend handling to include ioport emulation

2021-02-22 Thread Alexandru Elisei
Hi Andre,

On 2/17/21 5:43 PM, Andre Przywara wrote:
> On Thu, 11 Feb 2021 16:10:16 +
> Alexandru Elisei  wrote:
>
> Hi,
>
>> On 12/10/20 2:28 PM, Andre Przywara wrote:
>>> In their core functionality MMIO and I/O port traps are not really
>>> different, yet we still have two totally separate code paths for
>>> handling them. Devices need to decide on one conduit or need to provide
>>> different handler functions for each of them.
>>>
>>> Extend the existing MMIO emulation to also cover ioport handlers.
>>> This just adds another RB tree root for holding the I/O port handlers,
>>> but otherwise uses the same tree population and lookup code.  
>> Maybe I'm missing something, but why two trees? Is it valid to have an 
>> overlap
>> between IO port and MMIO emulation? Or was it done to make the removal of 
>> ioport
>> emulation easier?
> So I thought about it as well, but figured it's easier this way:
> - x86 allows overlap, PIO is a totally separate address space from
>   memory/MMIO. Early x86 CPUs had pins to indicate a PIO bus cycle, but
>   using the same address and data pins otherwise. In practise there
>   might be no overlap when it comes to *MMIO* traps vs PIO on x86
>   (there is DRAM only at the lowest 64K of the IBM PC memory map),
>   but not sure we should rely on this.
> - For non-x86 this would indeed be non-overlapping, but this would need
>   to be translated at init time then? And then we can't move those
>   anymore, I guess? So I found it cleaner to keep this separate, and do
>   the translation at trap time.
> - As a consequence we would need to have a bit indicating the address
>   space. I haven't actually tried this, but my understanding is that
>   this would spoil the whole rb_tree functions, since they rely on a
>   linear addressing scheme, and adding another bit there would be at
>   least cumbersome?
>
> At the end I decided to go for separate trees, as also this was less
> change.
>
> I agree that it would be nice to have one tree, from a design point of
> view, but I am afraid that would require more changes.
> If need be, I think we can always unify them later on, on top of this
> series?

Definitely later, I forgot that x86 uses special instructions to access IO 
ports,
which means that port addresses can overlap with memory addresses. Let's keep 2
trees for now and we can decide later if we should unify them for the other
architectures.

>
>> If it's not valid to have that overlap, then I think having one tree for both
>> would better. Struct mmio_mapping would have to be augmented with a flags 
>> field
>> that holds the same flags given to kvm__register_iotrap to differentiate 
>> between
>> the two slightly different emulations. Saving the IOTRAP_COALESCE flag would 
>> also
>> make it trivial to call KVM_UNREGISTER_COALESCED_MMIO in 
>> kvm__deregister_iotrap,
>> which we currently don't do.
>>
>>> "ioport" or "mmio" just become a flag in the registration function.
>>> Provide wrappers to not break existing users, and allow an easy
>>> transition for the existing ioport handlers.
>>>
>>> This also means that ioport handlers now can use the same emulation
>>> callback prototype as MMIO handlers, which means we have to migrate them
>>> over. To allow a smooth transition, we hook up the new I/O emulate
>>> function to the end of the existing ioport emulation code.  
>> I'm sorry, but I don't understand that last sentence. Do you mean that the 
>> ioport
>> emulation code has been modified to use kvm__emulate_pio() as a fallback for 
>> when
>> the port is not found in the ioport_tree?
> I meant that for the transition period we have all of traditional MMIO,
> traditional PIO, *and* just transformed PIO.
>
> That means there are still PIO devices registered through ioport.c's
> ioport__register(), *and* PIO devices registered through mmio.c's
> kvm__register_pio(). Which means they end up in two separate PIO trees.
> And only the traditional kvm__emulate_io() from ioport.c is called upon
> a trap, so it needs to check both trees, which it does by calling into
> kvm__emulate_pio(), shall a search in the local tree fail.

Thank you for the explanation, now it makes sense.

>
> Or did you mean something else?
>
>>> Signed-off-by: Andre Przywara 
>>> ---
>>>  include/kvm/kvm.h | 42 +
>>>  ioport.c  |  4 ++--
>>>  mmio.c| 59 +++
>>>  3 files changed, 89 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/include/kvm/kvm.h b/include/kvm/kvm.h
>>> index ee99c28e..14f9d58b 100644
>>> --- a/include/kvm/kvm.h
>>> +++ b/include/kvm/kvm.h
>>> @@ -27,10 +27,16 @@
>>>  #define PAGE_SIZE (sysconf(_SC_PAGE_SIZE))
>>>  #endif
>>>  
>>> +#define IOTRAP_BUS_MASK0xf  
>> It's not immediately obvious what this mask does. It turns out it's used to 
>> mask
>> the enum flags defined in the header devices.h, header which is not included 
>> in
>> this file.
>>
>> The flag names we 

Re: [RFC PATCH v2 16/26] KVM: arm64: Prepare Hyp memory protection

2021-02-22 Thread Quentin Perret
Hi Sean,

On Friday 19 Feb 2021 at 10:32:58 (-0800), Sean Christopherson wrote:
> On Wed, Feb 03, 2021, Will Deacon wrote:
> > On Fri, Jan 08, 2021 at 12:15:14PM +, Quentin Perret wrote:
> 
> ...
> 
> > > +static inline unsigned long hyp_s1_pgtable_size(void)
> > > +{
> 
> ...
> 
> > > + res += nr_pages << PAGE_SHIFT;
> > > + }
> > > +
> > > + /* Allow 1 GiB for private mappings */
> > > + nr_pages = (1 << 30) >> PAGE_SHIFT;
> > 
> > SZ_1G >> PAGE_SHIFT
> 
> Where does the 1gb magic number come from?

Admittedly it is arbitrary. It needs to be enough to cover all the
so-called 'private' mappings that EL2 needs, and which can vary a little
depending on the hardware.

> IIUC, this is calculating the number
> of pages needed for the hypervisor's Stage-1 page tables.

Correct. The thing worth noting is that the hypervisor VA space is
essentially split in half. One half is reserved to map portions of
memory with a fixed offset, and the other half is used for a whole bunch
of other things: we have a vmemmap, the 'private' mappings and the idmap
page.

> The amount of memory
> needed for those page tables should be easily calculated

As mentioned above, that is true for pretty much everything in the hyp
VA space except the private mappings as that depends on e.g. the CPU
uarch and such.

> and assuming huge pages can be used, should be far less the 1gb.

Ack, though this is no supported for the EL2 mappings yet. Historically
the amount of contiguous portions of memory mapped at EL2 has been
rather small, so there wasn't really a need, but we might want to
revisit this at some point.

> > > + nr_pages = __hyp_pgtable_max_pages(nr_pages);
> > > + res += nr_pages << PAGE_SHIFT;
> > > +
> > > + return res;
> 
> ...
> 
> > > +void __init kvm_hyp_reserve(void)
> > > +{
> > > + u64 nr_pages, prev;
> > > +
> > > + if (!is_hyp_mode_available() || is_kernel_in_hyp_mode())
> > > + return;
> > > +
> > > + if (kvm_get_mode() != KVM_MODE_PROTECTED)
> > > + return;
> > > +
> > > + if (kvm_nvhe_sym(hyp_memblock_nr) < 0) {
> > > + kvm_err("Failed to register hyp memblocks\n");
> > > + return;
> > > + }
> > > +
> > > + sort_memblock_regions();
> > > +
> > > + /*
> > > +  * We don't know the number of possible CPUs yet, so allocate for the
> > > +  * worst case.
> > > +  */
> > > + hyp_mem_size += NR_CPUS << PAGE_SHIFT;
> 
> Is this for per-cpu stack?

Correct.

> If so, what guarantees a single page is sufficient? Mostly a curiosity 
> question,
> since it looks like this is an existing assumption by init_hyp_mode().  
> Shouldn't
> the required stack size be defined in bytes and converted to pages, or is 
> there a
> guarantee that 64kb pages will be used?

Nope, we have no such guarantees, but 4K has been more than enough for
EL2 so far. The hyp code doesn't use recursion much (I think the only
occurence we have is Will's pgtable code, and that is architecturally
limited to 4 levels of recursion for obvious reasons) and doesn't have
use stack allocations.

It's on my todo list to remap the stack pages in the 'private' range, to
surround them with guard pages so we can at least run-time check this
assumption, so stay tuned :)

> > There was a recent patch bumping NR_CPUs to 512, so this would be 32MB
> > with 64k pages. Is it possible to return memory to the host later on once
> > we have a better handle on the number of CPUs in the system?
> 
> Does kvm_hyp_reserve() really need to be called during bootmem_init()?  What
> prevents doing the reservation during init_hyp_mode()?  If the problem is that
> pKVM needs a single contiguous chunk of memory, then it might be worth solving
> _that_ problem, e.g. letting the host donate memory in N-byte chunks instead 
> of
> requiring a single huge blob of memory.

Right, I've been thinking about this over the weekend and that might
actually be fairly straightforward for stack pages. I'll try to move this
allocation to init_hyp_mode() where it belongs (or better, re-use the
existing one) in the nest version.

Thanks,
Quentin
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH kvmtool 01/21] ioport: Remove ioport__setup_arch()

2021-02-22 Thread Alexandru Elisei
Hi Andre,

On 2/22/21 10:23 AM, Andre Przywara wrote:
> On Wed, 17 Feb 2021 16:46:47 +
> Andre Przywara  wrote:
>
>> On Thu, 11 Feb 2021 17:32:01 +
>> Alexandru Elisei  wrote:
>>
>> Hi,
>>
>>> On 2/11/21 5:16 PM, Andre Przywara wrote:  
 On Wed, 10 Feb 2021 17:44:59 +
 Alexandru Elisei  wrote:

 Hi Alex,

> On 12/10/20 2:28 PM, Andre Przywara wrote:
>> Since x86 had a special need for registering tons of special I/O ports,
>> we had an ioport__setup_arch() callback, to allow each architecture
>> to do the same. As it turns out no one uses it beside x86, so we remove
>> that unnecessary abstraction.
>>
>> The generic function was registered via a device_base_init() call, so
>> we just do the same for the x86 specific function only, and can remove
>> the unneeded ioport__setup_arch().
>>
>> Signed-off-by: Andre Przywara 
>> ---
>>  arm/ioport.c |  5 -
>>  include/kvm/ioport.h |  1 -
>>  ioport.c | 28 
>>  mips/kvm.c   |  5 -
>>  powerpc/ioport.c |  6 --
>>  x86/ioport.c | 25 -
>>  6 files changed, 24 insertions(+), 46 deletions(-)
>>
>> diff --git a/arm/ioport.c b/arm/ioport.c
>> index 2f0feb9a..24092c9d 100644
>> --- a/arm/ioport.c
>> +++ b/arm/ioport.c
>> @@ -1,11 +1,6 @@
>>  #include "kvm/ioport.h"
>>  #include "kvm/irq.h"
>>  
>> -int ioport__setup_arch(struct kvm *kvm)
>> -{
>> -return 0;
>> -}
>> -
>>  void ioport__map_irq(u8 *irq)
>>  {
>>  *irq = irq__alloc_line();
>> diff --git a/include/kvm/ioport.h b/include/kvm/ioport.h
>> index 039633f7..d0213541 100644
>> --- a/include/kvm/ioport.h
>> +++ b/include/kvm/ioport.h
>> @@ -35,7 +35,6 @@ struct ioport_operations {
>>  enum 
>> irq_type));
>>  };
>>  
>> -int ioport__setup_arch(struct kvm *kvm);
>>  void ioport__map_irq(u8 *irq);
>>  
>>  int __must_check ioport__register(struct kvm *kvm, u16 port, struct 
>> ioport_operations *ops,
>> diff --git a/ioport.c b/ioport.c
>> index 844a832d..667e8386 100644
>> --- a/ioport.c
>> +++ b/ioport.c
>> @@ -158,21 +158,6 @@ int ioport__unregister(struct kvm *kvm, u16 port)
>>  return 0;
>>  }
>>  
>> -static void ioport__unregister_all(void)
>> -{
>> -struct ioport *entry;
>> -struct rb_node *rb;
>> -struct rb_int_node *rb_node;
>> -
>> -rb = rb_first(_tree);
>> -while (rb) {
>> -rb_node = rb_int(rb);
>> -entry = ioport_node(rb_node);
>> -ioport_unregister(_tree, entry);
>> -rb = rb_first(_tree);
>> -}
>> -}  
> I get the impression this is a rebasing artifact. The commit message 
> doesn't
> mention anything about removing ioport__exit() -> 
> ioport__unregister_all(), and as
> far as I can tell it's still needed because there are places other than
> ioport__setup_arch() from where ioport__register() is called.
 I agree that the commit message is a bit thin on this fact, but the
 functionality of ioport__unregister_all() is now in
 x86/ioport.c:ioport__remove_arch(). I think removing ioport__init()
 without removing ioport__exit() as well would look very weird, if not
 hackish.
>>> Not necessarily. ioport__unregister_all() removes the ioports added by
>>> x86/ioport.c::ioport__setup_arch(), *plus* ioports added by different 
>>> devices,
>>> like serial, rtc, virtio-pci and vfio-pci (which are used by arm/arm64).  
>> Right, indeed. Not that it really matters, since we are about to exit
>> anyway, but it looks indeed I need to move this to a generic teardown
>> method, or actually just keep that part here in this file.
>>
>> Will give this a try.
> Well, now having a closer look I needed to remove this from here,
> because this whole file will go away.
> To keep the current functionality, we would need to add it to mmio.c,
> and interestingly we don't do any kind of similar cleanup there for the
> MMIO regions (probably this is kvmtool exiting anyway, see above).

This is a very good point. If the MMIO emulation doesn't unregister each MMIO
region before exiting (and has never done that since it was implemented), then I
don't think there's a reason that we should add it now. After all, kvmtool will
terminate after calling dev_base_exit destructors, which will take care of
deallocating the entire process memory.

Thanks,

Alex

>
> I will see if I can introduce it there, for good measure.
>
> Cheers,
> Andre
>
>
>> Thanks!
>> Andre
>>
 I can amend the commit message to mention this, or is there anything
 else I missed?

 

Re: [PATCH v11 01/13] vfio: VFIO_IOMMU_SET_PASID_TABLE

2021-02-22 Thread Keqian Zhu
Hi Eric,

On 2021/2/22 18:53, Auger Eric wrote:
> Hi Keqian,
> 
> On 2/2/21 1:34 PM, Keqian Zhu wrote:
>> Hi Eric,
>>
>> On 2020/11/16 19:00, Eric Auger wrote:
>>> From: "Liu, Yi L" 
>>>
>>> This patch adds an VFIO_IOMMU_SET_PASID_TABLE ioctl
>>> which aims to pass the virtual iommu guest configuration
>>> to the host. This latter takes the form of the so-called
>>> PASID table.
>>>
>>> Signed-off-by: Jacob Pan 
>>> Signed-off-by: Liu, Yi L 
>>> Signed-off-by: Eric Auger 
>>>
>>> ---
>>> v11 -> v12:
>>> - use iommu_uapi_set_pasid_table
>>> - check SET and UNSET are not set simultaneously (Zenghui)
>>>
>>> v8 -> v9:
>>> - Merge VFIO_IOMMU_ATTACH/DETACH_PASID_TABLE into a single
>>>   VFIO_IOMMU_SET_PASID_TABLE ioctl.
>>>
>>> v6 -> v7:
>>> - add a comment related to VFIO_IOMMU_DETACH_PASID_TABLE
>>>
>>> v3 -> v4:
>>> - restore ATTACH/DETACH
>>> - add unwind on failure
>>>
>>> v2 -> v3:
>>> - s/BIND_PASID_TABLE/SET_PASID_TABLE
>>>
>>> v1 -> v2:
>>> - s/BIND_GUEST_STAGE/BIND_PASID_TABLE
>>> - remove the struct device arg
>>> ---
>>>  drivers/vfio/vfio_iommu_type1.c | 65 +
>>>  include/uapi/linux/vfio.h   | 19 ++
>>>  2 files changed, 84 insertions(+)
>>>
>>> diff --git a/drivers/vfio/vfio_iommu_type1.c 
>>> b/drivers/vfio/vfio_iommu_type1.c
>>> index 67e827638995..87ddd9e882dc 100644
>>> --- a/drivers/vfio/vfio_iommu_type1.c
>>> +++ b/drivers/vfio/vfio_iommu_type1.c
>>> @@ -2587,6 +2587,41 @@ static int vfio_iommu_iova_build_caps(struct 
>>> vfio_iommu *iommu,
>>> return ret;
>>>  }
>>>  
>>> +static void
>>> +vfio_detach_pasid_table(struct vfio_iommu *iommu)
>>> +{
>>> +   struct vfio_domain *d;
>>> +
>>> +   mutex_lock(>lock);
>>> +   list_for_each_entry(d, >domain_list, next)
>>> +   iommu_detach_pasid_table(d->domain);
>>> +
>>> +   mutex_unlock(>lock);
>>> +}
>>> +
>>> +static int
>>> +vfio_attach_pasid_table(struct vfio_iommu *iommu, unsigned long arg)
>>> +{
>>> +   struct vfio_domain *d;
>>> +   int ret = 0;
>>> +
>>> +   mutex_lock(>lock);
>>> +
>>> +   list_for_each_entry(d, >domain_list, next) {
>>> +   ret = iommu_uapi_attach_pasid_table(d->domain, (void __user 
>>> *)arg);
>> This design is not very clear to me. This assumes all iommu_domains share 
>> the same pasid table.
>>
>> As I understand, it's reasonable when there is only one group in the domain, 
>> and only one domain in the vfio_iommu.
>> If more than one group in the vfio_iommu, the guest may put them into 
>> different guest iommu_domain, then they have different pasid table.
>>
>> Is this the use scenario?
> 
> the vfio_iommu is attached to a container. all the groups within a
> container share the same set of page tables (linux
> Documentation/driver-api/vfio.rst). So to me if you want to use
> different pasid tables, the groups need to be attached to different
> containers. Does that make sense to you?
OK, so this is what I understand about the design. A little question is that 
when
we perform attach_pasid_table on a container, maybe we ought to do a sanity
check to make sure that only one group is in this container, instead of
iterating all domain?

To be frank, my main concern is that if we put each group into different 
container
under nested mode, then we give up the possibility that they can share stage2 
page tables,
which saves host memory and reduces the time of preparing environment for VM.

To me, I'd like to understand the "container shares page table" to be:
1) share stage2 page table under nested mode.
2) share stage1 page table under non-nested mode.

As when we perform "map" on a container:
1) under nested mode, we setup stage2 mapping.
2) under non-nested mode, we setup stage1 mapping.

Indeed, to realize stage2 mapping sharing, we should do much more work to 
refactor
SMMU_DOMAIN...

Hope you can consider this. :)

Thanks,
Keqian

> 
> Thanks
> 
> Eric
>>
>> Thanks,
>> Keqian
>>
>>> +   if (ret)
>>> +   goto unwind;
>>> +   }
>>> +   goto unlock;
>>> +unwind:
>>> +   list_for_each_entry_continue_reverse(d, >domain_list, next) {
>>> +   iommu_detach_pasid_table(d->domain);
>>> +   }
>>> +unlock:
>>> +   mutex_unlock(>lock);
>>> +   return ret;
>>> +}
>>> +
>>>  static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
>>>struct vfio_info_cap *caps)
>>>  {
>>> @@ -2747,6 +2782,34 @@ static int vfio_iommu_type1_unmap_dma(struct 
>>> vfio_iommu *iommu,
>>> -EFAULT : 0;
>>>  }
>>>  
>>> +static int vfio_iommu_type1_set_pasid_table(struct vfio_iommu *iommu,
>>> +   unsigned long arg)
>>> +{
>>> +   struct vfio_iommu_type1_set_pasid_table spt;
>>> +   unsigned long minsz;
>>> +   int ret = -EINVAL;
>>> +
>>> +   minsz = offsetofend(struct vfio_iommu_type1_set_pasid_table, flags);
>>> +
>>> +   if (copy_from_user(, (void __user *)arg, minsz))
>>> +   return -EFAULT;
>>> +
>>> +   if (spt.argsz < minsz)
>>> +  

Re: [PATCH v7 23/23] [DO NOT MERGE] arm64: Cope with CPUs stuck in VHE mode

2021-02-22 Thread Jonathan Neuschäfer
Hi,

On Mon, Feb 08, 2021 at 09:57:32AM +, Marc Zyngier wrote:
> It seems that the CPU known as Apple M1 has the terrible habit
> of being stuck with HCR_EL2.E2H==1, in violation of the architecture.

Minor nitpick from the sideline: The M1 SoC has two kinds of CPU in it
(Icestorm and Firestorm), which makes "CPU known as Apple M1" a bit
imprecise.

In practicality it seems unlikely though, that Icestorm and Firestorm
act differently with regards to the code in this patch.


Best regards,
Jonathan Neuschäfer

> 
> Try and work around this deplorable state of affairs by detecting
> the stuck bit early and short-circuit the nVHE dance. It is still
> unknown whether there are many more such nuggets to be found...
> 
> Reported-by: Hector Martin 
> Signed-off-by: Marc Zyngier 
> ---
>  arch/arm64/kernel/head.S | 33 ++---
>  arch/arm64/kernel/hyp-stub.S | 28 
>  2 files changed, 54 insertions(+), 7 deletions(-)
[...]


signature.asc
Description: PGP signature
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v4 2/2] KVM: x86/mmu: Consider the hva in mmu_notifier retry

2021-02-22 Thread David Stevens
From: David Stevens 

Track the range being invalidated by mmu_notifier and skip page fault
retries if the fault address is not affected by the in-progress
invalidation. Handle concurrent invalidations by finding the minimal
range which includes all ranges being invalidated. Although the combined
range may include unrelated addresses and cannot be shrunk as individual
invalidation operations complete, it is unlikely the marginal gains of
proper range tracking are worth the additional complexity.

The primary benefit of this change is the reduction in the likelihood of
extreme latency when handing a page fault due to another thread having
been preempted while modifying host virtual addresses.

Signed-off-by: David Stevens 
---
v3 -> v4:
 - Skip prefetch while invalidations are in progress

v2 -> v3:
 - Removed unnecessary ifdef
 - Style changes

v1 -> v2:
 - Improve handling of concurrent invalidation requests by unioning
   ranges, instead of just giving up and using [0, ULONG_MAX).
 - Add lockdep check
 - Code comments and formatting

 arch/powerpc/kvm/book3s_64_mmu_hv.c|  2 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c |  2 +-
 arch/x86/kvm/mmu/mmu.c | 23 ++--
 arch/x86/kvm/mmu/paging_tmpl.h |  7 ---
 include/linux/kvm_host.h   | 25 +-
 virt/kvm/kvm_main.c| 29 ++
 6 files changed, 72 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c 
b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 38ea396a23d6..8e06cd3f759c 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -590,7 +590,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_vcpu *vcpu,
} else {
/* Call KVM generic code to do the slow-path check */
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-  writing, _ok);
+  writing, _ok, NULL);
if (is_error_noslot_pfn(pfn))
return -EFAULT;
page = NULL;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index bb35490400e9..e603de7ade52 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -822,7 +822,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
 
/* Call KVM generic code to do the slow-path check */
pfn = __gfn_to_pfn_memslot(memslot, gfn, false, NULL,
-  writing, upgrade_p);
+  writing, upgrade_p, NULL);
if (is_error_noslot_pfn(pfn))
return -EFAULT;
page = NULL;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 9ac0a727015d..f6aaac729667 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2758,6 +2758,13 @@ static void direct_pte_prefetch(struct kvm_vcpu *vcpu, 
u64 *sptep)
if (sp->role.level > PG_LEVEL_4K)
return;
 
+   /*
+* If addresses are being invalidated, skip prefetching to avoid
+* accidentally prefetching those addresses.
+*/
+   if (unlikely(vcpu->kvm->mmu_notifier_count))
+   return;
+
__direct_pte_prefetch(vcpu, sp, sptep);
 }
 
@@ -3658,8 +3665,8 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu 
*vcpu, gpa_t cr2_or_gpa,
 }
 
 static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
-gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
-bool *writable)
+gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva,
+bool write, bool *writable)
 {
struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
bool async;
@@ -3672,7 +3679,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool 
prefault, gfn_t gfn,
}
 
async = false;
-   *pfn = __gfn_to_pfn_memslot(slot, gfn, false, , write, writable);
+   *pfn = __gfn_to_pfn_memslot(slot, gfn, false, ,
+   write, writable, hva);
if (!async)
return false; /* *pfn has correct page already */
 
@@ -3686,7 +3694,8 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool 
prefault, gfn_t gfn,
return true;
}
 
-   *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
+   *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL,
+   write, writable, hva);
return false;
 }
 
@@ -3699,6 +3708,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t 
gpa, u32 error_code,
gfn_t gfn = gpa >> PAGE_SHIFT;
unsigned long mmu_seq;
kvm_pfn_t pfn;
+   hva_t hva;
int r;
 
if 

[PATCH v4 1/2] KVM: x86/mmu: Skip mmu_notifier check when handling MMIO page fault

2021-02-22 Thread David Stevens
From: Sean Christopherson 

Don't retry a page fault due to an mmu_notifier invalidation when
handling a page fault for a GPA that did not resolve to a memslot, i.e.
an MMIO page fault.  Invalidations from the mmu_notifier signal a change
in a host virtual address (HVA) mapping; without a memslot, there is no
HVA and thus no possibility that the invalidation is relevant to the
page fault being handled.

Note, the MMIO vs. memslot generation checks handle the case where a
pending memslot will create a memslot overlapping the faulting GPA.  The
mmu_notifier checks are orthogonal to memslot updates.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/mmu/mmu.c | 2 +-
 arch/x86/kvm/mmu/paging_tmpl.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 6d16481aa29d..9ac0a727015d 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3725,7 +3725,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t 
gpa, u32 error_code,
 
r = RET_PF_RETRY;
spin_lock(>kvm->mmu_lock);
-   if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+   if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
r = make_mmu_pages_available(vcpu);
if (r)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 50e268eb8e1a..ab54263d857c 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -869,7 +869,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t 
addr, u32 error_code,
 
r = RET_PF_RETRY;
spin_lock(>kvm->mmu_lock);
-   if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
+   if (!is_noslot_pfn(pfn) && mmu_notifier_retry(vcpu->kvm, mmu_seq))
goto out_unlock;
 
kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
-- 
2.30.0.617.g56c4b15f3c-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v4 0/2] KVM: x86/mmu: Skip mmu_notifier changes when possible

2021-02-22 Thread David Stevens
These patches reduce how often mmu_notifier updates block guest page
faults. The primary benefit of this is the reduction in the likelihood
of extreme latency when handling a page fault due to another thread
having been preempted while modifying host virtual addresses.

v3 -> v4:
 - Fix bug by skipping prefetch during invalidation

v2 -> v3:
 - Added patch to skip check for MMIO page faults
 - Style changes

David Stevens (1):
  KVM: x86/mmu: Consider the hva in mmu_notifier retry

Sean Christopherson (1):
  KVM: x86/mmu: Skip mmu_notifier check when handling MMIO page fault

 arch/powerpc/kvm/book3s_64_mmu_hv.c|  2 +-
 arch/powerpc/kvm/book3s_64_mmu_radix.c |  2 +-
 arch/x86/kvm/mmu/mmu.c | 23 ++--
 arch/x86/kvm/mmu/paging_tmpl.h |  7 ---
 include/linux/kvm_host.h   | 25 +-
 virt/kvm/kvm_main.c| 29 ++
 6 files changed, 72 insertions(+), 16 deletions(-)

-- 
2.30.0.617.g56c4b15f3c-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v11 01/13] vfio: VFIO_IOMMU_SET_PASID_TABLE

2021-02-22 Thread Auger Eric
Hi Keqian,

On 2/2/21 1:34 PM, Keqian Zhu wrote:
> Hi Eric,
> 
> On 2020/11/16 19:00, Eric Auger wrote:
>> From: "Liu, Yi L" 
>>
>> This patch adds an VFIO_IOMMU_SET_PASID_TABLE ioctl
>> which aims to pass the virtual iommu guest configuration
>> to the host. This latter takes the form of the so-called
>> PASID table.
>>
>> Signed-off-by: Jacob Pan 
>> Signed-off-by: Liu, Yi L 
>> Signed-off-by: Eric Auger 
>>
>> ---
>> v11 -> v12:
>> - use iommu_uapi_set_pasid_table
>> - check SET and UNSET are not set simultaneously (Zenghui)
>>
>> v8 -> v9:
>> - Merge VFIO_IOMMU_ATTACH/DETACH_PASID_TABLE into a single
>>   VFIO_IOMMU_SET_PASID_TABLE ioctl.
>>
>> v6 -> v7:
>> - add a comment related to VFIO_IOMMU_DETACH_PASID_TABLE
>>
>> v3 -> v4:
>> - restore ATTACH/DETACH
>> - add unwind on failure
>>
>> v2 -> v3:
>> - s/BIND_PASID_TABLE/SET_PASID_TABLE
>>
>> v1 -> v2:
>> - s/BIND_GUEST_STAGE/BIND_PASID_TABLE
>> - remove the struct device arg
>> ---
>>  drivers/vfio/vfio_iommu_type1.c | 65 +
>>  include/uapi/linux/vfio.h   | 19 ++
>>  2 files changed, 84 insertions(+)
>>
>> diff --git a/drivers/vfio/vfio_iommu_type1.c 
>> b/drivers/vfio/vfio_iommu_type1.c
>> index 67e827638995..87ddd9e882dc 100644
>> --- a/drivers/vfio/vfio_iommu_type1.c
>> +++ b/drivers/vfio/vfio_iommu_type1.c
>> @@ -2587,6 +2587,41 @@ static int vfio_iommu_iova_build_caps(struct 
>> vfio_iommu *iommu,
>>  return ret;
>>  }
>>  
>> +static void
>> +vfio_detach_pasid_table(struct vfio_iommu *iommu)
>> +{
>> +struct vfio_domain *d;
>> +
>> +mutex_lock(>lock);
>> +list_for_each_entry(d, >domain_list, next)
>> +iommu_detach_pasid_table(d->domain);
>> +
>> +mutex_unlock(>lock);
>> +}
>> +
>> +static int
>> +vfio_attach_pasid_table(struct vfio_iommu *iommu, unsigned long arg)
>> +{
>> +struct vfio_domain *d;
>> +int ret = 0;
>> +
>> +mutex_lock(>lock);
>> +
>> +list_for_each_entry(d, >domain_list, next) {
>> +ret = iommu_uapi_attach_pasid_table(d->domain, (void __user 
>> *)arg);
> This design is not very clear to me. This assumes all iommu_domains share the 
> same pasid table.
> 
> As I understand, it's reasonable when there is only one group in the domain, 
> and only one domain in the vfio_iommu.
> If more than one group in the vfio_iommu, the guest may put them into 
> different guest iommu_domain, then they have different pasid table.
> 
> Is this the use scenario?

the vfio_iommu is attached to a container. all the groups within a
container share the same set of page tables (linux
Documentation/driver-api/vfio.rst). So to me if you want to use
different pasid tables, the groups need to be attached to different
containers. Does that make sense to you?

Thanks

Eric
> 
> Thanks,
> Keqian
> 
>> +if (ret)
>> +goto unwind;
>> +}
>> +goto unlock;
>> +unwind:
>> +list_for_each_entry_continue_reverse(d, >domain_list, next) {
>> +iommu_detach_pasid_table(d->domain);
>> +}
>> +unlock:
>> +mutex_unlock(>lock);
>> +return ret;
>> +}
>> +
>>  static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
>> struct vfio_info_cap *caps)
>>  {
>> @@ -2747,6 +2782,34 @@ static int vfio_iommu_type1_unmap_dma(struct 
>> vfio_iommu *iommu,
>>  -EFAULT : 0;
>>  }
>>  
>> +static int vfio_iommu_type1_set_pasid_table(struct vfio_iommu *iommu,
>> +unsigned long arg)
>> +{
>> +struct vfio_iommu_type1_set_pasid_table spt;
>> +unsigned long minsz;
>> +int ret = -EINVAL;
>> +
>> +minsz = offsetofend(struct vfio_iommu_type1_set_pasid_table, flags);
>> +
>> +if (copy_from_user(, (void __user *)arg, minsz))
>> +return -EFAULT;
>> +
>> +if (spt.argsz < minsz)
>> +return -EINVAL;
>> +
>> +if (spt.flags & VFIO_PASID_TABLE_FLAG_SET &&
>> +spt.flags & VFIO_PASID_TABLE_FLAG_UNSET)
>> +return -EINVAL;
>> +
>> +if (spt.flags & VFIO_PASID_TABLE_FLAG_SET)
>> +ret = vfio_attach_pasid_table(iommu, arg + minsz);
>> +else if (spt.flags & VFIO_PASID_TABLE_FLAG_UNSET) {
>> +vfio_detach_pasid_table(iommu);
>> +ret = 0;
>> +}
>> +return ret;
>> +}
>> +
>>  static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
>>  unsigned long arg)
>>  {
>> @@ -2867,6 +2930,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
>>  return vfio_iommu_type1_unmap_dma(iommu, arg);
>>  case VFIO_IOMMU_DIRTY_PAGES:
>>  return vfio_iommu_type1_dirty_pages(iommu, arg);
>> +case VFIO_IOMMU_SET_PASID_TABLE:
>> +return vfio_iommu_type1_set_pasid_table(iommu, arg);
>>  default:
>>  return -ENOTTY;
>>  }
>> diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
>> index 2f313a238a8f..78ce3ce6c331 100644
>> 

Re: [PATCH kvmtool 01/21] ioport: Remove ioport__setup_arch()

2021-02-22 Thread Andre Przywara
On Wed, 17 Feb 2021 16:46:47 +
Andre Przywara  wrote:

> On Thu, 11 Feb 2021 17:32:01 +
> Alexandru Elisei  wrote:
> 
> Hi,
> 
> > On 2/11/21 5:16 PM, Andre Przywara wrote:  
> > > On Wed, 10 Feb 2021 17:44:59 +
> > > Alexandru Elisei  wrote:
> > >
> > > Hi Alex,
> > >
> > >> On 12/10/20 2:28 PM, Andre Przywara wrote:
> > >>> Since x86 had a special need for registering tons of special I/O ports,
> > >>> we had an ioport__setup_arch() callback, to allow each architecture
> > >>> to do the same. As it turns out no one uses it beside x86, so we remove
> > >>> that unnecessary abstraction.
> > >>>
> > >>> The generic function was registered via a device_base_init() call, so
> > >>> we just do the same for the x86 specific function only, and can remove
> > >>> the unneeded ioport__setup_arch().
> > >>>
> > >>> Signed-off-by: Andre Przywara 
> > >>> ---
> > >>>  arm/ioport.c |  5 -
> > >>>  include/kvm/ioport.h |  1 -
> > >>>  ioport.c | 28 
> > >>>  mips/kvm.c   |  5 -
> > >>>  powerpc/ioport.c |  6 --
> > >>>  x86/ioport.c | 25 -
> > >>>  6 files changed, 24 insertions(+), 46 deletions(-)
> > >>>
> > >>> diff --git a/arm/ioport.c b/arm/ioport.c
> > >>> index 2f0feb9a..24092c9d 100644
> > >>> --- a/arm/ioport.c
> > >>> +++ b/arm/ioport.c
> > >>> @@ -1,11 +1,6 @@
> > >>>  #include "kvm/ioport.h"
> > >>>  #include "kvm/irq.h"
> > >>>  
> > >>> -int ioport__setup_arch(struct kvm *kvm)
> > >>> -{
> > >>> -   return 0;
> > >>> -}
> > >>> -
> > >>>  void ioport__map_irq(u8 *irq)
> > >>>  {
> > >>> *irq = irq__alloc_line();
> > >>> diff --git a/include/kvm/ioport.h b/include/kvm/ioport.h
> > >>> index 039633f7..d0213541 100644
> > >>> --- a/include/kvm/ioport.h
> > >>> +++ b/include/kvm/ioport.h
> > >>> @@ -35,7 +35,6 @@ struct ioport_operations {
> > >>> enum 
> > >>> irq_type));
> > >>>  };
> > >>>  
> > >>> -int ioport__setup_arch(struct kvm *kvm);
> > >>>  void ioport__map_irq(u8 *irq);
> > >>>  
> > >>>  int __must_check ioport__register(struct kvm *kvm, u16 port, struct 
> > >>> ioport_operations *ops,
> > >>> diff --git a/ioport.c b/ioport.c
> > >>> index 844a832d..667e8386 100644
> > >>> --- a/ioport.c
> > >>> +++ b/ioport.c
> > >>> @@ -158,21 +158,6 @@ int ioport__unregister(struct kvm *kvm, u16 port)
> > >>> return 0;
> > >>>  }
> > >>>  
> > >>> -static void ioport__unregister_all(void)
> > >>> -{
> > >>> -   struct ioport *entry;
> > >>> -   struct rb_node *rb;
> > >>> -   struct rb_int_node *rb_node;
> > >>> -
> > >>> -   rb = rb_first(_tree);
> > >>> -   while (rb) {
> > >>> -   rb_node = rb_int(rb);
> > >>> -   entry = ioport_node(rb_node);
> > >>> -   ioport_unregister(_tree, entry);
> > >>> -   rb = rb_first(_tree);
> > >>> -   }
> > >>> -}  
> > >> I get the impression this is a rebasing artifact. The commit message 
> > >> doesn't
> > >> mention anything about removing ioport__exit() -> 
> > >> ioport__unregister_all(), and as
> > >> far as I can tell it's still needed because there are places other than
> > >> ioport__setup_arch() from where ioport__register() is called.
> > > I agree that the commit message is a bit thin on this fact, but the
> > > functionality of ioport__unregister_all() is now in
> > > x86/ioport.c:ioport__remove_arch(). I think removing ioport__init()
> > > without removing ioport__exit() as well would look very weird, if not
> > > hackish.
> > 
> > Not necessarily. ioport__unregister_all() removes the ioports added by
> > x86/ioport.c::ioport__setup_arch(), *plus* ioports added by different 
> > devices,
> > like serial, rtc, virtio-pci and vfio-pci (which are used by arm/arm64).  
> 
> Right, indeed. Not that it really matters, since we are about to exit
> anyway, but it looks indeed I need to move this to a generic teardown
> method, or actually just keep that part here in this file.
> 
> Will give this a try.

Well, now having a closer look I needed to remove this from here,
because this whole file will go away.
To keep the current functionality, we would need to add it to mmio.c,
and interestingly we don't do any kind of similar cleanup there for the
MMIO regions (probably this is kvmtool exiting anyway, see above).

I will see if I can introduce it there, for good measure.

Cheers,
Andre


> 
> Thanks!
> Andre
> 
> > >
> > > I can amend the commit message to mention this, or is there anything
> > > else I missed?
> > >
> > > Cheers,
> > > Andre
> > >
> > >>> -
> > >>>  static const char *to_direction(int direction)
> > >>>  {
> > >>> if (direction == KVM_EXIT_IO_IN)
> > >>> @@ -220,16 +205,3 @@ out:
> > >>>  
> > >>> return !kvm->cfg.ioport_debug;
> > >>>  }
> > >>> -
> > >>> -int ioport__init(struct kvm *kvm)
> > >>> -{
> > >>> -   return 

Re: [PATCH v7 23/23] [DO NOT MERGE] arm64: Cope with CPUs stuck in VHE mode

2021-02-22 Thread Marc Zyngier

Hi Jonathan,

On 2021-02-22 09:35, Jonathan Neuschäfer wrote:

Hi,

On Mon, Feb 08, 2021 at 09:57:32AM +, Marc Zyngier wrote:

It seems that the CPU known as Apple M1 has the terrible habit
of being stuck with HCR_EL2.E2H==1, in violation of the architecture.


Minor nitpick from the sideline: The M1 SoC has two kinds of CPU in it
(Icestorm and Firestorm), which makes "CPU known as Apple M1" a bit
imprecise.


Fair enough. How about something along the lines of:
"At least some of the CPUs integrated in the Apple M1 SoC have
 the terrible habit..."


In practicality it seems unlikely though, that Icestorm and Firestorm
act differently with regards to the code in this patch.


This is my hunch as well. And if they did, it shouldn't be a big deal:
the "architecture compliant" CPUs would simply transition via EL1
as expected, and join their buggy friends running at EL2 slightly later.

Thanks,

M.
--
Jazz is not dead. It just smells funny...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


RE: [PATCH v13 00/15] SMMUv3 Nested Stage Setup (IOMMU part)

2021-02-22 Thread Shameerali Kolothum Thodi


> -Original Message-
> From: Auger Eric [mailto:eric.au...@redhat.com]
> Sent: 21 February 2021 18:21
> To: Shameerali Kolothum Thodi ;
> eric.auger@gmail.com; io...@lists.linux-foundation.org;
> linux-ker...@vger.kernel.org; k...@vger.kernel.org;
> kvmarm@lists.cs.columbia.edu; w...@kernel.org; j...@8bytes.org;
> m...@kernel.org; robin.mur...@arm.com; alex.william...@redhat.com
> Cc: jean-phili...@linaro.org; zhangfei@linaro.org;
> zhangfei@gmail.com; vivek.gau...@arm.com;
> jacob.jun@linux.intel.com; yi.l@intel.com; t...@semihalf.com;
> nicoleots...@gmail.com; yuzenghui ; Zengtao (B)
> ; linux...@openeuler.org
> Subject: Re: [PATCH v13 00/15] SMMUv3 Nested Stage Setup (IOMMU part)
> 
> Hi Shameer,
> On 1/8/21 6:05 PM, Shameerali Kolothum Thodi wrote:
> > Hi Eric,
> >
> >> -Original Message-
> >> From: Eric Auger [mailto:eric.au...@redhat.com]
> >> Sent: 18 November 2020 11:22
> >> To: eric.auger@gmail.com; eric.au...@redhat.com;
> >> io...@lists.linux-foundation.org; linux-ker...@vger.kernel.org;
> >> k...@vger.kernel.org; kvmarm@lists.cs.columbia.edu; w...@kernel.org;
> >> j...@8bytes.org; m...@kernel.org; robin.mur...@arm.com;
> >> alex.william...@redhat.com
> >> Cc: jean-phili...@linaro.org; zhangfei@linaro.org;
> >> zhangfei@gmail.com; vivek.gau...@arm.com; Shameerali Kolothum
> >> Thodi ;
> >> jacob.jun@linux.intel.com; yi.l@intel.com; t...@semihalf.com;
> >> nicoleots...@gmail.com; yuzenghui 
> >> Subject: [PATCH v13 00/15] SMMUv3 Nested Stage Setup (IOMMU part)
> >>
> >> This series brings the IOMMU part of HW nested paging support
> >> in the SMMUv3. The VFIO part is submitted separately.
> >>
> >> The IOMMU API is extended to support 2 new API functionalities:
> >> 1) pass the guest stage 1 configuration
> >> 2) pass stage 1 MSI bindings
> >>
> >> Then those capabilities gets implemented in the SMMUv3 driver.
> >>
> >> The virtualizer passes information through the VFIO user API
> >> which cascades them to the iommu subsystem. This allows the guest
> >> to own stage 1 tables and context descriptors (so-called PASID
> >> table) while the host owns stage 2 tables and main configuration
> >> structures (STE).
> >
> > I am seeing an issue with Guest testpmd run with this series.
> > I have two different setups and testpmd works fine with the
> > first one but not with the second.
> >
> > 1). Guest doesn't have kernel driver built-in for pass-through dev.
> >
> > root@ubuntu:/# lspci -v
> > ...
> > 00:02.0 Ethernet controller: Huawei Technologies Co., Ltd. Device a22e (rev
> 21)
> > Subsystem: Huawei Technologies Co., Ltd. Device 
> > Flags: fast devsel
> > Memory at 800010 (64-bit, prefetchable) [disabled] [size=64K]
> > Memory at 80 (64-bit, prefetchable) [disabled] [size=1M]
> > Capabilities: [40] Express Root Complex Integrated Endpoint, MSI 00
> > Capabilities: [a0] MSI-X: Enable- Count=67 Masked-
> > Capabilities: [b0] Power Management version 3
> > Capabilities: [100] Access Control Services
> > Capabilities: [300] Transaction Processing Hints
> >
> > root@ubuntu:/# echo vfio-pci >
> /sys/bus/pci/devices/:00:02.0/driver_override
> > root@ubuntu:/# echo :00:02.0 > /sys/bus/pci/drivers_probe
> >
> > root@ubuntu:/mnt/dpdk/build/app# ./testpmd -w :00:02.0 --file-prefix
> socket0  -l 0-1 -n 2 -- -i
> > EAL: Detected 8 lcore(s)
> > EAL: Detected 1 NUMA nodes
> > EAL: Multi-process socket /var/run/dpdk/socket0/mp_socket
> > EAL: Selected IOVA mode 'VA'
> > EAL: No available hugepages reported in hugepages-32768kB
> > EAL: No available hugepages reported in hugepages-64kB
> > EAL: No available hugepages reported in hugepages-1048576kB
> > EAL: Probing VFIO support...
> > EAL: VFIO support initialized
> > EAL:   Invalid NUMA socket, default to 0
> > EAL:   using IOMMU type 1 (Type 1)
> > EAL: Probe PCI driver: net_hns3_vf (19e5:a22e) device: :00:02.0 (socket
> 0)
> > EAL: No legacy callbacks, legacy socket not created
> > Interactive-mode selected
> > testpmd: create a new mbuf pool : n=155456,
> size=2176, socket=0
> > testpmd: preferred mempool ops selected: ring_mp_mc
> >
> > Warning! port-topology=paired and odd forward ports number, the last port
> will pair with itself.
> >
> > Configuring Port 0 (socket 0)
> > Port 0: 8E:A6:8C:43:43:45
> > Checking link statuses...
> > Done
> > testpmd>
> >
> > 2). Guest have kernel driver built-in for pass-through dev.
> >
> > root@ubuntu:/# lspci -v
> > ...
> > 00:02.0 Ethernet controller: Huawei Technologies Co., Ltd. Device a22e (rev
> 21)
> > Subsystem: Huawei Technologies Co., Ltd. Device 
> > Flags: bus master, fast devsel, latency 0
> > Memory at 800010 (64-bit, prefetchable) [size=64K]
> > Memory at 80 (64-bit, prefetchable) [size=1M]
> > Capabilities: [40] Express Root Complex Integrated Endpoint, MSI 00
> > Capabilities: [a0] MSI-X: Enable+ Count=67 Masked-
> > Capabilities: [b0] Power Management version 3
> > Capabilities: [100] Access Control