Re: [RFC v3 02/21] iommu: Introduce cache_invalidate API

2019-01-29 Thread Alex Williamson
On Fri, 25 Jan 2019 17:49:20 +0100
Auger Eric  wrote:

> Hi Alex,
> 
> On 1/11/19 10:30 PM, Alex Williamson wrote:
> > On Tue,  8 Jan 2019 11:26:14 +0100
> > Eric Auger  wrote:
> >   
> >> From: "Liu, Yi L" 
> >>
> >> In any virtualization use case, when the first translation stage
> >> is "owned" by the guest OS, the host IOMMU driver has no knowledge
> >> of caching structure updates unless the guest invalidation activities
> >> are trapped by the virtualizer and passed down to the host.
> >>
> >> Since the invalidation data are obtained from user space and will be
> >> written into physical IOMMU, we must allow security check at various
> >> layers. Therefore, generic invalidation data format are proposed here,
> >> model specific IOMMU drivers need to convert them into their own format.
> >>
> >> Signed-off-by: Liu, Yi L 
> >> Signed-off-by: Jean-Philippe Brucker 
> >> Signed-off-by: Jacob Pan 
> >> Signed-off-by: Ashok Raj 
> >> Signed-off-by: Eric Auger 
> >>
> >> ---
> >> v1 -> v2:
> >> - add arch_id field
> >> - renamed tlb_invalidate into cache_invalidate as this API allows
> >>   to invalidate context caches on top of IOTLBs
> >>
> >> v1:
> >> renamed sva_invalidate into tlb_invalidate and add iommu_ prefix in
> >> header. Commit message reworded.
> >> ---
> >>  drivers/iommu/iommu.c  | 14 ++
> >>  include/linux/iommu.h  | 14 ++
> >>  include/uapi/linux/iommu.h | 95 ++
> >>  3 files changed, 123 insertions(+)
> >>
> >> diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> >> index 0f2b7f1fc7c8..b2e248770508 100644
> >> --- a/drivers/iommu/iommu.c
> >> +++ b/drivers/iommu/iommu.c
> >> @@ -1403,6 +1403,20 @@ int iommu_set_pasid_table(struct iommu_domain 
> >> *domain,
> >>  }
> >>  EXPORT_SYMBOL_GPL(iommu_set_pasid_table);
> >>  
> >> +int iommu_cache_invalidate(struct iommu_domain *domain, struct device 
> >> *dev,
> >> + struct iommu_cache_invalidate_info *inv_info)
> >> +{
> >> +  int ret = 0;
> >> +
> >> +  if (unlikely(!domain->ops->cache_invalidate))
> >> +  return -ENODEV;
> >> +
> >> +  ret = domain->ops->cache_invalidate(domain, dev, inv_info);
> >> +
> >> +  return ret;
> >> +}
> >> +EXPORT_SYMBOL_GPL(iommu_cache_invalidate);
> >> +
> >>  static void __iommu_detach_device(struct iommu_domain *domain,
> >>  struct device *dev)
> >>  {
> >> diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> >> index 1da2a2357ea4..96d59886f230 100644
> >> --- a/include/linux/iommu.h
> >> +++ b/include/linux/iommu.h
> >> @@ -186,6 +186,7 @@ struct iommu_resv_region {
> >>   * @of_xlate: add OF master IDs to iommu grouping
> >>   * @pgsize_bitmap: bitmap of all possible supported page sizes
> >>   * @set_pasid_table: set pasid table
> >> + * @cache_invalidate: invalidate translation caches
> >>   */
> >>  struct iommu_ops {
> >>bool (*capable)(enum iommu_cap);
> >> @@ -231,6 +232,9 @@ struct iommu_ops {
> >>int (*set_pasid_table)(struct iommu_domain *domain,
> >>   struct iommu_pasid_table_config *cfg);
> >>  
> >> +  int (*cache_invalidate)(struct iommu_domain *domain, struct device *dev,
> >> +  struct iommu_cache_invalidate_info *inv_info);
> >> +
> >>unsigned long pgsize_bitmap;
> >>  };
> >>  
> >> @@ -294,6 +298,9 @@ extern void iommu_detach_device(struct iommu_domain 
> >> *domain,
> >>struct device *dev);
> >>  extern int iommu_set_pasid_table(struct iommu_domain *domain,
> >> struct iommu_pasid_table_config *cfg);
> >> +extern int iommu_cache_invalidate(struct iommu_domain *domain,
> >> +  struct device *dev,
> >> +  struct iommu_cache_invalidate_info *inv_info);
> >>  extern struct iommu_domain *iommu_get_domain_for_dev(struct device *dev);
> >>  extern struct iommu_domain *iommu_get_dma_domain(struct device *dev);
> >>  extern int iommu_map(struct iommu_domain *domain, unsigned long iova,
> >> @@ -709,6 +716,13 @@ int iommu_set_pasid_table(struct iommu_domain *domain,
> >>  {
> >>return -ENODEV;
> >>  }
> >> +static inline int
> >> +iommu_cache_invalidate(struct iommu_domain *domain,
> >> + struct device *dev,
> >> + struct iommu_cache_invalidate_info *inv_info)
> >> +{
> >> +  return -ENODEV;
> >> +}
> >>  
> >>  #endif /* CONFIG_IOMMU_API */
> >>  
> >> diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
> >> index 7a7cf7a3de7c..4605f5cfac84 100644
> >> --- a/include/uapi/linux/iommu.h
> >> +++ b/include/uapi/linux/iommu.h
> >> @@ -47,4 +47,99 @@ struct iommu_pasid_table_config {
> >>};
> >>  };
> >>  
> >> +/**
> >> + * enum iommu_inv_granularity - Generic invalidation granularity
> >> + * @IOMMU_INV_GRANU_DOMAIN_ALL_PASID: TLB entries or PASID caches of 
> >> all
> >> + *PASIDs associated with a domain 
> >> ID

Re: [PATCH 1/2] KVM: arm/arm64: Add save/restore support for firmware workaround state

2019-01-29 Thread Dave Martin
On Fri, Jan 25, 2019 at 02:46:57PM +, Andre Przywara wrote:
> On Tue, 22 Jan 2019 15:17:14 +
> Dave Martin  wrote:
> 
> Hi Dave,
> 
> thanks for having a look!
> 
> > On Mon, Jan 07, 2019 at 12:05:36PM +, Andre Przywara wrote:
> > > KVM implements the firmware interface for mitigating cache
> > > speculation vulnerabilities. Guests may use this interface to
> > > ensure mitigation is active.
> > > If we want to migrate such a guest to a host with a different
> > > support level for those workarounds, migration might need to fail,
> > > to ensure that critical guests don't loose their protection.
> > > 
> > > Introduce a way for userland to save and restore the workarounds
> > > state. On restoring we do checks that make sure we don't downgrade
> > > our mitigation level.
> > > 
> > > Signed-off-by: Andre Przywara 
> > > ---
> > >  arch/arm/include/asm/kvm_emulate.h   |  10 ++
> > >  arch/arm/include/uapi/asm/kvm.h  |   9 ++
> > >  arch/arm64/include/asm/kvm_emulate.h |  14 +++
> > >  arch/arm64/include/uapi/asm/kvm.h|   9 ++
> > >  virt/kvm/arm/psci.c  | 138
> > > ++- 5 files changed, 178 insertions(+), 2
> > > deletions(-)
> > > 
> > > diff --git a/arch/arm/include/asm/kvm_emulate.h
> > > b/arch/arm/include/asm/kvm_emulate.h index
> > > 77121b713bef..2255c50debab 100644 ---
> > > a/arch/arm/include/asm/kvm_emulate.h +++
> > > b/arch/arm/include/asm/kvm_emulate.h @@ -275,6 +275,16 @@ static
> > > inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
> > > return vcpu_cp15(vcpu, c0_MPIDR) & MPIDR_HWID_BITMASK; }
> > >  
> > > +static inline bool kvm_arm_get_vcpu_workaround_2_flag(struct
> > > kvm_vcpu *vcpu) +{
> > > + return false;
> > > +}
> > > +
> > > +static inline void kvm_arm_set_vcpu_workaround_2_flag(struct
> > > kvm_vcpu *vcpu,
> > > +   bool flag)
> > > +{
> > > +}
> > > +
> > >  static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
> > >  {
> > >   *vcpu_cpsr(vcpu) |= PSR_E_BIT;
> > > diff --git a/arch/arm/include/uapi/asm/kvm.h
> > > b/arch/arm/include/uapi/asm/kvm.h index 4602464ebdfb..02c93b1d8f6d
> > > 100644 --- a/arch/arm/include/uapi/asm/kvm.h
> > > +++ b/arch/arm/include/uapi/asm/kvm.h
> > > @@ -214,6 +214,15 @@ struct kvm_vcpu_events {
> > >  #define KVM_REG_ARM_FW_REG(r)(KVM_REG_ARM |
> > > KVM_REG_SIZE_U64 | \ KVM_REG_ARM_FW | ((r) & 0x))
> > >  #define KVM_REG_ARM_PSCI_VERSION KVM_REG_ARM_FW_REG(0)
> > > +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1
> > > KVM_REG_ARM_FW_REG(1) +#define
> > > KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL 0 +#define
> > > KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL 1 +#define
> > > KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2   KVM_REG_ARM_FW_REG(2)
> > > +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_MASK 0x3
> > > +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL0
> > > +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL1
> > > +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNAFFECTED   2
> > > +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED  4 
> > >  /* Device Control API: ARM VGIC */
> > >  #define KVM_DEV_ARM_VGIC_GRP_ADDR0
> > > diff --git a/arch/arm64/include/asm/kvm_emulate.h
> > > b/arch/arm64/include/asm/kvm_emulate.h index
> > > 506386a3edde..a44f07f68da4 100644 ---
> > > a/arch/arm64/include/asm/kvm_emulate.h +++
> > > b/arch/arm64/include/asm/kvm_emulate.h @@ -336,6 +336,20 @@ static
> > > inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
> > > return vcpu_read_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; }
> > >  
> > > +static inline bool kvm_arm_get_vcpu_workaround_2_flag(struct
> > > kvm_vcpu *vcpu) +{
> > > + return vcpu->arch.workaround_flags &
> > > VCPU_WORKAROUND_2_FLAG; +}
> > > +
> > > +static inline void kvm_arm_set_vcpu_workaround_2_flag(struct
> > > kvm_vcpu *vcpu,
> > > +   bool flag)
> > > +{
> > > + if (flag)
> > > + vcpu->arch.workaround_flags |=
> > > VCPU_WORKAROUND_2_FLAG;
> > > + else
> > > + vcpu->arch.workaround_flags &=
> > > ~VCPU_WORKAROUND_2_FLAG; +}
> > > +
> > >  static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)
> > >  {
> > >   if (vcpu_mode_is_32bit(vcpu)) {
> > > diff --git a/arch/arm64/include/uapi/asm/kvm.h
> > > b/arch/arm64/include/uapi/asm/kvm.h index
> > > 97c3478ee6e7..4a19ef199a99 100644 ---
> > > a/arch/arm64/include/uapi/asm/kvm.h +++
> > > b/arch/arm64/include/uapi/asm/kvm.h @@ -225,6 +225,15 @@ struct
> > > kvm_vcpu_events { #define KVM_REG_ARM_FW_REG(r)
> > > (KVM_REG_ARM64 | KVM_REG_SIZE_U64 | \ KVM_REG_ARM_FW | ((r) &
> > > 0x)) #define KVM_REG_ARM_PSCI_VERSION
> > > KVM_REG_ARM_FW_REG(0) +#define
> > > KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1   KVM_REG_ARM_FW_REG(1)
> > > +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL0
> > > +#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL1
> > > +#define 

[PATCH] kvm: arm64: Relax the restriction on using stage2 PUD huge mapping

2019-01-29 Thread Suzuki K Poulose
We restrict mapping the PUD huge pages in stage2 to only when the
stage2 has 4 level page table, leaving the feature unused with
the default IPA size. But we could use it even with a 3
level page table, i.e, when the PUD level is folded into PGD,
just like the stage1. Relax the condition to allow using the
PUD huge page mappings at stage2 when it is possible.

Cc: Christoffer Dall 
Cc: Marc Zyngier 
Signed-off-by: Suzuki K Poulose 
---
 virt/kvm/arm/mmu.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index fbdf3ac..30251e2 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -1695,11 +1695,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, 
phys_addr_t fault_ipa,
 
vma_pagesize = vma_kernel_pagesize(vma);
/*
-* PUD level may not exist for a VM but PMD is guaranteed to
-* exist.
+* The stage2 has a minimum of 2 level table (For arm64 see
+* kvm_arm_setup_stage2()). Hence, we are guaranteed that we can
+* use PMD_SIZE huge mappings (even when the PMD is folded into PGD).
+* As for PUD huge maps, we must make sure that we have at least
+* 3 levels, i.e, PMD is not folded.
 */
if ((vma_pagesize == PMD_SIZE ||
-(vma_pagesize == PUD_SIZE && kvm_stage2_has_pud(kvm))) &&
+(vma_pagesize == PUD_SIZE && kvm_stage2_has_pmd(kvm))) &&
!force_pte) {
gfn = (fault_ipa & huge_page_mask(hstate_vma(vma))) >> 
PAGE_SHIFT;
}
-- 
2.7.4

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v7 0/7] Add virtio-iommu driver

2019-01-29 Thread Michael S. Tsirkin
On Mon, Jan 21, 2019 at 11:29:05AM +, Jean-Philippe Brucker wrote:
> Hi,
> 
> On 18/01/2019 15:51, Michael S. Tsirkin wrote:
> > 
> > On Tue, Jan 15, 2019 at 12:19:52PM +, Jean-Philippe Brucker wrote:
> >> Implement the virtio-iommu driver, following specification v0.9 [1].
> >>
> >> This is a simple rebase onto Linux v5.0-rc2. We now use the
> >> dev_iommu_fwspec_get() helper introduced in v5.0 instead of accessing
> >> dev->iommu_fwspec, but there aren't any functional change from v6 [2].
> >>
> >> Our current goal for virtio-iommu is to get a paravirtual IOMMU working
> >> on Arm, and enable device assignment to guest userspace. In this
> >> use-case the mappings are static, and don't require optimal performance,
> >> so this series tries to keep things simple. However there is plenty more
> >> to do for features and optimizations, and having this base in v5.1 would
> >> be good. Given that most of the changes are to drivers/iommu, I believe
> >> the driver and future changes should go via the IOMMU tree.
> >>
> >> You can find Linux driver and kvmtool device on v0.9.2 branches [3],
> >> module and x86 support on virtio-iommu/devel. Also tested with Eric's
> >> QEMU device [4]. Please note that the series depends on Robin's
> >> probe-deferral fix [5], which will hopefully land in v5.0.
> >>
> >> [1] Virtio-iommu specification v0.9, sources and pdf
> >> git://linux-arm.org/virtio-iommu.git virtio-iommu/v0.9
> >> http://jpbrucker.net/virtio-iommu/spec/v0.9/virtio-iommu-v0.9.pdf
> >>
> >> [2] [PATCH v6 0/7] Add virtio-iommu driver
> >> 
> >> https://lists.linuxfoundation.org/pipermail/iommu/2018-December/032127.html
> >>
> >> [3] git://linux-arm.org/linux-jpb.git virtio-iommu/v0.9.2
> >> git://linux-arm.org/kvmtool-jpb.git virtio-iommu/v0.9.2
> >>
> >> [4] [RFC v9 00/17] VIRTIO-IOMMU device
> >> https://www.mail-archive.com/qemu-devel@nongnu.org/msg575578.html
> >>
> >> [5] [PATCH] iommu/of: Fix probe-deferral
> >> https://www.spinics.net/lists/arm-kernel/msg698371.html
> > 
> > Thanks for the work!
> > So really my only issue with this is that there's no
> > way for the IOMMU to describe the devices that it
> > covers.
> > 
> > As a result that is then done in a platform-specific way.
> > 
> > And this means that for example it does not solve the problem that e.g.
> > some power people have in that their platform simply does not have a way
> > to specify which devices are covered by the IOMMU.
> 
> Isn't power using device tree? I haven't looked much at power because I
> was told a while ago that they already paravirtualize their IOMMU and
> don't need virtio-iommu, except perhaps for some legacy platforms. Or
> something along those lines. But I would certainly be interested in
> enabling the IOMMU for more architectures.

I have CC'd the relevant ppc developers, let's see what do they think.


> As for the enumeration problem, I still don't think we can get much
> better than DT and ACPI as solutions (and IMO they are necessary to make
> this device portable). But I believe that getting DT and ACPI support is
> just a one-off inconvenience. That is, once the required bindings are
> accepted, any future extension can then be done at the virtio level with
> feature bits and probe requests, without having to update ACPI or DT.
> 
> Thanks,
> Jean
> 
> > Solving that problem would make me much more excited about
> > this device.
> > 
> > On the other hand I can see that while there have been some
> > developments most of the code has been stable for quite a while now.
> > 
> > So what I am trying to do right about now, is making a small module that
> > loads early and pokes at the IOMMU sufficiently to get the data about
> > which devices use the IOMMU out of it using standard virtio config
> > space.  IIUC it's claimed to be impossible without messy changes to the
> > boot sequence.
> > 
> > If I succeed at least on some platforms I'll ask that this design is
> > worked into this device, minimizing info that goes through DT/ACPI.  If
> > I see I can't make it in time to meet the next merge window, I plan
> > merging the existing patches using DT (barring surprises).
> > 
> > As I only have a very small amount of time to spend on this attempt, If
> > someone else wants to try doing that in parallel, that would be great!
> > 
> > 
> >> Jean-Philippe Brucker (7):
> >>   dt-bindings: virtio-mmio: Add IOMMU description
> >>   dt-bindings: virtio: Add virtio-pci-iommu node
> >>   of: Allow the iommu-map property to omit untranslated devices
> >>   PCI: OF: Initialize dev->fwnode appropriately
> >>   iommu: Add virtio-iommu driver
> >>   iommu/virtio: Add probe request
> >>   iommu/virtio: Add event queue
> >>
> >>  .../devicetree/bindings/virtio/iommu.txt  |   66 +
> >>  .../devicetree/bindings/virtio/mmio.txt   |   30 +
> >>  MAINTAINERS   |7 +
> >>  drivers/iommu/Kconfig |   11 +
> >>  drivers/iommu/Makefile   

[PATCH v8 19/26] ACPI / APEI: Split ghes_read_estatus() to allow a peek at the CPER length

2019-01-29 Thread James Morse
ghes_read_estatus() reads the record address, then the record's
header, then performs some sanity checks before reading the
records into the provided estatus buffer.

To provide this estatus buffer the caller must know the size of the
records in advance, or always provide a worst-case sized buffer as
happens today for the non-NMI notifications.

Add a function to peek at the record's header to find the size. This
will let the NMI path allocate the right amount of memory before reading
the records, instead of using the worst-case size, and having to copy
the records.

Split ghes_read_estatus() to create __ghes_peek_estatus() which
returns the address and size of the CPER records.

Signed-off-by: James Morse 

Changes since v7:
 * Grammar
 * concistent argument ordering

Changes since v6:
 * Additional buf_addr = 0 error handling
 * Moved checking out of peek-estatus
 * Reworded an error message so we can tell them apart
---
 drivers/acpi/apei/ghes.c | 40 +---
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 9391fff71344..12375a82fa03 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -317,12 +317,12 @@ int __ghes_check_estatus(struct ghes *ghes,
return 0;
 }
 
-static int ghes_read_estatus(struct ghes *ghes,
-struct acpi_hest_generic_status *estatus,
-u64 *buf_paddr, enum fixed_addresses fixmap_idx)
+/* Read the CPER block, returning its address, and header in estatus. */
+static int __ghes_peek_estatus(struct ghes *ghes,
+  struct acpi_hest_generic_status *estatus,
+  u64 *buf_paddr, enum fixed_addresses fixmap_idx)
 {
struct acpi_hest_generic *g = ghes->generic;
-   u32 len;
int rc;
 
rc = apei_read(buf_paddr, >error_status_address);
@@ -343,14 +343,14 @@ static int ghes_read_estatus(struct ghes *ghes,
return -ENOENT;
}
 
-   rc = __ghes_check_estatus(ghes, estatus);
-   if (rc)
-   return rc;
+   return __ghes_check_estatus(ghes, estatus);
+}
 
-   len = cper_estatus_len(estatus);
-   ghes_copy_tofrom_phys(estatus + 1,
- *buf_paddr + sizeof(*estatus),
- len - sizeof(*estatus), 1, fixmap_idx);
+static int __ghes_read_estatus(struct acpi_hest_generic_status *estatus,
+  u64 buf_paddr, enum fixed_addresses fixmap_idx,
+  size_t buf_len)
+{
+   ghes_copy_tofrom_phys(estatus, buf_paddr, buf_len, 1, fixmap_idx);
if (cper_estatus_check(estatus)) {
pr_warn_ratelimited(FW_WARN GHES_PFX
"Failed to read error status block!\n");
@@ -360,6 +360,24 @@ static int ghes_read_estatus(struct ghes *ghes,
return 0;
 }
 
+static int ghes_read_estatus(struct ghes *ghes,
+struct acpi_hest_generic_status *estatus,
+u64 *buf_paddr, enum fixed_addresses fixmap_idx)
+{
+   int rc;
+
+   rc = __ghes_peek_estatus(ghes, estatus, buf_paddr, fixmap_idx);
+   if (rc)
+   return rc;
+
+   rc = __ghes_check_estatus(ghes, estatus);
+   if (rc)
+   return rc;
+
+   return __ghes_read_estatus(estatus, *buf_paddr, fixmap_idx,
+  cper_estatus_len(estatus));
+}
+
 static void ghes_clear_estatus(struct ghes *ghes,
   struct acpi_hest_generic_status *estatus,
   u64 buf_paddr, enum fixed_addresses fixmap_idx)
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 23/26] ACPI / APEI: Kick the memory_failure() queue for synchronous errors

2019-01-29 Thread James Morse
memory_failure() offlines or repairs pages of memory that have been
discovered to be corrupt. These may be detected by an external
component, (e.g. the memory controller), and notified via an IRQ.
In this case the work is queued as not all of memory_failure()s work
can happen in IRQ context.

If the error was detected as a result of user-space accessing a
corrupt memory location the CPU may take an abort instead. On arm64
this is a 'synchronous external abort', and on a firmware first
system it is replayed using NOTIFY_SEA.

This notification has NMI like properties, (it can interrupt
IRQ-masked code), so the memory_failure() work is queued. If we
return to user-space before the queued memory_failure() work is
processed, we will take the fault again. This loop may cause platform
firmware to exceed some threshold and reboot when Linux could have
recovered from this error.

For NMIlike notifications keep track of whether memory_failure() work
was queued, and make task_work pending to flush out the queue.
To save memory allocations, the task_work is allocated as part of
the ghes_estatus_node, and free()ing it back to the pool is deferred.

Signed-off-by: James Morse 

---
current->mm == _mm ? I couldn't find a helper for this.
The intent is not to set TIF flags on kernel threads. What happens
if a kernel-thread takes on of these? Its just one of the many
not-handled-very-well cases we have already, as memory_failure()
puts it: "try to be lucky".

I assume that if NOTIFY_NMI is coming from SMM it must suffer from
this problem too.

Changes since v7:
 * Don't allocate memory, stuff it in estatus_node. This means passing
   back a 'queued' flag to ghes_proc_in_irq() which holds the estatus_node.
---
 drivers/acpi/apei/ghes.c | 68 +---
 include/acpi/ghes.h  |  3 ++
 2 files changed, 60 insertions(+), 11 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index e6f0d176b245..dfa8f155f964 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -47,6 +47,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -399,23 +400,46 @@ static void ghes_clear_estatus(struct ghes *ghes,
ghes_ack_error(ghes->generic_v2);
 }
 
-static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, 
int sev)
+/*
+ * Called as task_work before returning to user-space.
+ * Ensure any queued work has been done before we return to the context that
+ * triggered the notification.
+ */
+static void ghes_kick_task_work(struct callback_head *head)
+{
+   struct acpi_hest_generic_status *estatus;
+   struct ghes_estatus_node *estatus_node;
+   u32 node_len;
+
+   estatus_node = container_of(head, struct ghes_estatus_node, task_work);
+   memory_failure_queue_kick(estatus_node->task_work_cpu);
+
+   estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
+   node_len = GHES_ESTATUS_NODE_LEN(cper_estatus_len(estatus));
+   gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
+}
+
+static bool ghes_handle_memory_failure(struct ghes *ghes,
+  struct acpi_hest_generic_data *gdata,
+  int sev)
 {
-#ifdef CONFIG_ACPI_APEI_MEMORY_FAILURE
unsigned long pfn;
int flags = -1;
int sec_sev = ghes_severity(gdata->error_severity);
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
 
+   if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
+   return false;
+
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
-   return;
+   return false;
 
pfn = mem_err->physical_addr >> PAGE_SHIFT;
if (!pfn_valid(pfn)) {
pr_warn_ratelimited(FW_WARN GHES_PFX
"Invalid address in generic error data: %#llx\n",
mem_err->physical_addr);
-   return;
+   return false;
}
 
/* iff following two events can be handled properly by now */
@@ -425,9 +449,12 @@ static void ghes_handle_memory_failure(struct 
acpi_hest_generic_data *gdata, int
if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
flags = 0;
 
-   if (flags != -1)
+   if (flags != -1) {
memory_failure_queue(pfn, flags);
-#endif
+   return true;
+   }
+
+   return false;
 }
 
 /*
@@ -475,11 +502,12 @@ static void ghes_handle_aer(struct acpi_hest_generic_data 
*gdata)
 #endif
 }
 
-static void ghes_do_proc(struct ghes *ghes,
+static bool ghes_do_proc(struct ghes *ghes,
 const struct acpi_hest_generic_status *estatus)
 {
int sev, sec_sev;
struct acpi_hest_generic_data *gdata;
+   bool work_queued = false;
guid_t *sec_type;
guid_t *fru_id = _UUID_LE;
char *fru_text = "";
@@ -500,7 +528,8 @@ static void ghes_do_proc(struct ghes *ghes,
  

[PATCH v8 18/26] ACPI / APEI: Make GHES estatus header validation more user friendly

2019-01-29 Thread James Morse
ghes_read_estatus() checks various lengths in the top-level header to
ensure the CPER records to be read aren't obviously corrupt.

Take the opportunity to make this more user-friendly, printing a
(ratelimited) message about the nature of the header format error.

Suggested-by: Borislav Petkov 
Signed-off-by: James Morse 
---
 drivers/acpi/apei/ghes.c | 46 
 1 file changed, 32 insertions(+), 14 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index f95db2398dd5..9391fff71344 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -293,6 +293,30 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, 
u32 len,
}
 }
 
+/* Check the top-level record header has an appropriate size. */
+int __ghes_check_estatus(struct ghes *ghes,
+struct acpi_hest_generic_status *estatus)
+{
+   u32 len = cper_estatus_len(estatus);
+
+   if (len < sizeof(*estatus)) {
+   pr_warn_ratelimited(FW_WARN GHES_PFX "Truncated error status 
block!\n");
+   return -EIO;
+   }
+
+   if (len > ghes->generic->error_block_length) {
+   pr_warn_ratelimited(FW_WARN GHES_PFX "Invalid error status 
block length!\n");
+   return -EIO;
+   }
+
+   if (cper_estatus_check_header(estatus)) {
+   pr_warn_ratelimited(FW_WARN GHES_PFX "Invalid CPER header!\n");
+   return -EIO;
+   }
+
+   return 0;
+}
+
 static int ghes_read_estatus(struct ghes *ghes,
 struct acpi_hest_generic_status *estatus,
 u64 *buf_paddr, enum fixed_addresses fixmap_idx)
@@ -319,27 +343,21 @@ static int ghes_read_estatus(struct ghes *ghes,
return -ENOENT;
}
 
-   rc = -EIO;
+   rc = __ghes_check_estatus(ghes, estatus);
+   if (rc)
+   return rc;
+
len = cper_estatus_len(estatus);
-   if (len < sizeof(*estatus))
-   goto err_read_block;
-   if (len > ghes->generic->error_block_length)
-   goto err_read_block;
-   if (cper_estatus_check_header(estatus))
-   goto err_read_block;
ghes_copy_tofrom_phys(estatus + 1,
  *buf_paddr + sizeof(*estatus),
  len - sizeof(*estatus), 1, fixmap_idx);
-   if (cper_estatus_check(estatus))
-   goto err_read_block;
-   rc = 0;
-
-err_read_block:
-   if (rc)
+   if (cper_estatus_check(estatus)) {
pr_warn_ratelimited(FW_WARN GHES_PFX
"Failed to read error status block!\n");
+   return -EIO;
+   }
 
-   return rc;
+   return 0;
 }
 
 static void ghes_clear_estatus(struct ghes *ghes,
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 25/26] firmware: arm_sdei: Add ACPI GHES registration helper

2019-01-29 Thread James Morse
APEI's Generic Hardware Error Source structures do not describe
whether the SDEI event is shared or private, as this information is
discoverable via the API.

GHES needs to know whether an event is normal or critical to avoid
sharing locks or fixmap entries, but GHES shouldn't have to know about
the SDEI API.

Add a helper to register the GHES using the appropriate normal or
critical callback.

Signed-off-by: James Morse 
Acked-by: Catalin Marinas 

---
Changes since v4:
 * Moved normal/critical callbacks into the helper, as APEI needs to know.
 * Tinkered with the commit message.
 * Dropped Punit's Reviewed-by.

Changes since v3:
 * Removed acpi_disabled() checks that aren't necessary after v2s #ifdef
   change.

Changes since v2:
 * Added header file, thanks kbuild-robot!
 * changed ifdef to the GHES version to match the fixmap definition

Changes since v1:
 * ghes->fixmap_idx variable rename
---
 arch/arm64/include/asm/fixmap.h |  4 ++
 drivers/firmware/arm_sdei.c | 68 +
 include/linux/arm_sdei.h|  6 +++
 3 files changed, 78 insertions(+)

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index 966dd4bb23f2..f987b8a8f325 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -56,6 +56,10 @@ enum fixed_addresses {
/* Used for GHES mapping from assorted contexts */
FIX_APEI_GHES_IRQ,
FIX_APEI_GHES_SEA,
+#ifdef CONFIG_ARM_SDE_INTERFACE
+   FIX_APEI_GHES_SDEI_NORMAL,
+   FIX_APEI_GHES_SDEI_CRITICAL,
+#endif
 #endif /* CONFIG_ACPI_APEI_GHES */
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c
index c64c7da73829..e6376f985ef7 100644
--- a/drivers/firmware/arm_sdei.c
+++ b/drivers/firmware/arm_sdei.c
@@ -2,6 +2,7 @@
 // Copyright (C) 2017 Arm Ltd.
 #define pr_fmt(fmt) "sdei: " fmt
 
+#include 
 #include 
 #include 
 #include 
@@ -887,6 +888,73 @@ static void sdei_smccc_hvc(unsigned long function_id,
arm_smccc_hvc(function_id, arg0, arg1, arg2, arg3, arg4, 0, 0, res);
 }
 
+int sdei_register_ghes(struct ghes *ghes, sdei_event_callback *normal_cb,
+  sdei_event_callback *critical_cb)
+{
+   int err;
+   u64 result;
+   u32 event_num;
+   sdei_event_callback *cb;
+
+   if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES))
+   return -EOPNOTSUPP;
+
+   event_num = ghes->generic->notify.vector;
+   if (event_num == 0) {
+   /*
+* Event 0 is reserved by the specification for
+* SDEI_EVENT_SIGNAL.
+*/
+   return -EINVAL;
+   }
+
+   err = sdei_api_event_get_info(event_num, SDEI_EVENT_INFO_EV_PRIORITY,
+ );
+   if (err)
+   return err;
+
+   if (result == SDEI_EVENT_PRIORITY_CRITICAL)
+   cb = critical_cb;
+   else
+   cb = normal_cb;
+
+   err = sdei_event_register(event_num, cb, ghes);
+   if (!err)
+   err = sdei_event_enable(event_num);
+
+   return err;
+}
+
+int sdei_unregister_ghes(struct ghes *ghes)
+{
+   int i;
+   int err;
+   u32 event_num = ghes->generic->notify.vector;
+
+   might_sleep();
+
+   if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES))
+   return -EOPNOTSUPP;
+
+   /*
+* The event may be running on another CPU. Disable it
+* to stop new events, then try to unregister a few times.
+*/
+   err = sdei_event_disable(event_num);
+   if (err)
+   return err;
+
+   for (i = 0; i < 3; i++) {
+   err = sdei_event_unregister(event_num);
+   if (err != -EINPROGRESS)
+   break;
+
+   schedule();
+   }
+
+   return err;
+}
+
 static int sdei_get_conduit(struct platform_device *pdev)
 {
const char *method;
diff --git a/include/linux/arm_sdei.h b/include/linux/arm_sdei.h
index 942afbd544b7..393899192906 100644
--- a/include/linux/arm_sdei.h
+++ b/include/linux/arm_sdei.h
@@ -11,6 +11,7 @@ enum sdei_conduit_types {
CONDUIT_HVC,
 };
 
+#include 
 #include 
 
 /* Arch code should override this to set the entry point from firmware... */
@@ -39,6 +40,11 @@ int sdei_event_unregister(u32 event_num);
 int sdei_event_enable(u32 event_num);
 int sdei_event_disable(u32 event_num);
 
+/* GHES register/unregister helpers */
+int sdei_register_ghes(struct ghes *ghes, sdei_event_callback *normal_cb,
+  sdei_event_callback *critical_cb);
+int sdei_unregister_ghes(struct ghes *ghes);
+
 #ifdef CONFIG_ARM_SDE_INTERFACE
 /* For use by arch code when CPU hotplug notifiers are not appropriate. */
 int sdei_mask_local_cpu(void);
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 26/26] ACPI / APEI: Add support for the SDEI GHES Notification type

2019-01-29 Thread James Morse
If the GHES notification type is SDEI, register the provided event
using the SDEI-GHES helper.

SDEI may be one of two types of event, normal and critical. Critical
events can interrupt normal events, so these must have separate
fixmap slots and locks in case both event types are in use.

Signed-off-by: James Morse 

--
Changes since v7:
 * Use __end_of_fixed_addresses as an arch-agnostic invalid fixmap entry
 * Move the locks definition into the function that uses them to make it
   clear these are NMI only.

Changes since v6:
 * Tinkering due to the absence of #ifdef
 * Added SDEI to the new ghes_is_synchronous() helper.

---
 drivers/acpi/apei/ghes.c | 85 
 include/linux/arm_sdei.h |  3 ++
 2 files changed, 88 insertions(+)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index dfa8f155f964..d0f0a219bf8f 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -25,6 +25,7 @@
  * GNU General Public License for more details.
  */
 
+#include 
 #include 
 #include 
 #include 
@@ -86,6 +87,15 @@
((struct acpi_hest_generic_status *)\
 ((struct ghes_estatus_node *)(estatus_node) + 1))
 
+/*
+ *  NMI-like notifications vary by architecture, before the compiler can prune
+ *  unused static functions it needs a value for these enums.
+ */
+#ifndef CONFIG_ARM_SDE_INTERFACE
+#define FIX_APEI_GHES_SDEI_NORMAL  __end_of_fixed_addresses
+#define FIX_APEI_GHES_SDEI_CRITICAL__end_of_fixed_addresses
+#endif
+
 static inline bool is_hest_type_generic_v2(struct ghes *ghes)
 {
return ghes->generic->header.type == ACPI_HEST_TYPE_GENERIC_ERROR_V2;
@@ -1088,6 +1098,63 @@ static void ghes_nmi_init_cxt(void)
init_irq_work(_proc_irq_work, ghes_proc_in_irq);
 }
 
+static int __ghes_sdei_callback(struct ghes *ghes,
+   enum fixed_addresses fixmap_idx)
+{
+   if (!ghes_in_nmi_queue_one_entry(ghes, fixmap_idx)) {
+   irq_work_queue(_proc_irq_work);
+
+   return 0;
+   }
+
+   return -ENOENT;
+}
+
+static int ghes_sdei_normal_callback(u32 event_num, struct pt_regs *regs,
+ void *arg)
+{
+   static DEFINE_RAW_SPINLOCK(ghes_notify_lock_sdei_normal);
+   struct ghes *ghes = arg;
+   int err;
+
+   raw_spin_lock(_notify_lock_sdei_normal);
+   err = __ghes_sdei_callback(ghes, FIX_APEI_GHES_SDEI_NORMAL);
+   raw_spin_unlock(_notify_lock_sdei_normal);
+
+   return err;
+}
+
+static int ghes_sdei_critical_callback(u32 event_num, struct pt_regs *regs,
+  void *arg)
+{
+   static DEFINE_RAW_SPINLOCK(ghes_notify_lock_sdei_critical);
+   struct ghes *ghes = arg;
+   int err;
+
+   raw_spin_lock(_notify_lock_sdei_critical);
+   err = __ghes_sdei_callback(ghes, FIX_APEI_GHES_SDEI_CRITICAL);
+   raw_spin_unlock(_notify_lock_sdei_critical);
+
+   return err;
+}
+
+static int apei_sdei_register_ghes(struct ghes *ghes)
+{
+   if (!IS_ENABLED(CONFIG_ARM_SDE_INTERFACE))
+   return -EOPNOTSUPP;
+
+   return sdei_register_ghes(ghes, ghes_sdei_normal_callback,
+ghes_sdei_critical_callback);
+}
+
+static int apei_sdei_unregister_ghes(struct ghes *ghes)
+{
+   if (!IS_ENABLED(CONFIG_ARM_SDE_INTERFACE))
+   return -EOPNOTSUPP;
+
+   return sdei_unregister_ghes(ghes);
+}
+
 static int ghes_probe(struct platform_device *ghes_dev)
 {
struct acpi_hest_generic *generic;
@@ -1123,6 +1190,13 @@ static int ghes_probe(struct platform_device *ghes_dev)
goto err;
}
break;
+   case ACPI_HEST_NOTIFY_SOFTWARE_DELEGATED:
+   if (!IS_ENABLED(CONFIG_ARM_SDE_INTERFACE)) {
+   pr_warn(GHES_PFX "Generic hardware error source: %d 
notified via SDE Interface is not supported!\n",
+   generic->header.source_id);
+   goto err;
+   }
+   break;
case ACPI_HEST_NOTIFY_LOCAL:
pr_warning(GHES_PFX "Generic hardware error source: %d notified 
via local interrupt is not supported!\n",
   generic->header.source_id);
@@ -1186,6 +1260,11 @@ static int ghes_probe(struct platform_device *ghes_dev)
case ACPI_HEST_NOTIFY_NMI:
ghes_nmi_add(ghes);
break;
+   case ACPI_HEST_NOTIFY_SOFTWARE_DELEGATED:
+   rc = apei_sdei_register_ghes(ghes);
+   if (rc)
+   goto err;
+   break;
default:
BUG();
}
@@ -1211,6 +1290,7 @@ static int ghes_probe(struct platform_device *ghes_dev)
 
 static int ghes_remove(struct platform_device *ghes_dev)
 {
+   int rc;
struct ghes *ghes;
struct acpi_hest_generic *generic;
 
@@ -1243,6 +1323,11 @@ static int 

[PATCH v8 24/26] arm64: acpi: Make apei_claim_sea() synchronise with APEI's irq work

2019-01-29 Thread James Morse
APEI is unable to do all of its error handling work in nmi-context, so
it defers non-fatal work onto the irq_work queue. arch_irq_work_raise()
sends an IPI to the calling cpu, but this is not guaranteed to be taken
before returning to user-space.

Unless the exception interrupted a context with irqs-masked,
irq_work_run() can run immediately. Otherwise return -EINPROGRESS to
indicate ghes_notify_sea() found some work to do, but it hasn't
finished yet.

With this apei_claim_sea() returning '0' means this external-abort was
also notification of a firmware-first RAS error, and that APEI has
processed the CPER records.

Signed-off-by: James Morse 
Reviewed-by: Punit Agrawal 
Tested-by: Tyler Baicar 
Acked-by: Catalin Marinas 
CC: Xie XiuQi 
CC: gengdongjiu 

---
Changes since v7:
 * Added Catalin's ack, then:
 * Added __irq_enter()/exit() calls so if we interrupted preemptible code, the
   preempt count matches what other irq-work expects.
 * Changed the 'if (!arch_irqs_disabled_flags(interrupted_flags))' test to be
   safe before/after Julien's PMR series.

Changes since v6:
 * Added pr_warn() for the EINPROGRESS case so panic-tracebacks show
   'APEI was here'.
 * Tinkered with the commit message

Changes since v2:
 * Removed IS_ENABLED() check, done by the caller unless we have a dummy
   definition.
---
 arch/arm64/kernel/acpi.c | 23 +++
 arch/arm64/mm/fault.c|  9 -
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 803f0494dd3e..8288ae0c8f3b 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -268,12 +269,17 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr)
 int apei_claim_sea(struct pt_regs *regs)
 {
int err = -ENOENT;
+   bool return_to_irqs_enabled;
unsigned long current_flags;
 
if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES))
return err;
 
current_flags = arch_local_save_flags();
+   return_to_irqs_enabled = !irqs_disabled_flags(current_flags);
+
+   if (regs)
+   return_to_irqs_enabled = interrupts_enabled(regs);
 
/*
 * SEA can interrupt SError, mask it and describe this as an NMI so
@@ -283,6 +289,23 @@ int apei_claim_sea(struct pt_regs *regs)
nmi_enter();
err = ghes_notify_sea();
nmi_exit();
+
+   /*
+* APEI NMI-like notifications are deferred to irq_work. Unless
+* we interrupted irqs-masked code, we can do that now.
+*/
+   if (!err) {
+   if (return_to_irqs_enabled) {
+   local_daif_restore(DAIF_PROCCTX_NOIRQ);
+   __irq_enter();
+   irq_work_run();
+   __irq_exit();
+   } else {
+   pr_warn("APEI work queued but not completed");
+   err = -EINPROGRESS;
+   }
+   }
+
local_daif_restore(current_flags);
 
return err;
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index e1c84c2e1cab..1611714f8333 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -642,11 +642,10 @@ static int do_sea(unsigned long addr, unsigned int esr, 
struct pt_regs *regs)
 
inf = esr_to_fault_info(esr);
 
-   /*
-* Return value ignored as we rely on signal merging.
-* Future patches will make this more robust.
-*/
-   apei_claim_sea(regs);
+   if (apei_claim_sea(regs) == 0) {
+   /* APEI claimed this as a firmware-first notification */
+   return 0;
+   }
 
if (esr & ESR_ELx_FnV)
siaddr = NULL;
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 21/26] ACPI / APEI: Use separate fixmap pages for arm64 NMI-like notifications

2019-01-29 Thread James Morse
Now that ghes notification helpers provide the fixmap slots and
take the lock themselves, multiple NMI-like notifications can
be used on arm64.

These should be named after their notification method as they can't
all be called 'NMI'. x86's NOTIFY_NMI already is, change the SEA
fixmap entry to be called FIX_APEI_GHES_SEA.

Future patches can add support for FIX_APEI_GHES_SEI and
FIX_APEI_GHES_SDEI_{NORMAL,CRITICAL}.

Because all of ghes.c builds on both architectures, provide a
constant for each fixmap entry that the architecture will never
use.

Signed-off-by: James Morse 

---
Changes since v7:
 * Removed v6's #ifdefs, these aren't needed now that SEA/NMI can't be
   turned off on their respective architectures.

Changes since v6:
 * Added #ifdef definitions of each missing fixmap entry.

Changes since v3:
 * idx/lock are now in a separate struct.
 * Add to the comment above ghes_fixmap_lock_irq so that it makes more
   sense in isolation.
---
 arch/arm64/include/asm/fixmap.h | 2 +-
 drivers/acpi/apei/ghes.c| 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/include/asm/fixmap.h b/arch/arm64/include/asm/fixmap.h
index ec1e6d6fa14c..966dd4bb23f2 100644
--- a/arch/arm64/include/asm/fixmap.h
+++ b/arch/arm64/include/asm/fixmap.h
@@ -55,7 +55,7 @@ enum fixed_addresses {
 #ifdef CONFIG_ACPI_APEI_GHES
/* Used for GHES mapping from assorted contexts */
FIX_APEI_GHES_IRQ,
-   FIX_APEI_GHES_NMI,
+   FIX_APEI_GHES_SEA,
 #endif /* CONFIG_ACPI_APEI_GHES */
 
 #ifdef CONFIG_UNMAP_KERNEL_AT_EL0
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 957c1559ebf5..e6f0d176b245 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -958,7 +958,7 @@ int ghes_notify_sea(void)
int rv;
 
raw_spin_lock(_notify_lock_sea);
-   rv = ghes_in_nmi_spool_from_list(_sea, FIX_APEI_GHES_NMI);
+   rv = ghes_in_nmi_spool_from_list(_sea, FIX_APEI_GHES_SEA);
raw_spin_unlock(_notify_lock_sea);
 
return rv;
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 20/26] ACPI / APEI: Only use queued estatus entry during in_nmi_queue_one_entry()

2019-01-29 Thread James Morse
Each struct ghes has an worst-case sized buffer for storing the
estatus. If an error is being processed by ghes_proc() in process
context this buffer will be in use. If the error source then triggers
an NMI-like notification, the same buffer will be used by
in_nmi_queue_one_entry() to stage the estatus data, before
__process_error() copys it into a queued estatus entry.

Merge __process_error()s work into in_nmi_queue_one_entry() so that
the queued estatus entry is used from the beginning. Use the new
ghes_peek_estatus() to know how much memory to allocate from
the ghes_estatus_pool before reading the records.

Reported-by: Borislav Petkov 
Signed-off-by: James Morse 
Reviewed-by: Borislav Petkov 

Change since v6:
 * Added a comment explaining the 'ack-error, then goto no_work'.
 * Added missing esatus-clearing, which is necessary after reading the GAS,
---
 drivers/acpi/apei/ghes.c | 64 +++-
 1 file changed, 37 insertions(+), 27 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 12375a82fa03..957c1559ebf5 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -862,57 +862,67 @@ static void ghes_print_queued_estatus(void)
}
 }
 
-/* Save estatus for further processing in IRQ context */
-static void __process_error(struct ghes *ghes,
-   struct acpi_hest_generic_status *src_estatus)
+static int ghes_in_nmi_queue_one_entry(struct ghes *ghes,
+  enum fixed_addresses fixmap_idx)
 {
-   u32 len, node_len;
+   struct acpi_hest_generic_status *estatus, tmp_header;
struct ghes_estatus_node *estatus_node;
-   struct acpi_hest_generic_status *estatus;
+   u32 len, node_len;
+   u64 buf_paddr;
+   int sev, rc;
 
if (!IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG))
-   return;
+   return -EOPNOTSUPP;
 
-   if (ghes_estatus_cached(src_estatus))
-   return;
+   rc = __ghes_peek_estatus(ghes, _header, _paddr, fixmap_idx);
+   if (rc) {
+   ghes_clear_estatus(ghes, _header, buf_paddr, fixmap_idx);
+   return rc;
+   }
 
-   len = cper_estatus_len(src_estatus);
-   node_len = GHES_ESTATUS_NODE_LEN(len);
+   rc = __ghes_check_estatus(ghes, _header);
+   if (rc) {
+   ghes_clear_estatus(ghes, _header, buf_paddr, fixmap_idx);
+   return rc;
+   }
 
+   len = cper_estatus_len(_header);
+   node_len = GHES_ESTATUS_NODE_LEN(len);
estatus_node = (void *)gen_pool_alloc(ghes_estatus_pool, node_len);
if (!estatus_node)
-   return;
+   return -ENOMEM;
 
estatus_node->ghes = ghes;
estatus_node->generic = ghes->generic;
estatus = GHES_ESTATUS_FROM_NODE(estatus_node);
-   memcpy(estatus, src_estatus, len);
-   llist_add(_node->llnode, _estatus_llist);
-}
 
-static int ghes_in_nmi_queue_one_entry(struct ghes *ghes,
-  enum fixed_addresses fixmap_idx)
-{
-   struct acpi_hest_generic_status *estatus = ghes->estatus;
-   u64 buf_paddr;
-   int sev;
-
-   if (ghes_read_estatus(ghes, estatus, _paddr, fixmap_idx)) {
+   if (__ghes_read_estatus(estatus, buf_paddr, fixmap_idx, len)) {
ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);
-   return -ENOENT;
+   rc = -ENOENT;
+   goto no_work;
}
 
sev = ghes_severity(estatus->error_severity);
if (sev >= GHES_SEV_PANIC) {
ghes_print_queued_estatus();
__ghes_panic(ghes, estatus, buf_paddr, fixmap_idx);
-
}
 
-   __process_error(ghes, estatus);
-   ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);
+   ghes_clear_estatus(ghes, _header, buf_paddr, fixmap_idx);
 
-   return 0;
+   /* This error has been reported before, don't process it again. */
+   if (ghes_estatus_cached(estatus))
+   goto no_work;
+
+   llist_add(_node->llnode, _estatus_llist);
+
+   return rc;
+
+no_work:
+   gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node,
+ node_len);
+
+   return rc;
 }
 
 static int ghes_in_nmi_spool_from_list(struct list_head *rcu_list,
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 22/26] mm/memory-failure: Add memory_failure_queue_kick()

2019-01-29 Thread James Morse
The GHES code calls memory_failure_queue() from IRQ context to schedule
work on the current CPU so that memory_failure() can sleep.

For synchronous memory errors the arch code needs to know any signals
that memory_failure() will trigger are pending before it returns to
user-space, possibly when exiting from the IRQ.

Add a helper to kick the memory failure queue, to ensure the scheduled
work has happened. This has to be called from process context, so may
have been migrated from the original cpu. Pass the cpu the work was
queued on.

Change memory_failure_work_func() to permit being called on the 'wrong'
cpu.

Signed-off-by: James Morse 
---
 include/linux/mm.h  |  1 +
 mm/memory-failure.c | 15 ++-
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 80bb6408fe73..b33bededc69d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2743,6 +2743,7 @@ enum mf_flags {
 };
 extern int memory_failure(unsigned long pfn, int flags);
 extern void memory_failure_queue(unsigned long pfn, int flags);
+extern void memory_failure_queue_kick(int cpu);
 extern int unpoison_memory(unsigned long pfn);
 extern int get_hwpoison_page(struct page *page);
 #define put_hwpoison_page(page)put_page(page)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6379fff1a5ff..9b4705a53fed 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1494,7 +1494,7 @@ static void memory_failure_work_func(struct work_struct 
*work)
unsigned long proc_flags;
int gotten;
 
-   mf_cpu = this_cpu_ptr(_failure_cpu);
+   mf_cpu = container_of(work, struct memory_failure_cpu, work);
for (;;) {
spin_lock_irqsave(_cpu->lock, proc_flags);
gotten = kfifo_get(_cpu->fifo, );
@@ -1508,6 +1508,19 @@ static void memory_failure_work_func(struct work_struct 
*work)
}
 }
 
+/*
+ * Process memory_failure work queued on the specified CPU.
+ * Used to avoid return-to-userspace racing with the memory_failure workqueue.
+ */
+void memory_failure_queue_kick(int cpu)
+{
+   struct memory_failure_cpu *mf_cpu;
+
+   mf_cpu = _cpu(memory_failure_cpu, cpu);
+   cancel_work_sync(_cpu->work);
+   memory_failure_work_func(_cpu->work);
+}
+
 static int __init memory_failure_init(void)
 {
struct memory_failure_cpu *mf_cpu;
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 09/26] ACPI / APEI: Generalise the estatus queue's notify code

2019-01-29 Thread James Morse
Refactor the estatus queue's pool notification routine from
NOTIFY_NMI's handlers. This will allow another notification
method to use the estatus queue without duplicating this code.

Add rcu_read_lock()/rcu_read_unlock() around the list
list_for_each_entry_rcu() walker. These aren't strictly necessary as
the whole nmi_enter/nmi_exit() window is a spooky RCU read-side
critical section.

in_nmi_queue_one_entry() is separate from the rcu-list walker for a
later caller that doesn't need to walk a list.

Signed-off-by: James Morse 
Reviewed-by: Punit Agrawal 
Tested-by: Tyler Baicar 

---
Changes since v7:
 * Moved err= onto a separate line to make this more readable
 * Dropped ghes_ prefix on new static functions
 * Renamed stuff, 'notify' has an overloaded meaning,

Changes since v6:
 * Removed pool grow/remove code as this is no longer necessary.

Changes since v3:
 * Removed duplicate or redundant paragraphs in commit message.
 * Fixed the style of a zero check.
Changes since v1:
   * Tidied up _in_nmi_notify_one().
---
 drivers/acpi/apei/ghes.c | 65 ++--
 1 file changed, 43 insertions(+), 22 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index af3c10f47f20..cb3d88de711f 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -912,37 +912,58 @@ static void __process_error(struct ghes *ghes)
 #endif
 }
 
-static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
+static int ghes_in_nmi_queue_one_entry(struct ghes *ghes)
 {
u64 buf_paddr;
-   struct ghes *ghes;
-   int sev, ret = NMI_DONE;
+   int sev;
 
-   if (!atomic_add_unless(_in_nmi, 1, 1))
-   return ret;
+   if (ghes_read_estatus(ghes, _paddr)) {
+   ghes_clear_estatus(ghes, buf_paddr);
+   return -ENOENT;
+   }
 
-   list_for_each_entry_rcu(ghes, _nmi, list) {
-   if (ghes_read_estatus(ghes, _paddr)) {
-   ghes_clear_estatus(ghes, buf_paddr);
-   continue;
-   } else {
-   ret = NMI_HANDLED;
-   }
+   sev = ghes_severity(ghes->estatus->error_severity);
+   if (sev >= GHES_SEV_PANIC) {
+   ghes_print_queued_estatus();
+   __ghes_panic(ghes, buf_paddr);
+   }
 
-   sev = ghes_severity(ghes->estatus->error_severity);
-   if (sev >= GHES_SEV_PANIC) {
-   ghes_print_queued_estatus();
-   __ghes_panic(ghes, buf_paddr);
-   }
+   __process_error(ghes);
+   ghes_clear_estatus(ghes, buf_paddr);
 
-   __process_error(ghes);
-   ghes_clear_estatus(ghes, buf_paddr);
+   return 0;
+}
+
+static int ghes_in_nmi_spool_from_list(struct list_head *rcu_list)
+{
+   int err, ret = -ENOENT;
+   struct ghes *ghes;
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(ghes, rcu_list, list) {
+   err = ghes_in_nmi_queue_one_entry(ghes);
+   if (!err)
+   ret = 0;
}
+   rcu_read_unlock();
 
-#ifdef CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG
-   if (ret == NMI_HANDLED)
+   if (IS_ENABLED(CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG) && !ret)
irq_work_queue(_proc_irq_work);
-#endif
+
+   return ret;
+}
+
+static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
+{
+   int err, ret = NMI_DONE;
+
+   if (!atomic_add_unless(_in_nmi, 1, 1))
+   return ret;
+
+   err = ghes_in_nmi_spool_from_list(_nmi);
+   if (!err)
+   ret = NMI_HANDLED;
+
atomic_dec(_in_nmi);
return ret;
 }
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 15/26] ACPI / APEI: Move locking to the notification helper

2019-01-29 Thread James Morse
ghes_copy_tofrom_phys() takes different locks depending on in_nmi().
This doesn't work if there are multiple NMI-like notifications, that
can interrupt each other.

Now that NOTIFY_SEA is always called in the same context, move the
lock-taking to the notification helper. The helper will always know
which lock to take. This avoids ghes_copy_tofrom_phys() taking a guess
based on in_nmi().

This splits NOTIFY_NMI and NOTIFY_SEA to use different locks. All
the other notifications use ghes_proc(), and are called in process
or IRQ context. Move the spin_lock_irqsave() around their ghes_proc()
calls.

Signed-off-by: James Morse 
Reviewed-by: Borislav Petkov 
---

Changes since v7:
 * Moved the locks into the function that uses them to make it clearer this
   is only use in_nmi().

Changes since v6:
 * Tinkered with the commit message
 * Lock definitions have moved due to the #ifdefs
---
 drivers/acpi/apei/ghes.c | 34 +-
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index ab794ab29554..c6bc73281d6a 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -114,11 +114,10 @@ static DEFINE_MUTEX(ghes_list_mutex);
  * handler, but general ioremap can not be used in atomic context, so
  * the fixmap is used instead.
  *
- * These 2 spinlocks are used to prevent the fixmap entries from being used
+ * This spinlock is used to prevent the fixmap entry from being used
  * simultaneously.
  */
-static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi);
-static DEFINE_SPINLOCK(ghes_ioremap_lock_irq);
+static DEFINE_SPINLOCK(ghes_notify_lock_irq);
 
 static struct gen_pool *ghes_estatus_pool;
 static unsigned long ghes_estatus_pool_size_request;
@@ -287,7 +286,6 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, 
u32 len,
  int from_phys)
 {
void __iomem *vaddr;
-   unsigned long flags = 0;
int in_nmi = in_nmi();
u64 offset;
u32 trunk;
@@ -295,10 +293,8 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, 
u32 len,
while (len > 0) {
offset = paddr - (paddr & PAGE_MASK);
if (in_nmi) {
-   raw_spin_lock(_ioremap_lock_nmi);
vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT);
} else {
-   spin_lock_irqsave(_ioremap_lock_irq, flags);
vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT);
}
trunk = PAGE_SIZE - offset;
@@ -312,10 +308,8 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, 
u32 len,
buffer += trunk;
if (in_nmi) {
ghes_iounmap_nmi();
-   raw_spin_unlock(_ioremap_lock_nmi);
} else {
ghes_iounmap_irq();
-   spin_unlock_irqrestore(_ioremap_lock_irq, flags);
}
}
 }
@@ -729,8 +723,11 @@ static void ghes_add_timer(struct ghes *ghes)
 static void ghes_poll_func(struct timer_list *t)
 {
struct ghes *ghes = from_timer(ghes, t, timer);
+   unsigned long flags;
 
+   spin_lock_irqsave(_notify_lock_irq, flags);
ghes_proc(ghes);
+   spin_unlock_irqrestore(_notify_lock_irq, flags);
if (!(ghes->flags & GHES_EXITING))
ghes_add_timer(ghes);
 }
@@ -738,9 +735,12 @@ static void ghes_poll_func(struct timer_list *t)
 static irqreturn_t ghes_irq_func(int irq, void *data)
 {
struct ghes *ghes = data;
+   unsigned long flags;
int rc;
 
+   spin_lock_irqsave(_notify_lock_irq, flags);
rc = ghes_proc(ghes);
+   spin_unlock_irqrestore(_notify_lock_irq, flags);
if (rc)
return IRQ_NONE;
 
@@ -751,14 +751,17 @@ static int ghes_notify_hed(struct notifier_block *this, 
unsigned long event,
   void *data)
 {
struct ghes *ghes;
+   unsigned long flags;
int ret = NOTIFY_DONE;
 
+   spin_lock_irqsave(_notify_lock_irq, flags);
rcu_read_lock();
list_for_each_entry_rcu(ghes, _hed, list) {
if (!ghes_proc(ghes))
ret = NOTIFY_OK;
}
rcu_read_unlock();
+   spin_unlock_irqrestore(_notify_lock_irq, flags);
 
return ret;
 }
@@ -913,7 +916,14 @@ static LIST_HEAD(ghes_sea);
  */
 int ghes_notify_sea(void)
 {
-   return ghes_in_nmi_spool_from_list(_sea);
+   static DEFINE_RAW_SPINLOCK(ghes_notify_lock_sea);
+   int rv;
+
+   raw_spin_lock(_notify_lock_sea);
+   rv = ghes_in_nmi_spool_from_list(_sea);
+   raw_spin_unlock(_notify_lock_sea);
+
+   return rv;
 }
 
 static void ghes_sea_add(struct ghes *ghes)
@@ -946,14 +956,17 @@ static LIST_HEAD(ghes_nmi);
 
 static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 {
+   static 

[PATCH v8 12/26] ACPI / APEI: Switch NOTIFY_SEA to use the estatus queue

2019-01-29 Thread James Morse
Now that the estatus queue can be used by more than one notification
method, we can move notifications that have NMI-like behaviour over.

Switch NOTIFY_SEA over to use the estatus queue. This makes it behave
in the same way as x86's NOTIFY_NMI.

Remove Kconfig's ability to turn ACPI_APEI_SEA off if ACPI_APEI_GHES
is selected. This roughly matches the x86 NOTIFY_NMI behaviour, and means
each architecture has at least one user of the estatus-queue, meaning it
doesn't need guarding with ifdef.

Signed-off-by: James Morse 
---
Changes since v6:
 * Lost all the pool grow/shrink stuff,
 * Changed Kconfig so this can't be turned off to avoid kconfig complexity:
 * Dropped Tyler's tested-by.
 * For now we need #ifdef around the SEA code as the arch code assumes its 
there.
 * Removed Punit's reviewed-by due to the swirling #ifdeffery
---
 drivers/acpi/apei/Kconfig | 12 +---
 drivers/acpi/apei/ghes.c  | 22 +-
 2 files changed, 6 insertions(+), 28 deletions(-)

diff --git a/drivers/acpi/apei/Kconfig b/drivers/acpi/apei/Kconfig
index 52ae5438edeb..6b18f8bc7be3 100644
--- a/drivers/acpi/apei/Kconfig
+++ b/drivers/acpi/apei/Kconfig
@@ -41,19 +41,9 @@ config ACPI_APEI_PCIEAER
  Turn on this option to enable the corresponding support.
 
 config ACPI_APEI_SEA
-   bool "APEI Synchronous External Abort logging/recovering support"
+   bool
depends on ARM64 && ACPI_APEI_GHES
default y
-   help
- This option should be enabled if the system supports
- firmware first handling of SEA (Synchronous External Abort).
- SEA happens with certain faults of data abort or instruction
- abort synchronous exceptions on ARMv8 systems. If a system
- supports firmware first handling of SEA, the platform analyzes
- and handles hardware error notifications from SEA, and it may then
- form a HW error record for the OS to parse and handle. This
- option allows the OS to look for such hardware error record, and
- take appropriate action.
 
 config ACPI_APEI_MEMORY_FAILURE
bool "APEI memory error recovering support"
diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 576dce29159d..ab794ab29554 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -767,7 +767,6 @@ static struct notifier_block ghes_notifier_hed = {
.notifier_call = ghes_notify_hed,
 };
 
-#ifdef CONFIG_HAVE_ACPI_APEI_NMI
 /*
  * Handlers for CPER records may not be NMI safe. For example,
  * memory_failure_queue() takes spinlocks and calls schedule_work_on().
@@ -904,7 +903,6 @@ static int ghes_in_nmi_spool_from_list(struct list_head 
*rcu_list)
 
return ret;
 }
-#endif /* CONFIG_HAVE_ACPI_APEI_NMI */
 
 #ifdef CONFIG_ACPI_APEI_SEA
 static LIST_HEAD(ghes_sea);
@@ -915,16 +913,7 @@ static LIST_HEAD(ghes_sea);
  */
 int ghes_notify_sea(void)
 {
-   struct ghes *ghes;
-   int ret = -ENOENT;
-
-   rcu_read_lock();
-   list_for_each_entry_rcu(ghes, _sea, list) {
-   if (!ghes_proc(ghes))
-   ret = 0;
-   }
-   rcu_read_unlock();
-   return ret;
+   return ghes_in_nmi_spool_from_list(_sea);
 }
 
 static void ghes_sea_add(struct ghes *ghes)
@@ -992,16 +981,15 @@ static void ghes_nmi_remove(struct ghes *ghes)
 */
synchronize_rcu();
 }
+#else /* CONFIG_HAVE_ACPI_APEI_NMI */
+static inline void ghes_nmi_add(struct ghes *ghes) { }
+static inline void ghes_nmi_remove(struct ghes *ghes) { }
+#endif /* CONFIG_HAVE_ACPI_APEI_NMI */
 
 static void ghes_nmi_init_cxt(void)
 {
init_irq_work(_proc_irq_work, ghes_proc_in_irq);
 }
-#else /* CONFIG_HAVE_ACPI_APEI_NMI */
-static inline void ghes_nmi_add(struct ghes *ghes) { }
-static inline void ghes_nmi_remove(struct ghes *ghes) { }
-static inline void ghes_nmi_init_cxt(void) { }
-#endif /* CONFIG_HAVE_ACPI_APEI_NMI */
 
 static int ghes_probe(struct platform_device *ghes_dev)
 {
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 16/26] ACPI / APEI: Let the notification helper specify the fixmap slot

2019-01-29 Thread James Morse
ghes_copy_tofrom_phys() uses a different fixmap slot depending on in_nmi().
This doesn't work when there are multiple NMI-like notifications, that
could interrupt each other.

As with the locking, move the chosen fixmap_idx to the notification helper.
This only matters for NMI-like notifications, anything calling
ghes_proc() can use the IRQ fixmap slot as its already holding an irqsave
spinlock.

This lets us collapse the ghes_ioremap_pfn_*() helpers.

Signed-off-by: James Morse 
Reviewed-by: Borislav Petkov 
---

The fixmap-idx and vaddr are passed back to ghes_unmap()
to allow ioremap() to be used in process context in the
future. This will let us send tlbi-ipi for notifications
that don't mask irqs.

Changes since v7:
 * Wwitched unmap arg order for concistency, p/v addr is always first
 * use the enum name for the fixmap_idx, in the hope the compiler validates it.
---
 drivers/acpi/apei/ghes.c | 92 +---
 1 file changed, 39 insertions(+), 53 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index c6bc73281d6a..ccad57468ab7 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -41,6 +41,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -127,38 +128,24 @@ static atomic_t ghes_estatus_cache_alloced;
 
 static int ghes_panic_timeout __read_mostly = 30;
 
-static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn)
+static void __iomem *ghes_map(u64 pfn, enum fixed_addresses fixmap_idx)
 {
phys_addr_t paddr;
pgprot_t prot;
 
-   paddr = pfn << PAGE_SHIFT;
+   paddr = PFN_PHYS(pfn);
prot = arch_apei_get_mem_attribute(paddr);
-   __set_fixmap(FIX_APEI_GHES_NMI, paddr, prot);
+   __set_fixmap(fixmap_idx, paddr, prot);
 
-   return (void __iomem *) fix_to_virt(FIX_APEI_GHES_NMI);
+   return (void __iomem *) __fix_to_virt(fixmap_idx);
 }
 
-static void __iomem *ghes_ioremap_pfn_irq(u64 pfn)
+static void ghes_unmap(void __iomem *vaddr, enum fixed_addresses fixmap_idx)
 {
-   phys_addr_t paddr;
-   pgprot_t prot;
-
-   paddr = pfn << PAGE_SHIFT;
-   prot = arch_apei_get_mem_attribute(paddr);
-   __set_fixmap(FIX_APEI_GHES_IRQ, paddr, prot);
+   int _idx = virt_to_fix((unsigned long)vaddr);
 
-   return (void __iomem *) fix_to_virt(FIX_APEI_GHES_IRQ);
-}
-
-static void ghes_iounmap_nmi(void)
-{
-   clear_fixmap(FIX_APEI_GHES_NMI);
-}
-
-static void ghes_iounmap_irq(void)
-{
-   clear_fixmap(FIX_APEI_GHES_IRQ);
+   WARN_ON_ONCE(fixmap_idx != _idx);
+   clear_fixmap(fixmap_idx);
 }
 
 int ghes_estatus_pool_init(int num_ghes)
@@ -283,20 +270,16 @@ static inline int ghes_severity(int severity)
 }
 
 static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len,
- int from_phys)
+ int from_phys,
+ enum fixed_addresses fixmap_idx)
 {
void __iomem *vaddr;
-   int in_nmi = in_nmi();
u64 offset;
u32 trunk;
 
while (len > 0) {
offset = paddr - (paddr & PAGE_MASK);
-   if (in_nmi) {
-   vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT);
-   } else {
-   vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT);
-   }
+   vaddr = ghes_map(PHYS_PFN(paddr), fixmap_idx);
trunk = PAGE_SIZE - offset;
trunk = min(trunk, len);
if (from_phys)
@@ -306,15 +289,13 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 
paddr, u32 len,
len -= trunk;
paddr += trunk;
buffer += trunk;
-   if (in_nmi) {
-   ghes_iounmap_nmi();
-   } else {
-   ghes_iounmap_irq();
-   }
+   ghes_unmap(vaddr, fixmap_idx);
}
 }
 
-static int ghes_read_estatus(struct ghes *ghes, u64 *buf_paddr)
+static int ghes_read_estatus(struct ghes *ghes, u64 *buf_paddr,
+enum fixed_addresses fixmap_idx)
+
 {
struct acpi_hest_generic *g = ghes->generic;
u32 len;
@@ -332,7 +313,7 @@ static int ghes_read_estatus(struct ghes *ghes, u64 
*buf_paddr)
return -ENOENT;
 
ghes_copy_tofrom_phys(ghes->estatus, *buf_paddr,
- sizeof(*ghes->estatus), 1);
+ sizeof(*ghes->estatus), 1, fixmap_idx);
if (!ghes->estatus->block_status) {
*buf_paddr = 0;
return -ENOENT;
@@ -348,7 +329,7 @@ static int ghes_read_estatus(struct ghes *ghes, u64 
*buf_paddr)
goto err_read_block;
ghes_copy_tofrom_phys(ghes->estatus + 1,
  *buf_paddr + sizeof(*ghes->estatus),
- len - sizeof(*ghes->estatus), 1);
+ len - 

[PATCH v8 17/26] ACPI / APEI: Pass ghes and estatus separately to avoid a later copy

2019-01-29 Thread James Morse
The NMI-like notifications scribble over ghes->estatus, before
copying it somewhere else. If this interrupts the ghes_probe() code
calling ghes_proc() on each struct ghes, the data is corrupted.

All the NMI-like notifications should use a queued estatus entry
from the beginning, instead of the ghes version, then copying it.
To do this, break up any use of "ghes->estatus" so that all
functions take the estatus as an argument.

This patch just moves these ghes->estatus dereferences into separate
arguments, no change in behaviour. struct ghes becomes unused in
ghes_clear_estatus() as it only wanted ghes->estatus, which we now
pass directly. This is removed.

Signed-off-by: James Morse 

---
Changes since v6:
 * Changed subject
 * Renamed ghes_estatus to src_estatus, which is a little clearer
 * Removed struct ghes from ghes_clear_estatus() now that this becomes
   unused in this patch.
 * Mangled the commit message to be different
---
 drivers/acpi/apei/ghes.c | 92 +---
 1 file changed, 49 insertions(+), 43 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index ccad57468ab7..f95db2398dd5 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -293,9 +293,9 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, 
u32 len,
}
 }
 
-static int ghes_read_estatus(struct ghes *ghes, u64 *buf_paddr,
-enum fixed_addresses fixmap_idx)
-
+static int ghes_read_estatus(struct ghes *ghes,
+struct acpi_hest_generic_status *estatus,
+u64 *buf_paddr, enum fixed_addresses fixmap_idx)
 {
struct acpi_hest_generic *g = ghes->generic;
u32 len;
@@ -312,25 +312,25 @@ static int ghes_read_estatus(struct ghes *ghes, u64 
*buf_paddr,
if (!*buf_paddr)
return -ENOENT;
 
-   ghes_copy_tofrom_phys(ghes->estatus, *buf_paddr,
- sizeof(*ghes->estatus), 1, fixmap_idx);
-   if (!ghes->estatus->block_status) {
+   ghes_copy_tofrom_phys(estatus, *buf_paddr, sizeof(*estatus), 1,
+ fixmap_idx);
+   if (!estatus->block_status) {
*buf_paddr = 0;
return -ENOENT;
}
 
rc = -EIO;
-   len = cper_estatus_len(ghes->estatus);
-   if (len < sizeof(*ghes->estatus))
+   len = cper_estatus_len(estatus);
+   if (len < sizeof(*estatus))
goto err_read_block;
if (len > ghes->generic->error_block_length)
goto err_read_block;
-   if (cper_estatus_check_header(ghes->estatus))
+   if (cper_estatus_check_header(estatus))
goto err_read_block;
-   ghes_copy_tofrom_phys(ghes->estatus + 1,
- *buf_paddr + sizeof(*ghes->estatus),
- len - sizeof(*ghes->estatus), 1, fixmap_idx);
-   if (cper_estatus_check(ghes->estatus))
+   ghes_copy_tofrom_phys(estatus + 1,
+ *buf_paddr + sizeof(*estatus),
+ len - sizeof(*estatus), 1, fixmap_idx);
+   if (cper_estatus_check(estatus))
goto err_read_block;
rc = 0;
 
@@ -342,16 +342,17 @@ static int ghes_read_estatus(struct ghes *ghes, u64 
*buf_paddr,
return rc;
 }
 
-static void ghes_clear_estatus(struct ghes *ghes, u64 buf_paddr,
-  enum fixed_addresses fixmap_idx)
+static void ghes_clear_estatus(struct ghes *ghes,
+  struct acpi_hest_generic_status *estatus,
+  u64 buf_paddr, enum fixed_addresses fixmap_idx)
 {
-   ghes->estatus->block_status = 0;
+   estatus->block_status = 0;
 
if (!buf_paddr)
return;
 
-   ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
- sizeof(ghes->estatus->block_status), 0,
+   ghes_copy_tofrom_phys(estatus, buf_paddr,
+ sizeof(estatus->block_status), 0,
  fixmap_idx);
 
/*
@@ -651,12 +652,13 @@ static void ghes_estatus_cache_add(
rcu_read_unlock();
 }
 
-static void __ghes_panic(struct ghes *ghes, u64 buf_paddr,
-enum fixed_addresses fixmap_idx)
+static void __ghes_panic(struct ghes *ghes,
+struct acpi_hest_generic_status *estatus,
+u64 buf_paddr, enum fixed_addresses fixmap_idx)
 {
-   __ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
+   __ghes_print_estatus(KERN_EMERG, ghes->generic, estatus);
 
-   ghes_clear_estatus(ghes, buf_paddr, fixmap_idx);
+   ghes_clear_estatus(ghes, estatus, buf_paddr, fixmap_idx);
 
/* reboot to log the error! */
if (!panic_timeout)
@@ -666,25 +668,25 @@ static void __ghes_panic(struct ghes *ghes, u64 buf_paddr,
 
 static int ghes_proc(struct ghes *ghes)
 {
+   struct 

[PATCH v8 13/26] KVM: arm/arm64: Add kvm_ras.h to collect kvm specific RAS plumbing

2019-01-29 Thread James Morse
To split up APEIs in_nmi() path, the caller needs to always be
in_nmi(). KVM shouldn't have to know about this, pull the RAS plumbing
out into a header file.

Currently guest synchronous external aborts are claimed as RAS
notifications by handle_guest_sea(), which is hidden in the arch codes
mm/fault.c. 32bit gets a dummy declaration in system_misc.h.

There is going to be more of this in the future if/when the kernel
supports the SError-based firmware-first notification mechanism and/or
kernel-first notifications for both synchronous external abort and
SError. Each of these will come with some Kconfig symbols and a
handful of header files.

Create a header file for all this.

This patch gives handle_guest_sea() a 'kvm_' prefix, and moves the
declarations to kvm_ras.h as preparation for a future patch that moves
the ACPI-specific RAS code out of mm/fault.c.

Signed-off-by: James Morse 
Reviewed-by: Punit Agrawal 
Acked-by: Marc Zyngier 
Tested-by: Tyler Baicar 
Acked-by: Catalin Marinas 

---
Changes since v6:
 * Tinkered with the commit message
---
 arch/arm/include/asm/kvm_ras.h   | 14 ++
 arch/arm/include/asm/system_misc.h   |  5 -
 arch/arm64/include/asm/kvm_ras.h | 11 +++
 arch/arm64/include/asm/system_misc.h |  2 --
 arch/arm64/mm/fault.c|  2 +-
 virt/kvm/arm/mmu.c   |  4 ++--
 6 files changed, 28 insertions(+), 10 deletions(-)
 create mode 100644 arch/arm/include/asm/kvm_ras.h
 create mode 100644 arch/arm64/include/asm/kvm_ras.h

diff --git a/arch/arm/include/asm/kvm_ras.h b/arch/arm/include/asm/kvm_ras.h
new file mode 100644
index ..e9577292dfe4
--- /dev/null
+++ b/arch/arm/include/asm/kvm_ras.h
@@ -0,0 +1,14 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018 - Arm Ltd */
+
+#ifndef __ARM_KVM_RAS_H__
+#define __ARM_KVM_RAS_H__
+
+#include 
+
+static inline int kvm_handle_guest_sea(phys_addr_t addr, unsigned int esr)
+{
+   return -1;
+}
+
+#endif /* __ARM_KVM_RAS_H__ */
diff --git a/arch/arm/include/asm/system_misc.h 
b/arch/arm/include/asm/system_misc.h
index 8e76db83c498..66f6a3ae68d2 100644
--- a/arch/arm/include/asm/system_misc.h
+++ b/arch/arm/include/asm/system_misc.h
@@ -38,11 +38,6 @@ static inline void harden_branch_predictor(void)
 
 extern unsigned int user_debug;
 
-static inline int handle_guest_sea(phys_addr_t addr, unsigned int esr)
-{
-   return -1;
-}
-
 #endif /* !__ASSEMBLY__ */
 
 #endif /* __ASM_ARM_SYSTEM_MISC_H */
diff --git a/arch/arm64/include/asm/kvm_ras.h b/arch/arm64/include/asm/kvm_ras.h
new file mode 100644
index ..6096f0251812
--- /dev/null
+++ b/arch/arm64/include/asm/kvm_ras.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright (C) 2018 - Arm Ltd */
+
+#ifndef __ARM64_KVM_RAS_H__
+#define __ARM64_KVM_RAS_H__
+
+#include 
+
+int kvm_handle_guest_sea(phys_addr_t addr, unsigned int esr);
+
+#endif /* __ARM64_KVM_RAS_H__ */
diff --git a/arch/arm64/include/asm/system_misc.h 
b/arch/arm64/include/asm/system_misc.h
index 0e2a0ecaf484..32693f34f431 100644
--- a/arch/arm64/include/asm/system_misc.h
+++ b/arch/arm64/include/asm/system_misc.h
@@ -46,8 +46,6 @@ extern void __show_regs(struct pt_regs *);
 
 extern void (*arm_pm_restart)(enum reboot_mode reboot_mode, const char *cmd);
 
-int handle_guest_sea(phys_addr_t addr, unsigned int esr);
-
 #endif /* __ASSEMBLY__ */
 
 #endif /* __ASM_SYSTEM_MISC_H */
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index efb7b2cbead5..c76dc981e3fc 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -733,7 +733,7 @@ static const struct fault_info fault_info[] = {
{ do_bad,   SIGKILL, SI_KERNEL, "unknown 63"
},
 };
 
-int handle_guest_sea(phys_addr_t addr, unsigned int esr)
+int kvm_handle_guest_sea(phys_addr_t addr, unsigned int esr)
 {
return ghes_notify_sea();
 }
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index fbdf3ac2f001..600e0cc74ea4 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -27,10 +27,10 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
-#include 
 
 #include "trace.h"
 
@@ -1903,7 +1903,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
 * For RAS the host kernel may handle this abort.
 * There is no need to pass the error into the guest.
 */
-   if (!handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
+   if (!kvm_handle_guest_sea(fault_ipa, kvm_vcpu_get_hsr(vcpu)))
return 1;
 
if (unlikely(!is_iabt)) {
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 10/26] ACPI / APEI: Don't allow ghes_ack_error() to mask earlier errors

2019-01-29 Thread James Morse
During ghes_proc() we use ghes_ack_error() to tell an external agent
we are done with these records and it can re-use the memory.

rc may hold an error returned by ghes_read_estatus(), ENOENT causes
us to skip ghes_ack_error() (as there is nothing to ack), but rc may
also by EIO, which gets supressed.

ghes_clear_estatus() is where we mark the records as processed for
non GHESv2 error sources, and already spots the ENOENT case as
buf_paddr is set to 0 by ghes_read_estatus().

Move the ghes_ack_error() call in here to avoid extra logic with
the return code in ghes_proc().

This enables GHESv2 acking for NMI-like error sources. This is safe
as the buffer is pre-mapped by map_gen_v2() before the GHES is added
to any NMI handler lists.

This same pre-mapping step means we can't receive an error from
apei_read()/write() here as apei_check_gar() succeeded when it
was mapped, and the mapping was cached, so the address can't be
rejected at runtime. Remove the error-returns as this is now
called from a function with no return.

Signed-off-by: James Morse 
---
 drivers/acpi/apei/ghes.c | 47 +++-
 1 file changed, 22 insertions(+), 25 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index cb3d88de711f..bd58749d31bb 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -197,6 +197,21 @@ static void unmap_gen_v2(struct ghes *ghes)
apei_unmap_generic_address(>generic_v2->read_ack_register);
 }
 
+static void ghes_ack_error(struct acpi_hest_generic_v2 *gv2)
+{
+   int rc;
+   u64 val = 0;
+
+   rc = apei_read(, >read_ack_register);
+   if (rc)
+   return;
+
+   val &= gv2->read_ack_preserve << gv2->read_ack_register.bit_offset;
+   val |= gv2->read_ack_write<< gv2->read_ack_register.bit_offset;
+
+   apei_write(val, >read_ack_register);
+}
+
 static struct ghes *ghes_new(struct acpi_hest_generic *generic)
 {
struct ghes *ghes;
@@ -361,6 +376,13 @@ static void ghes_clear_estatus(struct ghes *ghes, u64 
buf_paddr)
 
ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
  sizeof(ghes->estatus->block_status), 0);
+
+   /*
+* GHESv2 type HEST entries introduce support for error acknowledgment,
+* so only acknowledge the error if this support is present.
+*/
+   if (is_hest_type_generic_v2(ghes))
+   ghes_ack_error(ghes->generic_v2);
 }
 
 static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, 
int sev)
@@ -652,21 +674,6 @@ static void ghes_estatus_cache_add(
rcu_read_unlock();
 }
 
-static int ghes_ack_error(struct acpi_hest_generic_v2 *gv2)
-{
-   int rc;
-   u64 val = 0;
-
-   rc = apei_read(, >read_ack_register);
-   if (rc)
-   return rc;
-
-   val &= gv2->read_ack_preserve << gv2->read_ack_register.bit_offset;
-   val |= gv2->read_ack_write<< gv2->read_ack_register.bit_offset;
-
-   return apei_write(val, >read_ack_register);
-}
-
 static void __ghes_panic(struct ghes *ghes, u64 buf_paddr)
 {
__ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
@@ -701,16 +708,6 @@ static int ghes_proc(struct ghes *ghes)
 out:
ghes_clear_estatus(ghes, buf_paddr);
 
-   if (rc == -ENOENT)
-   return rc;
-
-   /*
-* GHESv2 type HEST entries introduce support for error acknowledgment,
-* so only acknowledge the error if this support is present.
-*/
-   if (is_hest_type_generic_v2(ghes))
-   return ghes_ack_error(ghes->generic_v2);
-
return rc;
 }
 
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 04/26] ACPI / APEI: Make hest.c manage the estatus memory pool

2019-01-29 Thread James Morse
ghes.c has a memory pool it uses for the estatus cache and the estatus
queue. The cache is initialised when registering the platform driver.
For the queue, an NMI-like notification has to grow/shrink the pool
as it is registered and unregistered.

This is all pretty noisy when adding new NMI-like notifications, it
would be better to replace this with a static pool size based on the
number of users.

As a precursor, move the call that creates the pool from ghes_init(),
into hest.c. Later this will take the number of ghes entries and
consolidate the queue allocations.
Remove ghes_estatus_pool_exit() as hest.c doesn't have anywhere to put
this.

The pool is now initialised as part of ACPI's subsys_initcall():
(acpi_init(), acpi_scan_init(), acpi_pci_root_init(), acpi_hest_init())
Before this patch it happened later as a GHES specific device_initcall().

Signed-off-by: James Morse 
---
Changes since v7:
* Moved the pool init later, the driver isn't probed until device_init.
---
 drivers/acpi/apei/ghes.c | 33 ++---
 drivers/acpi/apei/hest.c | 10 +-
 include/acpi/ghes.h  |  2 ++
 3 files changed, 17 insertions(+), 28 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index ee9206d5e119..4150c72c78cb 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -162,26 +162,16 @@ static void ghes_iounmap_irq(void)
clear_fixmap(FIX_APEI_GHES_IRQ);
 }
 
-static int ghes_estatus_pool_init(void)
+static int ghes_estatus_pool_expand(unsigned long len); //temporary
+
+int ghes_estatus_pool_init(void)
 {
ghes_estatus_pool = gen_pool_create(GHES_ESTATUS_POOL_MIN_ALLOC_ORDER, 
-1);
if (!ghes_estatus_pool)
return -ENOMEM;
-   return 0;
-}
 
-static void ghes_estatus_pool_free_chunk(struct gen_pool *pool,
- struct gen_pool_chunk *chunk,
- void *data)
-{
-   vfree((void *)chunk->start_addr);
-}
-
-static void ghes_estatus_pool_exit(void)
-{
-   gen_pool_for_each_chunk(ghes_estatus_pool,
-   ghes_estatus_pool_free_chunk, NULL);
-   gen_pool_destroy(ghes_estatus_pool);
+   return ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE *
+   GHES_ESTATUS_CACHE_ALLOCED_MAX);
 }
 
 static int ghes_estatus_pool_expand(unsigned long len)
@@ -1227,18 +1217,9 @@ static int __init ghes_init(void)
 
ghes_nmi_init_cxt();
 
-   rc = ghes_estatus_pool_init();
-   if (rc)
-   goto err;
-
-   rc = ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE *
- GHES_ESTATUS_CACHE_ALLOCED_MAX);
-   if (rc)
-   goto err_pool_exit;
-
rc = platform_driver_register(_platform_driver);
if (rc)
-   goto err_pool_exit;
+   goto err;
 
rc = apei_osc_setup();
if (rc == 0 && osc_sb_apei_support_acked)
@@ -1251,8 +1232,6 @@ static int __init ghes_init(void)
pr_info(GHES_PFX "Failed to enable APEI firmware first 
mode.\n");
 
return 0;
-err_pool_exit:
-   ghes_estatus_pool_exit();
 err:
return rc;
 }
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index b1e9f81ebeea..097ba07a657b 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "apei-internal.h"
 
@@ -203,6 +204,11 @@ static int __init hest_ghes_dev_register(unsigned int 
ghes_count)
rc = apei_hest_parse(hest_parse_ghes, _arr);
if (rc)
goto err;
+
+   rc = ghes_estatus_pool_init();
+   if (rc)
+   goto err;
+
 out:
kfree(ghes_arr.ghes_devs);
return rc;
@@ -251,7 +257,9 @@ void __init acpi_hest_init(void)
rc = apei_hest_parse(hest_parse_ghes_count, _count);
if (rc)
goto err;
-   rc = hest_ghes_dev_register(ghes_count);
+
+   if (ghes_count)
+   rc = hest_ghes_dev_register(ghes_count);
if (rc)
goto err;
}
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 82cb4eb225a4..46ef5566e052 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -52,6 +52,8 @@ enum {
GHES_SEV_PANIC = 0x3,
 };
 
+int ghes_estatus_pool_init(void);
+
 /* From drivers/edac/ghes_edac.c */
 
 #ifdef CONFIG_EDAC_GHES
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 11/26] ACPI / APEI: Move NOTIFY_SEA between the estatus-queue and NOTIFY_NMI

2019-01-29 Thread James Morse
The estatus-queue code is currently hidden by the NOTIFY_NMI #ifdefs.
Once NOTIFY_SEA starts using the estatus-queue we can stop hiding
it as each architecture has a user that can't be turned off.

Split the existing CONFIG_HAVE_ACPI_APEI_NMI block in two, and move
the SEA code into the gap.

Move the code around ... and changes the stale comment describing
why the status queue is necessary: printk() is no longer the issue,
its the helpers like memory_failure_queue() that aren't nmi safe.

Signed-off-by: James Morse 
---
 drivers/acpi/apei/ghes.c | 113 ---
 1 file changed, 59 insertions(+), 54 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index bd58749d31bb..576dce29159d 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -767,66 +767,21 @@ static struct notifier_block ghes_notifier_hed = {
.notifier_call = ghes_notify_hed,
 };
 
-#ifdef CONFIG_ACPI_APEI_SEA
-static LIST_HEAD(ghes_sea);
-
-/*
- * Return 0 only if one of the SEA error sources successfully reported an error
- * record sent from the firmware.
- */
-int ghes_notify_sea(void)
-{
-   struct ghes *ghes;
-   int ret = -ENOENT;
-
-   rcu_read_lock();
-   list_for_each_entry_rcu(ghes, _sea, list) {
-   if (!ghes_proc(ghes))
-   ret = 0;
-   }
-   rcu_read_unlock();
-   return ret;
-}
-
-static void ghes_sea_add(struct ghes *ghes)
-{
-   mutex_lock(_list_mutex);
-   list_add_rcu(>list, _sea);
-   mutex_unlock(_list_mutex);
-}
-
-static void ghes_sea_remove(struct ghes *ghes)
-{
-   mutex_lock(_list_mutex);
-   list_del_rcu(>list);
-   mutex_unlock(_list_mutex);
-   synchronize_rcu();
-}
-#else /* CONFIG_ACPI_APEI_SEA */
-static inline void ghes_sea_add(struct ghes *ghes) { }
-static inline void ghes_sea_remove(struct ghes *ghes) { }
-#endif /* CONFIG_ACPI_APEI_SEA */
-
 #ifdef CONFIG_HAVE_ACPI_APEI_NMI
 /*
- * printk is not safe in NMI context.  So in NMI handler, we allocate
- * required memory from lock-less memory allocator
- * (ghes_estatus_pool), save estatus into it, put them into lock-less
- * list (ghes_estatus_llist), then delay printk into IRQ context via
- * irq_work (ghes_proc_irq_work).  ghes_estatus_size_request record
- * required pool size by all NMI error source.
+ * Handlers for CPER records may not be NMI safe. For example,
+ * memory_failure_queue() takes spinlocks and calls schedule_work_on().
+ * In any NMI-like handler, memory from ghes_estatus_pool is used to save
+ * estatus, and added to the ghes_estatus_llist. irq_work_queue() causes
+ * ghes_proc_in_irq() to run in IRQ context where each estatus in
+ * ghes_estatus_llist is processed.
+ *
+ * Memory from the ghes_estatus_pool is also used with the ghes_estatus_cache
+ * to suppress frequent messages.
  */
 static struct llist_head ghes_estatus_llist;
 static struct irq_work ghes_proc_irq_work;
 
-/*
- * NMI may be triggered on any CPU, so ghes_in_nmi is used for
- * having only one concurrent reader.
- */
-static atomic_t ghes_in_nmi = ATOMIC_INIT(0);
-
-static LIST_HEAD(ghes_nmi);
-
 static void ghes_proc_in_irq(struct irq_work *irq_work)
 {
struct llist_node *llnode, *next;
@@ -949,6 +904,56 @@ static int ghes_in_nmi_spool_from_list(struct list_head 
*rcu_list)
 
return ret;
 }
+#endif /* CONFIG_HAVE_ACPI_APEI_NMI */
+
+#ifdef CONFIG_ACPI_APEI_SEA
+static LIST_HEAD(ghes_sea);
+
+/*
+ * Return 0 only if one of the SEA error sources successfully reported an error
+ * record sent from the firmware.
+ */
+int ghes_notify_sea(void)
+{
+   struct ghes *ghes;
+   int ret = -ENOENT;
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(ghes, _sea, list) {
+   if (!ghes_proc(ghes))
+   ret = 0;
+   }
+   rcu_read_unlock();
+   return ret;
+}
+
+static void ghes_sea_add(struct ghes *ghes)
+{
+   mutex_lock(_list_mutex);
+   list_add_rcu(>list, _sea);
+   mutex_unlock(_list_mutex);
+}
+
+static void ghes_sea_remove(struct ghes *ghes)
+{
+   mutex_lock(_list_mutex);
+   list_del_rcu(>list);
+   mutex_unlock(_list_mutex);
+   synchronize_rcu();
+}
+#else /* CONFIG_ACPI_APEI_SEA */
+static inline void ghes_sea_add(struct ghes *ghes) { }
+static inline void ghes_sea_remove(struct ghes *ghes) { }
+#endif /* CONFIG_ACPI_APEI_SEA */
+
+#ifdef CONFIG_HAVE_ACPI_APEI_NMI
+/*
+ * NMI may be triggered on any CPU, so ghes_in_nmi is used for
+ * having only one concurrent reader.
+ */
+static atomic_t ghes_in_nmi = ATOMIC_INIT(0);
+
+static LIST_HEAD(ghes_nmi);
 
 static int ghes_notify_nmi(unsigned int cmd, struct pt_regs *regs)
 {
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 08/26] ACPI / APEI: Don't update struct ghes' flags in read/clear estatus

2019-01-29 Thread James Morse
ghes_read_estatus() sets a flag in struct ghes if the buffer of
CPER records needs to be cleared once the records have been
processed. This flag value is a problem if a struct ghes can be
processed concurrently, as happens at probe time if an NMI arrives
for the same error source. The NMI clears the flag, meaning the
interrupted handler may never do the ghes_estatus_clear() work.

The GHES_TO_CLEAR flags is only set at the same time as
buffer_paddr, which is now owned by the caller and passed to
ghes_clear_estatus(). Use this value as the flag.

A non-zero buf_paddr returned by ghes_read_estatus() means
ghes_clear_estatus() should clear this address. ghes_read_estatus()
already checks for a read of error_status_address being zero,
so CPER records cannot be written here.

Signed-off-by: James Morse 
Reviewed-by: Borislav Petkov 

--
Changes since v6:
 * Added Boris' RB, then:
 * Moved earlier in the series,
 * Tinkered with the commit message,
 * Always cleared buf_paddr on errors in the previous patch, which was
   previously in here.
---
 drivers/acpi/apei/ghes.c | 5 -
 include/acpi/ghes.h  | 1 -
 2 files changed, 6 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index c20e1d0947b1..af3c10f47f20 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -329,8 +329,6 @@ static int ghes_read_estatus(struct ghes *ghes, u64 
*buf_paddr)
return -ENOENT;
}
 
-   ghes->flags |= GHES_TO_CLEAR;
-
rc = -EIO;
len = cper_estatus_len(ghes->estatus);
if (len < sizeof(*ghes->estatus))
@@ -357,15 +355,12 @@ static int ghes_read_estatus(struct ghes *ghes, u64 
*buf_paddr)
 static void ghes_clear_estatus(struct ghes *ghes, u64 buf_paddr)
 {
ghes->estatus->block_status = 0;
-   if (!(ghes->flags & GHES_TO_CLEAR))
-   return;
 
if (!buf_paddr)
return;
 
ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
  sizeof(ghes->estatus->block_status), 0);
-   ghes->flags &= ~GHES_TO_CLEAR;
 }
 
 static void ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata, 
int sev)
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index f82f4a7ddd90..e3f1cddb4ac8 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -13,7 +13,6 @@
  * estatus: memory buffer for error status block, allocated during
  * HEST parsing.
  */
-#define GHES_TO_CLEAR  0x0001
 #define GHES_EXITING   0x0002
 
 struct ghes {
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 01/26] ACPI / APEI: Don't wait to serialise with oops messages when panic()ing

2019-01-29 Thread James Morse
oops_begin() exists to group printk() messages with the oops message
printed by die(). To reach this caller we know that platform firmware
took this error first, then notified the OS via NMI with a 'panic'
severity.

Don't wait for another CPU to release the die-lock before panic()ing,
our only goal is to print this fatal error and panic().

This code is always called in_nmi(), and since commit 42a0bb3f7138
("printk/nmi: generic solution for safe printk in NMI"), it has been
safe to call printk() from this context. Messages are batched in a
per-cpu buffer and printed via irq-work, or a call back from panic().

Link: https://patchwork.kernel.org/patch/10313555/
Acked-by: Borislav Petkov 
Signed-off-by: James Morse 

---
Changes since v6:
 * Capitals in patch subject
 * Tinkered with the commit message.
---
 drivers/acpi/apei/ghes.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index f008ba7c9ced..0c46b79e31b1 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -33,7 +33,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -949,7 +948,6 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs 
*regs)
 
sev = ghes_severity(ghes->estatus->error_severity);
if (sev >= GHES_SEV_PANIC) {
-   oops_begin();
ghes_print_queued_estatus();
__ghes_panic(ghes);
}
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 03/26] ACPI / APEI: Switch estatus pool to use vmalloc memory

2019-01-29 Thread James Morse
The ghes code is careful to parse and round firmware's advertised
memory requirements for CPER records, up to a maximum of 64K.
However when ghes_estatus_pool_expand() does its work, it splits
the requested size into PAGE_SIZE granules.

This means if firmware generates 5K of CPER records, and correctly
describes this in the table, __process_error() will silently fail as it
is unable to allocate more than PAGE_SIZE.

Switch the estatus pool to vmalloc() memory. On x86 vmalloc() memory
may fault and be fixed up by vmalloc_fault(). To prevent this call
vmalloc_sync_all() before an NMI handler could discover the memory.

Signed-off-by: James Morse 
Reviewed-by: Borislav Petkov 
---
 drivers/acpi/apei/ghes.c | 30 +++---
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index f0a704aed040..ee9206d5e119 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -170,40 +170,40 @@ static int ghes_estatus_pool_init(void)
return 0;
 }
 
-static void ghes_estatus_pool_free_chunk_page(struct gen_pool *pool,
+static void ghes_estatus_pool_free_chunk(struct gen_pool *pool,
  struct gen_pool_chunk *chunk,
  void *data)
 {
-   free_page(chunk->start_addr);
+   vfree((void *)chunk->start_addr);
 }
 
 static void ghes_estatus_pool_exit(void)
 {
gen_pool_for_each_chunk(ghes_estatus_pool,
-   ghes_estatus_pool_free_chunk_page, NULL);
+   ghes_estatus_pool_free_chunk, NULL);
gen_pool_destroy(ghes_estatus_pool);
 }
 
 static int ghes_estatus_pool_expand(unsigned long len)
 {
-   unsigned long i, pages, size, addr;
-   int ret;
+   unsigned long size, addr;
 
ghes_estatus_pool_size_request += PAGE_ALIGN(len);
size = gen_pool_size(ghes_estatus_pool);
if (size >= ghes_estatus_pool_size_request)
return 0;
-   pages = (ghes_estatus_pool_size_request - size) / PAGE_SIZE;
-   for (i = 0; i < pages; i++) {
-   addr = __get_free_page(GFP_KERNEL);
-   if (!addr)
-   return -ENOMEM;
-   ret = gen_pool_add(ghes_estatus_pool, addr, PAGE_SIZE, -1);
-   if (ret)
-   return ret;
-   }
 
-   return 0;
+   addr = (unsigned long)vmalloc(PAGE_ALIGN(len));
+   if (!addr)
+   return -ENOMEM;
+
+   /*
+* New allocation must be visible in all pgd before it can be found by
+* an NMI allocating from the pool.
+*/
+   vmalloc_sync_all();
+
+   return gen_pool_add(ghes_estatus_pool, addr, PAGE_ALIGN(len), -1);
 }
 
 static int map_gen_v2(struct ghes *ghes)
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 07/26] ACPI / APEI: Remove spurious GHES_TO_CLEAR check

2019-01-29 Thread James Morse
ghes_notify_nmi() checks ghes->flags for GHES_TO_CLEAR before going
on to __process_error(). This is pointless as ghes_read_estatus()
will always set this flag if it returns success, which was checked
earlier in the loop. Remove it.

Signed-off-by: James Morse 
Reviewed-by: Borislav Petkov 
---
 drivers/acpi/apei/ghes.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index a34f79153b1a..c20e1d0947b1 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -940,9 +940,6 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs 
*regs)
__ghes_panic(ghes, buf_paddr);
}
 
-   if (!(ghes->flags & GHES_TO_CLEAR))
-   continue;
-
__process_error(ghes);
ghes_clear_estatus(ghes, buf_paddr);
}
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 14/26] arm64: KVM/mm: Move SEA handling behind a single 'claim' interface

2019-01-29 Thread James Morse
To split up APEIs in_nmi() path, the caller needs to always be
in_nmi(). Add a helper to do the work and claim the notification.

When KVM or the arch code takes an exception that might be a RAS
notification, it asks the APEI firmware-first code whether it wants
to claim the exception. A future kernel-first mechanism may be queried
afterwards, and claim the notification, otherwise we fall through
to the existing default behaviour.

The NOTIFY_SEA code was merged before considering multiple, possibly
interacting, NMI-like notifications and the need to consider kernel
first in the future. Make the 'claiming' behaviour explicit.

Restructuring the APEI code to allow multiple NMI-like notifications
means any notification that might interrupt interrupts-masked
code must always be wrapped in nmi_enter()/nmi_exit(). This will
allow APEI to use in_nmi() to use the right fixmap entries.

Mask SError over this window to prevent an asynchronous RAS error
arriving and tripping 'nmi_enter()'s BUG_ON(in_nmi()).

Signed-off-by: James Morse 
Acked-by: Marc Zyngier 
Tested-by: Tyler Baicar 
Acked-by: Catalin Marinas 

---
Why does apei_claim_sea() take a pt_regs? This gets used later to take
APEI by the hand through NMI->IRQ context, depending on what we
interrupted.

Changes since v6:
 * Moved the voice of the commit message.
 * moved arch_local_save_flags() below the !IS_ENABLED drop-out
 * Moved the dummy declaration into the if-ACPI part of the header instead
   of if-APEI.

Changes since v4:
 * Made irqs-unmasked comment a lockdep assert.

Changes since v3:
 * Removed spurious whitespace change
 * Updated comment in acpi.c to cover SError masking

Changes since v2:
 * Added dummy definition for !ACPI and culled IS_ENABLED() checks.
---
 arch/arm64/include/asm/acpi.h  |  4 +++-
 arch/arm64/include/asm/daifflags.h |  1 +
 arch/arm64/include/asm/kvm_ras.h   | 16 ++-
 arch/arm64/kernel/acpi.c   | 31 ++
 arch/arm64/mm/fault.c  | 24 +--
 5 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h
index 2def77ec14be..7628efbe6c12 100644
--- a/arch/arm64/include/asm/acpi.h
+++ b/arch/arm64/include/asm/acpi.h
@@ -18,6 +18,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -110,9 +111,10 @@ static inline u32 get_acpi_id_for_cpu(unsigned int cpu)
 
 static inline void arch_fix_phys_package_id(int num, u32 slot) { }
 void __init acpi_init_cpus(void);
-
+int apei_claim_sea(struct pt_regs *regs);
 #else
 static inline void acpi_init_cpus(void) { }
+static inline int apei_claim_sea(struct pt_regs *regs) { return -ENOENT; }
 #endif /* CONFIG_ACPI */
 
 #ifdef CONFIG_ARM64_ACPI_PARKING_PROTOCOL
diff --git a/arch/arm64/include/asm/daifflags.h 
b/arch/arm64/include/asm/daifflags.h
index 8d91f2233135..fa90779fc752 100644
--- a/arch/arm64/include/asm/daifflags.h
+++ b/arch/arm64/include/asm/daifflags.h
@@ -20,6 +20,7 @@
 
 #define DAIF_PROCCTX   0
 #define DAIF_PROCCTX_NOIRQ PSR_I_BIT
+#define DAIF_ERRCTX(PSR_I_BIT | PSR_A_BIT)
 
 /* mask/save/unmask/restore all exceptions, including interrupts. */
 static inline void local_daif_mask(void)
diff --git a/arch/arm64/include/asm/kvm_ras.h b/arch/arm64/include/asm/kvm_ras.h
index 6096f0251812..8ac6ee77437c 100644
--- a/arch/arm64/include/asm/kvm_ras.h
+++ b/arch/arm64/include/asm/kvm_ras.h
@@ -4,8 +4,22 @@
 #ifndef __ARM64_KVM_RAS_H__
 #define __ARM64_KVM_RAS_H__
 
+#include 
+#include 
 #include 
 
-int kvm_handle_guest_sea(phys_addr_t addr, unsigned int esr);
+#include 
+
+/*
+ * Was this synchronous external abort a RAS notification?
+ * Returns '0' for errors handled by some RAS subsystem, or -ENOENT.
+ */
+static inline int kvm_handle_guest_sea(phys_addr_t addr, unsigned int esr)
+{
+   /* apei_claim_sea(NULL) expects to mask interrupts itself */
+   lockdep_assert_irqs_enabled();
+
+   return apei_claim_sea(NULL);
+}
 
 #endif /* __ARM64_KVM_RAS_H__ */
diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index 44e3c351e1ea..803f0494dd3e 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -27,8 +27,10 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -256,3 +258,32 @@ pgprot_t __acpi_get_mem_attribute(phys_addr_t addr)
return __pgprot(PROT_NORMAL_NC);
return __pgprot(PROT_DEVICE_nGnRnE);
 }
+
+/*
+ * Claim Synchronous External Aborts as a firmware first notification.
+ *
+ * Used by KVM and the arch do_sea handler.
+ * @regs may be NULL when called from process context.
+ */
+int apei_claim_sea(struct pt_regs *regs)
+{
+   int err = -ENOENT;
+   unsigned long current_flags;
+
+   if (!IS_ENABLED(CONFIG_ACPI_APEI_GHES))
+   return err;
+
+   current_flags = arch_local_save_flags();
+
+   /*
+* SEA can interrupt SError, mask it and describe this as 

[PATCH v8 05/26] ACPI / APEI: Make estatus pool allocation a static size

2019-01-29 Thread James Morse
Adding new NMI-like notifications duplicates the calls that grow
and shrink the estatus pool. This is all pretty pointless, as the
size is capped to 64K. Allocate this for each ghes and drop
the code that grows and shrinks the pool.

Suggested-by: Borislav Petkov 
Signed-off-by: James Morse 
Reviewed-by: Borislav Petkov 
---
 drivers/acpi/apei/ghes.c | 49 +---
 drivers/acpi/apei/hest.c |  2 +-
 include/acpi/ghes.h  |  2 +-
 3 files changed, 8 insertions(+), 45 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 4150c72c78cb..33144ab0661a 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -162,27 +162,18 @@ static void ghes_iounmap_irq(void)
clear_fixmap(FIX_APEI_GHES_IRQ);
 }
 
-static int ghes_estatus_pool_expand(unsigned long len); //temporary
-
-int ghes_estatus_pool_init(void)
+int ghes_estatus_pool_init(int num_ghes)
 {
+   unsigned long addr, len;
+
ghes_estatus_pool = gen_pool_create(GHES_ESTATUS_POOL_MIN_ALLOC_ORDER, 
-1);
if (!ghes_estatus_pool)
return -ENOMEM;
 
-   return ghes_estatus_pool_expand(GHES_ESTATUS_CACHE_AVG_SIZE *
-   GHES_ESTATUS_CACHE_ALLOCED_MAX);
-}
-
-static int ghes_estatus_pool_expand(unsigned long len)
-{
-   unsigned long size, addr;
-
-   ghes_estatus_pool_size_request += PAGE_ALIGN(len);
-   size = gen_pool_size(ghes_estatus_pool);
-   if (size >= ghes_estatus_pool_size_request)
-   return 0;
+   len = GHES_ESTATUS_CACHE_AVG_SIZE * GHES_ESTATUS_CACHE_ALLOCED_MAX;
+   len += (num_ghes * GHES_ESOURCE_PREALLOC_MAX_SIZE);
 
+   ghes_estatus_pool_size_request = PAGE_ALIGN(len);
addr = (unsigned long)vmalloc(PAGE_ALIGN(len));
if (!addr)
return -ENOMEM;
@@ -956,32 +947,8 @@ static int ghes_notify_nmi(unsigned int cmd, struct 
pt_regs *regs)
return ret;
 }
 
-static unsigned long ghes_esource_prealloc_size(
-   const struct acpi_hest_generic *generic)
-{
-   unsigned long block_length, prealloc_records, prealloc_size;
-
-   block_length = min_t(unsigned long, generic->error_block_length,
-GHES_ESTATUS_MAX_SIZE);
-   prealloc_records = max_t(unsigned long,
-generic->records_to_preallocate, 1);
-   prealloc_size = min_t(unsigned long, block_length * prealloc_records,
- GHES_ESOURCE_PREALLOC_MAX_SIZE);
-
-   return prealloc_size;
-}
-
-static void ghes_estatus_pool_shrink(unsigned long len)
-{
-   ghes_estatus_pool_size_request -= PAGE_ALIGN(len);
-}
-
 static void ghes_nmi_add(struct ghes *ghes)
 {
-   unsigned long len;
-
-   len = ghes_esource_prealloc_size(ghes->generic);
-   ghes_estatus_pool_expand(len);
mutex_lock(_list_mutex);
if (list_empty(_nmi))
register_nmi_handler(NMI_LOCAL, ghes_notify_nmi, 0, "ghes");
@@ -991,8 +958,6 @@ static void ghes_nmi_add(struct ghes *ghes)
 
 static void ghes_nmi_remove(struct ghes *ghes)
 {
-   unsigned long len;
-
mutex_lock(_list_mutex);
list_del_rcu(>list);
if (list_empty(_nmi))
@@ -1003,8 +968,6 @@ static void ghes_nmi_remove(struct ghes *ghes)
 * freed after NMI handler finishes.
 */
synchronize_rcu();
-   len = ghes_esource_prealloc_size(ghes->generic);
-   ghes_estatus_pool_shrink(len);
 }
 
 static void ghes_nmi_init_cxt(void)
diff --git a/drivers/acpi/apei/hest.c b/drivers/acpi/apei/hest.c
index 097ba07a657b..fcc8cc1e4f3d 100644
--- a/drivers/acpi/apei/hest.c
+++ b/drivers/acpi/apei/hest.c
@@ -205,7 +205,7 @@ static int __init hest_ghes_dev_register(unsigned int 
ghes_count)
if (rc)
goto err;
 
-   rc = ghes_estatus_pool_init();
+   rc = ghes_estatus_pool_init(ghes_count);
if (rc)
goto err;
 
diff --git a/include/acpi/ghes.h b/include/acpi/ghes.h
index 46ef5566e052..cd9ee507d860 100644
--- a/include/acpi/ghes.h
+++ b/include/acpi/ghes.h
@@ -52,7 +52,7 @@ enum {
GHES_SEV_PANIC = 0x3,
 };
 
-int ghes_estatus_pool_init(void);
+int ghes_estatus_pool_init(int num_ghes);
 
 /* From drivers/edac/ghes_edac.c */
 
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 02/26] ACPI / APEI: Remove silent flag from ghes_read_estatus()

2019-01-29 Thread James Morse
Subsequent patches will split up ghes_read_estatus(), at which
point passing around the 'silent' flag gets annoying. This is to
suppress prink() messages, which prior to commit 42a0bb3f7138
("printk/nmi: generic solution for safe printk in NMI"), were
unsafe in NMI context.

This is no longer necessary, remove the flag. printk() messages
are batched in a per-cpu buffer and printed via irq-work, or a call
back from panic().

Signed-off-by: James Morse 
Reviewed-by: Borislav Petkov 

---
Changes since v6:
 * Moved earlier in the series,
 * Tinkered with the commit message.
 * switched to pr_warn_ratelimited() to shut checkpatch up
---
 drivers/acpi/apei/ghes.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 0c46b79e31b1..f0a704aed040 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -324,7 +324,7 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, 
u32 len,
}
 }
 
-static int ghes_read_estatus(struct ghes *ghes, int silent)
+static int ghes_read_estatus(struct ghes *ghes)
 {
struct acpi_hest_generic *g = ghes->generic;
u64 buf_paddr;
@@ -333,8 +333,7 @@ static int ghes_read_estatus(struct ghes *ghes, int silent)
 
rc = apei_read(_paddr, >error_status_address);
if (rc) {
-   if (!silent && printk_ratelimit())
-   pr_warning(FW_WARN GHES_PFX
+   pr_warn_ratelimited(FW_WARN GHES_PFX
 "Failed to read error status block address for hardware error source: %d.\n",
   g->header.source_id);
return -EIO;
@@ -366,9 +365,9 @@ static int ghes_read_estatus(struct ghes *ghes, int silent)
rc = 0;
 
 err_read_block:
-   if (rc && !silent && printk_ratelimit())
-   pr_warning(FW_WARN GHES_PFX
-  "Failed to read error status block!\n");
+   if (rc)
+   pr_warn_ratelimited(FW_WARN GHES_PFX
+   "Failed to read error status block!\n");
return rc;
 }
 
@@ -702,7 +701,7 @@ static int ghes_proc(struct ghes *ghes)
 {
int rc;
 
-   rc = ghes_read_estatus(ghes, 0);
+   rc = ghes_read_estatus(ghes);
if (rc)
goto out;
 
@@ -939,7 +938,7 @@ static int ghes_notify_nmi(unsigned int cmd, struct pt_regs 
*regs)
return ret;
 
list_for_each_entry_rcu(ghes, _nmi, list) {
-   if (ghes_read_estatus(ghes, 1)) {
+   if (ghes_read_estatus(ghes)) {
ghes_clear_estatus(ghes);
continue;
} else {
-- 
2.20.1

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[PATCH v8 06/26] ACPI / APEI: Don't store CPER records physical address in struct ghes

2019-01-29 Thread James Morse
When CPER records are found the address of the records is stashed
in the struct ghes. Once the records have been processed, this
address is overwritten with zero so that it won't be processed
again without being re-populated by firmware.

This goes wrong if a struct ghes can be processed concurrently,
as can happen at probe time when an NMI occurs. If the NMI arrives
on another CPU, the probing CPU may call ghes_clear_estatus() on the
records before the handler had finished with them.
Even on the same CPU, once the interrupted handler is resumed, it
will call ghes_clear_estatus() on the NMIs records, this memory may
have already been re-used by firmware.

Avoid this stashing by letting the caller hold the address. A
later patch will do away with the use of ghes->flags in the
read/clear code too.

Signed-off-by: James Morse 
Reviewed-by: Borislav Petkov 
---
Changes since v7:
 * Added buf_paddr to ghes_panic, as it wants to print the estatus

Changes since v6:
 * Moved earlier in the series
 * Added buf_adder = 0 on all the error paths, and test for it in
   ghes_estatus_clear() for extra sanity.
---
 drivers/acpi/apei/ghes.c | 46 +++-
 include/acpi/ghes.h  |  1 -
 2 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c
index 33144ab0661a..a34f79153b1a 100644
--- a/drivers/acpi/apei/ghes.c
+++ b/drivers/acpi/apei/ghes.c
@@ -305,29 +305,30 @@ static void ghes_copy_tofrom_phys(void *buffer, u64 
paddr, u32 len,
}
 }
 
-static int ghes_read_estatus(struct ghes *ghes)
+static int ghes_read_estatus(struct ghes *ghes, u64 *buf_paddr)
 {
struct acpi_hest_generic *g = ghes->generic;
-   u64 buf_paddr;
u32 len;
int rc;
 
-   rc = apei_read(_paddr, >error_status_address);
+   rc = apei_read(buf_paddr, >error_status_address);
if (rc) {
+   *buf_paddr = 0;
pr_warn_ratelimited(FW_WARN GHES_PFX
 "Failed to read error status block address for hardware error source: %d.\n",
   g->header.source_id);
return -EIO;
}
-   if (!buf_paddr)
+   if (!*buf_paddr)
return -ENOENT;
 
-   ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
+   ghes_copy_tofrom_phys(ghes->estatus, *buf_paddr,
  sizeof(*ghes->estatus), 1);
-   if (!ghes->estatus->block_status)
+   if (!ghes->estatus->block_status) {
+   *buf_paddr = 0;
return -ENOENT;
+   }
 
-   ghes->buffer_paddr = buf_paddr;
ghes->flags |= GHES_TO_CLEAR;
 
rc = -EIO;
@@ -339,7 +340,7 @@ static int ghes_read_estatus(struct ghes *ghes)
if (cper_estatus_check_header(ghes->estatus))
goto err_read_block;
ghes_copy_tofrom_phys(ghes->estatus + 1,
- buf_paddr + sizeof(*ghes->estatus),
+ *buf_paddr + sizeof(*ghes->estatus),
  len - sizeof(*ghes->estatus), 1);
if (cper_estatus_check(ghes->estatus))
goto err_read_block;
@@ -349,15 +350,20 @@ static int ghes_read_estatus(struct ghes *ghes)
if (rc)
pr_warn_ratelimited(FW_WARN GHES_PFX
"Failed to read error status block!\n");
+
return rc;
 }
 
-static void ghes_clear_estatus(struct ghes *ghes)
+static void ghes_clear_estatus(struct ghes *ghes, u64 buf_paddr)
 {
ghes->estatus->block_status = 0;
if (!(ghes->flags & GHES_TO_CLEAR))
return;
-   ghes_copy_tofrom_phys(ghes->estatus, ghes->buffer_paddr,
+
+   if (!buf_paddr)
+   return;
+
+   ghes_copy_tofrom_phys(ghes->estatus, buf_paddr,
  sizeof(ghes->estatus->block_status), 0);
ghes->flags &= ~GHES_TO_CLEAR;
 }
@@ -666,11 +672,11 @@ static int ghes_ack_error(struct acpi_hest_generic_v2 
*gv2)
return apei_write(val, >read_ack_register);
 }
 
-static void __ghes_panic(struct ghes *ghes)
+static void __ghes_panic(struct ghes *ghes, u64 buf_paddr)
 {
__ghes_print_estatus(KERN_EMERG, ghes->generic, ghes->estatus);
 
-   ghes_clear_estatus(ghes);
+   ghes_clear_estatus(ghes, buf_paddr);
 
/* reboot to log the error! */
if (!panic_timeout)
@@ -680,14 +686,15 @@ static void __ghes_panic(struct ghes *ghes)
 
 static int ghes_proc(struct ghes *ghes)
 {
+   u64 buf_paddr;
int rc;
 
-   rc = ghes_read_estatus(ghes);
+   rc = ghes_read_estatus(ghes, _paddr);
if (rc)
goto out;
 
if (ghes_severity(ghes->estatus->error_severity) >= GHES_SEV_PANIC) {
-   __ghes_panic(ghes);
+   __ghes_panic(ghes, buf_paddr);
}
 
if (!ghes_estatus_cached(ghes->estatus)) {
@@ -697,7 +704,7 @@ static int ghes_proc(struct ghes *ghes)
ghes_do_proc(ghes, 

[PATCH v8 00/26] APEI in_nmi() rework and SDEI wire-up

2019-01-29 Thread James Morse
Changes since v7?
 * Removed the memory allocation in the task_work stuff.
 * More user-friendly and easier on the eye,
 * Switched the irq-mask testing in the arch code to be safe before
   Julien's GIC PMR series.
Specific changes are noted in each patch.


This series aims to wire-up arm64's fancy new software-NMI notifications
for firmware-first RAS. These need to use the estatus-queue, which is
also needed for notifications via emulated-SError. All of these
things take the 'in_nmi()' path through ghes_copy_tofrom_phys(), and
so will deadlock if they can interact, which they might.

To that end, this series removes the in_nmi() stuff from ghes.c.
Locks are pushed out to the notification helpers, and fixmap entries
are passed in to the code that needs them. This means the estatus-queue
users can interrupt each other however they like.

While doing this there is a fair amount of cleanup, which is (now) at the
beginning of the series. NMIlike notifications interrupting
ghes_probe() can go wrong for three different reasons. CPER record
blocks greater than PAGE_SIZE dont' work.
The estatus-pool allocation is simplified and the silent-flag/oops-begin
is removed.

Nothing in this series is intended as fixes, as its all cleanup or
never-worked.

--%<--
The earlier boiler-plate:

What's SDEI? Its ARM's "Software Delegated Exception Interface" [0]. It's
used by firmware to tell the OS about firmware-first RAS events.

These Software exceptions can interrupt anything, so I describe them as
NMI-like. They aren't the only NMI-like way to notify the OS about
firmware-first RAS events, the ACPI spec also defines 'NOTFIY_SEA' and
'NOTIFY_SEI'.

(Acronyms: SEA, Synchronous External Abort. The CPU requested some memory,
but the owner of that memory said no. These are always synchronous with the
instruction that caused them. SEI, System-Error Interrupt, commonly called
SError. This is an asynchronous external abort, the memory-owner didn't say no
at the right point. Collectively these things are called external-aborts
How is firmware involved? It traps these and re-injects them into the kernel
once its written the CPER records).

APEI's GHES code only expects one source of NMI. If a platform implements
more than one of these mechanisms, APEI needs to handle the interaction.
'SEA' and 'SEI' can interact as 'SEI' is asynchronous. SDEI can interact
with itself: its exceptions can be 'normal' or 'critical', and firmware
could use both types for RAS. (errors using normal, 'panic-now' using
critical).
--%<--

This series is base on v5.0-rc1, and can be retrieved from:
git://linux-arm.org/linux-jm.git -b apei_ioremap_rework/v8


Known issues:
 * ghes_copy_tofrom_phys() already takes a lock in NMI context, this
   series moves that around, and makes sure we never try to take the
   same lock from different NMIlike notifications. Since the switch to
   queued spinlocks it looks like the kernel can only be 4 context's
   deep in spinlock, which arm64 could exceed as it doesn't have a
   single architected NMI. This would be fixed by dropping back to
   test-and-set when the nesting gets too deep:
 lore.kernel.org/r/1548215351-18896-1-git-send-email-long...@redhat.com

* Taking an NMI from a KVM guest on arm64 with VHE leaves HCR_EL2.TGE
  clear, meaning AT and TLBI point at the guest, and PAN/UAO are squiffy.
  Only TLBI matters for APEI, and this is fixed by Julien's patch:
 
http://lore.kernel.org/r/1548084825-8803-2-git-send-email-julien.thie...@arm.com

* Linux ignores the physical address mask, meaning it doesn't call
  memory_failure() on all the affected pages if firmware or hypervisor
  believe in a different page size. Easy to hit on arm64, (easy to fix too,
  it just conflicts with this series)


[v7] 
https://lore.kernel.org/linux-arm-kernel/20181203180613.228133-1-james.mo...@arm.com/
[v6] https://www.spinics.net/lists/linux-acpi/msg84228.html
[v5] https://www.spinics.net/lists/linux-acpi/msg82993.html
[v4] https://www.spinics.net/lists/arm-kernel/msg653078.html
[v3] https://www.spinics.net/lists/arm-kernel/msg649230.html

[0] 
https://static.docs.arm.com/den0054/a/ARM_DEN0054A_Software_Delegated_Exception_Interface.pdf


James Morse (26):
  ACPI / APEI: Don't wait to serialise with oops messages when
panic()ing
  ACPI / APEI: Remove silent flag from ghes_read_estatus()
  ACPI / APEI: Switch estatus pool to use vmalloc memory
  ACPI / APEI: Make hest.c manage the estatus memory pool
  ACPI / APEI: Make estatus pool allocation a static size
  ACPI / APEI: Don't store CPER records physical address in struct ghes
  ACPI / APEI: Remove spurious GHES_TO_CLEAR check
  ACPI / APEI: Don't update struct ghes' flags in read/clear estatus
  ACPI / APEI: Generalise the estatus queue's notify code
  ACPI / APEI: Don't allow ghes_ack_error() to mask earlier errors
  ACPI / APEI: Move NOTIFY_SEA between the estatus-queue and NOTIFY_NMI
  ACPI / APEI: Switch NOTIFY_SEA to use the estatus queue
  

Re: [PATCH v7 10/25] ACPI / APEI: Tell firmware the estatus queue consumed the records

2019-01-29 Thread James Morse
Hi Boris,

On 29/01/2019 11:49, Borislav Petkov wrote:
> On Wed, Jan 23, 2019 at 06:36:38PM +, James Morse wrote:
>> Do you consider ENOENT an error? We don't ack in that case as the
>> memory wasn't in use.
> 
> So let's see:
> 
> if (!*buf_paddr)
> return -ENOENT;
> 
> can happen when apei_read() has returned 0 but it has managed to do
> 
>   *val = 0;

> Now, that function returns error values which we should be checking
> but we're checking the buf_paddr pointed to value for being 0. Are
> we fearing that even if acpi_os_read_memory() or acpi_os_read_port()
> succeed, *buf_paddr could still be 0 ?

That's what this code is doing, checking for a successful read, of zero.
The g->error_status_address has to point somewhere as its location is advertised
in the tables.

What is the value of g->error_status_address 'out of reset' or before any error
has occurred? This code expects it to be zero, or to point to a CPER block with
an empty block_status.

(the acpi spec is unclear on when *(g->error_status_address) is written)


> Because if not, we should be checking whether rc == -EINVAL and then
> convert it to -ENOENT.

EINVAL implies the reg->space_id wasn't one of the two "System IO or System
Memory". (I thought the spec required this, but it only says this for EINJ:
'This constraint is an attempt to ensure that the registers are accessible in
the presence of hardware error conditions'.)

apei_check_gar() checks for these two in apei_map_generic_address(), so if this
is the case we would have failed at ghes_new() time.


> But ghes_read_estatus() handles the error case first *and* *then* checks
> buf_paddr too, to make really really sure we won't be reading from
> address 0.

I think this is the distinction between 'failed to read', (because
g->error_status_address has bad alignment or an unsupported address-space
id/access-size), and successfully read 0, which is treated as ENOENT.


>> For the other cases its because the records are bogus, but we still
>> unconditionally tell firmware we're done with them.
> 
> ... to free the memory, yes, ok.
> 
 I think it is. 18.3.2.8 of ACPI v6.2 (search for Generic Hardware Error 
 Source
 version 2", then below the table):
 * OSPM detects error (via interrupt/exception or polling the block status)
 * OSPM copies the error status block
 * OSPM clears the block status field of the error status block
 * OSPM acknowledges the error via Read Ack register

 The ENOENT case is excluded by 'polling the block status'.
>>>
>>> Ok, so we signal the absence of an error record with ENOENT.
>>>
>>> if (!buf_paddr)
>>> return -ENOENT;
>>>
>>> Can that even happen?
>>
>> Yes, for NOTIFY_POLLED its the norm. For the IRQ flavours that walk a list of
>> GHES, all but one of them will return ENOENT.


> Lemme get this straight: when we do
> 
>   apei_read(_paddr, >error_status_address);
> 
> in the polled case, buf_paddr can be 0?

If firmware has never generated CPER records, so it has never written to void
*error_status_address, yes.

There seem to be two ways of doing this. This zero check implies an example
system could be:
| g->error_status_address == 0xf00d
| *(u64 *)0xf00d == 0
Firmware populates CPER records, then updates 0xf00d.
(0xf00d would have been pre-mapped by apei_map_generic_address() in ghes_new())
Reads of 0xf00d before CPER records are generated get 0.

Once an error occurs, this system now looks like this:
| g->error_status_address == 0xf00d
| *(u64 *)0xf00d == 0xbeef
| *(u64 *)0xbeef == 0

For new errors, firmware populates CPER records, then updates 0xf00d.
Alternatively firmware could re-use the memory at 0xbeef, generating the CPER
records backwards, so that once 0xbeef is updated, the rest of the record is
visible. (firmware knows not to race with another CPU right?)

Firmware could equally point 0xf00d at 0xbeef at startup, so it has one fewer
values to write when an error occurs. I have an arm64 system with a HEST that
does this. (I'm pretty sure its ACPI support is a copy-and-paste from x86, it
even describes NOTIFY_NMI, who knows what that means on arm!)

When linux processes an error, ghes_clear_estatus() NULLs the
estatus->block_status, (which in this example is at 0xbeef). This is the
documented sequence for GHESv2.
Elsewhere the spec talks of checking the block status which is part of the
records, (not the error_status_address, which is the pointer to the records).

Linux can't NULL 0xf00d, because it doesn't know if firmware will write it again
next time it updates the records.
I can't find where in the spec it says the error status address is written to.
Linux works with both 'at boot' and 'on each error'.
If it were know to have a static value, ghes_copy_tofrom_phys() would not have
been necessary, but its been there since d334a49113a4.

In the worst case, if there is a value at the error_status_address, we have to
map/unmap it every time we poll in 

Re: [PATCH v7 10/25] ACPI / APEI: Tell firmware the estatus queue consumed the records

2019-01-29 Thread James Morse
Hi Tyler,

On 11/01/2019 20:53, Tyler Baicar wrote:
> On Fri, Jan 11, 2019 at 1:09 PM James Morse  wrote:
>> We can return on ENOENT out earlier, as nothing needs doing in that case. Its
>> what the GHES_TO_CLEAR spaghetti is for, we can probably move the ack thing 
>> into
>> ghes_clear_estatus(), that way that thing means 'I'm done with this memory'.
>>
>> Something like:
>> -
>> rc = ghes_read_estatus();
>> if (rc == -ENOENT)
>> return 0;
> 
> We still should be returning at least the -ENOENT from ghes_read_estatus().
> That is being used by the SEA handling to determine if an SEA was properly
> reported/handled by the host kernel in the KVM SEA case.

Sorry, my terrible example code. You'll be glad to know I would have caught this
when testing it!


Thanks,

James
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [RFC v3 02/21] iommu: Introduce cache_invalidate API

2019-01-29 Thread Auger Eric
Hi Jean-Philippe,

On 1/28/19 6:32 PM, Jean-Philippe Brucker wrote:
> Hi Eric,
> 
> On 25/01/2019 16:49, Auger Eric wrote:
> [...]
 diff --git a/include/uapi/linux/iommu.h b/include/uapi/linux/iommu.h
 index 7a7cf7a3de7c..4605f5cfac84 100644
 --- a/include/uapi/linux/iommu.h
 +++ b/include/uapi/linux/iommu.h
 @@ -47,4 +47,99 @@ struct iommu_pasid_table_config {
};
  };
  
 +/**
 + * enum iommu_inv_granularity - Generic invalidation granularity
 + * @IOMMU_INV_GRANU_DOMAIN_ALL_PASID: TLB entries or PASID caches of 
 all
 + *PASIDs associated with a domain 
 ID
 + * @IOMMU_INV_GRANU_PASID_SEL:TLB entries or PASID cache 
 associated
 + *with a PASID and a domain
 + * @IOMMU_INV_GRANU_PAGE_PASID:   TLB entries of selected page 
 range
 + *within a PASID
 + *
 + * When an invalidation request is passed down to IOMMU to flush 
 translation
 + * caches, it may carry different granularity levels, which can be 
 specific
 + * to certain types of translation caches.
 + * This enum is a collection of granularities for all types of translation
 + * caches. The idea is to make it easy for IOMMU model specific driver to
 + * convert from generic to model specific value. Each IOMMU driver
 + * can enforce check based on its own conversion table. The conversion is
 + * based on 2D look-up with inputs as follows:
 + * - translation cache types
 + * - granularity
 + *
 + * type |   DTLB|TLB|   PASID   |
 + *  granule |   |   |   cache   |
 + * -+---+---+---+
 + *  DN_ALL_PASID|   Y   |   Y   |   Y   |
 + *  PASID_SEL   |   Y   |   Y   |   Y   |
 + *  PAGE_PASID  |   Y   |   Y   |   N/A |
 + *
 + */
 +enum iommu_inv_granularity {
 +  IOMMU_INV_GRANU_DOMAIN_ALL_PASID,
 +  IOMMU_INV_GRANU_PASID_SEL,
 +  IOMMU_INV_GRANU_PAGE_PASID,
 +  IOMMU_INV_NR_GRANU,
 +};
 +
 +/**
 + * enum iommu_inv_type - Generic translation cache types for invalidation
 + *
 + * @IOMMU_INV_TYPE_DTLB:  device IOTLB
 + * @IOMMU_INV_TYPE_TLB:   IOMMU paging structure cache
 + * @IOMMU_INV_TYPE_PASID: PASID cache
 + * Invalidation requests sent to IOMMU for a given device need to indicate
 + * which type of translation cache to be operated on. Combined with enum
 + * iommu_inv_granularity, model specific driver can do a simple lookup to
 + * convert from generic to model specific value.
 + */
 +enum iommu_inv_type {
 +  IOMMU_INV_TYPE_DTLB,
 +  IOMMU_INV_TYPE_TLB,
 +  IOMMU_INV_TYPE_PASID,
 +  IOMMU_INV_NR_TYPE
 +};
 +
 +/**
 + * Translation cache invalidation header that contains mandatory meta 
 data.
 + * @version:  info format version, expecting future extesions
 + * @type: type of translation cache to be invalidated
 + */
 +struct iommu_cache_invalidate_hdr {
 +  __u32 version;
 +#define TLB_INV_HDR_VERSION_1 1
 +  enum iommu_inv_type type;
 +};
 +
 +/**
 + * Translation cache invalidation information, contains generic IOMMU
 + * data which can be parsed based on model ID by model specific drivers.
 + * Since the invalidation of second level page tables are included in the
 + * unmap operation, this info is only applicable to the first level
 + * translation caches, i.e. DMA request with PASID.
 + *
 + * @granularity:  requested invalidation granularity, type dependent
 + * @size: 2^size of 4K pages, 0 for 4k, 9 for 2MB, etc.
>>>
>>> Why is this a 4K page centric interface?
>> This matches the vt-d Address Mask (AM) field of the IOTLB Invalidate
>> Descriptor. We can pass a log2size instead.
>>>
 + * @nr_pages: number of pages to invalidate
 + * @pasid:processor address space ID value per PCI spec.
 + * @arch_id:  architecture dependent id characterizing a 
 context
 + *and tagging the caches, ie. domain Identfier on 
 VTD,
 + *asid on ARM SMMU
 + * @addr: page address to be invalidated
 + * @flags IOMMU_INVALIDATE_ADDR_LEAF: leaf paging entries
 + *IOMMU_INVALIDATE_GLOBAL_PAGE: global pages
>>>
>>> Shouldn't some of these be tied the the granularity of the
>>> invalidation?  It seems like this should be more similar to
>>> iommu_pasid_table_config where the granularity of the invalidation
>>> defines which entry within a union at the end of the structure is valid
>>> and populated.  Otherwise we have fields that don't make sense for
>>> 

Re: [PATCH 5/5] arm/arm64: KVM: Don't panic on failure to properly reset system registers

2019-01-29 Thread Andrew Jones
On Fri, Jan 25, 2019 at 10:46:56AM +0100, Christoffer Dall wrote:
> From: Marc Zyngier 
> 
> Failing to properly reset system registers is pretty bad. But not
> quite as bad as bringing the whole machine down... So warn loudly,
> but slightly more gracefully.
> 
> Signed-off-by: Marc Zyngier 
> Acked-by: Christoffer Dall 
> ---
>  arch/arm/kvm/coproc.c | 4 ++--
>  arch/arm64/kvm/sys_regs.c | 4 ++--
>  2 files changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c
> index 222c1635bc7a..e8bd288fd5be 100644
> --- a/arch/arm/kvm/coproc.c
> +++ b/arch/arm/kvm/coproc.c
> @@ -1450,6 +1450,6 @@ void kvm_reset_coprocs(struct kvm_vcpu *vcpu)
>   reset_coproc_regs(vcpu, table, num);
>  
>   for (num = 1; num < NR_CP15_REGS; num++)
> - if (vcpu_cp15(vcpu, num) == 0x42424242)
> - panic("Didn't reset vcpu_cp15(vcpu, %zi)", num);
> + WARN(vcpu_cp15(vcpu, num) == 0x42424242,
> +  "Didn't reset vcpu_cp15(vcpu, %zi)", num);
>  }
> diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> index 86096774abcd..4f067545c7d2 100644
> --- a/arch/arm64/kvm/sys_regs.c
> +++ b/arch/arm64/kvm/sys_regs.c
> @@ -2609,6 +2609,6 @@ void kvm_reset_sys_regs(struct kvm_vcpu *vcpu)
>   reset_sys_reg_descs(vcpu, table, num);
>  
>   for (num = 1; num < NR_SYS_REGS; num++)
> - if (__vcpu_sys_reg(vcpu, num) == 0x4242424242424242)
> - panic("Didn't reset __vcpu_sys_reg(%zi)", num);
> + WARN(__vcpu_sys_reg(vcpu, num) == 0x4242424242424242,
> +  "Didn't reset __vcpu_sys_reg(%zi)\n", num);
>  }
> -- 
> 2.18.0
>

If we only get halfway through resetting, then we'll get a warn splat,
complete with a backtrace, for each register. Should we do something
like the following instead?

  for (num = 1; num < NR_SYS_REGS; num++)
 if (__vcpu_sys_reg(vcpu, num) == 0x4242424242424242)
failed++;
  WARN(failed, "Didn't reset %d system registers", failed);

Thanks,
drew
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 4/5] KVM: arm/arm64: Implement PSCI ON_PENDING when turning on VCPUs

2019-01-29 Thread Andrew Jones
On Fri, Jan 25, 2019 at 10:46:55AM +0100, Christoffer Dall wrote:
> We are currently not implementing the PSCI spec completely, as we do not
> take handle the situation where two VCPUs are attempting to turn on a
> third VCPU at the same time.  The PSCI implementation should make sure
> that only one requesting VCPU wins the race and that the other receives
> PSCI_RET_ON_PENDING.
> 
> Implement this by changing the VCPU power state to a tristate enum and
> ensure only a single VCPU can turn on another VCPU at a given time using
> a cmpxchg operation.
> 
> Signed-off-by: Christoffer Dall 
> Acked-by: Marc Zyngier 
> ---
>  arch/arm/include/asm/kvm_host.h   | 10 --
>  arch/arm64/include/asm/kvm_host.h | 10 --
>  virt/kvm/arm/arm.c| 24 +++-
>  virt/kvm/arm/psci.c   | 21 ++---
>  4 files changed, 45 insertions(+), 20 deletions(-)
> 
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index b1cfae222441..4dc47fea1ac8 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -157,6 +157,12 @@ struct vcpu_reset_state {
>   boolreset;
>  };
>  
> +enum vcpu_power_state {
> + KVM_ARM_VCPU_OFF,
> + KVM_ARM_VCPU_ON_PENDING,
> + KVM_ARM_VCPU_ON,
> +};
> +
>  struct kvm_vcpu_arch {
>   struct kvm_cpu_context ctxt;
>  
> @@ -184,8 +190,8 @@ struct kvm_vcpu_arch {
>* here.
>*/
>  
> - /* vcpu power-off state */
> - bool power_off;
> + /* vcpu power state */
> + enum vcpu_power_state power_state;
>  
>/* Don't run the guest (internal implementation need) */
>   bool pause;
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index d43b13421987..0647a409657b 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -218,6 +218,12 @@ struct vcpu_reset_state {
>   boolreset;
>  };
>  
> +enum vcpu_power_state {
> + KVM_ARM_VCPU_OFF,
> + KVM_ARM_VCPU_ON_PENDING,
> + KVM_ARM_VCPU_ON,
> +};
> +
>  struct kvm_vcpu_arch {
>   struct kvm_cpu_context ctxt;
>  
> @@ -285,8 +291,8 @@ struct kvm_vcpu_arch {
>   u32 mdscr_el1;
>   } guest_debug_preserved;
>  
> - /* vcpu power-off state */
> - bool power_off;
> + /* vcpu power state */
> + enum vcpu_power_state power_state;
>  
>   /* Don't run the guest (internal implementation need) */
>   bool pause;
> diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
> index 785076176814..1e3195155860 100644
> --- a/virt/kvm/arm/arm.c
> +++ b/virt/kvm/arm/arm.c
> @@ -411,7 +411,7 @@ static void vcpu_power_off(struct kvm_vcpu *vcpu)
>  int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
>   struct kvm_mp_state *mp_state)
>  {
> - if (vcpu->arch.power_off)
> + if (vcpu->arch.power_state != KVM_ARM_VCPU_ON)
>   mp_state->mp_state = KVM_MP_STATE_STOPPED;
>   else
>   mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
> @@ -426,7 +426,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
>  
>   switch (mp_state->mp_state) {
>   case KVM_MP_STATE_RUNNABLE:
> - vcpu->arch.power_off = false;
> + vcpu->arch.power_state = KVM_ARM_VCPU_ON;
>   break;
>   case KVM_MP_STATE_STOPPED:
>   vcpu_power_off(vcpu);
> @@ -448,8 +448,9 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
>  int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
>  {
>   bool irq_lines = *vcpu_hcr(v) & (HCR_VI | HCR_VF);
> - return ((irq_lines || kvm_vgic_vcpu_pending_irq(v))
> - && !v->arch.power_off && !v->arch.pause);
> + return (irq_lines || kvm_vgic_vcpu_pending_irq(v)) &&
> + v->arch.power_state == KVM_ARM_VCPU_ON &&
> + !v->arch.pause;
>  }
>  
>  bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
> @@ -614,14 +615,19 @@ void kvm_arm_resume_guest(struct kvm *kvm)
>   }
>  }
>  
> +static bool vcpu_sleeping(struct kvm_vcpu *vcpu)
> +{
> + return vcpu->arch.power_state != KVM_ARM_VCPU_ON ||
> + vcpu->arch.pause;
> +}
> +
>  static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
>  {
>   struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
>  
> - swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
> -(!vcpu->arch.pause)));
> + swait_event_interruptible_exclusive(*wq, !vcpu_sleeping(vcpu));
>  
> - if (vcpu->arch.power_off || vcpu->arch.pause) {
> + if (vcpu_sleeping(vcpu)) {
>   /* Awaken to handle a signal, request we sleep again later. */
>   kvm_make_request(KVM_REQ_SLEEP, vcpu);
>   }
> @@ -646,7 +652,7 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
>   vcpu_req_sleep(vcpu);
>  
>   if 

Re: [PATCH 3/5] KVM: arm/arm64: Require VCPU threads to turn them self off

2019-01-29 Thread Andrew Jones


Summary edit

> KVM: arm/arm64: Require VCPU threads to turn them self off

themselves

On Fri, Jan 25, 2019 at 10:46:54AM +0100, Christoffer Dall wrote:
> To avoid a race between turning VCPUs off and turning them on, make sure
> that only the VCPU threat itself turns off the VCPU.  When other threads
> want to turn of a VCPU, they now do this via a request.
> 
> Signed-off-by: Christoffer Dall 
> Acked-by: Marc Zyngier 
> ---
>  arch/arm/include/asm/kvm_host.h   |  2 ++
>  arch/arm64/include/asm/kvm_host.h |  2 ++
>  virt/kvm/arm/arm.c|  8 ++--
>  virt/kvm/arm/psci.c   | 11 ++-
>  4 files changed, 12 insertions(+), 11 deletions(-)
> 
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index 50e89869178a..b1cfae222441 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -49,6 +49,8 @@
>   KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_IRQ_PENDING  KVM_ARCH_REQ(1)
>  #define KVM_REQ_VCPU_RESET   KVM_ARCH_REQ(2)
> +#define KVM_REQ_VCPU_OFF \
> + KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  
>  DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
>  
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index da3fc7324d68..d43b13421987 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -49,6 +49,8 @@
>   KVM_ARCH_REQ_FLAGS(0, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  #define KVM_REQ_IRQ_PENDING  KVM_ARCH_REQ(1)
>  #define KVM_REQ_VCPU_RESET   KVM_ARCH_REQ(2)
> +#define KVM_REQ_VCPU_OFF \
> + KVM_ARCH_REQ_FLAGS(3, KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP)
>  
>  DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use);
>  
> diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
> index 9c486fad3f9f..785076176814 100644
> --- a/virt/kvm/arm/arm.c
> +++ b/virt/kvm/arm/arm.c
> @@ -404,8 +404,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
>  
>  static void vcpu_power_off(struct kvm_vcpu *vcpu)
>  {
> - vcpu->arch.power_off = true;
> - kvm_make_request(KVM_REQ_SLEEP, vcpu);
> + kvm_make_request(KVM_REQ_VCPU_OFF, vcpu);
>   kvm_vcpu_kick(vcpu);
>  }

I think we should leave this function alone. Otherwise if userspace sets
the MP state to STOPPED and then queries the state before the vcpu
has a chance to manage its vcpu requests, the state will still indicate
RUNNBLE. The same goes for a query right after doing a vcpu init.

>  
> @@ -646,6 +645,11 @@ static void check_vcpu_requests(struct kvm_vcpu *vcpu)
>   if (kvm_check_request(KVM_REQ_SLEEP, vcpu))
>   vcpu_req_sleep(vcpu);
>  
> + if (kvm_check_request(KVM_REQ_VCPU_OFF, vcpu)) {
> + vcpu->arch.power_off = true;
> + vcpu_req_sleep(vcpu);
> + }
> +
>   if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
>   kvm_reset_vcpu(vcpu);
>  
> diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
> index b9cff1d4b06d..20255319e193 100644
> --- a/virt/kvm/arm/psci.c
> +++ b/virt/kvm/arm/psci.c
> @@ -97,9 +97,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu 
> *vcpu)
>  
>  static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
>  {
> - vcpu->arch.power_off = true;
> - kvm_make_request(KVM_REQ_SLEEP, vcpu);
> - kvm_vcpu_kick(vcpu);
> + kvm_make_request(KVM_REQ_VCPU_OFF, vcpu);
>  }

This was currently fine since it implements CPU_OFF which only applies to
the calling vcpu, but there's also no reason not to change it to be
consistent with the below change

>  
>  static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
> @@ -198,9 +196,6 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct 
> kvm_vcpu *vcpu)
>  
>  static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
>  {
> - int i;
> - struct kvm_vcpu *tmp;
> -
>   /*
>* The KVM ABI specifies that a system event exit may call KVM_RUN
>* again and may perform shutdown/reboot at a later time that when the
> @@ -210,9 +205,7 @@ static void kvm_prepare_system_event(struct kvm_vcpu 
> *vcpu, u32 type)
>* after this call is handled and before the VCPUs have been
>* re-initialized.
>*/
> - kvm_for_each_vcpu(i, tmp, vcpu->kvm)
> - tmp->arch.power_off = true;
> - kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_SLEEP);
> + kvm_make_all_cpus_request(vcpu->kvm, KVM_REQ_VCPU_OFF);
>  
>   memset(>run->system_event, 0, sizeof(vcpu->run->system_event));
>   vcpu->run->system_event.type = type;
> -- 
> 2.18.0
>

Thanks,
drew
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/5] KVM: arm/arm64: Reset the VCPU without preemption and vcpu state loaded

2019-01-29 Thread Marc Zyngier
Hi Drew,

On Tue, 29 Jan 2019 15:48:58 +,
Andrew Jones  wrote:
> 
> Hi Christoffer,
> 
> On Fri, Jan 25, 2019 at 10:46:52AM +0100, Christoffer Dall wrote:
> > Resetting the VCPU state modifies the system register state in memory,
> > but this may interact with vcpu_load/vcpu_put if running with preemption
> > disabled, which in turn may lead to corrupted system register state.
>   ^ enabled
> > 
> > Address this by disabling preemption and doing put/load if required
> > around the reset logic.
> 
> I'm having trouble understanding how disabling preemption helps here.
> There shouldn't be an issue with the KVM_ARM_VCPU_INIT case, since the
> target vcpu is guaranteed not to be loaded and therefore it doesn't
> have preempt notifiers registered either. Also, KVM_ARM_VCPU_INIT holds
> the vcpu mutex, so there's no chance for a load to occur until it's
> complete.
> 
> For the PSCI case it makes sense to force a vcpu load after the reset,
> otherwise the sleeping target vcpu won't have the correct state loaded.
> The initial put also makes sense in case we're not resetting everything.
> I don't understand how we're ensuring the target vcpu thread's preemption
> is disabled though. This modified kvm_reset_vcpu would need to be run
> from the target vcpu thread to work, but that's not how the PSCI code
> currently does it.

And that's exactly where we're going with the following patch in the
series. Ultimately, we need a vcpu to reset itself, as we otherwise
have a window where a vcpu can be spuriously loaded whilst being
reset.

Thanks,

M.

-- 
Jazz is not dead, it just smell funny.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 1/3] KVM: x86: Move mmu_memory_cache functions to common code

2019-01-29 Thread Christoffer Dall
On Mon, Jan 28, 2019 at 08:19:56AM -0800, Sean Christopherson wrote:
> On Mon, Jan 28, 2019 at 03:48:41PM +0100, Christoffer Dall wrote:
> > We are currently duplicating the mmu memory cache functionality quite
> > heavily between the architectures that support KVM.  As a first step,
> > move the x86 implementation (which seems to have the most recently
> > maintained version of the mmu memory cache) to common code.
> > 
> > We rename the functions and data types to have a kvm_ prefix for
> > anything exported as a symbol to the rest of the kernel and take the
> > chance to rename memory_cache to memcache to avoid overly long lines.
> 
> It'd be helpful to do the rename it a separate patch so that the move
> really is a straight move of code.
> 
> > This is a bit tedious on the callsites but ends up looking more
> > palatable.
> > 
> > We also introduce an arch-specific kvm_types.h which can be used to
> > define the architecture-specific GFP flags for allocating memory to the
> > memory cache, and to specify how many objects are required in the memory
> > cache.  These are the two points where the current implementations
> > diverge across architectures.  Since kvm_host.h defines structures with
> > fields of the memcache object, we define the memcache structure in
> > kvm_types.h, and we include the architecture-specific kvm_types.h to
> > know the size of object in kvm_host.h.
> > 
> > This patch currently only defines the structure and requires valid
> > defines in the architecture-specific kvm_types.h when KVM_NR_MEM_OBJS is
> > defined.  As we move each architecture to the common implementation,
> > this condition will eventually go away.
> > 
> > Signed-off-by: Christoffer Dall 
> > ---
> >  arch/arm/include/asm/kvm_types.h |  5 ++
> >  arch/arm64/include/asm/kvm_types.h   |  6 ++
> >  arch/mips/include/asm/kvm_types.h|  5 ++
> >  arch/powerpc/include/asm/kvm_types.h |  5 ++
> >  arch/s390/include/asm/kvm_types.h|  5 ++
> >  arch/x86/include/asm/kvm_host.h  | 17 +
> >  arch/x86/include/asm/kvm_types.h | 10 +++
> >  arch/x86/kvm/mmu.c   | 97 ++--
> >  arch/x86/kvm/paging_tmpl.h   |  4 +-
> >  include/linux/kvm_host.h |  9 +++
> >  include/linux/kvm_types.h| 12 
> >  virt/kvm/kvm_main.c  | 58 +
> >  12 files changed, 139 insertions(+), 94 deletions(-)
> >  create mode 100644 arch/arm/include/asm/kvm_types.h
> >  create mode 100644 arch/arm64/include/asm/kvm_types.h
> >  create mode 100644 arch/mips/include/asm/kvm_types.h
> >  create mode 100644 arch/powerpc/include/asm/kvm_types.h
> >  create mode 100644 arch/s390/include/asm/kvm_types.h
> >  create mode 100644 arch/x86/include/asm/kvm_types.h
> > 
> > diff --git a/arch/arm/include/asm/kvm_types.h 
> > b/arch/arm/include/asm/kvm_types.h
> > new file mode 100644
> > index ..bc389f82e88d
> > --- /dev/null
> > +++ b/arch/arm/include/asm/kvm_types.h
> > @@ -0,0 +1,5 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _ASM_ARM_KVM_TYPES_H
> > +#define _ASM_ARM_KVM_TYPES_H
> > +
> > +#endif /* _ASM_ARM_KVM_TYPES_H */
> > diff --git a/arch/arm64/include/asm/kvm_types.h 
> > b/arch/arm64/include/asm/kvm_types.h
> > new file mode 100644
> > index ..d0987007d581
> > --- /dev/null
> > +++ b/arch/arm64/include/asm/kvm_types.h
> > @@ -0,0 +1,6 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _ASM_ARM64_KVM_TYPES_H
> > +#define _ASM_ARM64_KVM_TYPES_H
> > +
> > +#endif /* _ASM_ARM64_KVM_TYPES_H */
> > +
> > diff --git a/arch/mips/include/asm/kvm_types.h 
> > b/arch/mips/include/asm/kvm_types.h
> > new file mode 100644
> > index ..5efeb32a5926
> > --- /dev/null
> > +++ b/arch/mips/include/asm/kvm_types.h
> > @@ -0,0 +1,5 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _ASM_MIPS_KVM_TYPES_H
> > +#define _ASM_MIPS_KVM_TYPES_H
> > +
> > +#endif /* _ASM_MIPS_KVM_TYPES_H */
> > diff --git a/arch/powerpc/include/asm/kvm_types.h 
> > b/arch/powerpc/include/asm/kvm_types.h
> > new file mode 100644
> > index ..f627eceaa314
> > --- /dev/null
> > +++ b/arch/powerpc/include/asm/kvm_types.h
> > @@ -0,0 +1,5 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _ASM_POWERPC_KVM_TYPES_H
> > +#define _ASM_POWERPC_KVM_TYPES_H
> > +
> > +#endif /* _ASM_POWERPC_KVM_TYPES_H */
> > diff --git a/arch/s390/include/asm/kvm_types.h 
> > b/arch/s390/include/asm/kvm_types.h
> > new file mode 100644
> > index ..b66a81f8a354
> > --- /dev/null
> > +++ b/arch/s390/include/asm/kvm_types.h
> > @@ -0,0 +1,5 @@
> > +/* SPDX-License-Identifier: GPL-2.0 */
> > +#ifndef _ASM_S390_KVM_TYPES_H
> > +#define _ASM_S390_KVM_TYPES_H
> > +
> > +#endif /* _ASM_S390_KVM_TYPES_H */
> > diff --git a/arch/x86/include/asm/kvm_host.h 
> > b/arch/x86/include/asm/kvm_host.h
> > index 4660ce90de7f..5c12cba8c2b1 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ 

Re: [kvm-unit-tests PATCH 2/7] lib: arm: Remove warning about uart0_base mismatch

2019-01-29 Thread Alexandru Elisei
On 1/29/19 12:23 PM, Andrew Jones wrote:
> On Tue, Jan 29, 2019 at 11:16:05AM +, Alexandru Elisei wrote:
>> On 1/28/19 4:31 PM, Andrew Jones wrote:
>>> On Mon, Jan 28, 2019 at 02:24:29PM +, Alexandru Elisei wrote:
 On 1/25/19 4:47 PM, Andrew Jones wrote:
> On Fri, Jan 25, 2019 at 04:36:13PM +, Alexandru Elisei wrote:
>> On 1/24/19 12:37 PM, Andrew Jones wrote:
>>> On Thu, Jan 24, 2019 at 11:59:43AM +, Andre Przywara wrote:
 On Thu, 24 Jan 2019 11:16:29 +
 Alexandru Elisei  wrote:

> A warning is displayed if uart0_base is different from what the code
> expects qemu to generate for the pl011 UART in the device tree.
> However, now we support the ns16550a UART emulated by kvmtool, which
> has a different address. This leads to the  warning being displayed
> even though the UART is configured and working as expected.
>
> Now that we support multiple UARTs, the warning serves no purpose, so
> remove it.
 Mmh, but we use that address before, right? So for anything not
 emulating an UART at this QEMU specific address we write to some random
 (device) memory?

 Drew, how important is this early print feature for kvm-unit-tests?
>>> The setup code passes through quite a few asserts before getting through
>>> io_init() (including in uart0_init), so I think there's still value in
>>> having a guessed UART address. Maybe we can provide guesses for both
>>> QEMU and kvmtool, and some selection method, that would be used until
>>> we've properly assigned uart0_base from DT?
>>>
 Cheers,
 Andre.

> Signed-off-by: Alexandru Elisei 
> ---
>  lib/arm/io.c | 6 --
>  1 file changed, 6 deletions(-)
>
> diff --git a/lib/arm/io.c b/lib/arm/io.c
> index 35fc05aeb4db..87435150f73e 100644
> --- a/lib/arm/io.c
> +++ b/lib/arm/io.c
> @@ -61,12 +61,6 @@ static void uart0_init(void)
>   }
>  
>   uart0_base = ioremap(base.addr, base.size);
> -
> - if (uart0_base != (u8 *)UART_EARLY_BASE) {
> - printf("WARNING: early print support may not work. "
> -"Found uart at %p, but early base is %p.\n",
> - uart0_base, (u8 *)UART_EARLY_BASE);
> - }
>  }
>>> This warning is doing what it should, which is pointing out that the
>>> UART_EARLY_BASE guess appears to be wrong. If we can provide a way
>>> to support more than one guess, then we should keep this warning but
>>> adjust it to match one of any of the guesses.
>>>
>>> Thanks,
>>> drew
>> I'm not really sure how to implement a selection method. I've looked at
>> splitting io_init() into uart0_init() and chr_testdev_init() and calling
>> uart0_init() very early in the setup process, but uart0_init() itself 
>> uses
>> printf() and assert().
>>
>> I've also thought about adding another function, something like
>> uart0_early_init(), that is called very early in setup() and gets the 
>> base
>> address from the dtb bootargs. But that means calling dt_init() and
>> dt_get_bootargs(), which can fail.
>>
>> One other option that could work is to make it a compile-time 
>> configuration.
>>
>> What do you think?
>>
> Compile-time is fine, which I guess will result in a new configure script
> option as well. I wonder if we shouldn't consider generating a config.h
> file with stuff like this rather than adding another -D to the compile
> line.
>
> drew 
 I propose a new configuration option called --vmm, with possible values 
 qemu and
 kvmtool, which defaults to qemu if not set.

 Another possibility would be to have an --uart-base option, but that means 
 we
 are expecting the user to be aware of the uart base address for the virtual
 machine manager, which might be unreasonable.

 This is a quick prototype of how using -D for conditional compilation 
 would look
 like (the configure changes are included too):

 diff --git a/configure b/configure
 index df8581e3a906..7a56ba47707f 100755
 --- a/configure
 +++ b/configure
 @@ -70,6 +70,9 @@ while [[ "$1" = -* ]]; do
     ;;
     --ld)
     ld="$arg"
 +    ;;
 +    --vmm)
 +    vmm="$arg"
     ;;
     --enable-pretty-print-stacks)
     pretty_print_stacks=yes
 @@ -108,6 +111,14 @@ if [ "$arch" = "i386" ] || [ "$arch" = "x86_64" ]; 
 then
  testdir=x86
  elif [ "$arch" = "arm" ] || [ "$arch" = "arm64" ]; then
  testdir=arm
 +    if [ -z "$vmm" ] || [ "$vmm" = "qemu" ]; then
 +    

Re: Kernel boot regression with PAuth and aarch64-softmmu -cpu max and el2 enabled

2019-01-29 Thread Mark Rutland
On Tue, Jan 29, 2019 at 11:54:13AM +, Peter Maydell wrote:
> On Tue, 29 Jan 2019 at 11:46, Mark Rutland  wrote:
> > On Tue, Jan 29, 2019 at 11:08:19AM +, Alex Bennée wrote:
> > > The -cpu max enabled a cortex-a57 + whatever extra features we've
> > > enabled in QEMU so far. It won't match any "real" CPU but it should be
> > > architecturally correct in so far we implement prerequisite features for
> > > any given feature. The cpuid feature bits should also be correct as we
> > > test them internally in QEMU to enable features.
> >
> > Just to check, does this enable VHE?
> 
> It does not. (We have no implementation of VHE yet -- as and when
> we do implement that "-cpu max" will turn it on, but today it doesn't.)

Ok. In that case, the kernel should drop to EL1N and stay there after it
configures EL2.

There are no pointer authentication traps taken to EL1N, so if something
is going wrong, I suspect either:

* a trap is taken to EL2, and we hyp_panic()
* an UNDEF is being taken to EL1N

Thanks,
Mark.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [kvm-unit-tests PATCH 2/7] lib: arm: Remove warning about uart0_base mismatch

2019-01-29 Thread Andrew Jones
On Tue, Jan 29, 2019 at 11:16:05AM +, Alexandru Elisei wrote:
> On 1/28/19 4:31 PM, Andrew Jones wrote:
> > On Mon, Jan 28, 2019 at 02:24:29PM +, Alexandru Elisei wrote:
> >> On 1/25/19 4:47 PM, Andrew Jones wrote:
> >>> On Fri, Jan 25, 2019 at 04:36:13PM +, Alexandru Elisei wrote:
>  On 1/24/19 12:37 PM, Andrew Jones wrote:
> > On Thu, Jan 24, 2019 at 11:59:43AM +, Andre Przywara wrote:
> >> On Thu, 24 Jan 2019 11:16:29 +
> >> Alexandru Elisei  wrote:
> >>
> >>> A warning is displayed if uart0_base is different from what the code
> >>> expects qemu to generate for the pl011 UART in the device tree.
> >>> However, now we support the ns16550a UART emulated by kvmtool, which
> >>> has a different address. This leads to the  warning being displayed
> >>> even though the UART is configured and working as expected.
> >>>
> >>> Now that we support multiple UARTs, the warning serves no purpose, so
> >>> remove it.
> >> Mmh, but we use that address before, right? So for anything not
> >> emulating an UART at this QEMU specific address we write to some random
> >> (device) memory?
> >>
> >> Drew, how important is this early print feature for kvm-unit-tests?
> > The setup code passes through quite a few asserts before getting through
> > io_init() (including in uart0_init), so I think there's still value in
> > having a guessed UART address. Maybe we can provide guesses for both
> > QEMU and kvmtool, and some selection method, that would be used until
> > we've properly assigned uart0_base from DT?
> >
> >> Cheers,
> >> Andre.
> >>
> >>> Signed-off-by: Alexandru Elisei 
> >>> ---
> >>>  lib/arm/io.c | 6 --
> >>>  1 file changed, 6 deletions(-)
> >>>
> >>> diff --git a/lib/arm/io.c b/lib/arm/io.c
> >>> index 35fc05aeb4db..87435150f73e 100644
> >>> --- a/lib/arm/io.c
> >>> +++ b/lib/arm/io.c
> >>> @@ -61,12 +61,6 @@ static void uart0_init(void)
> >>>   }
> >>>  
> >>>   uart0_base = ioremap(base.addr, base.size);
> >>> -
> >>> - if (uart0_base != (u8 *)UART_EARLY_BASE) {
> >>> - printf("WARNING: early print support may not work. "
> >>> -"Found uart at %p, but early base is %p.\n",
> >>> - uart0_base, (u8 *)UART_EARLY_BASE);
> >>> - }
> >>>  }
> > This warning is doing what it should, which is pointing out that the
> > UART_EARLY_BASE guess appears to be wrong. If we can provide a way
> > to support more than one guess, then we should keep this warning but
> > adjust it to match one of any of the guesses.
> >
> > Thanks,
> > drew
>  I'm not really sure how to implement a selection method. I've looked at
>  splitting io_init() into uart0_init() and chr_testdev_init() and calling
>  uart0_init() very early in the setup process, but uart0_init() itself 
>  uses
>  printf() and assert().
> 
>  I've also thought about adding another function, something like
>  uart0_early_init(), that is called very early in setup() and gets the 
>  base
>  address from the dtb bootargs. But that means calling dt_init() and
>  dt_get_bootargs(), which can fail.
> 
>  One other option that could work is to make it a compile-time 
>  configuration.
> 
>  What do you think?
> 
> >>> Compile-time is fine, which I guess will result in a new configure script
> >>> option as well. I wonder if we shouldn't consider generating a config.h
> >>> file with stuff like this rather than adding another -D to the compile
> >>> line.
> >>>
> >>> drew 
> >> I propose a new configuration option called --vmm, with possible values 
> >> qemu and
> >> kvmtool, which defaults to qemu if not set.
> >>
> >> Another possibility would be to have an --uart-base option, but that means 
> >> we
> >> are expecting the user to be aware of the uart base address for the virtual
> >> machine manager, which might be unreasonable.
> >>
> >> This is a quick prototype of how using -D for conditional compilation 
> >> would look
> >> like (the configure changes are included too):
> >>
> >> diff --git a/configure b/configure
> >> index df8581e3a906..7a56ba47707f 100755
> >> --- a/configure
> >> +++ b/configure
> >> @@ -70,6 +70,9 @@ while [[ "$1" = -* ]]; do
> >>     ;;
> >>     --ld)
> >>     ld="$arg"
> >> +    ;;
> >> +    --vmm)
> >> +    vmm="$arg"
> >>     ;;
> >>     --enable-pretty-print-stacks)
> >>     pretty_print_stacks=yes
> >> @@ -108,6 +111,14 @@ if [ "$arch" = "i386" ] || [ "$arch" = "x86_64" ]; 
> >> then
> >>  testdir=x86
> >>  elif [ "$arch" = "arm" ] || [ "$arch" = "arm64" ]; then
> >>  testdir=arm
> >> +    if [ -z "$vmm" ] || [ "$vmm" = "qemu" ]; then
> >> +    uart_early_base=0x0900UL
> > You can drop the 'UL'.
> >
> >> +    

Re: Kernel boot regression with PAuth and aarch64-softmmu -cpu max and el2 enabled

2019-01-29 Thread Peter Maydell
On Tue, 29 Jan 2019 at 11:46, Mark Rutland  wrote:
> On Tue, Jan 29, 2019 at 11:08:19AM +, Alex Bennée wrote:
> > The -cpu max enabled a cortex-a57 + whatever extra features we've
> > enabled in QEMU so far. It won't match any "real" CPU but it should be
> > architecturally correct in so far we implement prerequisite features for
> > any given feature. The cpuid feature bits should also be correct as we
> > test them internally in QEMU to enable features.
>
> Just to check, does this enable VHE?

It does not. (We have no implementation of VHE yet -- as and when
we do implement that "-cpu max" will turn it on, but today it doesn't.)

thanks
-- PMM
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v7 10/25] ACPI / APEI: Tell firmware the estatus queue consumed the records

2019-01-29 Thread Borislav Petkov
On Wed, Jan 23, 2019 at 06:36:38PM +, James Morse wrote:
> Do you consider ENOENT an error? We don't ack in that case as the
> memory wasn't in use.

So let's see:

if (!*buf_paddr)
return -ENOENT;

can happen when apei_read() has returned 0 but it has managed to do

*val = 0;

Now, that function returns error values which we should be checking
but we're checking the buf_paddr pointed to value for being 0. Are
we fearing that even if acpi_os_read_memory() or acpi_os_read_port()
succeed, *buf_paddr could still be 0 ?

Because if not, we should be checking whether rc == -EINVAL and then
convert it to -ENOENT.

But ghes_read_estatus() handles the error case first *and* *then* checks
buf_paddr too, to make really really sure we won't be reading from
address 0.

> For the other cases its because the records are bogus, but we still
> unconditionally tell firmware we're done with them.

... to free the memory, yes, ok.

> >> I think it is. 18.3.2.8 of ACPI v6.2 (search for Generic Hardware Error 
> >> Source
> >> version 2", then below the table):
> >> * OSPM detects error (via interrupt/exception or polling the block status)
> >> * OSPM copies the error status block
> >> * OSPM clears the block status field of the error status block
> >> * OSPM acknowledges the error via Read Ack register
> >>
> >> The ENOENT case is excluded by 'polling the block status'.
> > 
> > Ok, so we signal the absence of an error record with ENOENT.
> > 
> > if (!buf_paddr)
> > return -ENOENT;
> > 
> > Can that even happen?
> 
> Yes, for NOTIFY_POLLED its the norm. For the IRQ flavours that walk a list of
> GHES, all but one of them will return ENOENT.

Lemme get this straight: when we do

apei_read(_paddr, >error_status_address);

in the polled case, buf_paddr can be 0?

> We could try it and see. It depends if firmware shares ack locations between
> multiple GHES. We could ack an empty GHES, and it removes the records of one 
> we
> haven't looked at yet.

Yeah, OTOH, we shouldn't be pushing our luck here, I guess.

So let's sum up: we'll ack the GHES error in all but the -ENOENT cases
in order to free the memory occupied by the error record.

The slightly "pathological" -ENOENT case is I guess how the fw behaves
when it is being polled and also for broken firmware which could report
a 0 buf_paddr.

Btw, that last thing I'm assuming because

  d334a49113a4 ("ACPI, APEI, Generic Hardware Error Source memory error 
support")

doesn't say what that check was needed for.

Thx.

-- 
Regards/Gruss,
Boris.

Good mailing practices for 400: avoid top-posting and trim the reply.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: Kernel boot regression with PAuth and aarch64-softmmu -cpu max and el2 enabled

2019-01-29 Thread Mark Rutland
Hi,

[adding Kristina, who is in charge of Linux pointer authentication]

On Tue, Jan 29, 2019 at 11:08:19AM +, Alex Bennée wrote:
> Hi,
> 
> Following up on yesterday's discussion on IRC I thought I'd better
> report on my findings in the permanent record so things don't get lost.
> 
> As I tend to periodically rebuild my test kernels from the current
> state of linux.git I occasionally run into these things. My test
> invocation is:
> 
>   qemu-system-aarch64 -machine type=virt,virtualization=on \
>   -display none -m 4096 -serial mon:stdio \
>   -kernel 
> ../../kernel-v8-plain.build/arch/arm64/boot/Image \
>   -append 'console=ttyAMA0 panic=-1' -no-reboot -cpu max
>
> The kernel is essentially a defconfig kernel with a bunch of the VIRTIO
> device drivers built-in for when I actually boot a more complex setup
> with disks and drives. However this is a boot test so doesn't really
> matter.
> 
> The -machine type=virt,virtualization=on enables our virt machine model
> with EL2 turned on. As there is no BIOS involved the kernel is invoked
> directly at EL2.
> 
> The -cpu max enabled a cortex-a57 + whatever extra features we've
> enabled in QEMU so far. It won't match any "real" CPU but it should be
> architecturally correct in so far we implement prerequisite features for
> any given feature. The cpuid feature bits should also be correct as we
> test them internally in QEMU to enable features.

Just to check, does this enable VHE?

> The breakage is the kernel never boots (no output on serial port) and on
> attaching with gdb I found it stuck in:
> 
>   (gdb) bt
>   #0  0xff8010a9e480 in overflow_stack ()
>   Backtrace stopped: not enough registers or memory available to unwind 
> further
> 
> If I turn on exception tracing it looks like we go into an exception
> loop.

As mentioned on IRC, this looks very odd, since overflow_stack is a data
pointer, not code. I can't presently see how we could branch here.

If you pass the kernel 'earlycon keep_bootcon', do you get any output?

> On the QEMU side this breakage comes in at:
> 
>   commit 1ce32e47db52e3511132c7104770eae65d412144 (HEAD, refs/bisect/bad)
>   Author: Richard Henderson 
>   Date:   Mon Jan 21 10:23:13 2019 +
> 
>   target/arm: Enable PAuth for -cpu max
> 
>   Reviewed-by: Peter Maydell 
>   Signed-off-by: Richard Henderson 
>   Message-id: 20190108223129.5570-30-richard.hender...@linaro.org
>   Signed-off-by: Peter Maydell 
> 
> and as you would expect the system boots fine with -cpu cortex-a57
> 
> On the kernel side it breaks at:
> 
>   commit 04ca3204fa09f5f55c8f113b0072004a7b364ff4
>   Author: Mark Rutland 
>   Date:   Fri Dec 7 18:39:30 2018 +
> 
>   arm64: enable pointer authentication
> 
>   Now that all the necessary bits are in place for userspace, add the
>   necessary Kconfig logic to allow this to be enabled.
> 
>   Signed-off-by: Mark Rutland 
>   Signed-off-by: Kristina Martsenko 
>   Cc: Catalin Marinas 
>   Cc: Will Deacon 
>   Signed-off-by: Will Deacon 
> 
> So predictably we failed at enabling PAuth somewhere between the kernel
> and QEMU.
> 
> I'm guessing the kernel so far has been tested on the fast model with a
> full chain of TF, UEFI and kernel?

The kernel has been tested on a fast model with the Linux bootwrapper:

https://git.kernel.org/pub/scm/linux/kernel/git/mark/linux.git/

Kristina, could you confirm whether or not it's been tested with
ATF+UEFI?

> I think Richard's tests were without EL2 enabled.
> 
> So in the case that the kernel boots in EL2 is it expecting anyone else
> to deal with Pauth exceptions or should it be able to cope with an
> enabled Pauth but no firmware underneath it?

So long as the highest implemented exception level is EL2, the kernel
should handle that itself. During boot we'll configure HCR_EL2.{API,APK}
in el2_setup().

From that point onwards, there should be no traps for pointer
authentication functionality from EL1, AFAICT.

> Either we've got something wrong or we'll need to rethink what features
> the user can have enabled by -cpu max on a direct kernel boot.

It's not immediately clear to me when precisely things are going wrong,
so I think we need to narrow that down first. For example, it's not
clear whether a trap is being taken, or something is unexpectedly
behaving is UNDEF.

Is it possible to watch the exception vectors to see if/when an
exception is taken, and from where?

Thanks,
Mark.
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [kvm-unit-tests PATCH 2/7] lib: arm: Remove warning about uart0_base mismatch

2019-01-29 Thread Alexandru Elisei
On 1/28/19 4:31 PM, Andrew Jones wrote:
> On Mon, Jan 28, 2019 at 02:24:29PM +, Alexandru Elisei wrote:
>> On 1/25/19 4:47 PM, Andrew Jones wrote:
>>> On Fri, Jan 25, 2019 at 04:36:13PM +, Alexandru Elisei wrote:
 On 1/24/19 12:37 PM, Andrew Jones wrote:
> On Thu, Jan 24, 2019 at 11:59:43AM +, Andre Przywara wrote:
>> On Thu, 24 Jan 2019 11:16:29 +
>> Alexandru Elisei  wrote:
>>
>>> A warning is displayed if uart0_base is different from what the code
>>> expects qemu to generate for the pl011 UART in the device tree.
>>> However, now we support the ns16550a UART emulated by kvmtool, which
>>> has a different address. This leads to the  warning being displayed
>>> even though the UART is configured and working as expected.
>>>
>>> Now that we support multiple UARTs, the warning serves no purpose, so
>>> remove it.
>> Mmh, but we use that address before, right? So for anything not
>> emulating an UART at this QEMU specific address we write to some random
>> (device) memory?
>>
>> Drew, how important is this early print feature for kvm-unit-tests?
> The setup code passes through quite a few asserts before getting through
> io_init() (including in uart0_init), so I think there's still value in
> having a guessed UART address. Maybe we can provide guesses for both
> QEMU and kvmtool, and some selection method, that would be used until
> we've properly assigned uart0_base from DT?
>
>> Cheers,
>> Andre.
>>
>>> Signed-off-by: Alexandru Elisei 
>>> ---
>>>  lib/arm/io.c | 6 --
>>>  1 file changed, 6 deletions(-)
>>>
>>> diff --git a/lib/arm/io.c b/lib/arm/io.c
>>> index 35fc05aeb4db..87435150f73e 100644
>>> --- a/lib/arm/io.c
>>> +++ b/lib/arm/io.c
>>> @@ -61,12 +61,6 @@ static void uart0_init(void)
>>> }
>>>  
>>> uart0_base = ioremap(base.addr, base.size);
>>> -
>>> -   if (uart0_base != (u8 *)UART_EARLY_BASE) {
>>> -   printf("WARNING: early print support may not work. "
>>> -  "Found uart at %p, but early base is %p.\n",
>>> -   uart0_base, (u8 *)UART_EARLY_BASE);
>>> -   }
>>>  }
> This warning is doing what it should, which is pointing out that the
> UART_EARLY_BASE guess appears to be wrong. If we can provide a way
> to support more than one guess, then we should keep this warning but
> adjust it to match one of any of the guesses.
>
> Thanks,
> drew
 I'm not really sure how to implement a selection method. I've looked at
 splitting io_init() into uart0_init() and chr_testdev_init() and calling
 uart0_init() very early in the setup process, but uart0_init() itself uses
 printf() and assert().

 I've also thought about adding another function, something like
 uart0_early_init(), that is called very early in setup() and gets the base
 address from the dtb bootargs. But that means calling dt_init() and
 dt_get_bootargs(), which can fail.

 One other option that could work is to make it a compile-time 
 configuration.

 What do you think?

>>> Compile-time is fine, which I guess will result in a new configure script
>>> option as well. I wonder if we shouldn't consider generating a config.h
>>> file with stuff like this rather than adding another -D to the compile
>>> line.
>>>
>>> drew 
>> I propose a new configuration option called --vmm, with possible values qemu 
>> and
>> kvmtool, which defaults to qemu if not set.
>>
>> Another possibility would be to have an --uart-base option, but that means we
>> are expecting the user to be aware of the uart base address for the virtual
>> machine manager, which might be unreasonable.
>>
>> This is a quick prototype of how using -D for conditional compilation would 
>> look
>> like (the configure changes are included too):
>>
>> diff --git a/configure b/configure
>> index df8581e3a906..7a56ba47707f 100755
>> --- a/configure
>> +++ b/configure
>> @@ -70,6 +70,9 @@ while [[ "$1" = -* ]]; do
>>     ;;
>>     --ld)
>>     ld="$arg"
>> +    ;;
>> +    --vmm)
>> +    vmm="$arg"
>>     ;;
>>     --enable-pretty-print-stacks)
>>     pretty_print_stacks=yes
>> @@ -108,6 +111,14 @@ if [ "$arch" = "i386" ] || [ "$arch" = "x86_64" ]; then
>>  testdir=x86
>>  elif [ "$arch" = "arm" ] || [ "$arch" = "arm64" ]; then
>>  testdir=arm
>> +    if [ -z "$vmm" ] || [ "$vmm" = "qemu" ]; then
>> +    uart_early_base=0x0900UL
> You can drop the 'UL'.
>
>> +    elif [ "$vmm" = "kvmtool" ]; then
>> +    uart_early_base=0x3f8
>> +    else
>> +    echo '--vmm must be one of "qemu" or "kvmtool"'
>> +    usage
> You're outputting usage here, but you didn't add vmm to the help text.
>
>> +    fi
>>  elif [ "$arch" = "ppc64" ]; then
>>  testdir=powerpc
>> 

Re: [PATCH 3/4] KVM: arm/arm64: lazily create perf events on enable

2019-01-29 Thread Suzuki K Poulose

On 28/01/2019 14:28, Andrew Murray wrote:

On Tue, Jan 22, 2019 at 10:12:22PM +, Suzuki K Poulose wrote:

Hi Andrew,

On 01/22/2019 10:49 AM, Andrew Murray wrote:

To prevent re-creating perf events everytime the counter registers
are changed, let's instead lazily create the event when the event
is first enabled and destroy it when it changes.

Signed-off-by: Andrew Murray 




---
   virt/kvm/arm/pmu.c | 114 
-
   1 file changed, 78 insertions(+), 36 deletions(-)

diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 4464899..1921ca9 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -24,8 +24,11 @@
   #include 
   #include 
-static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 data,
- u64 select_idx);
+static void kvm_pmu_reenable_enabled_single(struct kvm_vcpu *vcpu, u64 pair);


I find the approach good. However the function names are a bit odd and
it makes the code read a bit difficult.


Thanks - the odd naming probably came about as I started with a patch that
added chained PMU support and worked backward to split it into smaller patches
that each made individual sense. The _single suffix was the counterpart of
_pair.



I think we could :

1) Rename the existing
  kvm_pmu_{enable/disable}_counter => kvm_pmu_{enable/disable}_[mask or
counters ]
as they operate on a set of counters (as a mask) instead of a single
counter.
And then you may be able to drop "_single" from
kvm_pmu_{enable/disable}_counter"_single() functions below, which makes
better sense for what they do.


Thanks for this suggestion. I like this.




+static void kvm_pmu_counter_create_enabled_perf_event(struct kvm_vcpu *vcpu,
+ u64 select_idx);


Could we simply keep kvm_pmu_counter_create_event() and add a comment above
the function explaining that the events are enabled as they are
created lazily ?


OK.






+ * kvm_pmu_reenable_enabled_single - reenable a counter if it should be enabled
+ * @vcpu: The vcpu pointer
+ * @select_idx: The counter index
+ */
+static void kvm_pmu_reenable_enabled_single(struct kvm_vcpu *vcpu,
+   u64 select_idx)
+{
+   u64 mask = kvm_pmu_valid_counter_mask(vcpu);
+   u64 set = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask;
+
+   if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
+   return;
+
+   if (set & BIT(select_idx))
+   kvm_pmu_enable_counter_single(vcpu, select_idx);


Could we not reuse kvm_pmu_enable_counter() here :
i.e,
static inline void kvm_pmu_reenable_counter(struct kvm_vcpu *vcpu, u64
select_idx)
{
kvm_pmu_enable_counter(vcpu, BIT(select_idx));
}



Not quite - when we call kvm_pmu_reenable_enabled_single the individual
counter may or may not be enabled. We only want to recreate the perf event
if it was previously enabled.

But we can do better, e.g.

static void kvm_pmu_reenable_enabled_single(struct kvm_vcpu *vcpu,
 u64 select_idx)


nit: could we use the name kvm_pmu_reenable_counter() ?


{
 u64 set = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);

 if (set & BIT(select_idx))
 kvm_pmu_enable_counter_mask(vcpu, BIT(select_idx));
}


Yep, thats correct. Alternatively, you could move the CNTENSET check
into the kvm_pmu_enable_counter_mask().

Suzuki
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Kernel boot regression with PAuth and aarch64-softmmu -cpu max and el2 enabled

2019-01-29 Thread Alex Bennée

Hi,

Following up on yesterday's discussion on IRC I thought I'd better
report on my findings in the permanent record so things don't get lost.

As I tend to periodically rebuild my test kernels from the current
state of linux.git I occasionally run into these things. My test
invocation is:

  qemu-system-aarch64 -machine type=virt,virtualization=on \
  -display none -m 4096 -serial mon:stdio \
  -kernel ../../kernel-v8-plain.build/arch/arm64/boot/Image 
\
  -append 'console=ttyAMA0 panic=-1' -no-reboot -cpu max

The kernel is essentially a defconfig kernel with a bunch of the VIRTIO
device drivers built-in for when I actually boot a more complex setup
with disks and drives. However this is a boot test so doesn't really
matter.

The -machine type=virt,virtualization=on enables our virt machine model
with EL2 turned on. As there is no BIOS involved the kernel is invoked
directly at EL2.

The -cpu max enabled a cortex-a57 + whatever extra features we've
enabled in QEMU so far. It won't match any "real" CPU but it should be
architecturally correct in so far we implement prerequisite features for
any given feature. The cpuid feature bits should also be correct as we
test them internally in QEMU to enable features.

The breakage is the kernel never boots (no output on serial port) and on
attaching with gdb I found it stuck in:

  (gdb) bt
  #0  0xff8010a9e480 in overflow_stack ()
  Backtrace stopped: not enough registers or memory available to unwind further

If I turn on exception tracing it looks like we go into an exception
loop.

On the QEMU side this breakage comes in at:

  commit 1ce32e47db52e3511132c7104770eae65d412144 (HEAD, refs/bisect/bad)
  Author: Richard Henderson 
  Date:   Mon Jan 21 10:23:13 2019 +

  target/arm: Enable PAuth for -cpu max

  Reviewed-by: Peter Maydell 
  Signed-off-by: Richard Henderson 
  Message-id: 20190108223129.5570-30-richard.hender...@linaro.org
  Signed-off-by: Peter Maydell 

and as you would expect the system boots fine with -cpu cortex-a57

On the kernel side it breaks at:

  commit 04ca3204fa09f5f55c8f113b0072004a7b364ff4
  Author: Mark Rutland 
  Date:   Fri Dec 7 18:39:30 2018 +

  arm64: enable pointer authentication

  Now that all the necessary bits are in place for userspace, add the
  necessary Kconfig logic to allow this to be enabled.

  Signed-off-by: Mark Rutland 
  Signed-off-by: Kristina Martsenko 
  Cc: Catalin Marinas 
  Cc: Will Deacon 
  Signed-off-by: Will Deacon 

So predictably we failed at enabling PAuth somewhere between the kernel
and QEMU.

I'm guessing the kernel so far has been tested on the fast model with a
full chain of TF, UEFI and kernel?

I think Richard's tests were without EL2 enabled.

So in the case that the kernel boots in EL2 is it expecting anyone else
to deal with Pauth exceptions or should it be able to cope with an
enabled Pauth but no firmware underneath it?

Either we've got something wrong or we'll need to rethink what features
the user can have enabled by -cpu max on a direct kernel boot.

--
Alex Bennée
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/4] KVM: arm/arm64: re-create event when setting counter value

2019-01-29 Thread Suzuki K Poulose

Hi Andrew,

On 28/01/2019 11:47, Andrew Murray wrote:

On Tue, Jan 22, 2019 at 02:18:17PM +, Suzuki K Poulose wrote:

Hi Andrew

On 01/22/2019 10:49 AM, Andrew Murray wrote:

The perf event sample_period is currently set based upon the current
counter value, when PMXEVTYPER is written to and the perf event is created.
However the user may choose to write the type before the counter value in
which case sample_period will be set incorrectly. Let's instead decouple
event creation from PMXEVTYPER and (re)create the event in either
suitation.

Signed-off-by: Andrew Murray 


The approach looks fine to me. However this patch seems to introduce a
memory leak, see below, which you may be addressing in a later patch in the
series. But this will affect bisecting issues.


See below, I don't think this is true.


You're right. Sorry for the noise.






---
   virt/kvm/arm/pmu.c | 39 ++-
   1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 531d27f..4464899 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -24,6 +24,8 @@
   #include 
   #include 
+static void kvm_pmu_create_perf_event(struct kvm_vcpu *vcpu, u64 data,
+ u64 select_idx);


Could we just pass the counter index (i.e, select_idx) after updating
the event_type/counter value in the respective functions.


Unless I misunderstand we need the value of 'data' as it is used to
populate the function local perf_event_attr structure.


Yes, we do program the "data" (which is the event code) in attr.config.
So, since this is now a helper routine, so the name "data" doesn't make any
sense.



However it is possible to instead read 'data' from __vcpu_sys_reg in
kvm_pmu_create_perf_event instead of the call site. However
kvm_pmu_set_counter_event_type would have to set the value of
__vcpu_sys_reg from its data argument (as __vcpu_sys_reg normally gets
set after kvm_pmu_set_counter_event_type returns). This is OK as we
do this in the next patch in this series anyway - so perhaps I can
bring that forward to this patch?


Yes, please.

Cheers
Suzuki


___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [kvm-unit-tests PATCH 2/7] lib: arm: Remove warning about uart0_base mismatch

2019-01-29 Thread Andrew Jones
On Mon, Jan 28, 2019 at 05:58:54PM +, Andre Przywara wrote:
> On Mon, 28 Jan 2019 17:31:01 +0100
> Andrew Jones  wrote:
> 
> > On Mon, Jan 28, 2019 at 02:24:29PM +, Alexandru Elisei wrote:
> > > On 1/25/19 4:47 PM, Andrew Jones wrote:  
> > > > On Fri, Jan 25, 2019 at 04:36:13PM +, Alexandru Elisei
> > > > wrote:  
> > > >> On 1/24/19 12:37 PM, Andrew Jones wrote:  
> > > >>> On Thu, Jan 24, 2019 at 11:59:43AM +, Andre Przywara
> > > >>> wrote:  
> > >  On Thu, 24 Jan 2019 11:16:29 +
> > >  Alexandru Elisei  wrote:
> > >   
> > > > A warning is displayed if uart0_base is different from what
> > > > the code expects qemu to generate for the pl011 UART in the
> > > > device tree. However, now we support the ns16550a UART
> > > > emulated by kvmtool, which has a different address. This
> > > > leads to the  warning being displayed even though the UART is
> > > > configured and working as expected.
> > > >
> > > > Now that we support multiple UARTs, the warning serves no
> > > > purpose, so remove it.  
> > >  Mmh, but we use that address before, right? So for anything not
> > >  emulating an UART at this QEMU specific address we write to
> > >  some random (device) memory?
> > > 
> > >  Drew, how important is this early print feature for
> > >  kvm-unit-tests?  
> > > >>> The setup code passes through quite a few asserts before
> > > >>> getting through io_init() (including in uart0_init), so I think
> > > >>> there's still value in having a guessed UART address. Maybe we
> > > >>> can provide guesses for both QEMU and kvmtool, and some
> > > >>> selection method, that would be used until we've properly
> > > >>> assigned uart0_base from DT? 
> > >  Cheers,
> > >  Andre.
> > >   
> > > > Signed-off-by: Alexandru Elisei 
> > > > ---
> > > >  lib/arm/io.c | 6 --
> > > >  1 file changed, 6 deletions(-)
> > > >
> > > > diff --git a/lib/arm/io.c b/lib/arm/io.c
> > > > index 35fc05aeb4db..87435150f73e 100644
> > > > --- a/lib/arm/io.c
> > > > +++ b/lib/arm/io.c
> > > > @@ -61,12 +61,6 @@ static void uart0_init(void)
> > > > }
> > > >  
> > > > uart0_base = ioremap(base.addr, base.size);
> > > > -
> > > > -   if (uart0_base != (u8 *)UART_EARLY_BASE) {
> > > > -   printf("WARNING: early print support may not
> > > > work. "
> > > > -  "Found uart at %p, but early base is
> > > > %p.\n",
> > > > -   uart0_base, (u8 *)UART_EARLY_BASE);
> > > > -   }
> > > >  }  
> > > >>> This warning is doing what it should, which is pointing out
> > > >>> that the UART_EARLY_BASE guess appears to be wrong. If we can
> > > >>> provide a way to support more than one guess, then we should
> > > >>> keep this warning but adjust it to match one of any of the
> > > >>> guesses.
> > > >>>
> > > >>> Thanks,
> > > >>> drew  
> > > >> I'm not really sure how to implement a selection method. I've
> > > >> looked at splitting io_init() into uart0_init() and
> > > >> chr_testdev_init() and calling uart0_init() very early in the
> > > >> setup process, but uart0_init() itself uses printf() and
> > > >> assert().
> > > >>
> > > >> I've also thought about adding another function, something like
> > > >> uart0_early_init(), that is called very early in setup() and
> > > >> gets the base address from the dtb bootargs. But that means
> > > >> calling dt_init() and dt_get_bootargs(), which can fail.
> > > >>
> > > >> One other option that could work is to make it a compile-time
> > > >> configuration.
> > > >>
> > > >> What do you think?
> > > >>  
> > > > Compile-time is fine, which I guess will result in a new
> > > > configure script option as well. I wonder if we shouldn't
> > > > consider generating a config.h file with stuff like this rather
> > > > than adding another -D to the compile line.
> > > >
> > > > drew   
> > > 
> > > I propose a new configuration option called --vmm, with possible
> > > values qemu and kvmtool, which defaults to qemu if not set.
> > > 
> > > Another possibility would be to have an --uart-base option, but
> > > that means we are expecting the user to be aware of the uart base
> > > address for the virtual machine manager, which might be
> > > unreasonable.
> > > 
> > > This is a quick prototype of how using -D for conditional
> > > compilation would look like (the configure changes are included
> > > too):
> > > 
> > > diff --git a/configure b/configure
> > > index df8581e3a906..7a56ba47707f 100755
> > > --- a/configure
> > > +++ b/configure
> > > @@ -70,6 +70,9 @@ while [[ "$1" = -* ]]; do
> > >     ;;
> > >     --ld)
> > >     ld="$arg"
> > > +    ;;
> > > +    --vmm)
> > > +    vmm="$arg"
> > >     ;;
> > >     --enable-pretty-print-stacks)
> > >     pretty_print_stacks=yes
> > > @@ -108,6 +111,14 @@ if [ "$arch" = "i386" ] || [ 

Re: [PATCH 1/3] KVM: x86: Move mmu_memory_cache functions to common code

2019-01-29 Thread Sean Christopherson
On Mon, Jan 28, 2019 at 03:48:41PM +0100, Christoffer Dall wrote:
> We are currently duplicating the mmu memory cache functionality quite
> heavily between the architectures that support KVM.  As a first step,
> move the x86 implementation (which seems to have the most recently
> maintained version of the mmu memory cache) to common code.
> 
> We rename the functions and data types to have a kvm_ prefix for
> anything exported as a symbol to the rest of the kernel and take the
> chance to rename memory_cache to memcache to avoid overly long lines.

It'd be helpful to do the rename it a separate patch so that the move
really is a straight move of code.

> This is a bit tedious on the callsites but ends up looking more
> palatable.
> 
> We also introduce an arch-specific kvm_types.h which can be used to
> define the architecture-specific GFP flags for allocating memory to the
> memory cache, and to specify how many objects are required in the memory
> cache.  These are the two points where the current implementations
> diverge across architectures.  Since kvm_host.h defines structures with
> fields of the memcache object, we define the memcache structure in
> kvm_types.h, and we include the architecture-specific kvm_types.h to
> know the size of object in kvm_host.h.
> 
> This patch currently only defines the structure and requires valid
> defines in the architecture-specific kvm_types.h when KVM_NR_MEM_OBJS is
> defined.  As we move each architecture to the common implementation,
> this condition will eventually go away.
> 
> Signed-off-by: Christoffer Dall 
> ---
>  arch/arm/include/asm/kvm_types.h |  5 ++
>  arch/arm64/include/asm/kvm_types.h   |  6 ++
>  arch/mips/include/asm/kvm_types.h|  5 ++
>  arch/powerpc/include/asm/kvm_types.h |  5 ++
>  arch/s390/include/asm/kvm_types.h|  5 ++
>  arch/x86/include/asm/kvm_host.h  | 17 +
>  arch/x86/include/asm/kvm_types.h | 10 +++
>  arch/x86/kvm/mmu.c   | 97 ++--
>  arch/x86/kvm/paging_tmpl.h   |  4 +-
>  include/linux/kvm_host.h |  9 +++
>  include/linux/kvm_types.h| 12 
>  virt/kvm/kvm_main.c  | 58 +
>  12 files changed, 139 insertions(+), 94 deletions(-)
>  create mode 100644 arch/arm/include/asm/kvm_types.h
>  create mode 100644 arch/arm64/include/asm/kvm_types.h
>  create mode 100644 arch/mips/include/asm/kvm_types.h
>  create mode 100644 arch/powerpc/include/asm/kvm_types.h
>  create mode 100644 arch/s390/include/asm/kvm_types.h
>  create mode 100644 arch/x86/include/asm/kvm_types.h
> 
> diff --git a/arch/arm/include/asm/kvm_types.h 
> b/arch/arm/include/asm/kvm_types.h
> new file mode 100644
> index ..bc389f82e88d
> --- /dev/null
> +++ b/arch/arm/include/asm/kvm_types.h
> @@ -0,0 +1,5 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_ARM_KVM_TYPES_H
> +#define _ASM_ARM_KVM_TYPES_H
> +
> +#endif /* _ASM_ARM_KVM_TYPES_H */
> diff --git a/arch/arm64/include/asm/kvm_types.h 
> b/arch/arm64/include/asm/kvm_types.h
> new file mode 100644
> index ..d0987007d581
> --- /dev/null
> +++ b/arch/arm64/include/asm/kvm_types.h
> @@ -0,0 +1,6 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_ARM64_KVM_TYPES_H
> +#define _ASM_ARM64_KVM_TYPES_H
> +
> +#endif /* _ASM_ARM64_KVM_TYPES_H */
> +
> diff --git a/arch/mips/include/asm/kvm_types.h 
> b/arch/mips/include/asm/kvm_types.h
> new file mode 100644
> index ..5efeb32a5926
> --- /dev/null
> +++ b/arch/mips/include/asm/kvm_types.h
> @@ -0,0 +1,5 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_MIPS_KVM_TYPES_H
> +#define _ASM_MIPS_KVM_TYPES_H
> +
> +#endif /* _ASM_MIPS_KVM_TYPES_H */
> diff --git a/arch/powerpc/include/asm/kvm_types.h 
> b/arch/powerpc/include/asm/kvm_types.h
> new file mode 100644
> index ..f627eceaa314
> --- /dev/null
> +++ b/arch/powerpc/include/asm/kvm_types.h
> @@ -0,0 +1,5 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_POWERPC_KVM_TYPES_H
> +#define _ASM_POWERPC_KVM_TYPES_H
> +
> +#endif /* _ASM_POWERPC_KVM_TYPES_H */
> diff --git a/arch/s390/include/asm/kvm_types.h 
> b/arch/s390/include/asm/kvm_types.h
> new file mode 100644
> index ..b66a81f8a354
> --- /dev/null
> +++ b/arch/s390/include/asm/kvm_types.h
> @@ -0,0 +1,5 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef _ASM_S390_KVM_TYPES_H
> +#define _ASM_S390_KVM_TYPES_H
> +
> +#endif /* _ASM_S390_KVM_TYPES_H */
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index 4660ce90de7f..5c12cba8c2b1 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -179,8 +179,6 @@ enum {
>  
>  #include 
>  
> -#define KVM_NR_MEM_OBJS 40
> -
>  #define KVM_NR_DB_REGS   4
>  
>  #define DR6_BD   (1 << 13)
> @@ -238,15 +236,6 @@ enum {
>  
>  struct kvm_kernel_irq_routing_entry;
>  
> -/*
> - * We don't want allocation failures within the mmu 

Re: [PATCH 4/4] KVM: arm/arm64: support chained PMU counters

2019-01-29 Thread Julien Thierry



On 28/01/2019 17:13, Andrew Murray wrote:
> On Tue, Jan 22, 2019 at 02:59:48PM +, Julien Thierry wrote:
>> Hi Andrew
>>
>> On 22/01/2019 10:49, Andrew Murray wrote:
>>> Emulate chained PMU counters by creating a single 64 bit event counter
>>> for a pair of chained KVM counters.
>>>
>>> Signed-off-by: Andrew Murray 
>>> ---
>>>  include/kvm/arm_pmu.h |   2 +
>>>  virt/kvm/arm/pmu.c| 308 
>>> +-
>>>  2 files changed, 258 insertions(+), 52 deletions(-)
>>>
>>> diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h
>>> index f87fe20..d4f3b28 100644
>>> --- a/include/kvm/arm_pmu.h
>>> +++ b/include/kvm/arm_pmu.h
>>> @@ -29,6 +29,8 @@ struct kvm_pmc {
>>> u8 idx; /* index into the pmu->pmc array */
>>> struct perf_event *perf_event;
>>> u64 bitmask;
>>> +   u64 sample_period;
>>> +   u64 left;
>>>  };
>>>  
>>>  struct kvm_pmu {
>>> diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
>>> index 1921ca9..d111d5b 100644
>>> --- a/virt/kvm/arm/pmu.c
>>> +++ b/virt/kvm/arm/pmu.c
>>> @@ -24,10 +24,26 @@
>>>  #include 
>>>  #include 
>>>  
>>> +#define ARMV8_PMUV3_PERFCTR_CHAIN 0x1E
>>> +static void kvm_pmu_stop_release_perf_event_pair(struct kvm_vcpu *vcpu,
>>> +   u64 pair_low);
>>> +static void kvm_pmu_stop_release_perf_event_single(struct kvm_vcpu *vcpu,
>>> + u64 select_idx);
>>> +static void kvm_pmu_reenable_enabled_pair(struct kvm_vcpu *vcpu, u64 
>>> pair_low);
>>>  static void kvm_pmu_reenable_enabled_single(struct kvm_vcpu *vcpu, u64 
>>> pair);
>>>  static void kvm_pmu_counter_create_enabled_perf_event(struct kvm_vcpu 
>>> *vcpu,
>>>   u64 select_idx);
>>> -static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, struct kvm_pmc 
>>> *pmc);
>>> +
>>> +/**
>>> + * kvm_pmu_counter_is_high_word - is select_idx high counter of 64bit event
>>> + * @pmc: The PMU counter pointer
>>> + * @select_idx: The counter index
>>> + */
>>> +static inline bool kvm_pmu_counter_is_high_word(struct kvm_pmc *pmc)
>>> +{
>>> +   return ((pmc->perf_event->attr.config1 & 0x1)
>>> +   && (pmc->idx % 2));
>>> +}
>>>  
>>>  /**
>>>   * kvm_pmu_get_counter_value - get PMU counter value
>>> @@ -36,7 +52,7 @@ static void kvm_pmu_stop_counter(struct kvm_vcpu *vcpu, 
>>> struct kvm_pmc *pmc);
>>>   */
>>>  u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, u64 select_idx)
>>>  {
>>> -   u64 counter, reg, enabled, running;
>>> +   u64 counter, reg, enabled, running, incr;
>>> struct kvm_pmu *pmu = >arch.pmu;
>>> struct kvm_pmc *pmc = >pmc[select_idx];
>>>  
>>> @@ -47,14 +63,53 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, 
>>> u64 select_idx)
>>> /* The real counter value is equal to the value of counter register plus
>>>  * the value perf event counts.
>>>  */
>>> -   if (pmc->perf_event)
>>> -   counter += perf_event_read_value(pmc->perf_event, ,
>>> +   if (pmc->perf_event) {
>>> +   incr = perf_event_read_value(pmc->perf_event, ,
>>>  );
>>>  
>>> +   if (kvm_pmu_counter_is_high_word(pmc))
>>> +   incr = upper_32_bits(incr);
>>> +   counter += incr;
>>> +   }
>>> +
>>> return counter & pmc->bitmask;
>>>  }
>>>  
>>>  /**
>>> + * kvm_pmu_counter_is_enabled - is a counter active
>>> + * @vcpu: The vcpu pointer
>>> + * @select_idx: The counter index
>>> + */
>>> +static bool kvm_pmu_counter_is_enabled(struct kvm_vcpu *vcpu, u64 
>>> select_idx)
>>> +{
>>> +   u64 mask = kvm_pmu_valid_counter_mask(vcpu);
>>> +
>>> +   return (__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E) &&
>>> +  (__vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask & BIT(select_idx));
>>> +}
>>> +
>>> +/**
>>> + * kvnm_pmu_event_is_chained - is a pair of counters chained and enabled
>>> + * @vcpu: The vcpu pointer
>>> + * @select_idx: The low counter index
>>> + */
>>> +static bool kvm_pmu_event_is_chained(struct kvm_vcpu *vcpu, u64 pair_low)
>>> +{
>>> +   u64 eventsel;
>>> +
>>> +   eventsel = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + pair_low + 1) &
>>> +   ARMV8_PMU_EVTYPE_EVENT;
>>> +   if (eventsel != ARMV8_PMUV3_PERFCTR_CHAIN)
>>> +   return false;
>>> +
>>> +   if (kvm_pmu_counter_is_enabled(vcpu, pair_low) !=
>>> +   kvm_pmu_counter_is_enabled(vcpu, pair_low + 1))
>>> +   return false;
>>> +
>>> +   return true;
>>> +}
>>> +
>>> +/**
>>>   * kvm_pmu_set_counter_value - set PMU counter value
>>>   * @vcpu: The vcpu pointer
>>>   * @select_idx: The counter index
>>> @@ -62,29 +117,45 @@ u64 kvm_pmu_get_counter_value(struct kvm_vcpu *vcpu, 
>>> u64 select_idx)
>>>   */
>>>  void kvm_pmu_set_counter_value(struct kvm_vcpu *vcpu, u64 select_idx, u64 
>>> val)
>>>  {
>>> -   u64 reg;
>>> -   struct kvm_pmu *pmu = >arch.pmu;
>>> -   struct kvm_pmc *pmc = >pmc[select_idx];
>>> +   u64 reg, pair_low;
>>>