from:"Shameer Kolothum"

[PATCH v4 16/16] kvm/arm: Align the VMID allocation with the arm64 ASID one

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

At the moment, the VMID algorithm will send an SGI to all the CPUs to
force an exit and then broadcast a full TLB flush and I-Cache
invalidation.

This patch re-use the new ASID allocator. The
benefits are:
- CPUs are not forced to exit at roll-over. Instead the VMID will be
marked reserved and the context will be flushed at next exit. This
will reduce the IPIs traffic.
- Context invalidation is now per-CPU rather than broadcasted.
- Catalin has a formal model of the ASID allocator.

With the new algo, the code is now adapted:
- The function __kvm_flush_vm_context() has been renamed to
__kvm_tlb_flush_local_all() and now only flushing the current CPU
context.
- The call to update_vmid() will be done with preemption disabled
as the new algo requires to store information per-CPU.
- The TLBs associated to EL1 will be flushed when booting a CPU to
deal with stale information. This was previously done on the
allocation of the first VMID of a new generation.


Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
Test Results:

v4:
The measurement was made on a HiSilicon D06 platform with maxcpus set
to 8 and with the number of VMID limited to 4-bit. The test involves
running concurrently 40 guests with 2 vCPUs. Each guest will then
 execute hackbench 5 times before exiting.

The performance difference between the current algo and the new one are
(avg. of 10 runs):
   - 1.9% less entry/exit from guest
   - 0.7% faster

v3:
The measurement was made on a Seattle based SoC (8 CPUs), with the
number of VMID limited to 4-bit. The test involves running concurrently 40
guests with 2 vCPUs. Each guest will then execute hackbench 5 times
before exiting.

The performance difference between the current algo and the new one are:
- 2.5% less exit from the guest
- 22.4% more flush, although they are now local rather than
broadcasted
- 0.11% faster (just for the record)

---
 arch/arm64/include/asm/kvm_asm.h   |   4 +-
 arch/arm64/include/asm/kvm_host.h  |   5 +-
 arch/arm64/include/asm/kvm_mmu.h   |   3 +-
 arch/arm64/kvm/arm.c   | 124 +++--
 arch/arm64/kvm/hyp/nvhe/hyp-main.c |   6 +-
 arch/arm64/kvm/hyp/nvhe/tlb.c  |  10 +--
 arch/arm64/kvm/hyp/vhe/tlb.c   |  10 +--
 arch/arm64/kvm/mmu.c   |   1 -
 8 files changed, 65 insertions(+), 98 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h
index a7ab84f781f7..29697c5ab2c2 100644
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -44,7 +44,7 @@
 
 #define __KVM_HOST_SMCCC_FUNC___kvm_hyp_init   0
 #define __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run   1
-#define __KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context   2
+#define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_local_all2
 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid_ipa 3
 #define __KVM_HOST_SMCCC_FUNC___kvm_tlb_flush_vmid 4
 #define __KVM_HOST_SMCCC_FUNC___kvm_flush_cpu_context  5
@@ -182,7 +182,7 @@ DECLARE_KVM_NVHE_SYM(__per_cpu_end);
 DECLARE_KVM_HYP_SYM(__bp_harden_hyp_vecs);
 #define __bp_harden_hyp_vecs   CHOOSE_HYP_SYM(__bp_harden_hyp_vecs)
 
-extern void __kvm_flush_vm_context(void);
+extern void __kvm_tlb_flush_local_all(void);
 extern void __kvm_flush_cpu_context(struct kvm_s2_mmu *mmu);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm_s2_mmu *mmu, phys_addr_t ipa,
 int level);
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 3d10e6527f7d..5309216e4a94 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -70,9 +70,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
 void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
 
 struct kvm_vmid {
-   /* The VMID generation used for the virt. memory system */
-   u64vmid_gen;
-   u32vmid;
+   atomic64_t id;
 };
 
 struct kvm_s2_mmu {
@@ -631,7 +629,6 @@ void kvm_arm_resume_guest(struct kvm *kvm);
ret;\
})
 
-void force_vm_exit(const cpumask_t *mask);
 void kvm_mmu_wp_memory_region(struct kvm *kvm, int slot);
 
 int handle_exit(struct kvm_vcpu *vcpu, int exception_index);
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index c3080966ef83..43e83df87e3a 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -252,7 +252,8 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu 
*mmu)
u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0;
 
baddr = mmu->pgd_phys;
-   vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT;
+   vmid_field = atomic64_read(>id) << VTTBR_VMID_SHIFT;
+   vmid_field &= VTTBR_VMID_MASK(kvm_get_vmid_bits());
return kvm_phys_to_

[PATCH v4 14/16] arm64/lib: Add an helper to free memory allocated by the ASID allocator

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

Some users of the ASID allocator (e.g VMID) may need to free any
resources if the initialization fail. So introduce a function that
allows freeing of any memory allocated by the ASID allocator.

Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
 arch/arm64/include/asm/lib_asid.h | 2 ++
 arch/arm64/lib/asid.c | 6 ++
 2 files changed, 8 insertions(+)

diff --git a/arch/arm64/include/asm/lib_asid.h 
b/arch/arm64/include/asm/lib_asid.h
index acae8d243d17..4dbc0a3f19a6 100644
--- a/arch/arm64/include/asm/lib_asid.h
+++ b/arch/arm64/include/asm/lib_asid.h
@@ -82,4 +82,6 @@ void asid_context_pinned_put(struct asid_info *info, 
atomic64_t *pasid,
 refcount_t *pinned);
 int asid_allocator_init(struct asid_info *info, u32 bits, bool pinned);
 
+void asid_allocator_free(struct asid_info *info);
+
 #endif
diff --git a/arch/arm64/lib/asid.c b/arch/arm64/lib/asid.c
index 286285616f65..7bd031f9516a 100644
--- a/arch/arm64/lib/asid.c
+++ b/arch/arm64/lib/asid.c
@@ -256,3 +256,9 @@ int asid_allocator_init(struct asid_info *info, u32 bits, 
bool pinned)
 
return 0;
 }
+
+void asid_allocator_free(struct asid_info *info)
+{
+   kfree(info->map);
+   kfree(info->pinned_map);
+}
-- 
2.17.1

[PATCH v4 15/16] arch/arm64: Introduce a capability to tell whether 16-bit VMID is available

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

At the moment, the function kvm_get_vmid_bits() is looking up for the
sanitized value of ID_AA64MMFR1_EL1 and extract the information
regarding the number of VMID bits supported.

This is fine as the function is mainly used during VMID roll-over. New
use in a follow-up patch will require the function to be called a every
context switch so we want the function to be more efficient.

A new capability is introduced to tell whether 16-bit VMID is
available.

Signed-off-by: Julien Grall 
---
 arch/arm64/include/asm/cpucaps.h | 3 ++-
 arch/arm64/include/asm/kvm_mmu.h | 4 +---
 arch/arm64/kernel/cpufeature.c   | 9 +
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index c40f2490cd7b..acb92da5c254 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -67,7 +67,8 @@
 #define ARM64_HAS_LDAPR59
 #define ARM64_KVM_PROTECTED_MODE   60
 #define ARM64_WORKAROUND_NVIDIA_CARMEL_CNP 61
+#define ARM64_HAS_16BIT_VMID   62
 
-#define ARM64_NCAPS62
+#define ARM64_NCAPS63
 
 #endif /* __ASM_CPUCAPS_H */
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 90873851f677..c3080966ef83 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -213,9 +213,7 @@ void kvm_toggle_cache(struct kvm_vcpu *vcpu, bool 
was_enabled);
 
 static inline unsigned int kvm_get_vmid_bits(void)
 {
-   int reg = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
-
-   return get_vmid_bits(reg);
+   return cpus_have_const_cap(ARM64_HAS_16BIT_VMID) ? 16 : 8;
 }
 
 /*
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index e5281e1c8f1d..ff956fb2f712 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2203,6 +2203,15 @@ static const struct arm64_cpu_capabilities 
arm64_features[] = {
.matches = has_cpuid_feature,
.min_field_value = 1,
},
+   {
+   .capability = ARM64_HAS_16BIT_VMID,
+   .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+   .sys_reg = SYS_ID_AA64MMFR1_EL1,
+   .field_pos = ID_AA64MMFR1_VMIDBITS_SHIFT,
+   .sign = FTR_UNSIGNED,
+   .min_field_value = ID_AA64MMFR1_VMIDBITS_16,
+   .matches = has_cpuid_feature,
+   },
{},
 };
 
-- 
2.17.1

[PATCH v4 13/16] arm64: Move the ASID allocator code in a separate file

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

We will want to re-use the ASID allocator in a separate context (e.g
allocating VMID). So move the code in a new file.

The function asid_check_context has been moved in the header as a static
inline function because we want to avoid add a branch when checking if the
ASID is still valid.

Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
 arch/arm64/include/asm/lib_asid.h |  85 
 arch/arm64/lib/Makefile   |   2 +
 arch/arm64/lib/asid.c | 258 +
 arch/arm64/mm/context.c   | 310 +-
 4 files changed, 347 insertions(+), 308 deletions(-)
 create mode 100644 arch/arm64/include/asm/lib_asid.h
 create mode 100644 arch/arm64/lib/asid.c

diff --git a/arch/arm64/include/asm/lib_asid.h 
b/arch/arm64/include/asm/lib_asid.h
new file mode 100644
index ..acae8d243d17
--- /dev/null
+++ b/arch/arm64/include/asm/lib_asid.h
@@ -0,0 +1,85 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __ASM_ASM_LIB_ASID_H
+#define __ASM_ASM_LIB_ASID_H
+
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct asid_info {
+   atomic64_t  generation;
+   unsigned long   *map;
+   unsigned intmap_idx;
+   atomic64_t __percpu *active;
+   u64 __percpu*reserved;
+   u32 bits;
+   raw_spinlock_t  lock;
+   /* Which CPU requires context flush on next call */
+   cpumask_t   flush_pending;
+   /* Pinned ASIDs info */
+   unsigned long   *pinned_map;
+   unsigned long   max_pinned_asids;
+   unsigned long   nr_pinned_asids;
+   /* Callback to locally flush the context. */
+   void(*flush_cpu_ctxt_cb)(void);
+   /* Callback to set the list of reserved ASIDs */
+   void(*set_reserved_bits)(struct asid_info *info);
+};
+
+#define NUM_CTXT_ASIDS(info)   (1UL << ((info)->bits))
+
+#define active_asid(info, cpu)  (*per_cpu_ptr((info)->active, cpu))
+#define asid_gen_match(asid, info) \
+   (!(((asid) ^ atomic64_read(&(info)->generation)) >> info->bits))
+
+void asid_new_context(struct asid_info *info, atomic64_t *pasid,
+ refcount_t *pinned, unsigned int cpu);
+
+/*
+ * Check the ASID is still valid for the context. If not generate a new ASID.
+ *
+ * @pasid: Pointer to the current ASID batch
+ * @pinned: refcount if asid is pinned
+ */
+static inline void asid_check_context(struct asid_info *info, atomic64_t 
*pasid,
+ refcount_t *pinned)
+{
+   unsigned int cpu;
+   u64 asid, old_active_asid;
+
+   asid = atomic64_read(pasid);
+
+   /*
+* The memory ordering here is subtle.
+* If our active_asid is non-zero and the ASID matches the current
+* generation, then we update the active_asid entry with a relaxed
+* cmpxchg. Racing with a concurrent rollover means that either:
+*
+* - We get a zero back from the cmpxchg and end up waiting on the
+*   lock. Taking the lock synchronises with the rollover and so
+*   we are forced to see the updated generation.
+*
+* - We get a valid ASID back from the cmpxchg, which means the
+*   relaxed xchg in flush_context will treat us as reserved
+*   because atomic RmWs are totally ordered for a given location.
+*/
+   old_active_asid = atomic64_read(this_cpu_ptr(info->active));
+   if (old_active_asid && asid_gen_match(asid, info) &&
+   atomic64_cmpxchg_relaxed(this_cpu_ptr(info->active),
+old_active_asid, asid))
+   return;
+
+   cpu = smp_processor_id();
+   asid_new_context(info, pasid, pinned, cpu);
+}
+
+unsigned long asid_context_pinned_get(struct asid_info *info,
+ atomic64_t *pasid,
+ refcount_t *pinned);
+void asid_context_pinned_put(struct asid_info *info, atomic64_t *pasid,
+refcount_t *pinned);
+int asid_allocator_init(struct asid_info *info, u32 bits, bool pinned);
+
+#endif
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index d31e1169d9b8..d42c66ce0460 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -5,6 +5,8 @@ lib-y   := clear_user.o delay.o copy_from_user.o
\
   memset.o memcmp.o strcmp.o strncmp.o strlen.o\
   strnlen.o strchr.o strrchr.o tishift.o
 
+lib-y  += asid.o
+
 ifeq ($(CONFIG_KERNEL_MODE_NEON), y)
 obj-$(CONFIG_XOR_BLOCKS)   += xor-neon.o
 CFLAGS_REMOVE_xor-neon.o   += -mgeneral-regs-only
diff --git a/arch/arm64/lib/asid.c b/arch/arm64/lib/asid.c
new file mode 100644
index ..286285616f65
--- /dev/null
+++ b/arch/arm64/lib/as

[PATCH v4 12/16] arm64/mm: Introduce a callback to set reserved bits

2021-04-14 Thread Shameer Kolothum

Setting the reserved asid bits will vary depending on the actual
user of the ASID allocator. Introduce a new callback.

Signed-off-by: Shameer Kolothum 
---
 arch/arm64/mm/context.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index ee446f7535a3..e9049d14f54a 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -34,6 +34,8 @@ static struct asid_info
unsigned long   nr_pinned_asids;
/* Callback to locally flush the context. */
void(*flush_cpu_ctxt_cb)(void);
+   /* Callback to set the list of reserved ASIDs */
+   void(*set_reserved_bits)(struct asid_info *info);
 } asid_info;
 
 #define active_asid(info, cpu)  (*per_cpu_ptr((info)->active, cpu))
@@ -118,7 +120,8 @@ static void flush_context(struct asid_info *info)
u64 asid;
 
/* Update the list of reserved ASIDs and the ASID bitmap. */
-   set_reserved_asid_bits(info);
+   if (info->set_reserved_bits)
+   info->set_reserved_bits(info);
 
for_each_possible_cpu(i) {
asid = atomic64_xchg_relaxed(_asid(info, i), 0);
@@ -508,6 +511,7 @@ static int asids_init(void)
info->active = _asids;
info->reserved = _asids;
info->flush_cpu_ctxt_cb = asid_flush_cpu_ctxt;
+   info->set_reserved_bits = set_reserved_asid_bits;
 
/*
 * We cannot call set_reserved_asid_bits() here because CPU
-- 
2.17.1

[PATCH v4 11/16] arm64/mm: Introduce a callback to flush the local context

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

Flushing the local context will vary depending on the actual user
of the ASID allocator. Introduce a new callback to flush the local
context and move the call to flush local TLB in it.

Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
 arch/arm64/mm/context.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 901472a57b5d..ee446f7535a3 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -32,6 +32,8 @@ static struct asid_info
unsigned long   *pinned_map;
unsigned long   max_pinned_asids;
unsigned long   nr_pinned_asids;
+   /* Callback to locally flush the context. */
+   void(*flush_cpu_ctxt_cb)(void);
 } asid_info;
 
 #define active_asid(info, cpu)  (*per_cpu_ptr((info)->active, cpu))
@@ -245,8 +247,9 @@ static void asid_new_context(struct asid_info *info, 
atomic64_t *pasid,
atomic64_set(pasid, asid);
}
 
-   if (cpumask_test_and_clear_cpu(cpu, >flush_pending))
-   local_flush_tlb_all();
+   if (cpumask_test_and_clear_cpu(cpu, >flush_pending) &&
+   info->flush_cpu_ctxt_cb)
+   info->flush_cpu_ctxt_cb();
 
atomic64_set(_asid(info, cpu), asid);
raw_spin_unlock_irqrestore(>lock, flags);
@@ -427,6 +430,11 @@ void cpu_do_switch_mm(phys_addr_t pgd_phys, struct 
mm_struct *mm)
post_ttbr_update_workaround();
 }
 
+static void asid_flush_cpu_ctxt(void)
+{
+   local_flush_tlb_all();
+}
+
 static int asids_update_limit(void)
 {
struct asid_info *info = _info;
@@ -499,6 +507,7 @@ static int asids_init(void)
 
info->active = _asids;
info->reserved = _asids;
+   info->flush_cpu_ctxt_cb = asid_flush_cpu_ctxt;
 
/*
 * We cannot call set_reserved_asid_bits() here because CPU
-- 
2.17.1

[PATCH v4 09/16] arm64/mm: Split the function check_and_switch_context in 3 parts

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

The function check_and_switch_context is used to:
1) Check whether the ASID is still valid
2) Generate a new one if it is not valid
3) Switch the context

While the latter is specific to the MM subsystem, the rest could be part
of the generic ASID allocator.

After this patch, the function is now split in 3 parts which corresponds
to the use of the functions:
1) asid_check_context: Check if the ASID is still valid
2) asid_new_context: Generate a new ASID for the context
3) check_and_switch_context: Call 1) and 2) and switch the context

1) and 2) have not been merged in a single function because we want to
avoid to add a branch in when the ASID is still valid. This will matter
when the code will be moved in separate file later on as 1) will reside
in the header as a static inline function.

Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
v3 comment:
Will wants to avoid to add a branch when the ASID is still valid. So
1) and 2) are in separates function. The former will move to a new
header and make static inline.
---
 arch/arm64/mm/context.c | 70 -
 1 file changed, 48 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 041c3c5e0216..40ef013c90c3 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -222,17 +222,49 @@ static u64 new_context(struct asid_info *info, atomic64_t 
*pasid,
return idx2asid(info, asid) | generation;
 }
 
-void check_and_switch_context(struct mm_struct *mm)
+/*
+ * Generate a new ASID for the context.
+ *
+ * @pasid: Pointer to the current ASID batch allocated. It will be updated
+ * with the new ASID batch.
+ * @pinned: refcount if asid is pinned.
+ * Caller needs to make sure preempt is disabled before calling this function.
+ */
+static void asid_new_context(struct asid_info *info, atomic64_t *pasid,
+refcount_t *pinned)
 {
unsigned long flags;
-   unsigned int cpu;
-   u64 asid, old_active_asid;
-   struct asid_info *info = _info;
+   u64 asid;
+   unsigned int cpu = smp_processor_id();
 
-   if (system_supports_cnp())
-   cpu_set_reserved_ttbr0();
+   raw_spin_lock_irqsave(>lock, flags);
+   /* Check that our ASID belongs to the current generation. */
+   asid = atomic64_read(pasid);
+   if (!asid_gen_match(asid, info)) {
+   asid = new_context(info, pasid, pinned);
+   atomic64_set(pasid, asid);
+   }
 
-   asid = atomic64_read(>context.id);
+   if (cpumask_test_and_clear_cpu(cpu, >flush_pending))
+   local_flush_tlb_all();
+
+   atomic64_set(_asid(info, cpu), asid);
+   raw_spin_unlock_irqrestore(>lock, flags);
+}
+
+/*
+ * Check the ASID is still valid for the context. If not generate a new ASID.
+ *
+ * @pasid: Pointer to the current ASID batch
+ * @pinned: refcount if asid is pinned
+ * Caller needs to make sure preempt is disabled before calling this function.
+ */
+static void asid_check_context(struct asid_info *info, atomic64_t *pasid,
+  refcount_t *pinned)
+{
+   u64 asid, old_active_asid;
+
+   asid = atomic64_read(pasid);
 
/*
 * The memory ordering here is subtle.
@@ -252,24 +284,18 @@ void check_and_switch_context(struct mm_struct *mm)
if (old_active_asid && asid_gen_match(asid, info) &&
atomic64_cmpxchg_relaxed(this_cpu_ptr(info->active),
 old_active_asid, asid))
-   goto switch_mm_fastpath;
-
-   raw_spin_lock_irqsave(>lock, flags);
-   /* Check that our ASID belongs to the current generation. */
-   asid = atomic64_read(>context.id);
-   if (!asid_gen_match(asid, info)) {
-   asid = new_context(info, >context.id, >context.pinned);
-   atomic64_set(>context.id, asid);
-   }
+   return;
 
-   cpu = smp_processor_id();
-   if (cpumask_test_and_clear_cpu(cpu, >flush_pending))
-   local_flush_tlb_all();
+   asid_new_context(info, pasid, pinned);
+}
 
-   atomic64_set(_asid(info, cpu), asid);
-   raw_spin_unlock_irqrestore(>lock, flags);
+void check_and_switch_context(struct mm_struct *mm)
+{
+   if (system_supports_cnp())
+   cpu_set_reserved_ttbr0();
 
-switch_mm_fastpath:
+   asid_check_context(_info, >context.id,
+  >context.pinned);
 
arm64_apply_bp_hardening();
 
-- 
2.17.1

[PATCH v4 08/16] arm64/mm: Split asid_inits in 2 parts

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

Move out the common initialization of the ASID allocator in a separate
function.

Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
v3-->v4
  -dropped asid_per_ctxt and added pinned asid map init.
---
 arch/arm64/mm/context.c | 44 +++--
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 8af54e06f5bc..041c3c5e0216 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -412,26 +412,50 @@ static int asids_update_limit(void)
 }
 arch_initcall(asids_update_limit);
 
-static int asids_init(void)
+/*
+ * Initialize the ASID allocator
+ *
+ * @info: Pointer to the asid allocator structure
+ * @bits: Number of ASIDs available
+ * @pinned: Support for Pinned ASIDs
+ */
+static int asid_allocator_init(struct asid_info *info, u32 bits, bool pinned)
 {
-   struct asid_info *info = _info;
+   info->bits = bits;
 
-   info->bits = get_cpu_asid_bits();
+   /*
+* Expect allocation after rollover to fail if we don't have at least
+* one more ASID than CPUs. ASID #0 is always reserved.
+*/
+   WARN_ON(NUM_CTXT_ASIDS(info) - 1 <= num_possible_cpus());
atomic64_set(>generation, ASID_FIRST_VERSION(info));
info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
sizeof(*info->map), GFP_KERNEL);
if (!info->map)
-   panic("Failed to allocate bitmap for %lu ASIDs\n",
- NUM_CTXT_ASIDS(info));
+   return -ENOMEM;
 
info->map_idx = 1;
-   info->active = _asids;
-   info->reserved = _asids;
raw_spin_lock_init(>lock);
 
-   info->pinned_map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
-  sizeof(*info->pinned_map), GFP_KERNEL);
-   info->nr_pinned_asids = 0;
+   if (pinned) {
+   info->pinned_map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
+  sizeof(*info->pinned_map), 
GFP_KERNEL);
+   info->nr_pinned_asids = 0;
+   }
+
+   return 0;
+}
+
+static int asids_init(void)
+{
+   struct asid_info *info = _info;
+
+   if (asid_allocator_init(info, get_cpu_asid_bits(), true))
+   panic("Unable to initialize ASID allocator for %lu ASIDs\n",
+ NUM_CTXT_ASIDS(info));
+
+   info->active = _asids;
+   info->reserved = _asids;
 
/*
 * We cannot call set_reserved_asid_bits() here because CPU
-- 
2.17.1

[PATCH v4 10/16] arm64/mm: Split the arm64_mm_context_get/put

2021-04-14 Thread Shameer Kolothum

Keep only the mm specific part in arm64_mm_context_get/put
and move the rest to generic functions.

Signed-off-by: Shameer Kolothum 
---
 arch/arm64/mm/context.c | 53 +++--
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 40ef013c90c3..901472a57b5d 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -307,20 +307,21 @@ void check_and_switch_context(struct mm_struct *mm)
cpu_switch_mm(mm->pgd, mm);
 }
 
-unsigned long arm64_mm_context_get(struct mm_struct *mm)
+static unsigned long asid_context_pinned_get(struct asid_info *info,
+atomic64_t *pasid,
+refcount_t *pinned)
 {
unsigned long flags;
u64 asid;
-   struct asid_info *info = _info;
 
if (!info->pinned_map)
return 0;
 
raw_spin_lock_irqsave(>lock, flags);
 
-   asid = atomic64_read(>context.id);
+   asid = atomic64_read(pasid);
 
-   if (refcount_inc_not_zero(>context.pinned))
+   if (refcount_inc_not_zero(pinned))
goto out_unlock;
 
if (info->nr_pinned_asids >= info->max_pinned_asids) {
@@ -333,45 +334,61 @@ unsigned long arm64_mm_context_get(struct mm_struct *mm)
 * We went through one or more rollover since that ASID was
 * used. Ensure that it is still valid, or generate a new one.
 */
-   asid = new_context(info, >context.id, >context.pinned);
-   atomic64_set(>context.id, asid);
+   asid = new_context(info, pasid, pinned);
+   atomic64_set(pasid, asid);
}
 
info->nr_pinned_asids++;
__set_bit(asid2idx(info, asid), info->pinned_map);
-   refcount_set(>context.pinned, 1);
+   refcount_set(pinned, 1);
 
 out_unlock:
raw_spin_unlock_irqrestore(>lock, flags);
-
asid &= ~ASID_MASK(info);
-
-   /* Set the equivalent of USER_ASID_BIT */
-   if (asid && arm64_kernel_unmapped_at_el0())
-   asid |= 1;
-
return asid;
 }
-EXPORT_SYMBOL_GPL(arm64_mm_context_get);
 
-void arm64_mm_context_put(struct mm_struct *mm)
+static void asid_context_pinned_put(struct asid_info *info, atomic64_t *pasid,
+   refcount_t *pinned)
 {
unsigned long flags;
-   struct asid_info *info = _info;
-   u64 asid = atomic64_read(>context.id);
+   u64 asid = atomic64_read(pasid);
 
if (!info->pinned_map)
return;
 
raw_spin_lock_irqsave(>lock, flags);
 
-   if (refcount_dec_and_test(>context.pinned)) {
+   if (refcount_dec_and_test(pinned)) {
__clear_bit(asid2idx(info, asid), info->pinned_map);
info->nr_pinned_asids--;
}
 
raw_spin_unlock_irqrestore(>lock, flags);
 }
+
+unsigned long arm64_mm_context_get(struct mm_struct *mm)
+{
+   u64 asid;
+   struct asid_info *info = _info;
+
+   asid = asid_context_pinned_get(info, >context.id,
+  >context.pinned);
+
+   /* Set the equivalent of USER_ASID_BIT */
+   if (asid && arm64_kernel_unmapped_at_el0())
+   asid |= 1;
+
+   return asid;
+}
+EXPORT_SYMBOL_GPL(arm64_mm_context_get);
+
+void arm64_mm_context_put(struct mm_struct *mm)
+{
+   struct asid_info *info = _info;
+
+   asid_context_pinned_put(info, >context.id, >context.pinned);
+}
 EXPORT_SYMBOL_GPL(arm64_mm_context_put);
 
 /* Errata workaround post TTBRx_EL1 update. */
-- 
2.17.1

[PATCH v4 06/16] arm64/mm: Introduce NUM_CTXT_ASIDS

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

At the moment ASID_FIRST_VERSION is used to know the number of ASIDs
supported. As we are going to move the ASID allocator to a separate file,
it would be better to use a different name for external users.

Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
v3-->v4
 -Dropped patch #6, but retained the name NUM_CTXT_ASIDS.

---
 arch/arm64/mm/context.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 628304e0d3b1..0f11d7c7f6a3 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -41,9 +41,9 @@ static unsigned long nr_pinned_asids;
 static unsigned long *pinned_asid_map;
 
 #define ASID_MASK(info)(~GENMASK((info)->bits - 1, 0))
-#define ASID_FIRST_VERSION(info)   (1UL << (info)->bits)
+#define NUM_CTXT_ASIDS(info)   (1UL << ((info)->bits))
+#define ASID_FIRST_VERSION(info)NUM_CTXT_ASIDS(info)
 
-#define NUM_USER_ASIDS(info)   ASID_FIRST_VERSION(info)
 #define asid2idx(info, asid)   ((asid) & ~ASID_MASK(info))
 #define idx2asid(info, idx)asid2idx(info, idx)
 
@@ -87,7 +87,7 @@ void verify_cpu_asid_bits(void)
 
 static void set_kpti_asid_bits(struct asid_info *info, unsigned long *map)
 {
-   unsigned int len = BITS_TO_LONGS(NUM_USER_ASIDS(info)) * 
sizeof(unsigned long);
+   unsigned int len = BITS_TO_LONGS(NUM_CTXT_ASIDS(info)) * 
sizeof(unsigned long);
/*
 * In case of KPTI kernel/user ASIDs are allocated in
 * pairs, the bottom bit distinguishes the two: if it
@@ -100,11 +100,11 @@ static void set_kpti_asid_bits(struct asid_info *info, 
unsigned long *map)
 static void set_reserved_asid_bits(struct asid_info *info)
 {
if (pinned_asid_map)
-   bitmap_copy(info->map, pinned_asid_map, NUM_USER_ASIDS(info));
+   bitmap_copy(info->map, pinned_asid_map, NUM_CTXT_ASIDS(info));
else if (arm64_kernel_unmapped_at_el0())
set_kpti_asid_bits(info, info->map);
else
-   bitmap_clear(info->map, 0, NUM_USER_ASIDS(info));
+   bitmap_clear(info->map, 0, NUM_CTXT_ASIDS(info));
 }
 
 #define asid_gen_match(asid, info) \
@@ -204,8 +204,8 @@ static u64 new_context(struct asid_info *info, atomic64_t 
*pasid,
 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
 * pairs.
 */
-   asid = find_next_zero_bit(info->map, NUM_USER_ASIDS(info), 
info->map_idx);
-   if (asid != NUM_USER_ASIDS(info))
+   asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 
info->map_idx);
+   if (asid != NUM_CTXT_ASIDS(info))
goto set_asid;
 
/* We're out of ASIDs, so increment the global generation count */
@@ -214,7 +214,7 @@ static u64 new_context(struct asid_info *info, atomic64_t 
*pasid,
flush_context(info);
 
/* We have more ASIDs than CPUs, so this will always succeed */
-   asid = find_next_zero_bit(info->map, NUM_USER_ASIDS(info), 1);
+   asid = find_next_zero_bit(info->map, NUM_CTXT_ASIDS(info), 1);
 
 set_asid:
__set_bit(asid, info->map);
@@ -387,7 +387,7 @@ void cpu_do_switch_mm(phys_addr_t pgd_phys, struct 
mm_struct *mm)
 static int asids_update_limit(void)
 {
struct asid_info *info = _info;
-   unsigned long num_available_asids = NUM_USER_ASIDS(info);
+   unsigned long num_available_asids = NUM_CTXT_ASIDS(info);
 
if (arm64_kernel_unmapped_at_el0()) {
num_available_asids /= 2;
@@ -418,18 +418,18 @@ static int asids_init(void)
 
info->bits = get_cpu_asid_bits();
atomic64_set(>generation, ASID_FIRST_VERSION(info));
-   info->map = kcalloc(BITS_TO_LONGS(NUM_USER_ASIDS(info)),
+   info->map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
sizeof(*info->map), GFP_KERNEL);
if (!info->map)
panic("Failed to allocate bitmap for %lu ASIDs\n",
- NUM_USER_ASIDS(info));
+ NUM_CTXT_ASIDS(info));
 
info->map_idx = 1;
info->active = _asids;
info->reserved = _asids;
raw_spin_lock_init(>lock);
 
-   pinned_asid_map = kcalloc(BITS_TO_LONGS(NUM_USER_ASIDS(info)),
+   pinned_asid_map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
  sizeof(*pinned_asid_map), GFP_KERNEL);
nr_pinned_asids = 0;
 
-- 
2.17.1

[PATCH v4 07/16] arm64/mm: Move Pinned ASID related variables to asid_info

2021-04-14 Thread Shameer Kolothum

The Pinned ASID variables hold information for a given ASID
allocator. So move them to the structure asid_info.

Signed-off-by: Shameer Kolothum 
---
 arch/arm64/mm/context.c | 38 +++---
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 0f11d7c7f6a3..8af54e06f5bc 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -28,6 +28,10 @@ static struct asid_info
raw_spinlock_t  lock;
/* Which CPU requires context flush on next call */
cpumask_t   flush_pending;
+   /* Pinned ASIDs info */
+   unsigned long   *pinned_map;
+   unsigned long   max_pinned_asids;
+   unsigned long   nr_pinned_asids;
 } asid_info;
 
 #define active_asid(info, cpu)  (*per_cpu_ptr((info)->active, cpu))
@@ -36,10 +40,6 @@ static struct asid_info
 static DEFINE_PER_CPU(atomic64_t, active_asids);
 static DEFINE_PER_CPU(u64, reserved_asids);
 
-static unsigned long max_pinned_asids;
-static unsigned long nr_pinned_asids;
-static unsigned long *pinned_asid_map;
-
 #define ASID_MASK(info)(~GENMASK((info)->bits - 1, 0))
 #define NUM_CTXT_ASIDS(info)   (1UL << ((info)->bits))
 #define ASID_FIRST_VERSION(info)NUM_CTXT_ASIDS(info)
@@ -99,8 +99,8 @@ static void set_kpti_asid_bits(struct asid_info *info, 
unsigned long *map)
 
 static void set_reserved_asid_bits(struct asid_info *info)
 {
-   if (pinned_asid_map)
-   bitmap_copy(info->map, pinned_asid_map, NUM_CTXT_ASIDS(info));
+   if (info->pinned_map)
+   bitmap_copy(info->map, info->pinned_map, NUM_CTXT_ASIDS(info));
else if (arm64_kernel_unmapped_at_el0())
set_kpti_asid_bits(info, info->map);
else
@@ -287,7 +287,7 @@ unsigned long arm64_mm_context_get(struct mm_struct *mm)
u64 asid;
struct asid_info *info = _info;
 
-   if (!pinned_asid_map)
+   if (!info->pinned_map)
return 0;
 
raw_spin_lock_irqsave(>lock, flags);
@@ -297,7 +297,7 @@ unsigned long arm64_mm_context_get(struct mm_struct *mm)
if (refcount_inc_not_zero(>context.pinned))
goto out_unlock;
 
-   if (nr_pinned_asids >= max_pinned_asids) {
+   if (info->nr_pinned_asids >= info->max_pinned_asids) {
asid = 0;
goto out_unlock;
}
@@ -311,8 +311,8 @@ unsigned long arm64_mm_context_get(struct mm_struct *mm)
atomic64_set(>context.id, asid);
}
 
-   nr_pinned_asids++;
-   __set_bit(asid2idx(info, asid), pinned_asid_map);
+   info->nr_pinned_asids++;
+   __set_bit(asid2idx(info, asid), info->pinned_map);
refcount_set(>context.pinned, 1);
 
 out_unlock:
@@ -334,14 +334,14 @@ void arm64_mm_context_put(struct mm_struct *mm)
struct asid_info *info = _info;
u64 asid = atomic64_read(>context.id);
 
-   if (!pinned_asid_map)
+   if (!info->pinned_map)
return;
 
raw_spin_lock_irqsave(>lock, flags);
 
if (refcount_dec_and_test(>context.pinned)) {
-   __clear_bit(asid2idx(info, asid), pinned_asid_map);
-   nr_pinned_asids--;
+   __clear_bit(asid2idx(info, asid), info->pinned_map);
+   info->nr_pinned_asids--;
}
 
raw_spin_unlock_irqrestore(>lock, flags);
@@ -391,8 +391,8 @@ static int asids_update_limit(void)
 
if (arm64_kernel_unmapped_at_el0()) {
num_available_asids /= 2;
-   if (pinned_asid_map)
-   set_kpti_asid_bits(info, pinned_asid_map);
+   if (info->pinned_map)
+   set_kpti_asid_bits(info, info->pinned_map);
}
/*
 * Expect allocation after rollover to fail if we don't have at least
@@ -407,7 +407,7 @@ static int asids_update_limit(void)
 * even if all CPUs have a reserved ASID and the maximum number of ASIDs
 * are pinned, there still is at least one empty slot in the ASID map.
 */
-   max_pinned_asids = num_available_asids - num_possible_cpus() - 2;
+   info->max_pinned_asids = num_available_asids - num_possible_cpus() - 2;
return 0;
 }
 arch_initcall(asids_update_limit);
@@ -429,9 +429,9 @@ static int asids_init(void)
info->reserved = _asids;
raw_spin_lock_init(>lock);
 
-   pinned_asid_map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
- sizeof(*pinned_asid_map), GFP_KERNEL);
-   nr_pinned_asids = 0;
+   info->pinned_map = kcalloc(BITS_TO_LONGS(NUM_CTXT_ASIDS(info)),
+  sizeof(*info->pinned_map), GFP_KERNEL);
+   info->nr_pinned_asids = 0;
 
/*
 * We cannot call set_reserved_asid_bits() here because CPU
-- 
2.17.1

[PATCH v4 05/16] arm64/mm: Remove dependency on MM in new_context

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

The function new_context will be part of a generic ASID allocator. At
the moment, the MM structure is currently used to fetch the ASID and
pinned refcount.

To remove the dependency on MM, it is possible to just pass a pointer to
the current ASID and pinned refcount. Also please note that 'pinned' may
be NULL if the user doesn't require the pinned asid support.


Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
v3-->v4:
  Changes related to Pinned ASID refcount.

---
 arch/arm64/mm/context.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 139ebc161acb..628304e0d3b1 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -165,9 +165,10 @@ static bool check_update_reserved_asid(struct asid_info 
*info, u64 asid,
return hit;
 }
 
-static u64 new_context(struct asid_info *info, struct mm_struct *mm)
+static u64 new_context(struct asid_info *info, atomic64_t *pasid,
+  refcount_t *pinned)
 {
-   u64 asid = atomic64_read(>context.id);
+   u64 asid = atomic64_read(pasid);
u64 generation = atomic64_read(>generation);
 
if (asid != 0) {
@@ -185,7 +186,7 @@ static u64 new_context(struct asid_info *info, struct 
mm_struct *mm)
 * takes priority, because even if it is also pinned, we need to
 * update the generation into the reserved_asids.
 */
-   if (refcount_read(>context.pinned))
+   if (pinned && refcount_read(pinned))
return newasid;
 
/*
@@ -257,7 +258,7 @@ void check_and_switch_context(struct mm_struct *mm)
/* Check that our ASID belongs to the current generation. */
asid = atomic64_read(>context.id);
if (!asid_gen_match(asid, info)) {
-   asid = new_context(info, mm);
+   asid = new_context(info, >context.id, >context.pinned);
atomic64_set(>context.id, asid);
}
 
@@ -306,7 +307,7 @@ unsigned long arm64_mm_context_get(struct mm_struct *mm)
 * We went through one or more rollover since that ASID was
 * used. Ensure that it is still valid, or generate a new one.
 */
-   asid = new_context(info, mm);
+   asid = new_context(info, >context.id, >context.pinned);
atomic64_set(>context.id, asid);
}
 
-- 
2.17.1

[PATCH v4 04/16] arm64/mm: Move the variable lock and tlb_flush_pending to asid_info

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

The variables lock and tlb_flush_pending holds information for a given
ASID allocator. So move them to the asid_info structure.

Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
 arch/arm64/mm/context.c | 23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 1fd40a42955c..139ebc161acb 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -17,8 +17,6 @@
 #include 
 #include 
 
-static DEFINE_RAW_SPINLOCK(cpu_asid_lock);
-
 static struct asid_info
 {
atomic64_t  generation;
@@ -27,6 +25,9 @@ static struct asid_info
atomic64_t __percpu *active;
u64 __percpu*reserved;
u32 bits;
+   raw_spinlock_t  lock;
+   /* Which CPU requires context flush on next call */
+   cpumask_t   flush_pending;
 } asid_info;
 
 #define active_asid(info, cpu)  (*per_cpu_ptr((info)->active, cpu))
@@ -34,7 +35,6 @@ static struct asid_info
 
 static DEFINE_PER_CPU(atomic64_t, active_asids);
 static DEFINE_PER_CPU(u64, reserved_asids);
-static cpumask_t tlb_flush_pending;
 
 static unsigned long max_pinned_asids;
 static unsigned long nr_pinned_asids;
@@ -137,7 +137,7 @@ static void flush_context(struct asid_info *info)
 * Queue a TLB invalidation for each CPU to perform on next
 * context-switch
 */
-   cpumask_setall(_flush_pending);
+   cpumask_setall(>flush_pending);
 }
 
 static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
@@ -253,7 +253,7 @@ void check_and_switch_context(struct mm_struct *mm)
 old_active_asid, asid))
goto switch_mm_fastpath;
 
-   raw_spin_lock_irqsave(_asid_lock, flags);
+   raw_spin_lock_irqsave(>lock, flags);
/* Check that our ASID belongs to the current generation. */
asid = atomic64_read(>context.id);
if (!asid_gen_match(asid, info)) {
@@ -262,11 +262,11 @@ void check_and_switch_context(struct mm_struct *mm)
}
 
cpu = smp_processor_id();
-   if (cpumask_test_and_clear_cpu(cpu, _flush_pending))
+   if (cpumask_test_and_clear_cpu(cpu, >flush_pending))
local_flush_tlb_all();
 
atomic64_set(_asid(info, cpu), asid);
-   raw_spin_unlock_irqrestore(_asid_lock, flags);
+   raw_spin_unlock_irqrestore(>lock, flags);
 
 switch_mm_fastpath:
 
@@ -289,7 +289,7 @@ unsigned long arm64_mm_context_get(struct mm_struct *mm)
if (!pinned_asid_map)
return 0;
 
-   raw_spin_lock_irqsave(_asid_lock, flags);
+   raw_spin_lock_irqsave(>lock, flags);
 
asid = atomic64_read(>context.id);
 
@@ -315,7 +315,7 @@ unsigned long arm64_mm_context_get(struct mm_struct *mm)
refcount_set(>context.pinned, 1);
 
 out_unlock:
-   raw_spin_unlock_irqrestore(_asid_lock, flags);
+   raw_spin_unlock_irqrestore(>lock, flags);
 
asid &= ~ASID_MASK(info);
 
@@ -336,14 +336,14 @@ void arm64_mm_context_put(struct mm_struct *mm)
if (!pinned_asid_map)
return;
 
-   raw_spin_lock_irqsave(_asid_lock, flags);
+   raw_spin_lock_irqsave(>lock, flags);
 
if (refcount_dec_and_test(>context.pinned)) {
__clear_bit(asid2idx(info, asid), pinned_asid_map);
nr_pinned_asids--;
}
 
-   raw_spin_unlock_irqrestore(_asid_lock, flags);
+   raw_spin_unlock_irqrestore(>lock, flags);
 }
 EXPORT_SYMBOL_GPL(arm64_mm_context_put);
 
@@ -426,6 +426,7 @@ static int asids_init(void)
info->map_idx = 1;
info->active = _asids;
info->reserved = _asids;
+   raw_spin_lock_init(>lock);
 
pinned_asid_map = kcalloc(BITS_TO_LONGS(NUM_USER_ASIDS(info)),
  sizeof(*pinned_asid_map), GFP_KERNEL);
-- 
2.17.1

[PATCH v4 02/16] arm64/mm: Move active_asids and reserved_asids to asid_info

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

The variables active_asids and reserved_asids hold information for a
given ASID allocator. So move them to the structure asid_info.

At the same time, introduce wrappers to access the active and reserved
ASIDs to make the code clearer.


Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
v3-->v4
  keep the this_cpu_ptr in fastpath. See c4885bbb3afe("arm64/mm: save
memory access in check_and_switch_context() fast switch path")

---
 arch/arm64/mm/context.c | 32 
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 783f8bdb91ee..42e011094571 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -25,8 +25,13 @@ static struct asid_info
atomic64_t  generation;
unsigned long   *map;
unsigned intmap_idx;
+   atomic64_t __percpu *active;
+   u64 __percpu*reserved;
 } asid_info;
 
+#define active_asid(info, cpu)  (*per_cpu_ptr((info)->active, cpu))
+#define reserved_asid(info, cpu) (*per_cpu_ptr((info)->reserved, cpu))
+
 static DEFINE_PER_CPU(atomic64_t, active_asids);
 static DEFINE_PER_CPU(u64, reserved_asids);
 static cpumask_t tlb_flush_pending;
@@ -114,7 +119,7 @@ static void flush_context(struct asid_info *info)
set_reserved_asid_bits(info);
 
for_each_possible_cpu(i) {
-   asid = atomic64_xchg_relaxed(_cpu(active_asids, i), 0);
+   asid = atomic64_xchg_relaxed(_asid(info, i), 0);
/*
 * If this CPU has already been through a
 * rollover, but hasn't run another task in
@@ -123,9 +128,9 @@ static void flush_context(struct asid_info *info)
 * the process it is still running.
 */
if (asid == 0)
-   asid = per_cpu(reserved_asids, i);
+   asid = reserved_asid(info, i);
__set_bit(asid2idx(asid), info->map);
-   per_cpu(reserved_asids, i) = asid;
+   reserved_asid(info, i) = asid;
}
 
/*
@@ -135,7 +140,8 @@ static void flush_context(struct asid_info *info)
cpumask_setall(_flush_pending);
 }
 
-static bool check_update_reserved_asid(u64 asid, u64 newasid)
+static bool check_update_reserved_asid(struct asid_info *info, u64 asid,
+  u64 newasid)
 {
int cpu;
bool hit = false;
@@ -150,9 +156,9 @@ static bool check_update_reserved_asid(u64 asid, u64 
newasid)
 * generation.
 */
for_each_possible_cpu(cpu) {
-   if (per_cpu(reserved_asids, cpu) == asid) {
+   if (reserved_asid(info, cpu) == asid) {
hit = true;
-   per_cpu(reserved_asids, cpu) = newasid;
+   reserved_asid(info, cpu) = newasid;
}
}
 
@@ -171,7 +177,7 @@ static u64 new_context(struct asid_info *info, struct 
mm_struct *mm)
 * If our current ASID was active during a rollover, we
 * can continue to use it and this was just a false alarm.
 */
-   if (check_update_reserved_asid(asid, newasid))
+   if (check_update_reserved_asid(info, asid, newasid))
return newasid;
 
/*
@@ -229,8 +235,8 @@ void check_and_switch_context(struct mm_struct *mm)
 
/*
 * The memory ordering here is subtle.
-* If our active_asids is non-zero and the ASID matches the current
-* generation, then we update the active_asids entry with a relaxed
+* If our active_asid is non-zero and the ASID matches the current
+* generation, then we update the active_asid entry with a relaxed
 * cmpxchg. Racing with a concurrent rollover means that either:
 *
 * - We get a zero back from the cmpxchg and end up waiting on the
@@ -241,9 +247,9 @@ void check_and_switch_context(struct mm_struct *mm)
 *   relaxed xchg in flush_context will treat us as reserved
 *   because atomic RmWs are totally ordered for a given location.
 */
-   old_active_asid = atomic64_read(this_cpu_ptr(_asids));
+   old_active_asid = atomic64_read(this_cpu_ptr(info->active));
if (old_active_asid && asid_gen_match(asid, info) &&
-   atomic64_cmpxchg_relaxed(this_cpu_ptr(_asids),
+   atomic64_cmpxchg_relaxed(this_cpu_ptr(info->active),
 old_active_asid, asid))
goto switch_mm_fastpath;
 
@@ -259,7 +265,7 @@ void check_and_switch_context(struct mm_struct *mm)
if (cpumask_test_and_clear_cpu(cpu, _flush_pending))
local_flush_tlb_all();
 
-   atomic64_set(this_cpu_ptr(_asids), asid);
+   atomic64_set(_asid(info, cpu), asid);
raw_spin_unlock_irqresto

[PATCH v4 03/16] arm64/mm: Move bits to asid_info

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

The variable bits hold information for a given ASID allocator. So move
it to the asid_info structure.

Because most of the macros were relying on bits, they are now taking an
extra parameter that is a pointer to the asid_info structure.

Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
 arch/arm64/mm/context.c | 70 +
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 42e011094571..1fd40a42955c 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -17,7 +17,6 @@
 #include 
 #include 
 
-static u32 asid_bits;
 static DEFINE_RAW_SPINLOCK(cpu_asid_lock);
 
 static struct asid_info
@@ -27,6 +26,7 @@ static struct asid_info
unsigned intmap_idx;
atomic64_t __percpu *active;
u64 __percpu*reserved;
+   u32 bits;
 } asid_info;
 
 #define active_asid(info, cpu)  (*per_cpu_ptr((info)->active, cpu))
@@ -40,12 +40,12 @@ static unsigned long max_pinned_asids;
 static unsigned long nr_pinned_asids;
 static unsigned long *pinned_asid_map;
 
-#define ASID_MASK  (~GENMASK(asid_bits - 1, 0))
-#define ASID_FIRST_VERSION (1UL << asid_bits)
+#define ASID_MASK(info)(~GENMASK((info)->bits - 1, 0))
+#define ASID_FIRST_VERSION(info)   (1UL << (info)->bits)
 
-#define NUM_USER_ASIDS ASID_FIRST_VERSION
-#define asid2idx(asid) ((asid) & ~ASID_MASK)
-#define idx2asid(idx)  asid2idx(idx)
+#define NUM_USER_ASIDS(info)   ASID_FIRST_VERSION(info)
+#define asid2idx(info, asid)   ((asid) & ~ASID_MASK(info))
+#define idx2asid(info, idx)asid2idx(info, idx)
 
 /* Get the ASIDBits supported by the current CPU */
 static u32 get_cpu_asid_bits(void)
@@ -74,20 +74,20 @@ void verify_cpu_asid_bits(void)
 {
u32 asid = get_cpu_asid_bits();
 
-   if (asid < asid_bits) {
+   if (asid < asid_info.bits) {
/*
 * We cannot decrease the ASID size at runtime, so panic if we 
support
 * fewer ASID bits than the boot CPU.
 */
pr_crit("CPU%d: smaller ASID size(%u) than boot CPU (%u)\n",
-   smp_processor_id(), asid, asid_bits);
+   smp_processor_id(), asid, asid_info.bits);
cpu_panic_kernel();
}
 }
 
-static void set_kpti_asid_bits(unsigned long *map)
+static void set_kpti_asid_bits(struct asid_info *info, unsigned long *map)
 {
-   unsigned int len = BITS_TO_LONGS(NUM_USER_ASIDS) * sizeof(unsigned 
long);
+   unsigned int len = BITS_TO_LONGS(NUM_USER_ASIDS(info)) * 
sizeof(unsigned long);
/*
 * In case of KPTI kernel/user ASIDs are allocated in
 * pairs, the bottom bit distinguishes the two: if it
@@ -100,15 +100,15 @@ static void set_kpti_asid_bits(unsigned long *map)
 static void set_reserved_asid_bits(struct asid_info *info)
 {
if (pinned_asid_map)
-   bitmap_copy(info->map, pinned_asid_map, NUM_USER_ASIDS);
+   bitmap_copy(info->map, pinned_asid_map, NUM_USER_ASIDS(info));
else if (arm64_kernel_unmapped_at_el0())
-   set_kpti_asid_bits(info->map);
+   set_kpti_asid_bits(info, info->map);
else
-   bitmap_clear(info->map, 0, NUM_USER_ASIDS);
+   bitmap_clear(info->map, 0, NUM_USER_ASIDS(info));
 }
 
 #define asid_gen_match(asid, info) \
-   (!(((asid) ^ atomic64_read(&(info)->generation)) >> asid_bits))
+   (!(((asid) ^ atomic64_read(&(info)->generation)) >> info->bits))
 
 static void flush_context(struct asid_info *info)
 {
@@ -129,7 +129,7 @@ static void flush_context(struct asid_info *info)
 */
if (asid == 0)
asid = reserved_asid(info, i);
-   __set_bit(asid2idx(asid), info->map);
+   __set_bit(asid2idx(info, asid), info->map);
reserved_asid(info, i) = asid;
}
 
@@ -171,7 +171,7 @@ static u64 new_context(struct asid_info *info, struct 
mm_struct *mm)
u64 generation = atomic64_read(>generation);
 
if (asid != 0) {
-   u64 newasid = generation | (asid & ~ASID_MASK);
+   u64 newasid = generation | (asid & ~ASID_MASK(info));
 
/*
 * If our current ASID was active during a rollover, we
@@ -192,7 +192,7 @@ static u64 new_context(struct asid_info *info, struct 
mm_struct *mm)
 * We had a valid ASID in a previous life, so try to re-use
 * it if possible.
 */
-   if (!__test_and_set_bit(asid2idx(asid), info->map))
+   if (!__test_and_set_bit(asid2idx(info, asid), info->map))

[PATCH v4 01/16] arm64/mm: Introduce asid_info structure and move asid_generation/asid_map to it

2021-04-14 Thread Shameer Kolothum

From: Julien Grall 

In an attempt to make the ASID allocator generic, create a new structure
asid_info to store all the information necessary for the allocator.

For now, move the variables asid_generation, asid_map, cur_idx to the
new structure asid_info. Follow-up patches will move more variables.

Note to avoid more renaming aftwards, a local variable 'info' has been
created and is a pointer to the ASID allocator structure.

Signed-off-by: Julien Grall 
Signed-off-by: Shameer Kolothum 
---
v3-->v4:
  Move cur_idx into asid_info.
---
 arch/arm64/mm/context.c | 71 +++--
 1 file changed, 40 insertions(+), 31 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index 001737a8f309..783f8bdb91ee 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -20,8 +20,12 @@
 static u32 asid_bits;
 static DEFINE_RAW_SPINLOCK(cpu_asid_lock);
 
-static atomic64_t asid_generation;
-static unsigned long *asid_map;
+static struct asid_info
+{
+   atomic64_t  generation;
+   unsigned long   *map;
+   unsigned intmap_idx;
+} asid_info;
 
 static DEFINE_PER_CPU(atomic64_t, active_asids);
 static DEFINE_PER_CPU(u64, reserved_asids);
@@ -88,26 +92,26 @@ static void set_kpti_asid_bits(unsigned long *map)
memset(map, 0xaa, len);
 }
 
-static void set_reserved_asid_bits(void)
+static void set_reserved_asid_bits(struct asid_info *info)
 {
if (pinned_asid_map)
-   bitmap_copy(asid_map, pinned_asid_map, NUM_USER_ASIDS);
+   bitmap_copy(info->map, pinned_asid_map, NUM_USER_ASIDS);
else if (arm64_kernel_unmapped_at_el0())
-   set_kpti_asid_bits(asid_map);
+   set_kpti_asid_bits(info->map);
else
-   bitmap_clear(asid_map, 0, NUM_USER_ASIDS);
+   bitmap_clear(info->map, 0, NUM_USER_ASIDS);
 }
 
-#define asid_gen_match(asid) \
-   (!(((asid) ^ atomic64_read(_generation)) >> asid_bits))
+#define asid_gen_match(asid, info) \
+   (!(((asid) ^ atomic64_read(&(info)->generation)) >> asid_bits))
 
-static void flush_context(void)
+static void flush_context(struct asid_info *info)
 {
int i;
u64 asid;
 
/* Update the list of reserved ASIDs and the ASID bitmap. */
-   set_reserved_asid_bits();
+   set_reserved_asid_bits(info);
 
for_each_possible_cpu(i) {
asid = atomic64_xchg_relaxed(_cpu(active_asids, i), 0);
@@ -120,7 +124,7 @@ static void flush_context(void)
 */
if (asid == 0)
asid = per_cpu(reserved_asids, i);
-   __set_bit(asid2idx(asid), asid_map);
+   __set_bit(asid2idx(asid), info->map);
per_cpu(reserved_asids, i) = asid;
}
 
@@ -155,11 +159,10 @@ static bool check_update_reserved_asid(u64 asid, u64 
newasid)
return hit;
 }
 
-static u64 new_context(struct mm_struct *mm)
+static u64 new_context(struct asid_info *info, struct mm_struct *mm)
 {
-   static u32 cur_idx = 1;
u64 asid = atomic64_read(>context.id);
-   u64 generation = atomic64_read(_generation);
+   u64 generation = atomic64_read(>generation);
 
if (asid != 0) {
u64 newasid = generation | (asid & ~ASID_MASK);
@@ -183,7 +186,7 @@ static u64 new_context(struct mm_struct *mm)
 * We had a valid ASID in a previous life, so try to re-use
 * it if possible.
 */
-   if (!__test_and_set_bit(asid2idx(asid), asid_map))
+   if (!__test_and_set_bit(asid2idx(asid), info->map))
return newasid;
}
 
@@ -194,21 +197,21 @@ static u64 new_context(struct mm_struct *mm)
 * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd
 * pairs.
 */
-   asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, cur_idx);
+   asid = find_next_zero_bit(info->map, NUM_USER_ASIDS, info->map_idx);
if (asid != NUM_USER_ASIDS)
goto set_asid;
 
/* We're out of ASIDs, so increment the global generation count */
generation = atomic64_add_return_relaxed(ASID_FIRST_VERSION,
-_generation);
-   flush_context();
+>generation);
+   flush_context(info);
 
/* We have more ASIDs than CPUs, so this will always succeed */
-   asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, 1);
+   asid = find_next_zero_bit(info->map, NUM_USER_ASIDS, 1);
 
 set_asid:
-   __set_bit(asid, asid_map);
-   cur_idx = asid;
+   __set_bit(asid, info->map);
+   info->map_idx = asid;
return idx2asid(asid) | generation;
 }
 
@@ -217,6 +220,7 @@ void check_and_switch_context(struct mm_struct *mm)
unsigned long flags;
unsigned int cpu;

[PATCH v4 00/16] kvm/arm: Align the VMID allocation with the arm64 ASID one

2021-04-14 Thread Shameer Kolothum

Hi,

This is an attempt to revive this series originally posted by
Julien Grall[1]. The main motive to work on this now is because
of the requirement to have Pinned KVM VMIDs and the RFC discussion
for the same basically suggested[2] to have a common/better vmid
allocator for KVM which this series provides.

Major Changes from v3:

-Changes related to Pinned ASID support.
-Changes to take care KPTI related bits reservation.
-Dropped support for 32 bit KVM.
-Rebase to 5.12-rc7

Individual patches have change history for any major changes
from v3.

Tests were performed on a HiSilicon D06 platform and so far not observed
any regressions.

For ASID allocation,

Avg of 10 runs(hackbench -s 512 -l 200 -g 300 -f 25 -P),
5.12-rc7: Time:18.8119
5.12-rc7+v4: Time: 18.459

~1.8% improvement.

For KVM VMID,

The measurement was made with maxcpus set to 8 and with the
number of VMID limited to 4-bit. The test involves running
concurrently 40 guests with 2 vCPUs. Each guest will then
execute hackbench 5 times before exiting.

The performance difference between the current algo and the
new one are(ag. of 10 runs):
- 1.9% less exit from the guest
- 0.7% faster

For complete series, please see,
https://github.com/hisilicon/kernel-dev/tree/private-v5.12-rc7-asid-v4

Please take a look and let me know your feedback.

Thanks,
Shameer
[1].https://patchwork.kernel.org/project/linux-arm-kernel/cover/20190724162534.7390-1-julien.gr...@arm.com/
[2].https://lore.kernel.org/linux-arm-kernel/20210222155338.26132-6-shameerali.kolothum.th...@huawei.com/T/#mff3129997739e2747172f4a2e81fd66be91ffea4

>From V3:

Hi all,

This patch series is moving out the ASID allocator in a separate file in order
to re-use it for the VMID. The benefits are:
- CPUs are not forced to exit on a roll-over.
- Context invalidation is now per-CPU rather than
broadcasted.

There are no performance regression on the fastpath for ASID allocation.
Actually on the hackbench measurement (300 hackbench) it was .7% faster.

The measurement was made on a Seattle based SoC (8 CPUs), with the
number of VMID limited to 4-bit. The test involves running concurrently 40
guests with 2 vCPUs. Each guest will then execute hackbench 5 times
before exiting.

The performance difference (on 5.1-rc1) between the current algo and the
new one are:
- 2.5% less exit from the guest
- 22.4% more flush, although they are now local rather than broadcasted
- 0.11% faster (just for the record)

The ASID allocator rework to make it generic has been divided in multiple
patches to make the review easier.

A branch with the patch based on 5.3-rc1 can be found:

http://xenbits.xen.org/gitweb/?p=people/julieng/linux-arm.git;a=shortlog;h=refs/heads/vmid-rework/v3

For all the changes see in each patch.

Best regards,

Julien Grall (13):
arm64/mm: Introduce asid_info structure and move
asid_generation/asid_map to it
arm64/mm: Move active_asids and reserved_asids to asid_info
arm64/mm: Move bits to asid_info
arm64/mm: Move the variable lock and tlb_flush_pending to asid_info
arm64/mm: Remove dependency on MM in new_context
arm64/mm: Introduce NUM_CTXT_ASIDS
arm64/mm: Split asid_inits in 2 parts
arm64/mm: Split the function check_and_switch_context in 3 parts
arm64/mm: Introduce a callback to flush the local context
arm64: Move the ASID allocator code in a separate file
arm64/lib: Add an helper to free memory allocated by the ASID
allocator
arch/arm64: Introduce a capability to tell whether 16-bit VMID is
available
kvm/arm: Align the VMID allocation with the arm64 ASID one

Shameer Kolothum (3):
arm64/mm: Move Pinned ASID related variables to asid_info
arm64/mm: Split the arm64_mm_context_get/put
arm64/mm: Introduce a callback to set reserved bits

--
2.17.1

[PATCH v3] iommu: Check dev->iommu in iommu_dev_xxx functions

2021-03-03 Thread Shameer Kolothum

The device iommu probe/attach might have failed leaving dev->iommu
to NULL and device drivers may still invoke these functions resulting
in a crash in iommu vendor driver code.

Hence make sure we check that.

Fixes: a3a195929d40 ("iommu: Add APIs for multiple domains per device")
Signed-off-by: Shameer Kolothum 
---
v2 --> v3
 -Removed iommu_ops from struct dev_iommu.
v1 --> v2:
 -Added iommu_ops to struct dev_iommu based on the discussion with Robin.
 -Rebased against iommu-tree core branch.
---
 drivers/iommu/iommu.c | 24 +++-
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index d0b0a15dba84..e10cfa99057c 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2878,10 +2878,12 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
  */
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-   const struct iommu_ops *ops = dev->bus->iommu_ops;
+   if (dev->iommu && dev->iommu->iommu_dev) {
+   const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-   if (ops && ops->dev_enable_feat)
-   return ops->dev_enable_feat(dev, feat);
+   if (ops->dev_enable_feat)
+   return ops->dev_enable_feat(dev, feat);
+   }
 
return -ENODEV;
 }
@@ -2894,10 +2896,12 @@ EXPORT_SYMBOL_GPL(iommu_dev_enable_feature);
  */
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-   const struct iommu_ops *ops = dev->bus->iommu_ops;
+   if (dev->iommu && dev->iommu->iommu_dev) {
+   const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-   if (ops && ops->dev_disable_feat)
-   return ops->dev_disable_feat(dev, feat);
+   if (ops->dev_disable_feat)
+   return ops->dev_disable_feat(dev, feat);
+   }
 
return -EBUSY;
 }
@@ -2905,10 +2909,12 @@ EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
 
 bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features 
feat)
 {
-   const struct iommu_ops *ops = dev->bus->iommu_ops;
+   if (dev->iommu && dev->iommu->iommu_dev) {
+   const struct iommu_ops *ops = dev->iommu->iommu_dev->ops;
 
-   if (ops && ops->dev_feat_enabled)
-   return ops->dev_feat_enabled(dev, feat);
+   if (ops->dev_feat_enabled)
+   return ops->dev_feat_enabled(dev, feat);
+   }
 
return false;
 }
-- 
2.17.1

[PATCH v2] iommu: Check dev->iommu in iommu_dev_xxx functions

2021-02-01 Thread Shameer Kolothum

The device iommu probe/attach might have failed leaving dev->iommu
to NULL and device drivers may still invoke these functions resulting
in a crash in iommu vendor driver code. Hence make sure we check that.

Also added iommu_ops to the "struct dev_iommu" and set it if the dev
is successfully associated with an iommu. 

Fixes: a3a195929d40 ("iommu: Add APIs for multiple domains per device")
Signed-off-by: Shameer Kolothum 
---
v1 --> v2:
 -Added iommu_ops to struct dev_iommu based on the discussion with Robin.
 -Rebased against iommu-tree core branch.
---
 drivers/iommu/iommu.c | 19 +++
 include/linux/iommu.h |  2 ++
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index fd76e2f579fe..6023d0b7c542 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -217,6 +217,7 @@ static int __iommu_probe_device(struct device *dev, struct 
list_head *group_list
}
 
dev->iommu->iommu_dev = iommu_dev;
+   dev->iommu->ops = iommu_dev->ops;
 
group = iommu_group_get_for_dev(dev);
if (IS_ERR(group)) {
@@ -2865,10 +2866,8 @@ EXPORT_SYMBOL_GPL(iommu_fwspec_add_ids);
  */
 int iommu_dev_enable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-   const struct iommu_ops *ops = dev->bus->iommu_ops;
-
-   if (ops && ops->dev_enable_feat)
-   return ops->dev_enable_feat(dev, feat);
+   if (dev->iommu && dev->iommu->ops->dev_enable_feat)
+   return dev->iommu->ops->dev_enable_feat(dev, feat);
 
return -ENODEV;
 }
@@ -2881,10 +2880,8 @@ EXPORT_SYMBOL_GPL(iommu_dev_enable_feature);
  */
 int iommu_dev_disable_feature(struct device *dev, enum iommu_dev_features feat)
 {
-   const struct iommu_ops *ops = dev->bus->iommu_ops;
-
-   if (ops && ops->dev_disable_feat)
-   return ops->dev_disable_feat(dev, feat);
+   if (dev->iommu && dev->iommu->ops->dev_disable_feat)
+   return dev->iommu->ops->dev_disable_feat(dev, feat);
 
return -EBUSY;
 }
@@ -2892,10 +2889,8 @@ EXPORT_SYMBOL_GPL(iommu_dev_disable_feature);
 
 bool iommu_dev_feature_enabled(struct device *dev, enum iommu_dev_features 
feat)
 {
-   const struct iommu_ops *ops = dev->bus->iommu_ops;
-
-   if (ops && ops->dev_feat_enabled)
-   return ops->dev_feat_enabled(dev, feat);
+   if (dev->iommu && dev->iommu->ops->dev_feat_enabled)
+   return dev->iommu->ops->dev_feat_enabled(dev, feat);
 
return false;
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 524ffc2ff64f..ff0c76bdfb67 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -354,6 +354,7 @@ struct iommu_fault_param {
  * @fault_param: IOMMU detected device fault reporting data
  * @fwspec: IOMMU fwspec data
  * @iommu_dev:  IOMMU device this device is linked to
+ * @ops:iommu-ops for talking to the iommu_dev
  * @priv:   IOMMU Driver private data
  *
  * TODO: migrate other per device data pointers under iommu_dev_data, e.g.
@@ -364,6 +365,7 @@ struct dev_iommu {
struct iommu_fault_param*fault_param;
struct iommu_fwspec *fwspec;
struct iommu_device *iommu_dev;
+   const struct iommu_ops  *ops;
void*priv;
 };
 
-- 
2.17.1

[PATCH] iommu: Check dev->iommu in iommu_dev_xxx functions

2021-01-26 Thread Shameer Kolothum

The device iommu probe/attach might have failed leaving dev->iommu
to NULL and device drivers may still invoke these functions resulting
a crash in iommu vendor driver code. Hence make sure we check that.

Signed-off-by: Shameer Kolothum 
---
 drivers/iommu/iommu.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index ffeebda8d6de..cb68153c5cc0 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2867,7 +2867,7 @@ bool iommu_dev_has_feature(struct device *dev, enum 
iommu_dev_features feat)
 {
const struct iommu_ops *ops = dev->bus->iommu_ops;
 
-   if (ops && ops->dev_has_feat)
+   if (dev->iommu && ops && ops->dev_has_feat)
return ops->dev_has_feat(dev, feat);
 
return false;
@@ -2878,7 +2878,7 @@ int iommu_dev_enable_feature(struct device *dev, enum 
iommu_dev_features feat)
 {
const struct iommu_ops *ops = dev->bus->iommu_ops;
 
-   if (ops && ops->dev_enable_feat)
+   if (dev->iommu && ops && ops->dev_enable_feat)
return ops->dev_enable_feat(dev, feat);
 
return -ENODEV;
@@ -2894,7 +2894,7 @@ int iommu_dev_disable_feature(struct device *dev, enum 
iommu_dev_features feat)
 {
const struct iommu_ops *ops = dev->bus->iommu_ops;
 
-   if (ops && ops->dev_disable_feat)
+   if (dev->iommu && ops && ops->dev_disable_feat)
return ops->dev_disable_feat(dev, feat);
 
return -EBUSY;
@@ -2905,7 +2905,7 @@ bool iommu_dev_feature_enabled(struct device *dev, enum 
iommu_dev_features feat)
 {
const struct iommu_ops *ops = dev->bus->iommu_ops;
 
-   if (ops && ops->dev_feat_enabled)
+   if (dev->iommu && ops && ops->dev_feat_enabled)
return ops->dev_feat_enabled(dev, feat);
 
return false;
-- 
2.17.1

[PATCH] genirq/msi: Make sure early activation of all PCI MSIs

2021-01-21 Thread Shameer Kolothum

We currently do early activation of MSI irqs for PCI/MSI based on
the MSI_FLAG_ACTIVATE_EARLY flag. Though this activates all the
allocated MSIs in the case of MSI-X, it only does so for the
base irq in the case of MSI. This is because, for MSI, there
is only one msi_desc entry for all the 32 irqs it can support
and the current implementation iterates over the msi entries and
ends up activating the base irq only.

The above creates an issue on platforms where the msi controller
supports direct injection of vLPIs(eg: ARM GICv4 ITS). On these
platforms, upon irq activation, ITS driver maps the event to an
ITT entry. And for Guest pass-through to work, early mapping of
all the dev MSI vectors is required. Otherwise, the vfio irq
bypass manager registration will fail. eg, On a HiSilicon D06
platform with GICv4 enabled, Guest boot with zip dev pass-through
reports,

"vfio-pci :75:00.1: irq bypass producer (token 06e5176a)
registration fails: 66311"

and Guest boot fails.

This is traced to,
   kvm_arch_irq_bypass_add_producer
     kvm_vgic_v4_set_forwarding
       vgic_its_resolve_lpi --> returns E_ITS_INT_UNMAPPED_INTERRUPT

Hence make sure we activate all the irqs for both MSI and MSI-x cases.

Signed-off-by: Shameer Kolothum 
---
It is unclear to me whether not performing the early activation of all
MSI irqs was deliberate and has consequences on any other platforms.
Please let me know.

Thanks,
Shameer 
---
 kernel/irq/msi.c | 114 +--
 1 file changed, 90 insertions(+), 24 deletions(-)

diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 2c0c4d6d0f83..eec187fc32a9 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -395,6 +395,78 @@ static bool msi_check_reservation_mode(struct irq_domain 
*domain,
return desc->msi_attrib.is_msix || desc->msi_attrib.maskbit;
 }
 
+static void msi_domain_deactivate_irq(struct irq_domain *domain, int irq)
+{
+   struct irq_data *irqd;
+
+   irqd = irq_domain_get_irq_data(domain, irq);
+   if (irqd_is_activated(irqd))
+   irq_domain_deactivate_irq(irqd);
+}
+
+static int msi_domain_activate_irq(struct irq_domain *domain,
+  int irq, bool can_reserve)
+{
+   struct irq_data *irqd;
+
+   irqd = irq_domain_get_irq_data(domain, irq);
+   if (!can_reserve) {
+   irqd_clr_can_reserve(irqd);
+   if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
+   irqd_set_msi_nomask_quirk(irqd);
+   }
+   return irq_domain_activate_irq(irqd, can_reserve);
+}
+
+static int msi_domain_activate_msix_irqs(struct irq_domain *domain,
+struct device *dev, bool can_reserve)
+{
+   struct msi_desc *desc;
+   int ret, irq;
+
+   for_each_msi_entry(desc, dev) {
+   irq = desc->irq;
+   ret = msi_domain_activate_irq(domain, irq, can_reserve);
+   if (ret)
+   goto out;
+   }
+   return 0;
+
+out:
+   for_each_msi_entry(desc, dev) {
+   if (irq == desc->irq)
+   break;
+   msi_domain_deactivate_irq(domain, desc->irq);
+   }
+   return ret;
+}
+
+static int msi_domain_activate_msi_irqs(struct irq_domain *domain,
+   struct device *dev, bool can_reserve)
+{
+   struct msi_desc *desc;
+   int i, ret, base, irq;
+
+   desc = first_msi_entry(dev);
+   base = desc->irq;
+
+   for (i = 0; i < desc->nvec_used; i++) {
+   irq = base + i;
+   ret = msi_domain_activate_irq(domain, irq, can_reserve);
+   if (ret)
+   goto out;
+   }
+   return 0;
+
+out:
+   for (i = 0; i < desc->nvec_used; i++) {
+   if (irq == base + i)
+   break;
+   msi_domain_deactivate_irq(domain, base + i);
+   }
+   return ret;
+}
+
 int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
int nvec)
 {
@@ -443,21 +515,25 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, 
struct device *dev,
else
dev_dbg(dev, "irq [%d-%d] for MSI\n",
virq, virq + desc->nvec_used - 1);
-   /*
-* This flag is set by the PCI layer as we need to activate
-* the MSI entries before the PCI layer enables MSI in the
-* card. Otherwise the card latches a random msi message.
-*/
-   if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
-   continue;
+   }
 
-   irq_data = irq_domain_get_irq_data(domain, desc->irq);
-   if (!can_reserve) {
-   irqd_clr_can_reserve(irq_data)

[PATCH] irqchip/gic-v3: Check SRE bit for GICv2 legacy support

2020-11-30 Thread Shameer Kolothum

At present, the support for GICv2 backward compatibility on GICv3/v4
hardware is determined based on whether DT/ACPI provides a memory
mapped phys base address for GIC virtual CPU interface register(GICV).
This creates a problem that a Qemu guest boot with default GIC(GICv2)
hangs when firmware falsely reports this address on systems that don't
have support for legacy mode. 

As per GICv3/v4 spec, in an implementation that does not support legacy
operation, affinity routing and system register access are permanently
enabled. This means that the associated control bits are RAO/WI. Hence
use the ICC_SRE_EL1.SRE bit to decide whether hardware supports GICv2
mode in addition to the above firmware based check.

Signed-off-by: Shameer Kolothum 
---
On Hisilicon D06, UEFI sets the GIC MADT GICC gicv_base_address but the
GIC implementation on these boards doesn't have the GICv2 legacy support.
This results in, Guest boot hang when Qemu uses the default GIC option.

With this patch, the Qemu Guest with GICv2 now gracefully exits,
 "qemu-system-aarch64: host does not support in-kernel GICv2 emulation"

Not very sure there is a better way to detect this other than checking
the SRE bit as done in this patch(Of course, we will be fixing the UEFI
going forward).

Thanks,
Shameer

---
 drivers/irqchip/irq-gic-v3.c | 33 -
 1 file changed, 28 insertions(+), 5 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c
index 16fecc0febe8..15fa1eea45e4 100644
--- a/drivers/irqchip/irq-gic-v3.c
+++ b/drivers/irqchip/irq-gic-v3.c
@@ -1835,6 +1835,27 @@ static void __init gic_populate_ppi_partitions(struct 
device_node *gic_node)
of_node_put(parts_node);
 }
 
+/* SRE bit being RAO/WI implies no GICv2 legacy mode support */
+static bool __init gic_gicv2_compatible(void)
+{
+   u32 org, val;
+
+   org = gic_read_sre();
+   if (!(org & ICC_SRE_EL1_SRE))
+   return true;
+
+   val = org & ~ICC_SRE_EL1_SRE;
+   gic_write_sre(val);
+
+   val = gic_read_sre();
+   gic_write_sre(org);
+
+   if (val & ICC_SRE_EL1_SRE)
+   return false;
+
+   return true;
+}
+
 static void __init gic_of_setup_kvm_info(struct device_node *node)
 {
int ret;
@@ -1851,10 +1872,12 @@ static void __init gic_of_setup_kvm_info(struct 
device_node *node)
 _idx))
gicv_idx = 1;
 
-   gicv_idx += 3;  /* Also skip GICD, GICC, GICH */
-   ret = of_address_to_resource(node, gicv_idx, );
-   if (!ret)
-   gic_v3_kvm_info.vcpu = r;
+   if (gic_gicv2_compatible()) {
+   gicv_idx += 3;  /* Also skip GICD, GICC, GICH */
+   ret = of_address_to_resource(node, gicv_idx, );
+   if (!ret)
+   gic_v3_kvm_info.vcpu = r;
+   }
 
gic_v3_kvm_info.has_v4 = gic_data.rdists.has_vlpis;
gic_v3_kvm_info.has_v4_1 = gic_data.rdists.has_rvpeid;
@@ -2164,7 +2187,7 @@ static void __init gic_acpi_setup_kvm_info(void)
 
gic_v3_kvm_info.maint_irq = irq;
 
-   if (acpi_data.vcpu_base) {
+   if (gic_gicv2_compatible() && acpi_data.vcpu_base) {
struct resource *vcpu = _v3_kvm_info.vcpu;
 
vcpu->flags = IORESOURCE_MEM;
-- 
2.17.1

[PATCH v7 4/4] perf/smmuv3: Enable HiSilicon Erratum 162001800 quirk

2019-03-26 Thread Shameer Kolothum

HiSilicon erratum 162001800 describes the limitation of
SMMUv3 PMCG implementation on HiSilicon Hip08 platforms.

On these platforms, the PMCG event counter registers
(SMMU_PMCG_EVCNTRn) are read only and as a result it
is not possible to set the initial counter period value
on event monitor start.

To work around this, the current value of the counter
is read and used for delta calculations. OEM information
from ACPI header is used to identify the affected hardware
platforms.

Signed-off-by: Shameer Kolothum 
Reviewed-by: Hanjun Guo 
Reviewed-by: Robin Murphy 
---
 drivers/acpi/arm64/iort.c | 16 ++-
 drivers/perf/arm_smmuv3_pmu.c | 48 ---
 include/linux/acpi_iort.h |  1 +
 3 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index e2c9b26..4dc68de 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1366,9 +1366,23 @@ static void __init 
arm_smmu_v3_pmcg_init_resources(struct resource *res,
   ACPI_EDGE_SENSITIVE, [2]);
 }
 
+static struct acpi_platform_list pmcg_plat_info[] __initdata = {
+   /* HiSilicon Hip08 Platform */
+   {"HISI  ", "HIP08   ", 0, ACPI_SIG_IORT, greater_than_or_equal, 0,
+IORT_SMMU_V3_PMCG_HISI_HIP08},
+   { }
+};
+
 static int __init arm_smmu_v3_pmcg_add_platdata(struct platform_device *pdev)
 {
-   u32 model = IORT_SMMU_V3_PMCG_GENERIC;
+   u32 model;
+   int idx;
+
+   idx = acpi_match_platform_list(pmcg_plat_info);
+   if (idx >= 0)
+   model = pmcg_plat_info[idx].data;
+   else
+   model = IORT_SMMU_V3_PMCG_GENERIC;
 
return platform_device_add_data(pdev, , sizeof(model));
 }
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 7803e9e..6b3c0ed 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -35,6 +35,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -93,6 +94,8 @@
 
 #define SMMU_PMCG_PA_SHIFT  12
 
+#define SMMU_PMCG_EVCNTR_RDONLY BIT(0)
+
 static int cpuhp_state_num;
 
 struct smmu_pmu {
@@ -108,6 +111,7 @@ struct smmu_pmu {
void __iomem *reg_base;
void __iomem *reloc_base;
u64 counter_mask;
+   u32 options;
bool global_filter;
u32 global_filter_span;
u32 global_filter_sid;
@@ -222,15 +226,27 @@ static void smmu_pmu_set_period(struct smmu_pmu *smmu_pmu,
u32 idx = hwc->idx;
u64 new;
 
-   /*
-* We limit the max period to half the max counter value of the counter
-* size, so that even in the case of extreme interrupt latency the
-* counter will (hopefully) not wrap past its initial value.
-*/
-   new = smmu_pmu->counter_mask >> 1;
+   if (smmu_pmu->options & SMMU_PMCG_EVCNTR_RDONLY) {
+   /*
+* On platforms that require this quirk, if the counter starts
+* at < half_counter value and wraps, the current logic of
+* handling the overflow may not work. It is expected that,
+* those platforms will have full 64 counter bits implemented
+* so that such a possibility is remote(eg: HiSilicon HIP08).
+*/
+   new = smmu_pmu_counter_get_value(smmu_pmu, idx);
+   } else {
+   /*
+* We limit the max period to half the max counter value
+* of the counter size, so that even in the case of extreme
+* interrupt latency the counter will (hopefully) not wrap
+* past its initial value.
+*/
+   new = smmu_pmu->counter_mask >> 1;
+   smmu_pmu_counter_set_value(smmu_pmu, idx, new);
+   }
 
local64_set(>prev_count, new);
-   smmu_pmu_counter_set_value(smmu_pmu, idx, new);
 }
 
 static void smmu_pmu_set_event_filter(struct perf_event *event,
@@ -669,6 +685,22 @@ static void smmu_pmu_reset(struct smmu_pmu *smmu_pmu)
   smmu_pmu->reloc_base + SMMU_PMCG_OVSCLR0);
 }
 
+static void smmu_pmu_get_acpi_options(struct smmu_pmu *smmu_pmu)
+{
+   u32 model;
+
+   model = *(u32 *)dev_get_platdata(smmu_pmu->dev);
+
+   switch (model) {
+   case IORT_SMMU_V3_PMCG_HISI_HIP08:
+   /* HiSilicon Erratum 162001800 */
+   smmu_pmu->options |= SMMU_PMCG_EVCNTR_RDONLY;
+   break;
+   }
+
+   dev_notice(smmu_pmu->dev, "option mask 0x%x\n", smmu_pmu->options);
+}
+
 static int smmu_pmu_probe(struct platform_device *pdev)
 {
struct smmu_pmu *smmu_pmu;
@@ -748,6 +780,8 @@ static int smmu_pmu_probe(struct platform_device *pdev)
return -EINVAL;
}
 
+   smmu_pmu_get_acpi_options(smmu_pmu);
+
/* Pick one CPU

[PATCH v7 3/4] perf/smmuv3: Add MSI irq support

2019-03-26 Thread Shameer Kolothum

This adds support for MSI-based counter overflow interrupt.

Signed-off-by: Shameer Kolothum 
Reviewed-by: Robin Murphy 
---
 drivers/perf/arm_smmuv3_pmu.c | 58 +++
 1 file changed, 58 insertions(+)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index c0924e5..7803e9e 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -67,6 +67,7 @@
 #define SMMU_PMCG_OVSSET0   0xCC0
 #define SMMU_PMCG_CFGR  0xE00
 #define SMMU_PMCG_CFGR_SID_FILTER_TYPE  BIT(23)
+#define SMMU_PMCG_CFGR_MSI  BIT(21)
 #define SMMU_PMCG_CFGR_RELOC_CTRS   BIT(20)
 #define SMMU_PMCG_CFGR_SIZE GENMASK(13, 8)
 #define SMMU_PMCG_CFGR_NCTR GENMASK(5, 0)
@@ -77,6 +78,12 @@
 #define SMMU_PMCG_IRQ_CTRL  0xE50
 #define SMMU_PMCG_IRQ_CTRL_IRQENBIT(0)
 #define SMMU_PMCG_IRQ_CFG0  0xE58
+#define SMMU_PMCG_IRQ_CFG1  0xE60
+#define SMMU_PMCG_IRQ_CFG2  0xE64
+
+/* MSI config fields */
+#define MSI_CFG0_ADDR_MASK  GENMASK_ULL(51, 2)
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
 
 #define SMMU_PMCG_DEFAULT_FILTER_SPAN   1
 #define SMMU_PMCG_DEFAULT_FILTER_SIDGENMASK(31, 0)
@@ -584,11 +591,62 @@ static irqreturn_t smmu_pmu_handle_irq(int irq_num, void 
*data)
return IRQ_HANDLED;
 }
 
+static void smmu_pmu_free_msis(void *data)
+{
+   struct device *dev = data;
+
+   platform_msi_domain_free_irqs(dev);
+}
+
+static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   phys_addr_t doorbell;
+   struct device *dev = msi_desc_to_dev(desc);
+   struct smmu_pmu *pmu = dev_get_drvdata(dev);
+
+   doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+   doorbell &= MSI_CFG0_ADDR_MASK;
+
+   writeq_relaxed(doorbell, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+   writel_relaxed(msg->data, pmu->reg_base + SMMU_PMCG_IRQ_CFG1);
+   writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE,
+  pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
+}
+
+static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
+{
+   struct msi_desc *desc;
+   struct device *dev = pmu->dev;
+   int ret;
+
+   /* Clear MSI address reg */
+   writeq_relaxed(0, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+
+   /* MSI supported or not */
+   if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
+   return;
+
+   ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
+   if (ret) {
+   dev_warn(dev, "failed to allocate MSIs\n");
+   return;
+   }
+
+   desc = first_msi_entry(dev);
+   if (desc)
+   pmu->irq = desc->irq;
+
+   /* Add callback to free MSIs on teardown */
+   devm_add_action(dev, smmu_pmu_free_msis, dev);
+}
+
 static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
 {
unsigned long flags = IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD;
int irq, ret = -ENXIO;
 
+   smmu_pmu_setup_msi(pmu);
+
irq = pmu->irq;
if (irq)
ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
-- 
2.7.4

[PATCH v7 2/4] perf/smmuv3: Add arm64 smmuv3 pmu driver

2019-03-26 Thread Shameer Kolothum

From: Neil Leeder 

Adds a new driver to support the SMMUv3 PMU and add it into the
perf events framework.

Each SMMU node may have multiple PMUs associated with it, each of
which may support different events.

SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
 is the physical page address of the SMMU PMCG
wrapped to 4K boundary. For example, the PMCG at 0xff8884 is
named smmuv3_pmcg_ff88840

Filtering by stream id is done by specifying filtering parameters
with the event. options are:
   filter_enable- 0 = no filtering, 1 = filtering enabled
   filter_span  - 0 = exact match, 1 = pattern match
   filter_stream_id - pattern to filter against

Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
   filter_span=1,filter_stream_id=0x42/ -a netperf

Applies filter pattern 0x42 to transaction events, which means events
matching stream ids 0x42 & 0x43 are counted as only upper StreamID
bits are required to match the given filter. Further filtering
information is available in the SMMU documentation.

SMMU events are not attributable to a CPU, so task mode and sampling
are not supported.

Signed-off-by: Neil Leeder 
Signed-off-by: Shameer Kolothum 
Reviewed-by: Robin Murphy 
---
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 776 ++
 3 files changed, 786 insertions(+)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index af9bc17..6a472fc 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -52,6 +52,15 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config ARM_SMMU_V3_PMU
+bool "ARM SMMUv3 Performance Monitors Extension"
+depends on ARM64 && ACPI && ARM_SMMU_V3
+  help
+  Provides support for the SMMU version 3 performance monitor unit 
(PMU)
+  on ARM-based systems.
+  Adds the SMMU PMU into the perf events subsystem for
+  monitoring SMMU performance events.
+
 config ARM_DSU_PMU
tristate "ARM DynamIQ Shared Unit (DSU) PMU"
depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 909f27f..3048994 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ARM_CCN) += arm-ccn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
new file mode 100644
index 000..c0924e5
--- /dev/null
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -0,0 +1,776 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * This driver adds support for perf events to use the Performance
+ * Monitor Counter Groups (PMCG) associated with an SMMUv3 node
+ * to monitor that node.
+ *
+ * SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
+ *  is the physical page address of the SMMU PMCG wrapped
+ * to 4K boundary. For example, the PMCG at 0xff8884 is named
+ * smmuv3_pmcg_ff88840
+ *
+ * Filtering by stream id is done by specifying filtering parameters
+ * with the event. options are:
+ *   filter_enable- 0 = no filtering, 1 = filtering enabled
+ *   filter_span  - 0 = exact match, 1 = pattern match
+ *   filter_stream_id - pattern to filter against
+ *
+ * To match a partial StreamID where the X most-significant bits must match
+ * but the Y least-significant bits might differ, STREAMID is programmed
+ * with a value that contains:
+ *  STREAMID[Y - 1] == 0.
+ *  STREAMID[Y - 2:0] == 1 (where Y > 1).
+ * The remainder of implemented bits of STREAMID (X bits, from bit Y upwards)
+ * contain a value to match from the corresponding bits of event StreamID.
+ *
+ * Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
+ *filter_span=1,filter_stream_id=0x42/ -a netperf
+ * Applies filter pattern 0x42 to transaction events, which means events
+ * matching stream ids 0x42 and 0x43 are counted. Further filtering
+ * information is available in the SMMU documentation.
+ *
+ * SMMU events are not attributable to a CPU, so task mode and sampling
+ * are not supported.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define SMMU_PMCG_EVCNTR0   0x0
+#define SMMU_PMCG_EVCNTR(n, stride) (SMMU_PMCG_EVCNTR0 + (n) * (stride))
+#define SMMU_PMCG_EVTYPER0  0x400
+#define SMMU_PMCG_EVTYPER(n)(SMMU_PMCG_EVTYPER0 + (n) * 4)
+#define SMMU_PMCG_SID_SPAN_SHIFT

[PATCH v7 1/4] ACPI/IORT: Add support for PMCG

2019-03-26 Thread Shameer Kolothum

From: Neil Leeder 

Add support for the SMMU Performance Monitor Counter Group
information from ACPI. This is in preparation for its use
in the SMMUv3 PMU driver.

Signed-off-by: Neil Leeder 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
Reviewed-by: Robin Murphy 
Acked-by: Lorenzo Pieralisi 
---
 drivers/acpi/arm64/iort.c | 117 --
 include/linux/acpi_iort.h |   7 +++
 2 files changed, 100 insertions(+), 24 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index e48894e..e2c9b26 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -356,7 +356,8 @@ static struct acpi_iort_node *iort_node_get_id(struct 
acpi_iort_node *node,
if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX ||
-   node->type == ACPI_IORT_NODE_SMMU_V3) {
+   node->type == ACPI_IORT_NODE_SMMU_V3 ||
+   node->type == ACPI_IORT_NODE_PMCG) {
*id_out = map->output_base;
return parent;
}
@@ -394,6 +395,8 @@ static int iort_get_id_mapping_index(struct acpi_iort_node 
*node)
}
 
return smmu->id_mapping_index;
+   case ACPI_IORT_NODE_PMCG:
+   return 0;
default:
return -EINVAL;
}
@@ -1218,14 +1221,23 @@ static void __init arm_smmu_v3_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_v3_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_v3_dma_configure(struct device *dev,
+struct acpi_iort_node *node)
 {
struct acpi_iort_smmu_v3 *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMUv3 specific data */
smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE;
+   attr = (smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for all SMMUv3 set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
 }
 
 #if defined(CONFIG_ACPI_NUMA)
@@ -1301,30 +1313,82 @@ static void __init arm_smmu_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_dma_configure(struct device *dev,
+ struct acpi_iort_node *node)
 {
struct acpi_iort_smmu *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMU specific data */
smmu = (struct acpi_iort_smmu *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
+   attr = (smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for SMMU set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
+}
+
+static int __init arm_smmu_v3_pmcg_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   /*
+* There are always 2 memory resources.
+* If the overflow_gsiv is present then add that for a total of 3.
+*/
+   return pmcg->overflow_gsiv ? 3 : 2;
+}
+
+static void __init arm_smmu_v3_pmcg_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   res[0].start = pmcg->page0_base_address;
+   res[0].end = pmcg->page0_base_address + SZ_4K - 1;
+   res[0].flags = IORESOURCE_MEM;
+   res[1].start = pmcg->page1_base_address;
+   res[1].end = pmcg->page1_base_address + SZ_4K - 1;
+   res[1].flags = IORESOURCE_MEM;
+
+   if (pmcg->overflow_gsiv)
+   acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow",
+  ACPI_EDGE_SENSITIVE, [2]);
+}
+
+static int __init arm_smmu_v3_pmcg_add_platdata(struct platform_device *pdev)
+{
+   u32 model = IORT_SMMU_V3_PMCG_GENERIC;
+
+   return platform_device_add_data(pdev, , sizeof(model));
 }
 
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
-   bool (*dev_is_coherent)(struct acpi_i

[PATCH v7 0/4] arm64 SMMUv3 PMU driver with IORT support

2019-03-26 Thread Shameer Kolothum

This adds a driver for the SMMUv3 PMU into the perf framework.
It includes an IORT update to support PM Counter Groups.

This is based on the initial work done by Neil Leeder[1]

SMMUv3 PMCG devices are named as smmuv3_pmcg_
where  is the physical page address of the SMMU PMCG.
For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840

Usage example:
For common arch supported events:
perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
 filter_span=1,filter_stream_id=0x42/ -a netperf

For IMP DEF events:
perf stat -e smmuv3_pmcg_ff88840/event=id/ -a netperf

This is sanity tested on a HiSilicon platform that requires
a quirk to run  it properly. As per HiSilicon erratum  #162001800,
PMCG event counter registers (SMMU_PMCG_EVCNTRn) on HiSilicon Hip08
platforms are read only and this prevents the software from setting
the initial period on event start. Unfortunately we were a bit late
in the cycle to detect this issue and now require software workaround
for this. Patch #4 is added to this series to provide a workaround
for this issue.

Further testing on supported platforms are very much welcome.

v6 --> v7
-Addressed comments from Robin and Lorenzo.
-Added R-by from Robin/Hanjun and A-by from Lorenzo.

v5 ---> v6
-Addressed comments from Robin and Andrew.
-Changed the way global filter settings are applied as a probable
 fix to the v5 bug where in-use settings gets overwritten.
-Use of PMCG model number to identify the platform.
-Added R-by from Robin to patches #1 and #3.

v4 ---> v5
-IORT code is modified to pass the option/quirk flags to the driver
 through platform_data (patch #4), based on Robin's comments.
-Removed COMPILE_TEST (patch #2).

v3 --> v4

-Addressed comments from Jean and Robin.
-Merged dma config callbacks as per Lorenzo's comments(patch #1).
-Added handling of Global(Counter0) filter settings mode(patch #2).
-Added patch #4 to address HiSilicon erratum  #162001800
-
v2 --> v3

-Addressed comments from Robin.
-Removed iort helper function to retrieve the PMCG reference smmu.
-PMCG devices are now named using the base address

v1 --> v2

- Addressed comments from Robin.
- Added an helper to retrieve the associated smmu dev and named PMUs
  to make the association visible to user.
- Added MSI support  for overflow irq

[1]https://www.spinics.net/lists/arm-kernel/msg598591.html


Neil Leeder (2):
  ACPI/IORT: Add support for PMCG
  perf/smmuv3: Add arm64 smmuv3 pmu driver

Shameer Kolothum (2):
  perf/smmuv3: Add MSI irq support
  perf/smmuv3: Enable HiSilicon Erratum 162001800 quirk

 drivers/acpi/arm64/iort.c | 131 +--
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 868 ++
 include/linux/acpi_iort.h |   8 +
 5 files changed, 993 insertions(+), 24 deletions(-)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

-- 
2.7.4

[PATCH v6 0/4] arm64 SMMUv3 PMU driver with IORT support

2019-02-04 Thread Shameer Kolothum

This adds a driver for the SMMUv3 PMU into the perf framework.
It includes an IORT update to support PM Counter Groups.

This is based on the initial work done by Neil Leeder[1]

SMMUv3 PMCG devices are named as smmuv3_pmcg_
where  is the physical page address of the SMMU PMCG.
For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840

Usage example:
For common arch supported events:
perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
 filter_span=1,filter_stream_id=0x42/ -a netperf

For IMP DEF events:
perf stat -e smmuv3_pmcg_ff88840/event=id/ -a netperf

This is sanity tested on a HiSilicon platform that requires
a quirk to run  it properly. As per HiSilicon erratum  #162001800,
PMCG event counter registers (SMMU_PMCG_EVCNTRn) on HiSilicon Hip08
platforms are read only and this prevents the software from setting
the initial period on event start. Unfortunately we were a bit late
in the cycle to detect this issue and now require software workaround
for this. Patch #4 is added to this series to provide a workaround
for this issue.

Further testing on supported platforms are very much welcome.

v5 ---> v6
-Addressed comments from Robin and Andrew.
-Changed the way global filter settings are applied as a probable
 fix to the v5 bug where in-use settings gets overwritten.
-Use of PMCG model number to identify the platform.
-Added R-by from Robin to patches #1 and #3.

v4 ---> v5
-IORT code is modified to pass the option/quirk flags to the driver
 through platform_data (patch #4), based on Robin's comments.
-Removed COMPILE_TEST (patch #2).

v3 --> v4

-Addressed comments from Jean and Robin.
-Merged dma config callbacks as per Lorenzo's comments(patch #1).
-Added handling of Global(Counter0) filter settings mode(patch #2).
-Added patch #4 to address HiSilicon erratum  #162001800
-
v2 --> v3

-Addressed comments from Robin.
-Removed iort helper function to retrieve the PMCG reference smmu.
-PMCG devices are now named using the base address

v1 --> v2

- Addressed comments from Robin.
- Added an helper to retrieve the associated smmu dev and named PMUs
  to make the association visible to user.
- Added MSI support  for overflow irq

[1]https://www.spinics.net/lists/arm-kernel/msg598591.html

Neil Leeder (2):
  acpi: arm64: add iort support for PMCG
  perf: add arm64 smmuv3 pmu driver

Shameer Kolothum (2):
  perf/smmuv3: Add MSI irq support
  perf/smmuv3_pmu: Enable HiSilicon Erratum 162001800 quirk

 drivers/acpi/arm64/iort.c | 131 +--
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 873 ++
 include/linux/acpi_iort.h |   7 +
 5 files changed, 997 insertions(+), 24 deletions(-)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

-- 
2.7.4

[PATCH v6 2/4] perf: add arm64 smmuv3 pmu driver

2019-02-04 Thread Shameer Kolothum

From: Neil Leeder 

Adds a new driver to support the SMMUv3 PMU and add it into the
perf events framework.

Each SMMU node may have multiple PMUs associated with it, each of
which may support different events.

SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
 is the physical page address of the SMMU PMCG
wrapped to 4K boundary. For example, the PMCG at 0xff8884 is
named smmuv3_pmcg_ff88840

Filtering by stream id is done by specifying filtering parameters
with the event. options are:
   filter_enable- 0 = no filtering, 1 = filtering enabled
   filter_span  - 0 = exact match, 1 = pattern match
   filter_stream_id - pattern to filter against

Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
   filter_span=1,filter_stream_id=0x42/ -a netperf

Applies filter pattern 0x42 to transaction events, which means events
matching stream ids 0x42 & 0x43 are counted as only upper StreamID
bits are required to match the given filter. Further filtering
information is available in the SMMU documentation.

SMMU events are not attributable to a CPU, so task mode and sampling
are not supported.

Signed-off-by: Neil Leeder 
Signed-off-by: Shameer Kolothum 
---
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 781 ++
 3 files changed, 791 insertions(+)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index af9bc17..6a472fc 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -52,6 +52,15 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config ARM_SMMU_V3_PMU
+bool "ARM SMMUv3 Performance Monitors Extension"
+depends on ARM64 && ACPI && ARM_SMMU_V3
+  help
+  Provides support for the SMMU version 3 performance monitor unit 
(PMU)
+  on ARM-based systems.
+  Adds the SMMU PMU into the perf events subsystem for
+  monitoring SMMU performance events.
+
 config ARM_DSU_PMU
tristate "ARM DynamIQ Shared Unit (DSU) PMU"
depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 909f27f..3048994 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ARM_CCN) += arm-ccn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
new file mode 100644
index 000..0371c01
--- /dev/null
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -0,0 +1,781 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * This driver adds support for perf events to use the Performance
+ * Monitor Counter Groups (PMCG) associated with an SMMUv3 node
+ * to monitor that node.
+ *
+ * SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
+ *  is the physical page address of the SMMU PMCG wrapped
+ * to 4K boundary. For example, the PMCG at 0xff8884 is named
+ * smmuv3_pmcg_ff88840
+ *
+ * Filtering by stream id is done by specifying filtering parameters
+ * with the event. options are:
+ *   filter_enable- 0 = no filtering, 1 = filtering enabled
+ *   filter_span  - 0 = exact match, 1 = pattern match
+ *   filter_stream_id - pattern to filter against
+ *
+ * To match a partial StreamID where the X most-significant bits must match
+ * but the Y least-significant bits might differ, STREAMID is programmed
+ * with a value that contains:
+ *  STREAMID[Y - 1] == 0.
+ *  STREAMID[Y - 2:0] == 1 (where Y > 1).
+ * The remainder of implemented bits of STREAMID (X bits, from bit Y upwards)
+ * contain a value to match from the corresponding bits of event StreamID.
+ *
+ * Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
+ *filter_span=1,filter_stream_id=0x42/ -a netperf
+ * Applies filter pattern 0x42 to transaction events, which means events
+ * matching stream ids 0x42 and 0x43 are counted. Further filtering
+ * information is available in the SMMU documentation.
+ *
+ * SMMU events are not attributable to a CPU, so task mode and sampling
+ * are not supported.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define SMMU_PMCG_EVCNTR0   0x0
+#define SMMU_PMCG_EVCNTR(n, stride) (SMMU_PMCG_EVCNTR0 + (n) * (stride))
+#define SMMU_PMCG_EVTYPER0  0x400
+#define SMMU_PMCG_EVTYPER(n)(SMMU_PMCG_EVTYPER0 + (n) * 4)
+#define SMMU_PMCG_SID_SPAN_SHIFT29
+#define SMMU_PMCG_S

[PATCH v6 1/4] acpi: arm64: add iort support for PMCG

2019-02-04 Thread Shameer Kolothum

From: Neil Leeder 

Add support for the SMMU Performance Monitor Counter Group
information from ACPI. This is in preparation for its use
in the SMMUv3 PMU driver.

Signed-off-by: Neil Leeder 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
Reviewed-by: Robin Murphy 
---
 drivers/acpi/arm64/iort.c | 117 --
 include/linux/acpi_iort.h |   6 +++
 2 files changed, 99 insertions(+), 24 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index e48894e..e2c9b26 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -356,7 +356,8 @@ static struct acpi_iort_node *iort_node_get_id(struct 
acpi_iort_node *node,
if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX ||
-   node->type == ACPI_IORT_NODE_SMMU_V3) {
+   node->type == ACPI_IORT_NODE_SMMU_V3 ||
+   node->type == ACPI_IORT_NODE_PMCG) {
*id_out = map->output_base;
return parent;
}
@@ -394,6 +395,8 @@ static int iort_get_id_mapping_index(struct acpi_iort_node 
*node)
}
 
return smmu->id_mapping_index;
+   case ACPI_IORT_NODE_PMCG:
+   return 0;
default:
return -EINVAL;
}
@@ -1218,14 +1221,23 @@ static void __init arm_smmu_v3_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_v3_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_v3_dma_configure(struct device *dev,
+struct acpi_iort_node *node)
 {
struct acpi_iort_smmu_v3 *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMUv3 specific data */
smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE;
+   attr = (smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for all SMMUv3 set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
 }
 
 #if defined(CONFIG_ACPI_NUMA)
@@ -1301,30 +1313,82 @@ static void __init arm_smmu_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_dma_configure(struct device *dev,
+ struct acpi_iort_node *node)
 {
struct acpi_iort_smmu *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMU specific data */
smmu = (struct acpi_iort_smmu *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
+   attr = (smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for SMMU set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
+}
+
+static int __init arm_smmu_v3_pmcg_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   /*
+* There are always 2 memory resources.
+* If the overflow_gsiv is present then add that for a total of 3.
+*/
+   return pmcg->overflow_gsiv ? 3 : 2;
+}
+
+static void __init arm_smmu_v3_pmcg_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   res[0].start = pmcg->page0_base_address;
+   res[0].end = pmcg->page0_base_address + SZ_4K - 1;
+   res[0].flags = IORESOURCE_MEM;
+   res[1].start = pmcg->page1_base_address;
+   res[1].end = pmcg->page1_base_address + SZ_4K - 1;
+   res[1].flags = IORESOURCE_MEM;
+
+   if (pmcg->overflow_gsiv)
+   acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow",
+  ACPI_EDGE_SENSITIVE, [2]);
+}
+
+static int __init arm_smmu_v3_pmcg_add_platdata(struct platform_device *pdev)
+{
+   u32 model = IORT_SMMU_V3_PMCG_GENERIC;
+
+   return platform_device_add_data(pdev, , sizeof(model));
 }
 
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
-   bool (*dev_is_coherent)(struct acpi_iort_node *node);
+   void (*dev

[PATCH v6 3/4] perf/smmuv3: Add MSI irq support

2019-02-04 Thread Shameer Kolothum

This adds support for MSI-based counter overflow interrupt.

Signed-off-by: Shameer Kolothum 
Reviewed-by: Robin Murphy 
---
 drivers/perf/arm_smmuv3_pmu.c | 58 +++
 1 file changed, 58 insertions(+)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 0371c01..eeb9dee 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -67,6 +67,7 @@
 #define SMMU_PMCG_OVSSET0   0xCC0
 #define SMMU_PMCG_CFGR  0xE00
 #define SMMU_PMCG_CFGR_SID_FILTER_TYPE  BIT(23)
+#define SMMU_PMCG_CFGR_MSI  BIT(21)
 #define SMMU_PMCG_CFGR_RELOC_CTRS   BIT(20)
 #define SMMU_PMCG_CFGR_SIZE GENMASK(13, 8)
 #define SMMU_PMCG_CFGR_NCTR GENMASK(5, 0)
@@ -77,6 +78,12 @@
 #define SMMU_PMCG_IRQ_CTRL  0xE50
 #define SMMU_PMCG_IRQ_CTRL_IRQENBIT(0)
 #define SMMU_PMCG_IRQ_CFG0  0xE58
+#define SMMU_PMCG_IRQ_CFG1  0xE60
+#define SMMU_PMCG_IRQ_CFG2  0xE64
+
+/* MSI config fields */
+#define MSI_CFG0_ADDR_MASK  GENMASK_ULL(51, 2)
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
 
 #define SMMU_PMCG_DEFAULT_FILTER_SPAN   1
 #define SMMU_PMCG_DEFAULT_FILTER_SIDGENMASK(31, 0)
@@ -589,11 +596,62 @@ static irqreturn_t smmu_pmu_handle_irq(int irq_num, void 
*data)
return IRQ_HANDLED;
 }
 
+static void smmu_pmu_free_msis(void *data)
+{
+   struct device *dev = data;
+
+   platform_msi_domain_free_irqs(dev);
+}
+
+static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   phys_addr_t doorbell;
+   struct device *dev = msi_desc_to_dev(desc);
+   struct smmu_pmu *pmu = dev_get_drvdata(dev);
+
+   doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+   doorbell &= MSI_CFG0_ADDR_MASK;
+
+   writeq_relaxed(doorbell, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+   writel_relaxed(msg->data, pmu->reg_base + SMMU_PMCG_IRQ_CFG1);
+   writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE,
+  pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
+}
+
+static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
+{
+   struct msi_desc *desc;
+   struct device *dev = pmu->dev;
+   int ret;
+
+   /* Clear MSI address reg */
+   writeq_relaxed(0, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+
+   /* MSI supported or not */
+   if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
+   return;
+
+   ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
+   if (ret) {
+   dev_warn(dev, "failed to allocate MSIs\n");
+   return;
+   }
+
+   desc = first_msi_entry(dev);
+   if (desc)
+   pmu->irq = desc->irq;
+
+   /* Add callback to free MSIs on teardown */
+   devm_add_action(dev, smmu_pmu_free_msis, dev);
+}
+
 static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
 {
unsigned long flags = IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD;
int irq, ret = -ENXIO;
 
+   smmu_pmu_setup_msi(pmu);
+
irq = pmu->irq;
if (irq)
ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
-- 
2.7.4

[PATCH v6 4/4] perf/smmuv3_pmu: Enable HiSilicon Erratum 162001800 quirk

2019-02-04 Thread Shameer Kolothum

HiSilicon erratum 162001800 describes the limitation of
SMMUv3 PMCG implementation on HiSilicon Hip08 platforms.

On these platforms, the PMCG event counter registers
(SMMU_PMCG_EVCNTRn) are read only and as a result it
is not possible to set the initial counter period value
on event monitor start.

To work around this, the current value of the counter
is read and used for delta calculations. OEM information
from ACPI header is used to identify the affected hardware
platforms.

Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 16 ++-
 drivers/perf/arm_smmuv3_pmu.c | 48 ---
 include/linux/acpi_iort.h |  1 +
 3 files changed, 57 insertions(+), 8 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index e2c9b26..4dc68de 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1366,9 +1366,23 @@ static void __init 
arm_smmu_v3_pmcg_init_resources(struct resource *res,
   ACPI_EDGE_SENSITIVE, [2]);
 }
 
+static struct acpi_platform_list pmcg_plat_info[] __initdata = {
+   /* HiSilicon Hip08 Platform */
+   {"HISI  ", "HIP08   ", 0, ACPI_SIG_IORT, greater_than_or_equal, 0,
+IORT_SMMU_V3_PMCG_HISI_HIP08},
+   { }
+};
+
 static int __init arm_smmu_v3_pmcg_add_platdata(struct platform_device *pdev)
 {
-   u32 model = IORT_SMMU_V3_PMCG_GENERIC;
+   u32 model;
+   int idx;
+
+   idx = acpi_match_platform_list(pmcg_plat_info);
+   if (idx >= 0)
+   model = pmcg_plat_info[idx].data;
+   else
+   model = IORT_SMMU_V3_PMCG_GENERIC;
 
return platform_device_add_data(pdev, , sizeof(model));
 }
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index eeb9dee..95a3ed0 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -35,6 +35,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -93,6 +94,8 @@
 
 #define SMMU_PMCG_PA_SHIFT  12
 
+#define SMMU_PMCG_EVCNTR_RDONLY BIT(0)
+
 static int cpuhp_state_num;
 
 struct smmu_pmu {
@@ -107,6 +110,7 @@ struct smmu_pmu {
struct device *dev;
void __iomem *reg_base;
void __iomem *reloc_base;
+   u32 options;
u64 counter_mask;
bool global_filter;
u32 global_filter_span;
@@ -222,15 +226,27 @@ static void smmu_pmu_set_period(struct smmu_pmu *smmu_pmu,
u32 idx = hwc->idx;
u64 new;
 
-   /*
-* We limit the max period to half the max counter value of the counter
-* size, so that even in the case of extreme interrupt latency the
-* counter will (hopefully) not wrap past its initial value.
-*/
-   new = smmu_pmu->counter_mask >> 1;
+   if (smmu_pmu->options & SMMU_PMCG_EVCNTR_RDONLY) {
+   /*
+* On platforms that require this quirk, if the counter starts
+* at < half_counter value and wraps, the current logic of
+* handling the overflow may not work. It is expected that,
+* those platforms will have full 64 counter bits implemented
+* so that such a possibility is remote(eg: HiSilicon HIP08).
+*/
+   new = smmu_pmu_counter_get_value(smmu_pmu, idx);
+   } else {
+   /*
+* We limit the max period to half the max counter value
+* of the counter size, so that even in the case of extreme
+* interrupt latency the counter will (hopefully) not wrap
+* past its initial value.
+*/
+   new = smmu_pmu->counter_mask >> 1;
+   smmu_pmu_counter_set_value(smmu_pmu, idx, new);
+   }
 
local64_set(>prev_count, new);
-   smmu_pmu_counter_set_value(smmu_pmu, idx, new);
 }
 
 static void smmu_pmu_set_event_filter(struct perf_event *event,
@@ -674,6 +690,22 @@ static void smmu_pmu_reset(struct smmu_pmu *smmu_pmu)
   smmu_pmu->reloc_base + SMMU_PMCG_OVSCLR0);
 }
 
+static void smmu_pmu_get_acpi_options(struct smmu_pmu *smmu_pmu)
+{
+   u32 model;
+
+   model = *(u32 *)dev_get_platdata(smmu_pmu->dev);
+
+   switch (model) {
+   case IORT_SMMU_V3_PMCG_HISI_HIP08:
+   /* HiSilicon Erratum 162001800 */
+   smmu_pmu->options |= SMMU_PMCG_EVCNTR_RDONLY;
+   break;
+   }
+
+   dev_notice(smmu_pmu->dev, "option mask 0x%x\n", smmu_pmu->options);
+}
+
 static int smmu_pmu_probe(struct platform_device *pdev)
 {
struct smmu_pmu *smmu_pmu;
@@ -752,6 +784,8 @@ static int smmu_pmu_probe(struct platform_device *pdev)
return -EINVAL;
}
 
+   smmu_pmu_get_acpi_options(smmu_pmu);
+
/* Pick one CPU to be the preferred one to use */
smmu_pmu->on

[PATCH v4] irqchip: gicv3-its: Use NUMA aware memory allocation for ITS tables

2019-01-14 Thread Shameer Kolothum

From: Shanker Donthineni 

The NUMA node information is visible to ITS driver but not being used
other than handling hardware errata. ITS/GICR hardware accesses to the
local NUMA node is usually quicker than the remote NUMA node. How slow
the remote NUMA accesses are depends on the implementation details.

This patch allocates memory for ITS management tables and command
queue from the corresponding NUMA node using the appropriate NUMA
aware functions. This change improves the performance of the ITS
tables read latency on systems where it has more than one ITS block,
and with the slower inter node accesses.

Apache Web server benchmarking using ab tool on a HiSilicon D06
board with multiple numa mem nodes shows Time per request and
Transfer rate improvements of ~3.6% with this patch.

Signed-off-by: Shanker Donthineni 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
Reviewed-by: Ganapatrao Kulkarni 
---

This is to revive the patch originally sent by Shanker[1] and 
to back it up with a benchmark test. Any further testing of
this is most welcome.

v3-->v4
-Addressed comments on alloc_pages_node() and page_address() usage.
-Rebased on 5.0-rc1
-Added Ganapatrao's R-by.

v2-->v3
 -Addressed comments to use page_address().
 -Added Benchmark results to commit log.
 -Removed T-by from Ganapatrao for now.

v1-->v2
 -Edited commit text.
 -Added Ganapatrao's tested-by.

Benchmark test details:

Test Setup:
-D06 with dimm on node 0(Sock#0) and 3 (Sock#1).
-ITS belongs to numa node 0.
-Filesystem mounted on a PCIe NVMe based disk.
-Apache server installed on D06.
-Running ab benchmark test in concurrency mode from a remote m/c
 connected to D06 via  hns3(PCIe) n/w port.
 "ab -k -c 750 -n 200 http://10.202.225.188/;

Test results are avg. of 15 runs.

For 4.20-rc1  Kernel,

Time per request(mean, concurrent)  = 0.02753[ms]  
Transfer Rate = 416501[Kbytes/sec]

For 4.20-rc1 +  this patch,
--
Time per request(mean, concurrent)  = 0.02653[ms]  
Transfer Rate = 431954[Kbytes/sec]

% improvement ~3.6%

vmstat shows around 170K-200K interrupts per second.

~# vmstat 1 -w
procs ---memory-- -  -system--
 r  b swpd freein 
 5  00 30166724  102794 
 9  00 30141828  171148 
 5  00 30150160  207185 
13  00 30145924  175691 
15  00 30140792  145250 
13  00 30135556  201879 
13  00 30134864  192391 
10  00 30133632  168880 


[1] https://patchwork.kernel.org/patch/989/

 drivers/irqchip/irq-gic-v3-its.c | 26 --
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index db20e99..5df59ad 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1737,6 +1737,7 @@ static int its_setup_baser(struct its_node *its, struct 
its_baser *baser,
u64 type = GITS_BASER_TYPE(val);
u64 baser_phys, tmp;
u32 alloc_pages;
+   struct page *page;
void *base;
 
 retry_alloc_baser:
@@ -1749,10 +1750,11 @@ static int its_setup_baser(struct its_node *its, struct 
its_baser *baser,
order = get_order(GITS_BASER_PAGES_MAX * psz);
}
 
-   base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-   if (!base)
+   page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO, order);
+   if (!page)
return -ENOMEM;
 
+   base = (void *)page_address(page);
baser_phys = virt_to_phys(base);
 
/* Check if the physical address of the memory is above 48bits */
@@ -2236,7 +2238,8 @@ static struct its_baser *its_get_baser(struct its_node 
*its, u32 type)
return NULL;
 }
 
-static bool its_alloc_table_entry(struct its_baser *baser, u32 id)
+static bool its_alloc_table_entry(struct its_node *its,
+ struct its_baser *baser, u32 id)
 {
struct page *page;
u32 esz, idx;
@@ -2256,7 +2259,8 @@ static bool its_alloc_table_entry(struct its_baser 
*baser, u32 id)
 
/* Allocate memory for 2nd level table */
if (!table[idx]) {
-   page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 
get_order(baser->psz));
+   page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
+   get_order(baser->psz));
if (!page)
return false;
 
@@ -2287,7 +2291,7 @@ static bool its_alloc_device_table(struct its_node *its, 
u32 dev_id)
if (!baser)
return (ilog2(dev_id) < its->device_ids);
 
-   return its_alloc_table_entry(baser, dev_id);
+   return its_allo

[PATCH v3] irqchip: gicv3-its: Use NUMA aware memory allocation for ITS tables

2018-12-13 Thread Shameer Kolothum

From: Shanker Donthineni 

The NUMA node information is visible to ITS driver but not being used
other than handling hardware errata. ITS/GICR hardware accesses to the
local NUMA node is usually quicker than the remote NUMA node. How slow
the remote NUMA accesses are depends on the implementation details.

This patch allocates memory for ITS management tables and command
queue from the corresponding NUMA node using the appropriate NUMA
aware functions. This change improves the performance of the ITS
tables read latency on systems where it has more than one ITS block,
and with the slower inter node accesses.

Apache Web server benchmarking using ab tool on a HiSilicon D06
board with multiple numa mem nodes shows Time per request and
Transfer rate improvements of ~3.6% with this patch.

Signed-off-by: Shanker Donthineni 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
---

This is to revive the patch originally sent by Shanker[1] and 
to back it up with a benchmark test. Any further testing of
this is most welcome.

v2-->v3
 -Addressed comments to use page_address().
 -Added Benchmark results to commit log.
 -Removed T-by from Ganapatrao for now.

v1-->v2
 -Edited commit text.
 -Added Ganapatrao's tested-by.

Benchmark test details:

Test Setup:
-D06 with dimm on node 0(Sock#0) and 3 (Sock#1).
-ITS belongs to numa node 0.
-Filesystem mounted on a PCIe NVMe based disk.
-Apache server installed on D06.
-Running ab benchmark test in concurrency mode from a remote m/c
 connected to D06 via  hns3(PCIe) n/w port.
 "ab -k -c 750 -n 200 http://10.202.225.188/;

Test results are avg. of 15 runs.

For 4.20-rc1  Kernel,

Time per request(mean, concurrent)  = 0.02753[ms]  
Transfer Rate = 416501[Kbytes/sec]

For 4.20-rc1 +  this patch,
--
Time per request(mean, concurrent)  = 0.02653[ms]  
Transfer Rate = 431954[Kbytes/sec]

% improvement ~3.6%

vmstat shows around 170K-200K interrupts per second.

~# vmstat 1 -w
procs ---memory-- -  -system--
 r  b swpd freein 
 5  00 30166724  102794 
 9  00 30141828  171148 
 5  00 30150160  207185 
13  00 30145924  175691 
15  00 30140792  145250 
13  00 30135556  201879 
13  00 30134864  192391 
10  00 30133632  168880 


[1] https://patchwork.kernel.org/patch/989/

 drivers/irqchip/irq-gic-v3-its.c | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c
index db20e99..ab01061 100644
--- a/drivers/irqchip/irq-gic-v3-its.c
+++ b/drivers/irqchip/irq-gic-v3-its.c
@@ -1749,7 +1749,8 @@ static int its_setup_baser(struct its_node *its, struct 
its_baser *baser,
order = get_order(GITS_BASER_PAGES_MAX * psz);
}
 
-   base = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+   base = (void *)page_address(alloc_pages_node(its->numa_node,
+   GFP_KERNEL | __GFP_ZERO, order));
if (!base)
return -ENOMEM;
 
@@ -2236,7 +2237,8 @@ static struct its_baser *its_get_baser(struct its_node 
*its, u32 type)
return NULL;
 }
 
-static bool its_alloc_table_entry(struct its_baser *baser, u32 id)
+static bool its_alloc_table_entry(struct its_node *its,
+ struct its_baser *baser, u32 id)
 {
struct page *page;
u32 esz, idx;
@@ -2256,7 +2258,8 @@ static bool its_alloc_table_entry(struct its_baser 
*baser, u32 id)
 
/* Allocate memory for 2nd level table */
if (!table[idx]) {
-   page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 
get_order(baser->psz));
+   page = alloc_pages_node(its->numa_node, GFP_KERNEL | __GFP_ZERO,
+   get_order(baser->psz));
if (!page)
return false;
 
@@ -2287,7 +2290,7 @@ static bool its_alloc_device_table(struct its_node *its, 
u32 dev_id)
if (!baser)
return (ilog2(dev_id) < its->device_ids);
 
-   return its_alloc_table_entry(baser, dev_id);
+   return its_alloc_table_entry(its, baser, dev_id);
 }
 
 static bool its_alloc_vpe_table(u32 vpe_id)
@@ -2311,7 +2314,7 @@ static bool its_alloc_vpe_table(u32 vpe_id)
if (!baser)
return false;
 
-   if (!its_alloc_table_entry(baser, vpe_id))
+   if (!its_alloc_table_entry(its, baser, vpe_id))
return false;
}
 
@@ -2345,7 +2348,7 @@ static struct its_device *its_create_device(struct 
its_node *its, u32 dev_id,
nr_ites = max(2, nvecs);
sz = nr_ites * its-&

[PATCH v5 2/4] perf: add arm64 smmuv3 pmu driver

2018-11-30 Thread Shameer Kolothum

From: Neil Leeder 

Adds a new driver to support the SMMUv3 PMU and add it into the
perf events framework.

Each SMMU node may have multiple PMUs associated with it, each of
which may support different events.

SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
 is the physical page address of the SMMU PMCG
wrapped to 4K boundary. For example, the PMCG at 0xff8884 is
named smmuv3_pmcg_ff88840

Filtering by stream id is done by specifying filtering parameters
with the event. options are:
   filter_enable- 0 = no filtering, 1 = filtering enabled
   filter_span  - 0 = exact match, 1 = pattern match
   filter_stream_id - pattern to filter against

Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
   filter_span=1,filter_stream_id=0x42/ -a netperf

Applies filter pattern 0x42 to transaction events, which means events
matching stream ids 0x42 & 0x43 are counted as only upper StreamID
bits are required to match the given filter. Further filtering
information is available in the SMMU documentation.

SMMU events are not attributable to a CPU, so task mode and sampling
are not supported.

Signed-off-by: Neil Leeder 
Signed-off-by: Shameer Kolothum 
---
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 778 ++
 3 files changed, 788 insertions(+)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 08ebaf7..92be6a3 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -52,6 +52,15 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config ARM_SMMU_V3_PMU
+bool "ARM SMMUv3 Performance Monitors Extension"
+depends on ARM64 && ACPI && ARM_SMMU_V3
+  help
+  Provides support for the SMMU version 3 performance monitor unit 
(PMU)
+  on ARM-based systems.
+  Adds the SMMU PMU into the perf events subsystem for
+  monitoring SMMU performance events.
+
 config ARM_DSU_PMU
tristate "ARM DynamIQ Shared Unit (DSU) PMU"
depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index b3902bd..f10a932 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ARM_CCN) += arm-ccn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
new file mode 100644
index 000..fb9dcd8
--- /dev/null
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -0,0 +1,778 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * This driver adds support for perf events to use the Performance
+ * Monitor Counter Groups (PMCG) associated with an SMMUv3 node
+ * to monitor that node.
+ *
+ * SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
+ *  is the physical page address of the SMMU PMCG wrapped
+ * to 4K boundary. For example, the PMCG at 0xff8884 is named
+ * smmuv3_pmcg_ff88840
+ *
+ * Filtering by stream id is done by specifying filtering parameters
+ * with the event. options are:
+ *   filter_enable- 0 = no filtering, 1 = filtering enabled
+ *   filter_span  - 0 = exact match, 1 = pattern match
+ *   filter_stream_id - pattern to filter against
+ *
+ * To match a partial StreamID where the X most-significant bits must match
+ * but the Y least-significant bits might differ, STREAMID is programmed
+ * with a value that contains:
+ *  STREAMID[Y - 1] == 0.
+ *  STREAMID[Y - 2:0] == 1 (where Y > 1).
+ * The remainder of implemented bits of STREAMID (X bits, from bit Y upwards)
+ * contain a value to match from the corresponding bits of event StreamID.
+ *
+ * Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
+ *filter_span=1,filter_stream_id=0x42/ -a netperf
+ * Applies filter pattern 0x42 to transaction events, which means events
+ * matching stream ids 0x42 and 0x43 are counted. Further filtering
+ * information is available in the SMMU documentation.
+ *
+ * SMMU events are not attributable to a CPU, so task mode and sampling
+ * are not supported.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define SMMU_PMCG_EVCNTR0   0x0
+#define SMMU_PMCG_EVCNTR(n, stride) (SMMU_PMCG_EVCNTR0 + (n) * (stride))
+#define SMMU_PMCG_EVTYPER0  0x400
+#define SMMU_PMCG_EVTYPER(n)(SMMU_PMCG_EVTYPER0 + (n) * 4)
+#define SMMU_PMCG_SID_SPAN_SHIFT29
+#define SMMU_PMCG_SID_SPAN_M

[PATCH v5 2/4] perf: add arm64 smmuv3 pmu driver

2018-11-30 Thread Shameer Kolothum

From: Neil Leeder 

Adds a new driver to support the SMMUv3 PMU and add it into the
perf events framework.

Each SMMU node may have multiple PMUs associated with it, each of
which may support different events.

SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
 is the physical page address of the SMMU PMCG
wrapped to 4K boundary. For example, the PMCG at 0xff8884 is
named smmuv3_pmcg_ff88840

Filtering by stream id is done by specifying filtering parameters
with the event. options are:
   filter_enable- 0 = no filtering, 1 = filtering enabled
   filter_span  - 0 = exact match, 1 = pattern match
   filter_stream_id - pattern to filter against

Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
   filter_span=1,filter_stream_id=0x42/ -a netperf

Applies filter pattern 0x42 to transaction events, which means events
matching stream ids 0x42 & 0x43 are counted as only upper StreamID
bits are required to match the given filter. Further filtering
information is available in the SMMU documentation.

SMMU events are not attributable to a CPU, so task mode and sampling
are not supported.

Signed-off-by: Neil Leeder 
Signed-off-by: Shameer Kolothum 
---
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 778 ++
 3 files changed, 788 insertions(+)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 08ebaf7..92be6a3 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -52,6 +52,15 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config ARM_SMMU_V3_PMU
+bool "ARM SMMUv3 Performance Monitors Extension"
+depends on ARM64 && ACPI && ARM_SMMU_V3
+  help
+  Provides support for the SMMU version 3 performance monitor unit 
(PMU)
+  on ARM-based systems.
+  Adds the SMMU PMU into the perf events subsystem for
+  monitoring SMMU performance events.
+
 config ARM_DSU_PMU
tristate "ARM DynamIQ Shared Unit (DSU) PMU"
depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index b3902bd..f10a932 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ARM_CCN) += arm-ccn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
new file mode 100644
index 000..fb9dcd8
--- /dev/null
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -0,0 +1,778 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * This driver adds support for perf events to use the Performance
+ * Monitor Counter Groups (PMCG) associated with an SMMUv3 node
+ * to monitor that node.
+ *
+ * SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
+ *  is the physical page address of the SMMU PMCG wrapped
+ * to 4K boundary. For example, the PMCG at 0xff8884 is named
+ * smmuv3_pmcg_ff88840
+ *
+ * Filtering by stream id is done by specifying filtering parameters
+ * with the event. options are:
+ *   filter_enable- 0 = no filtering, 1 = filtering enabled
+ *   filter_span  - 0 = exact match, 1 = pattern match
+ *   filter_stream_id - pattern to filter against
+ *
+ * To match a partial StreamID where the X most-significant bits must match
+ * but the Y least-significant bits might differ, STREAMID is programmed
+ * with a value that contains:
+ *  STREAMID[Y - 1] == 0.
+ *  STREAMID[Y - 2:0] == 1 (where Y > 1).
+ * The remainder of implemented bits of STREAMID (X bits, from bit Y upwards)
+ * contain a value to match from the corresponding bits of event StreamID.
+ *
+ * Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
+ *filter_span=1,filter_stream_id=0x42/ -a netperf
+ * Applies filter pattern 0x42 to transaction events, which means events
+ * matching stream ids 0x42 and 0x43 are counted. Further filtering
+ * information is available in the SMMU documentation.
+ *
+ * SMMU events are not attributable to a CPU, so task mode and sampling
+ * are not supported.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define SMMU_PMCG_EVCNTR0   0x0
+#define SMMU_PMCG_EVCNTR(n, stride) (SMMU_PMCG_EVCNTR0 + (n) * (stride))
+#define SMMU_PMCG_EVTYPER0  0x400
+#define SMMU_PMCG_EVTYPER(n)(SMMU_PMCG_EVTYPER0 + (n) * 4)
+#define SMMU_PMCG_SID_SPAN_SHIFT29
+#define SMMU_PMCG_SID_SPAN_M

[PATCH v5 1/4] acpi: arm64: add iort support for PMCG

2018-11-30 Thread Shameer Kolothum

From: Neil Leeder 

Add support for the SMMU Performance Monitor Counter Group
information from ACPI. This is in preparation for its use
in the SMMUv3 PMU driver.

Signed-off-by: Neil Leeder 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 97 +--
 1 file changed, 76 insertions(+), 21 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 2a361e2..2da08e1 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -356,7 +356,8 @@ static struct acpi_iort_node *iort_node_get_id(struct 
acpi_iort_node *node,
if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX ||
-   node->type == ACPI_IORT_NODE_SMMU_V3) {
+   node->type == ACPI_IORT_NODE_SMMU_V3 ||
+   node->type == ACPI_IORT_NODE_PMCG) {
*id_out = map->output_base;
return parent;
}
@@ -394,6 +395,8 @@ static int iort_get_id_mapping_index(struct acpi_iort_node 
*node)
}
 
return smmu->id_mapping_index;
+   case ACPI_IORT_NODE_PMCG:
+   return 0;
default:
return -EINVAL;
}
@@ -1216,14 +1219,23 @@ static void __init arm_smmu_v3_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_v3_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_v3_dma_configure(struct device *dev,
+struct acpi_iort_node *node)
 {
struct acpi_iort_smmu_v3 *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMUv3 specific data */
smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE;
+   attr = (smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for all SMMUv3 set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
 }
 
 #if defined(CONFIG_ACPI_NUMA)
@@ -1299,20 +1311,64 @@ static void __init arm_smmu_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_dma_configure(struct device *dev,
+ struct acpi_iort_node *node)
 {
struct acpi_iort_smmu *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMU specific data */
smmu = (struct acpi_iort_smmu *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
+   attr = (smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for SMMU set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
+}
+
+static int __init arm_smmu_v3_pmcg_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   /*
+* There are always 2 memory resources.
+* If the overflow_gsiv is present then add that for a total of 3.
+*/
+   return pmcg->overflow_gsiv ? 3 : 2;
+}
+
+static void __init arm_smmu_v3_pmcg_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   res[0].start = pmcg->page0_base_address;
+   res[0].end = pmcg->page0_base_address + SZ_4K - 1;
+   res[0].flags = IORESOURCE_MEM;
+   res[1].start = pmcg->page1_base_address;
+   res[1].end = pmcg->page1_base_address + SZ_4K - 1;
+   res[1].flags = IORESOURCE_MEM;
+
+   if (pmcg->overflow_gsiv)
+   acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow",
+  ACPI_EDGE_SENSITIVE, [2]);
 }
 
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
-   bool (*dev_is_coherent)(struct acpi_iort_node *node);
+   void (*dev_dma_configure)(struct device *dev,
+   struct acpi_iort_node *node);
int (*dev_count_resources)(struct acpi_iort_node *node);
void (*dev_init_resources)(struct resource *res,
 stru

[PATCH v5 1/4] acpi: arm64: add iort support for PMCG

2018-11-30 Thread Shameer Kolothum

From: Neil Leeder 

Add support for the SMMU Performance Monitor Counter Group
information from ACPI. This is in preparation for its use
in the SMMUv3 PMU driver.

Signed-off-by: Neil Leeder 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 97 +--
 1 file changed, 76 insertions(+), 21 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 2a361e2..2da08e1 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -356,7 +356,8 @@ static struct acpi_iort_node *iort_node_get_id(struct 
acpi_iort_node *node,
if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX ||
-   node->type == ACPI_IORT_NODE_SMMU_V3) {
+   node->type == ACPI_IORT_NODE_SMMU_V3 ||
+   node->type == ACPI_IORT_NODE_PMCG) {
*id_out = map->output_base;
return parent;
}
@@ -394,6 +395,8 @@ static int iort_get_id_mapping_index(struct acpi_iort_node 
*node)
}
 
return smmu->id_mapping_index;
+   case ACPI_IORT_NODE_PMCG:
+   return 0;
default:
return -EINVAL;
}
@@ -1216,14 +1219,23 @@ static void __init arm_smmu_v3_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_v3_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_v3_dma_configure(struct device *dev,
+struct acpi_iort_node *node)
 {
struct acpi_iort_smmu_v3 *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMUv3 specific data */
smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE;
+   attr = (smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for all SMMUv3 set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
 }
 
 #if defined(CONFIG_ACPI_NUMA)
@@ -1299,20 +1311,64 @@ static void __init arm_smmu_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_dma_configure(struct device *dev,
+ struct acpi_iort_node *node)
 {
struct acpi_iort_smmu *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMU specific data */
smmu = (struct acpi_iort_smmu *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
+   attr = (smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for SMMU set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
+}
+
+static int __init arm_smmu_v3_pmcg_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   /*
+* There are always 2 memory resources.
+* If the overflow_gsiv is present then add that for a total of 3.
+*/
+   return pmcg->overflow_gsiv ? 3 : 2;
+}
+
+static void __init arm_smmu_v3_pmcg_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   res[0].start = pmcg->page0_base_address;
+   res[0].end = pmcg->page0_base_address + SZ_4K - 1;
+   res[0].flags = IORESOURCE_MEM;
+   res[1].start = pmcg->page1_base_address;
+   res[1].end = pmcg->page1_base_address + SZ_4K - 1;
+   res[1].flags = IORESOURCE_MEM;
+
+   if (pmcg->overflow_gsiv)
+   acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow",
+  ACPI_EDGE_SENSITIVE, [2]);
 }
 
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
-   bool (*dev_is_coherent)(struct acpi_iort_node *node);
+   void (*dev_dma_configure)(struct device *dev,
+   struct acpi_iort_node *node);
int (*dev_count_resources)(struct acpi_iort_node *node);
void (*dev_init_resources)(struct resource *res,
 stru

[PATCH v5 0/4] arm64 SMMUv3 PMU driver with IORT support

2018-11-30 Thread Shameer Kolothum

This adds a driver for the SMMUv3 PMU into the perf framework.
It includes an IORT update to support PM Counter Groups.

This is based on the initial work done by Neil Leeder[1]

SMMUv3 PMCG devices are named as smmuv3_pmcg_
where  is the physical page address of the SMMU PMCG.
For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840

Usage example:
For common arch supported events:
perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
 filter_span=1,filter_stream_id=0x42/ -a netperf

For IMP DEF events:
perf stat -e smmuv3_pmcg_ff88840/event=id/ -a netperf

This is sanity tested on a HiSilicon platform that requires
a quirk to run  it properly. As per HiSilicon erratum  #162001800,
PMCG event counter registers (SMMU_PMCG_EVCNTRn) on HiSilicon Hip08
platforms are read only and this prevents the software from setting
the initial period on event start. Unfortunately we were a bit late
in the cycle to detect this issue and now require software workaround
for this. Patch #4 is added to this series to provide a workaround
for this issue.

Further testing on supported platforms are very much welcome.

v4 ---> v5
-IORT code is modified to pass the option/quirk flags to the driver
 through platform_data (patch #4), based on Robin's comments.
-Removed COMPILE_TEST (patch #2).

v3 --> v4

-Addressed comments from Jean and Robin.
-Merged dma config callbacks as per Lorenzo's comments(patch #1).
-Added handling of Global(Counter0) filter settings mode(patch #2).
-Added patch #4 to address HiSilicon erratum  #162001800
-
v2 --> v3

-Addressed comments from Robin.
-Removed iort helper function to retrieve the PMCG reference smmu.
-PMCG devices are now named using the base address

v1 --> v2

- Addressed comments from Robin.
- Added an helper to retrieve the associated smmu dev and named PMUs
  to make the association visible to user.
- Added MSI support  for overflow irq

[1]https://www.spinics.net/lists/arm-kernel/msg598591.html

Neil Leeder (2):
  acpi: arm64: add iort support for PMCG
  perf: add arm64 smmuv3 pmu driver

Shameer Kolothum (2):
  perf/smmuv3: Add MSI irq support
  perf/smmuv3_pmu: Enable HiSilicon Erratum 162001800 quirk

 drivers/acpi/arm64/iort.c | 127 +--
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 859 ++
 include/linux/acpi_iort.h |   3 +
 5 files changed, 975 insertions(+), 24 deletions(-)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

-- 
2.7.4

[PATCH v5 0/4] arm64 SMMUv3 PMU driver with IORT support

2018-11-30 Thread Shameer Kolothum

This adds a driver for the SMMUv3 PMU into the perf framework.
It includes an IORT update to support PM Counter Groups.

This is based on the initial work done by Neil Leeder[1]

SMMUv3 PMCG devices are named as smmuv3_pmcg_
where  is the physical page address of the SMMU PMCG.
For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840

Usage example:
For common arch supported events:
perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
 filter_span=1,filter_stream_id=0x42/ -a netperf

For IMP DEF events:
perf stat -e smmuv3_pmcg_ff88840/event=id/ -a netperf

This is sanity tested on a HiSilicon platform that requires
a quirk to run  it properly. As per HiSilicon erratum  #162001800,
PMCG event counter registers (SMMU_PMCG_EVCNTRn) on HiSilicon Hip08
platforms are read only and this prevents the software from setting
the initial period on event start. Unfortunately we were a bit late
in the cycle to detect this issue and now require software workaround
for this. Patch #4 is added to this series to provide a workaround
for this issue.

Further testing on supported platforms are very much welcome.

v4 ---> v5
-IORT code is modified to pass the option/quirk flags to the driver
 through platform_data (patch #4), based on Robin's comments.
-Removed COMPILE_TEST (patch #2).

v3 --> v4

-Addressed comments from Jean and Robin.
-Merged dma config callbacks as per Lorenzo's comments(patch #1).
-Added handling of Global(Counter0) filter settings mode(patch #2).
-Added patch #4 to address HiSilicon erratum  #162001800
-
v2 --> v3

-Addressed comments from Robin.
-Removed iort helper function to retrieve the PMCG reference smmu.
-PMCG devices are now named using the base address

v1 --> v2

- Addressed comments from Robin.
- Added an helper to retrieve the associated smmu dev and named PMUs
  to make the association visible to user.
- Added MSI support  for overflow irq

[1]https://www.spinics.net/lists/arm-kernel/msg598591.html

Neil Leeder (2):
  acpi: arm64: add iort support for PMCG
  perf: add arm64 smmuv3 pmu driver

Shameer Kolothum (2):
  perf/smmuv3: Add MSI irq support
  perf/smmuv3_pmu: Enable HiSilicon Erratum 162001800 quirk

 drivers/acpi/arm64/iort.c | 127 +--
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 859 ++
 include/linux/acpi_iort.h |   3 +
 5 files changed, 975 insertions(+), 24 deletions(-)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

-- 
2.7.4

[PATCH v5 3/4] perf/smmuv3: Add MSI irq support

2018-11-30 Thread Shameer Kolothum

This adds support for MSI-based counter overflow interrupt.

Signed-off-by: Shameer Kolothum 
---
 drivers/perf/arm_smmuv3_pmu.c | 58 +++
 1 file changed, 58 insertions(+)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index fb9dcd8..71d10a0 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -68,6 +68,7 @@
 #define SMMU_PMCG_OVSSET0   0xCC0
 #define SMMU_PMCG_CFGR  0xE00
 #define SMMU_PMCG_CFGR_RELOC_CTRS   BIT(20)
+#define SMMU_PMCG_CFGR_MSI  BIT(21)
 #define SMMU_PMCG_CFGR_SID_FILTER_TYPE  BIT(23)
 #define SMMU_PMCG_CFGR_SIZE_MASKGENMASK(13, 8)
 #define SMMU_PMCG_CFGR_NCTR_MASKGENMASK(5, 0)
@@ -78,6 +79,12 @@
 #define SMMU_PMCG_IRQ_CTRL  0xE50
 #define SMMU_PMCG_IRQ_CTRL_IRQENBIT(0)
 #define SMMU_PMCG_IRQ_CFG0  0xE58
+#define SMMU_PMCG_IRQ_CFG1  0xE60
+#define SMMU_PMCG_IRQ_CFG2  0xE64
+
+/* MSI config fields */
+#define MSI_CFG0_ADDR_MASK  GENMASK_ULL(51, 2)
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
 
 #define SMMU_DEFAULT_FILTER_SPAN1
 #define SMMU_DEFAULT_FILTER_STREAM_ID   GENMASK(31, 0)
@@ -587,11 +594,62 @@ static irqreturn_t smmu_pmu_handle_irq(int irq_num, void 
*data)
return IRQ_HANDLED;
 }
 
+static void smmu_pmu_free_msis(void *data)
+{
+   struct device *dev = data;
+
+   platform_msi_domain_free_irqs(dev);
+}
+
+static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   phys_addr_t doorbell;
+   struct device *dev = msi_desc_to_dev(desc);
+   struct smmu_pmu *pmu = dev_get_drvdata(dev);
+
+   doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+   doorbell &= MSI_CFG0_ADDR_MASK;
+
+   writeq_relaxed(doorbell, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+   writel_relaxed(msg->data, pmu->reg_base + SMMU_PMCG_IRQ_CFG1);
+   writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE,
+  pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
+}
+
+static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
+{
+   struct msi_desc *desc;
+   struct device *dev = pmu->dev;
+   int ret;
+
+   /* Clear MSI address reg */
+   writeq_relaxed(0, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+
+   /* MSI supported or not */
+   if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
+   return;
+
+   ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
+   if (ret) {
+   dev_warn(dev, "failed to allocate MSIs\n");
+   return;
+   }
+
+   desc = first_msi_entry(dev);
+   if (desc)
+   pmu->irq = desc->irq;
+
+   /* Add callback to free MSIs on teardown */
+   devm_add_action(dev, smmu_pmu_free_msis, dev);
+}
+
 static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
 {
unsigned long flags = IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD;
int irq, ret = -ENXIO;
 
+   smmu_pmu_setup_msi(pmu);
+
irq = pmu->irq;
if (irq)
ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
-- 
2.7.4

[PATCH v5 4/4] perf/smmuv3_pmu: Enable HiSilicon Erratum 162001800 quirk

2018-11-30 Thread Shameer Kolothum

HiSilicon erratum 162001800 describes the limitation of
SMMUv3 PMCG implementation on HiSilicon Hip08 platforms.

On these platforms, the PMCG event counter registers
(SMMU_PMCG_EVCNTRn) are read only and as a result it
is not possible to set the initial counter period value
on event monitor start.

To work around this, the current value of the counter
is read and used for delta calculations. OEM information
from ACPI header is used to identify the affected hardware
platforms.

Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 30 +++---
 drivers/perf/arm_smmuv3_pmu.c | 35 +--
 include/linux/acpi_iort.h |  3 +++
 3 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 2da08e1..d174379 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1364,6 +1364,22 @@ static void __init 
arm_smmu_v3_pmcg_init_resources(struct resource *res,
   ACPI_EDGE_SENSITIVE, [2]);
 }
 
+static struct acpi_platform_list pmcg_evcntr_rdonly_list[] __initdata = {
+   /* HiSilicon Erratum 162001800 */
+   {"HISI  ", "HIP08   ", 0, ACPI_SIG_IORT, greater_than_or_equal},
+   { }
+};
+
+static int __init arm_smmu_v3_pmcg_add_platdata(struct platform_device *pdev)
+{
+   u32 options = 0;
+
+   if (acpi_match_platform_list(pmcg_evcntr_rdonly_list) >= 0)
+   options |= IORT_PMCG_EVCNTR_RDONLY;
+
+   return platform_device_add_data(pdev, , sizeof(options));
+}
+
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
@@ -1374,6 +1390,7 @@ struct iort_dev_config {
 struct acpi_iort_node *node);
void (*dev_set_proximity)(struct device *dev,
struct acpi_iort_node *node);
+   int (*dev_add_platdata)(struct platform_device *pdev);
 };
 
 static const struct iort_dev_config iort_arm_smmu_v3_cfg __initconst = {
@@ -1395,6 +1412,7 @@ static const struct iort_dev_config 
iort_arm_smmu_v3_pmcg_cfg __initconst = {
.name = "arm-smmu-v3-pmu",
.dev_count_resources = arm_smmu_v3_pmcg_count_resources,
.dev_init_resources = arm_smmu_v3_pmcg_init_resources,
+   .dev_add_platdata   = arm_smmu_v3_pmcg_add_platdata,
 };
 
 static __init const struct iort_dev_config *iort_get_dev_cfg(
@@ -1455,10 +1473,16 @@ static int __init iort_add_platform_device(struct 
acpi_iort_node *node,
goto dev_put;
 
/*
-* Add a copy of IORT node pointer to platform_data to
-* be used to retrieve IORT data information.
+* Platform devices based on PMCG nodes uses platform_data to
+* pass quirk flags to the driver. For others, add a copy of
+* IORT node pointer to platform_data to be used to retrieve
+* IORT data information.
 */
-   ret = platform_device_add_data(pdev, , sizeof(node));
+   if (ops->dev_add_platdata)
+   ret = ops->dev_add_platdata(pdev);
+   else
+   ret = platform_device_add_data(pdev, , sizeof(node));
+
if (ret)
goto dev_put;
 
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 71d10a0..02107a1 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -35,6 +35,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -111,6 +112,7 @@ struct smmu_pmu {
struct device *dev;
void __iomem *reg_base;
void __iomem *reloc_base;
+   u32 options;
u64 counter_present_mask;
u64 counter_mask;
 };
@@ -224,12 +226,25 @@ static void smmu_pmu_set_period(struct smmu_pmu *smmu_pmu,
u32 idx = hwc->idx;
u64 new;
 
-   /*
-* We limit the max period to half the max counter value of the counter
-* size, so that even in the case of extreme interrupt latency the
-* counter will (hopefully) not wrap past its initial value.
-*/
-   new = smmu_pmu->counter_mask >> 1;
+   if (smmu_pmu->options & IORT_PMCG_EVCNTR_RDONLY) {
+   /*
+* On platforms that require this quirk, if the counter starts
+* at < half_counter value and wraps, the current logic of
+* handling the overflow may not work. It is expected that,
+* those platforms will have full 64 counter bits implemented
+* so that such a possibility is remote(eg: HiSilicon HIP08).
+*/
+   new = smmu_pmu_counter_get_value(smmu_pmu, idx);
+   } else {
+   /*
+* We limit the max period to half the max counter value
+* of the counter size, so that even in the case of extreme
+* interrupt latency the

[PATCH v5 3/4] perf/smmuv3: Add MSI irq support

2018-11-30 Thread Shameer Kolothum

This adds support for MSI-based counter overflow interrupt.

Signed-off-by: Shameer Kolothum 
---
 drivers/perf/arm_smmuv3_pmu.c | 58 +++
 1 file changed, 58 insertions(+)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index fb9dcd8..71d10a0 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -68,6 +68,7 @@
 #define SMMU_PMCG_OVSSET0   0xCC0
 #define SMMU_PMCG_CFGR  0xE00
 #define SMMU_PMCG_CFGR_RELOC_CTRS   BIT(20)
+#define SMMU_PMCG_CFGR_MSI  BIT(21)
 #define SMMU_PMCG_CFGR_SID_FILTER_TYPE  BIT(23)
 #define SMMU_PMCG_CFGR_SIZE_MASKGENMASK(13, 8)
 #define SMMU_PMCG_CFGR_NCTR_MASKGENMASK(5, 0)
@@ -78,6 +79,12 @@
 #define SMMU_PMCG_IRQ_CTRL  0xE50
 #define SMMU_PMCG_IRQ_CTRL_IRQENBIT(0)
 #define SMMU_PMCG_IRQ_CFG0  0xE58
+#define SMMU_PMCG_IRQ_CFG1  0xE60
+#define SMMU_PMCG_IRQ_CFG2  0xE64
+
+/* MSI config fields */
+#define MSI_CFG0_ADDR_MASK  GENMASK_ULL(51, 2)
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
 
 #define SMMU_DEFAULT_FILTER_SPAN1
 #define SMMU_DEFAULT_FILTER_STREAM_ID   GENMASK(31, 0)
@@ -587,11 +594,62 @@ static irqreturn_t smmu_pmu_handle_irq(int irq_num, void 
*data)
return IRQ_HANDLED;
 }
 
+static void smmu_pmu_free_msis(void *data)
+{
+   struct device *dev = data;
+
+   platform_msi_domain_free_irqs(dev);
+}
+
+static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   phys_addr_t doorbell;
+   struct device *dev = msi_desc_to_dev(desc);
+   struct smmu_pmu *pmu = dev_get_drvdata(dev);
+
+   doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+   doorbell &= MSI_CFG0_ADDR_MASK;
+
+   writeq_relaxed(doorbell, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+   writel_relaxed(msg->data, pmu->reg_base + SMMU_PMCG_IRQ_CFG1);
+   writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE,
+  pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
+}
+
+static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
+{
+   struct msi_desc *desc;
+   struct device *dev = pmu->dev;
+   int ret;
+
+   /* Clear MSI address reg */
+   writeq_relaxed(0, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+
+   /* MSI supported or not */
+   if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
+   return;
+
+   ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
+   if (ret) {
+   dev_warn(dev, "failed to allocate MSIs\n");
+   return;
+   }
+
+   desc = first_msi_entry(dev);
+   if (desc)
+   pmu->irq = desc->irq;
+
+   /* Add callback to free MSIs on teardown */
+   devm_add_action(dev, smmu_pmu_free_msis, dev);
+}
+
 static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
 {
unsigned long flags = IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD;
int irq, ret = -ENXIO;
 
+   smmu_pmu_setup_msi(pmu);
+
irq = pmu->irq;
if (irq)
ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
-- 
2.7.4

[PATCH v5 4/4] perf/smmuv3_pmu: Enable HiSilicon Erratum 162001800 quirk

2018-11-30 Thread Shameer Kolothum

HiSilicon erratum 162001800 describes the limitation of
SMMUv3 PMCG implementation on HiSilicon Hip08 platforms.

On these platforms, the PMCG event counter registers
(SMMU_PMCG_EVCNTRn) are read only and as a result it
is not possible to set the initial counter period value
on event monitor start.

To work around this, the current value of the counter
is read and used for delta calculations. OEM information
from ACPI header is used to identify the affected hardware
platforms.

Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 30 +++---
 drivers/perf/arm_smmuv3_pmu.c | 35 +--
 include/linux/acpi_iort.h |  3 +++
 3 files changed, 59 insertions(+), 9 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 2da08e1..d174379 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -1364,6 +1364,22 @@ static void __init 
arm_smmu_v3_pmcg_init_resources(struct resource *res,
   ACPI_EDGE_SENSITIVE, [2]);
 }
 
+static struct acpi_platform_list pmcg_evcntr_rdonly_list[] __initdata = {
+   /* HiSilicon Erratum 162001800 */
+   {"HISI  ", "HIP08   ", 0, ACPI_SIG_IORT, greater_than_or_equal},
+   { }
+};
+
+static int __init arm_smmu_v3_pmcg_add_platdata(struct platform_device *pdev)
+{
+   u32 options = 0;
+
+   if (acpi_match_platform_list(pmcg_evcntr_rdonly_list) >= 0)
+   options |= IORT_PMCG_EVCNTR_RDONLY;
+
+   return platform_device_add_data(pdev, , sizeof(options));
+}
+
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
@@ -1374,6 +1390,7 @@ struct iort_dev_config {
 struct acpi_iort_node *node);
void (*dev_set_proximity)(struct device *dev,
struct acpi_iort_node *node);
+   int (*dev_add_platdata)(struct platform_device *pdev);
 };
 
 static const struct iort_dev_config iort_arm_smmu_v3_cfg __initconst = {
@@ -1395,6 +1412,7 @@ static const struct iort_dev_config 
iort_arm_smmu_v3_pmcg_cfg __initconst = {
.name = "arm-smmu-v3-pmu",
.dev_count_resources = arm_smmu_v3_pmcg_count_resources,
.dev_init_resources = arm_smmu_v3_pmcg_init_resources,
+   .dev_add_platdata   = arm_smmu_v3_pmcg_add_platdata,
 };
 
 static __init const struct iort_dev_config *iort_get_dev_cfg(
@@ -1455,10 +1473,16 @@ static int __init iort_add_platform_device(struct 
acpi_iort_node *node,
goto dev_put;
 
/*
-* Add a copy of IORT node pointer to platform_data to
-* be used to retrieve IORT data information.
+* Platform devices based on PMCG nodes uses platform_data to
+* pass quirk flags to the driver. For others, add a copy of
+* IORT node pointer to platform_data to be used to retrieve
+* IORT data information.
 */
-   ret = platform_device_add_data(pdev, , sizeof(node));
+   if (ops->dev_add_platdata)
+   ret = ops->dev_add_platdata(pdev);
+   else
+   ret = platform_device_add_data(pdev, , sizeof(node));
+
if (ret)
goto dev_put;
 
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 71d10a0..02107a1 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -35,6 +35,7 @@
  */
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -111,6 +112,7 @@ struct smmu_pmu {
struct device *dev;
void __iomem *reg_base;
void __iomem *reloc_base;
+   u32 options;
u64 counter_present_mask;
u64 counter_mask;
 };
@@ -224,12 +226,25 @@ static void smmu_pmu_set_period(struct smmu_pmu *smmu_pmu,
u32 idx = hwc->idx;
u64 new;
 
-   /*
-* We limit the max period to half the max counter value of the counter
-* size, so that even in the case of extreme interrupt latency the
-* counter will (hopefully) not wrap past its initial value.
-*/
-   new = smmu_pmu->counter_mask >> 1;
+   if (smmu_pmu->options & IORT_PMCG_EVCNTR_RDONLY) {
+   /*
+* On platforms that require this quirk, if the counter starts
+* at < half_counter value and wraps, the current logic of
+* handling the overflow may not work. It is expected that,
+* those platforms will have full 64 counter bits implemented
+* so that such a possibility is remote(eg: HiSilicon HIP08).
+*/
+   new = smmu_pmu_counter_get_value(smmu_pmu, idx);
+   } else {
+   /*
+* We limit the max period to half the max counter value
+* of the counter size, so that even in the case of extreme
+* interrupt latency the

[PATCH v4 1/4] acpi: arm64: add iort support for PMCG

2018-10-16 Thread Shameer Kolothum

From: Neil Leeder 

Add support for the SMMU Performance Monitor Counter Group
information from ACPI. This is in preparation for its use
in the SMMUv3 PMU driver.

Signed-off-by: Neil Leeder 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 97 +--
 1 file changed, 76 insertions(+), 21 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 08f26db..c44d8f6 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -356,7 +356,8 @@ static struct acpi_iort_node *iort_node_get_id(struct 
acpi_iort_node *node,
if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX ||
-   node->type == ACPI_IORT_NODE_SMMU_V3) {
+   node->type == ACPI_IORT_NODE_SMMU_V3 ||
+   node->type == ACPI_IORT_NODE_PMCG) {
*id_out = map->output_base;
return parent;
}
@@ -394,6 +395,8 @@ static int iort_get_id_mapping_index(struct acpi_iort_node 
*node)
}
 
return smmu->id_mapping_index;
+   case ACPI_IORT_NODE_PMCG:
+   return 0;
default:
return -EINVAL;
}
@@ -1216,14 +1219,23 @@ static void __init arm_smmu_v3_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_v3_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_v3_dma_configure(struct device *dev,
+struct acpi_iort_node *node)
 {
struct acpi_iort_smmu_v3 *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMUv3 specific data */
smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE;
+   attr = (smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for all SMMUv3 set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
 }
 
 #if defined(CONFIG_ACPI_NUMA)
@@ -1299,20 +1311,64 @@ static void __init arm_smmu_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_dma_configure(struct device *dev,
+ struct acpi_iort_node *node)
 {
struct acpi_iort_smmu *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMU specific data */
smmu = (struct acpi_iort_smmu *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
+   attr = (smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for SMMU set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
+}
+
+static int __init arm_smmu_v3_pmcg_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   /*
+* There are always 2 memory resources.
+* If the overflow_gsiv is present then add that for a total of 3.
+*/
+   return pmcg->overflow_gsiv ? 3 : 2;
+}
+
+static void __init arm_smmu_v3_pmcg_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   res[0].start = pmcg->page0_base_address;
+   res[0].end = pmcg->page0_base_address + SZ_4K - 1;
+   res[0].flags = IORESOURCE_MEM;
+   res[1].start = pmcg->page1_base_address;
+   res[1].end = pmcg->page1_base_address + SZ_4K - 1;
+   res[1].flags = IORESOURCE_MEM;
+
+   if (pmcg->overflow_gsiv)
+   acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow",
+  ACPI_EDGE_SENSITIVE, [2]);
 }
 
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
-   bool (*dev_is_coherent)(struct acpi_iort_node *node);
+   void (*dev_dma_configure)(struct device *dev,
+   struct acpi_iort_node *node);
int (*dev_count_resources)(struct acpi_iort_node *node);
void (*dev_init_resources)(struct resource *res,
 stru

[PATCH v4 0/4] arm64 SMMUv3 PMU driver with IORT support

2018-10-16 Thread Shameer Kolothum

This adds a driver for the SMMUv3 PMU into the perf framework.
It includes an IORT update to support PM Counter Groups.

This is based on the initial work done by Neil Leeder[1]

SMMUv3 PMCG devices are named as smmuv3_pmcg_
where  is the physical page address of the SMMU PMCG.
For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840

Usage example:
For common arch supported events:
perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
 filter_span=1,filter_stream_id=0x42/ -a netperf

For IMP DEF events:
perf stat -e smmuv3_pmcg_ff88840/event=id/ -a netperf

This is sanity tested on a HiSilicon platform that requires
a quirk to run  it properly. As per HiSilicon erratum  #162001800,
PMCG event counter registers (SMMU_PMCG_EVCNTRn) on HiSilicon Hip08
platforms are read only and this prevents the software from setting
the initial period on event start. Unfortunately we were a bit late
in the cycle to detect this issue and now require software workaround
for this. Patch #4 is added to this series to provide a workaround
for this issue.

Further testing on supported platforms are very much welcome.

v3 --> v4

-Addressed comments from Jean and Robin.
-Merged dma config callbacks as per Lorenzo's comments(patch #1).
-Added handling of Global(Counter0) filter settings mode(patch #2).
-Added patch #4 to address HiSilicon erratum  #162001800
-
v2 --> v3

-Addressed comments from Robin.
-Removed iort helper function to retrieve the PMCG reference smmu.
-PMCG devices are now named using the base address

v1 --> v2

- Addressed comments from Robin.
- Added an helper to retrieve the associated smmu dev and named PMUs
  to make the association visible to user.
- Added MSI support  for overflow irq

[1]https://www.spinics.net/lists/arm-kernel/msg598591.html

Neil Leeder (2):
  acpi: arm64: add iort support for PMCG
  perf: add arm64 smmuv3 pmu driver

Shameer Kolothum (2):
  perf/smmuv3: Add MSI irq support
  perf/smmuv3_pmu: Enable HiSilicon Erratum 162001800 quirk

 drivers/acpi/arm64/iort.c |  97 -
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 959 ++
 4 files changed, 1045 insertions(+), 21 deletions(-)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

-- 
2.7.4

[PATCH v4 1/4] acpi: arm64: add iort support for PMCG

2018-10-16 Thread Shameer Kolothum

From: Neil Leeder 

Add support for the SMMU Performance Monitor Counter Group
information from ACPI. This is in preparation for its use
in the SMMUv3 PMU driver.

Signed-off-by: Neil Leeder 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 97 +--
 1 file changed, 76 insertions(+), 21 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 08f26db..c44d8f6 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -356,7 +356,8 @@ static struct acpi_iort_node *iort_node_get_id(struct 
acpi_iort_node *node,
if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX ||
-   node->type == ACPI_IORT_NODE_SMMU_V3) {
+   node->type == ACPI_IORT_NODE_SMMU_V3 ||
+   node->type == ACPI_IORT_NODE_PMCG) {
*id_out = map->output_base;
return parent;
}
@@ -394,6 +395,8 @@ static int iort_get_id_mapping_index(struct acpi_iort_node 
*node)
}
 
return smmu->id_mapping_index;
+   case ACPI_IORT_NODE_PMCG:
+   return 0;
default:
return -EINVAL;
}
@@ -1216,14 +1219,23 @@ static void __init arm_smmu_v3_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_v3_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_v3_dma_configure(struct device *dev,
+struct acpi_iort_node *node)
 {
struct acpi_iort_smmu_v3 *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMUv3 specific data */
smmu = (struct acpi_iort_smmu_v3 *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE;
+   attr = (smmu->flags & ACPI_IORT_SMMU_V3_COHACC_OVERRIDE) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for all SMMUv3 set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
 }
 
 #if defined(CONFIG_ACPI_NUMA)
@@ -1299,20 +1311,64 @@ static void __init arm_smmu_init_resources(struct 
resource *res,
}
 }
 
-static bool __init arm_smmu_is_coherent(struct acpi_iort_node *node)
+static void __init arm_smmu_dma_configure(struct device *dev,
+ struct acpi_iort_node *node)
 {
struct acpi_iort_smmu *smmu;
+   enum dev_dma_attr attr;
 
/* Retrieve SMMU specific data */
smmu = (struct acpi_iort_smmu *)node->node_data;
 
-   return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
+   attr = (smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK) ?
+   DEV_DMA_COHERENT : DEV_DMA_NON_COHERENT;
+
+   /* We expect the dma masks to be equivalent for SMMU set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
+}
+
+static int __init arm_smmu_v3_pmcg_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   /*
+* There are always 2 memory resources.
+* If the overflow_gsiv is present then add that for a total of 3.
+*/
+   return pmcg->overflow_gsiv ? 3 : 2;
+}
+
+static void __init arm_smmu_v3_pmcg_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   res[0].start = pmcg->page0_base_address;
+   res[0].end = pmcg->page0_base_address + SZ_4K - 1;
+   res[0].flags = IORESOURCE_MEM;
+   res[1].start = pmcg->page1_base_address;
+   res[1].end = pmcg->page1_base_address + SZ_4K - 1;
+   res[1].flags = IORESOURCE_MEM;
+
+   if (pmcg->overflow_gsiv)
+   acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow",
+  ACPI_EDGE_SENSITIVE, [2]);
 }
 
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
-   bool (*dev_is_coherent)(struct acpi_iort_node *node);
+   void (*dev_dma_configure)(struct device *dev,
+   struct acpi_iort_node *node);
int (*dev_count_resources)(struct acpi_iort_node *node);
void (*dev_init_resources)(struct resource *res,
 stru

[PATCH v4 0/4] arm64 SMMUv3 PMU driver with IORT support

2018-10-16 Thread Shameer Kolothum

This adds a driver for the SMMUv3 PMU into the perf framework.
It includes an IORT update to support PM Counter Groups.

This is based on the initial work done by Neil Leeder[1]

SMMUv3 PMCG devices are named as smmuv3_pmcg_
where  is the physical page address of the SMMU PMCG.
For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840

Usage example:
For common arch supported events:
perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
 filter_span=1,filter_stream_id=0x42/ -a netperf

For IMP DEF events:
perf stat -e smmuv3_pmcg_ff88840/event=id/ -a netperf

This is sanity tested on a HiSilicon platform that requires
a quirk to run  it properly. As per HiSilicon erratum  #162001800,
PMCG event counter registers (SMMU_PMCG_EVCNTRn) on HiSilicon Hip08
platforms are read only and this prevents the software from setting
the initial period on event start. Unfortunately we were a bit late
in the cycle to detect this issue and now require software workaround
for this. Patch #4 is added to this series to provide a workaround
for this issue.

Further testing on supported platforms are very much welcome.

v3 --> v4

-Addressed comments from Jean and Robin.
-Merged dma config callbacks as per Lorenzo's comments(patch #1).
-Added handling of Global(Counter0) filter settings mode(patch #2).
-Added patch #4 to address HiSilicon erratum  #162001800
-
v2 --> v3

-Addressed comments from Robin.
-Removed iort helper function to retrieve the PMCG reference smmu.
-PMCG devices are now named using the base address

v1 --> v2

- Addressed comments from Robin.
- Added an helper to retrieve the associated smmu dev and named PMUs
  to make the association visible to user.
- Added MSI support  for overflow irq

[1]https://www.spinics.net/lists/arm-kernel/msg598591.html

Neil Leeder (2):
  acpi: arm64: add iort support for PMCG
  perf: add arm64 smmuv3 pmu driver

Shameer Kolothum (2):
  perf/smmuv3: Add MSI irq support
  perf/smmuv3_pmu: Enable HiSilicon Erratum 162001800 quirk

 drivers/acpi/arm64/iort.c |  97 -
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 959 ++
 4 files changed, 1045 insertions(+), 21 deletions(-)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

-- 
2.7.4

[PATCH v4 3/4] perf/smmuv3: Add MSI irq support

2018-10-16 Thread Shameer Kolothum

This adds support for MSI-based counter overflow interrupt.

Signed-off-by: Shameer Kolothum 
---
 drivers/perf/arm_smmuv3_pmu.c | 58 +++
 1 file changed, 58 insertions(+)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index e30b939..d927ef8 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -68,6 +68,7 @@
 #define SMMU_PMCG_OVSSET0   0xCC0
 #define SMMU_PMCG_CFGR  0xE00
 #define SMMU_PMCG_CFGR_RELOC_CTRS   BIT(20)
+#define SMMU_PMCG_CFGR_MSI  BIT(21)
 #define SMMU_PMCG_CFGR_SID_FILTER_TYPE  BIT(23)
 #define SMMU_PMCG_CFGR_SIZE_MASKGENMASK(13, 8)
 #define SMMU_PMCG_CFGR_NCTR_MASKGENMASK(5, 0)
@@ -78,6 +79,12 @@
 #define SMMU_PMCG_IRQ_CTRL  0xE50
 #define SMMU_PMCG_IRQ_CTRL_IRQENBIT(0)
 #define SMMU_PMCG_IRQ_CFG0  0xE58
+#define SMMU_PMCG_IRQ_CFG1  0xE60
+#define SMMU_PMCG_IRQ_CFG2  0xE64
+
+/* MSI config fields */
+#define MSI_CFG0_ADDR_MASK  GENMASK_ULL(51, 2)
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
 
 #define SMMU_DEFAULT_FILTER_SPAN1
 #define SMMU_DEFAULT_FILTER_STREAM_ID   GENMASK(31, 0)
@@ -587,11 +594,62 @@ static irqreturn_t smmu_pmu_handle_irq(int irq_num, void 
*data)
return IRQ_HANDLED;
 }
 
+static void smmu_pmu_free_msis(void *data)
+{
+   struct device *dev = data;
+
+   platform_msi_domain_free_irqs(dev);
+}
+
+static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   phys_addr_t doorbell;
+   struct device *dev = msi_desc_to_dev(desc);
+   struct smmu_pmu *pmu = dev_get_drvdata(dev);
+
+   doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+   doorbell &= MSI_CFG0_ADDR_MASK;
+
+   writeq_relaxed(doorbell, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+   writel_relaxed(msg->data, pmu->reg_base + SMMU_PMCG_IRQ_CFG1);
+   writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE,
+  pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
+}
+
+static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
+{
+   struct msi_desc *desc;
+   struct device *dev = pmu->dev;
+   int ret;
+
+   /* Clear MSI address reg */
+   writeq_relaxed(0, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+
+   /* MSI supported or not */
+   if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
+   return;
+
+   ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
+   if (ret) {
+   dev_warn(dev, "failed to allocate MSIs\n");
+   return;
+   }
+
+   desc = first_msi_entry(dev);
+   if (desc)
+   pmu->irq = desc->irq;
+
+   /* Add callback to free MSIs on teardown */
+   devm_add_action(dev, smmu_pmu_free_msis, dev);
+}
+
 static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
 {
unsigned long flags = IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD;
int irq, ret = -ENXIO;
 
+   smmu_pmu_setup_msi(pmu);
+
irq = pmu->irq;
if (irq)
ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
-- 
2.7.4

[PATCH v4 4/4] perf/smmuv3_pmu: Enable HiSilicon Erratum 162001800 quirk

2018-10-16 Thread Shameer Kolothum

HiSilicon erratum 162001800 describes the limitation of
SMMUv3 PMCG implementation on HiSilicon Hip08 platforms.

On these platforms, the PMCG event counter registers
(SMMU_PMCG_EVCNTRn) are read only and as a result it is
not possible to set the initial counter period value on
event monitor start.

To work around this, the current value of the counter is
read and is used for delta calculations. This increases
the possibility of reporting incorrect values if counter
overflow happens and counter passes the initial value.

OEM information from ACPI header is used to identify the
affected hardware platform.

Signed-off-by: Shameer Kolothum 
---
 drivers/perf/arm_smmuv3_pmu.c | 137 +++---
 1 file changed, 130 insertions(+), 7 deletions(-)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index d927ef8..519545e 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -96,6 +96,8 @@
 
 #define SMMU_PA_SHIFT   12
 
+#define SMMU_PMU_OPT_EVCNTR_RDONLY (1 << 0)
+
 static int cpuhp_state_num;
 
 struct smmu_pmu {
@@ -111,10 +113,55 @@ struct smmu_pmu {
struct device *dev;
void __iomem *reg_base;
void __iomem *reloc_base;
+   u32 options;
u64 counter_present_mask;
u64 counter_mask;
 };
 
+struct erratum_acpi_oem_info {
+   char oem_id[ACPI_OEM_ID_SIZE + 1];
+   char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
+   u32 oem_revision;
+};
+
+static struct erratum_acpi_oem_info hisi_162001800_oem_info[] = {
+   /*
+* Note that trailing spaces are required to properly match
+* the OEM table information.
+*/
+   {
+   .oem_id = "HISI  ",
+   .oem_table_id   = "HIP08   ",
+   .oem_revision   = 0,
+   },
+   { /* Sentinel indicating the end of the OEM array */ },
+};
+
+enum smmu_pmu_erratum_match_type {
+   se_match_acpi_oem,
+};
+
+void hisi_erratum_evcntr_rdonly(struct smmu_pmu *smmu_pmu)
+{
+   smmu_pmu->options |= SMMU_PMU_OPT_EVCNTR_RDONLY;
+}
+
+struct smmu_pmu_erratum_wa {
+   enum smmu_pmu_erratum_match_type match_type;
+   const void *id; /* Indicate the Erratum ID */
+   const char *desc_str;
+   void (*enable)(struct smmu_pmu *smmu_pmu);
+};
+
+static const struct smmu_pmu_erratum_wa smmu_pmu_wa[] = {
+   {
+   .match_type = se_match_acpi_oem,
+   .id = hisi_162001800_oem_info,
+   .desc_str = "HiSilicon erratum 162001800",
+   .enable = hisi_erratum_evcntr_rdonly,
+   },
+};
+
 #define to_smmu_pmu(p) (container_of(p, struct smmu_pmu, pmu))
 
 #define SMMU_PMU_EVENT_ATTR_EXTRACTOR(_name, _config, _start, _end)\
@@ -224,15 +271,20 @@ static void smmu_pmu_set_period(struct smmu_pmu *smmu_pmu,
u32 idx = hwc->idx;
u64 new;
 
-   /*
-* We limit the max period to half the max counter value of the counter
-* size, so that even in the case of extreme interrupt latency the
-* counter will (hopefully) not wrap past its initial value.
-*/
-   new = smmu_pmu->counter_mask >> 1;
+   if (smmu_pmu->options & SMMU_PMU_OPT_EVCNTR_RDONLY) {
+   new = smmu_pmu_counter_get_value(smmu_pmu, idx);
+   } else {
+   /*
+* We limit the max period to half the max counter value
+* of the counter size, so that even in the case of extreme
+* interrupt latency the counter will (hopefully) not wrap
+* past its initial value.
+*/
+   new = smmu_pmu->counter_mask >> 1;
+   smmu_pmu_counter_set_value(smmu_pmu, idx, new);
+   }
 
local64_set(>prev_count, new);
-   smmu_pmu_counter_set_value(smmu_pmu, idx, new);
 }
 
 static void smmu_pmu_get_event_filter(struct perf_event *event, u32 *span,
@@ -670,6 +722,69 @@ static void smmu_pmu_reset(struct smmu_pmu *smmu_pmu)
   smmu_pmu->reloc_base + SMMU_PMCG_OVSCLR0);
 }
 
+typedef bool (*se_match_fn_t)(const struct smmu_pmu_erratum_wa *,
+ const void *);
+
+bool smmu_pmu_check_acpi_erratum(const struct smmu_pmu_erratum_wa *wa,
+   const void *arg)
+{
+   static const struct erratum_acpi_oem_info empty_oem_info = {};
+   const struct erratum_acpi_oem_info *info = wa->id;
+   const struct acpi_table_header *hdr = arg;
+
+   /* Iterate over the ACPI OEM info array, looking for a match */
+   while (memcmp(info, _oem_info, sizeof(*info))) {
+   if (!memcmp(info->oem_id, hdr->oem_id, ACPI_OEM_ID_SIZE) &&
+   !memcmp(info->oem_table_id, hdr->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) &&
+   info->oem_revision == hdr->oem_revision)
+

[PATCH v4 3/4] perf/smmuv3: Add MSI irq support

2018-10-16 Thread Shameer Kolothum

This adds support for MSI-based counter overflow interrupt.

Signed-off-by: Shameer Kolothum 
---
 drivers/perf/arm_smmuv3_pmu.c | 58 +++
 1 file changed, 58 insertions(+)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index e30b939..d927ef8 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -68,6 +68,7 @@
 #define SMMU_PMCG_OVSSET0   0xCC0
 #define SMMU_PMCG_CFGR  0xE00
 #define SMMU_PMCG_CFGR_RELOC_CTRS   BIT(20)
+#define SMMU_PMCG_CFGR_MSI  BIT(21)
 #define SMMU_PMCG_CFGR_SID_FILTER_TYPE  BIT(23)
 #define SMMU_PMCG_CFGR_SIZE_MASKGENMASK(13, 8)
 #define SMMU_PMCG_CFGR_NCTR_MASKGENMASK(5, 0)
@@ -78,6 +79,12 @@
 #define SMMU_PMCG_IRQ_CTRL  0xE50
 #define SMMU_PMCG_IRQ_CTRL_IRQENBIT(0)
 #define SMMU_PMCG_IRQ_CFG0  0xE58
+#define SMMU_PMCG_IRQ_CFG1  0xE60
+#define SMMU_PMCG_IRQ_CFG2  0xE64
+
+/* MSI config fields */
+#define MSI_CFG0_ADDR_MASK  GENMASK_ULL(51, 2)
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
 
 #define SMMU_DEFAULT_FILTER_SPAN1
 #define SMMU_DEFAULT_FILTER_STREAM_ID   GENMASK(31, 0)
@@ -587,11 +594,62 @@ static irqreturn_t smmu_pmu_handle_irq(int irq_num, void 
*data)
return IRQ_HANDLED;
 }
 
+static void smmu_pmu_free_msis(void *data)
+{
+   struct device *dev = data;
+
+   platform_msi_domain_free_irqs(dev);
+}
+
+static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   phys_addr_t doorbell;
+   struct device *dev = msi_desc_to_dev(desc);
+   struct smmu_pmu *pmu = dev_get_drvdata(dev);
+
+   doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+   doorbell &= MSI_CFG0_ADDR_MASK;
+
+   writeq_relaxed(doorbell, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+   writel_relaxed(msg->data, pmu->reg_base + SMMU_PMCG_IRQ_CFG1);
+   writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE,
+  pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
+}
+
+static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
+{
+   struct msi_desc *desc;
+   struct device *dev = pmu->dev;
+   int ret;
+
+   /* Clear MSI address reg */
+   writeq_relaxed(0, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+
+   /* MSI supported or not */
+   if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
+   return;
+
+   ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
+   if (ret) {
+   dev_warn(dev, "failed to allocate MSIs\n");
+   return;
+   }
+
+   desc = first_msi_entry(dev);
+   if (desc)
+   pmu->irq = desc->irq;
+
+   /* Add callback to free MSIs on teardown */
+   devm_add_action(dev, smmu_pmu_free_msis, dev);
+}
+
 static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
 {
unsigned long flags = IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD;
int irq, ret = -ENXIO;
 
+   smmu_pmu_setup_msi(pmu);
+
irq = pmu->irq;
if (irq)
ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
-- 
2.7.4

[PATCH v4 4/4] perf/smmuv3_pmu: Enable HiSilicon Erratum 162001800 quirk

2018-10-16 Thread Shameer Kolothum

HiSilicon erratum 162001800 describes the limitation of
SMMUv3 PMCG implementation on HiSilicon Hip08 platforms.

On these platforms, the PMCG event counter registers
(SMMU_PMCG_EVCNTRn) are read only and as a result it is
not possible to set the initial counter period value on
event monitor start.

To work around this, the current value of the counter is
read and is used for delta calculations. This increases
the possibility of reporting incorrect values if counter
overflow happens and counter passes the initial value.

OEM information from ACPI header is used to identify the
affected hardware platform.

Signed-off-by: Shameer Kolothum 
---
 drivers/perf/arm_smmuv3_pmu.c | 137 +++---
 1 file changed, 130 insertions(+), 7 deletions(-)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index d927ef8..519545e 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -96,6 +96,8 @@
 
 #define SMMU_PA_SHIFT   12
 
+#define SMMU_PMU_OPT_EVCNTR_RDONLY (1 << 0)
+
 static int cpuhp_state_num;
 
 struct smmu_pmu {
@@ -111,10 +113,55 @@ struct smmu_pmu {
struct device *dev;
void __iomem *reg_base;
void __iomem *reloc_base;
+   u32 options;
u64 counter_present_mask;
u64 counter_mask;
 };
 
+struct erratum_acpi_oem_info {
+   char oem_id[ACPI_OEM_ID_SIZE + 1];
+   char oem_table_id[ACPI_OEM_TABLE_ID_SIZE + 1];
+   u32 oem_revision;
+};
+
+static struct erratum_acpi_oem_info hisi_162001800_oem_info[] = {
+   /*
+* Note that trailing spaces are required to properly match
+* the OEM table information.
+*/
+   {
+   .oem_id = "HISI  ",
+   .oem_table_id   = "HIP08   ",
+   .oem_revision   = 0,
+   },
+   { /* Sentinel indicating the end of the OEM array */ },
+};
+
+enum smmu_pmu_erratum_match_type {
+   se_match_acpi_oem,
+};
+
+void hisi_erratum_evcntr_rdonly(struct smmu_pmu *smmu_pmu)
+{
+   smmu_pmu->options |= SMMU_PMU_OPT_EVCNTR_RDONLY;
+}
+
+struct smmu_pmu_erratum_wa {
+   enum smmu_pmu_erratum_match_type match_type;
+   const void *id; /* Indicate the Erratum ID */
+   const char *desc_str;
+   void (*enable)(struct smmu_pmu *smmu_pmu);
+};
+
+static const struct smmu_pmu_erratum_wa smmu_pmu_wa[] = {
+   {
+   .match_type = se_match_acpi_oem,
+   .id = hisi_162001800_oem_info,
+   .desc_str = "HiSilicon erratum 162001800",
+   .enable = hisi_erratum_evcntr_rdonly,
+   },
+};
+
 #define to_smmu_pmu(p) (container_of(p, struct smmu_pmu, pmu))
 
 #define SMMU_PMU_EVENT_ATTR_EXTRACTOR(_name, _config, _start, _end)\
@@ -224,15 +271,20 @@ static void smmu_pmu_set_period(struct smmu_pmu *smmu_pmu,
u32 idx = hwc->idx;
u64 new;
 
-   /*
-* We limit the max period to half the max counter value of the counter
-* size, so that even in the case of extreme interrupt latency the
-* counter will (hopefully) not wrap past its initial value.
-*/
-   new = smmu_pmu->counter_mask >> 1;
+   if (smmu_pmu->options & SMMU_PMU_OPT_EVCNTR_RDONLY) {
+   new = smmu_pmu_counter_get_value(smmu_pmu, idx);
+   } else {
+   /*
+* We limit the max period to half the max counter value
+* of the counter size, so that even in the case of extreme
+* interrupt latency the counter will (hopefully) not wrap
+* past its initial value.
+*/
+   new = smmu_pmu->counter_mask >> 1;
+   smmu_pmu_counter_set_value(smmu_pmu, idx, new);
+   }
 
local64_set(>prev_count, new);
-   smmu_pmu_counter_set_value(smmu_pmu, idx, new);
 }
 
 static void smmu_pmu_get_event_filter(struct perf_event *event, u32 *span,
@@ -670,6 +722,69 @@ static void smmu_pmu_reset(struct smmu_pmu *smmu_pmu)
   smmu_pmu->reloc_base + SMMU_PMCG_OVSCLR0);
 }
 
+typedef bool (*se_match_fn_t)(const struct smmu_pmu_erratum_wa *,
+ const void *);
+
+bool smmu_pmu_check_acpi_erratum(const struct smmu_pmu_erratum_wa *wa,
+   const void *arg)
+{
+   static const struct erratum_acpi_oem_info empty_oem_info = {};
+   const struct erratum_acpi_oem_info *info = wa->id;
+   const struct acpi_table_header *hdr = arg;
+
+   /* Iterate over the ACPI OEM info array, looking for a match */
+   while (memcmp(info, _oem_info, sizeof(*info))) {
+   if (!memcmp(info->oem_id, hdr->oem_id, ACPI_OEM_ID_SIZE) &&
+   !memcmp(info->oem_table_id, hdr->oem_table_id, 
ACPI_OEM_TABLE_ID_SIZE) &&
+   info->oem_revision == hdr->oem_revision)
+

[PATCH v4 2/4] perf: add arm64 smmuv3 pmu driver

2018-10-16 Thread Shameer Kolothum

From: Neil Leeder 

Adds a new driver to support the SMMUv3 PMU and add it into the
perf events framework.

Each SMMU node may have multiple PMUs associated with it, each of
which may support different events.

SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
 is the physical page address of the SMMU PMCG
wrapped to 4K boundary. For example, the PMCG at 0xff8884 is
named smmuv3_pmcg_ff88840

Filtering by stream id is done by specifying filtering parameters
with the event. options are:
   filter_enable- 0 = no filtering, 1 = filtering enabled
   filter_span  - 0 = exact match, 1 = pattern match
   filter_stream_id - pattern to filter against

Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
   filter_span=1,filter_stream_id=0x42/ -a netperf

Applies filter pattern 0x42 to transaction events, which means events
matching stream ids 0x42 & 0x43 are counted as only upper StreamID
bits are required to match the given filter. Further filtering
information is available in the SMMU documentation.

SMMU events are not attributable to a CPU, so task mode and sampling
are not supported.

Signed-off-by: Neil Leeder 
Signed-off-by: Shameer Kolothum 
---
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 778 ++
 3 files changed, 788 insertions(+)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 08ebaf7..c5deb8a 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -52,6 +52,15 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config ARM_SMMU_V3_PMU
+bool "ARM SMMUv3 Performance Monitors Extension"
+depends on (ARM64 && ACPI && ARM_SMMU_V3) || COMPILE_TEST
+  help
+  Provides support for the SMMU version 3 performance monitor unit 
(PMU)
+  on ARM-based systems.
+  Adds the SMMU PMU into the perf events subsystem for
+  monitoring SMMU performance events.
+
 config ARM_DSU_PMU
tristate "ARM DynamIQ Shared Unit (DSU) PMU"
depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index b3902bd..f10a932 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ARM_CCN) += arm-ccn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
new file mode 100644
index 000..e30b939
--- /dev/null
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -0,0 +1,778 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * This driver adds support for perf events to use the Performance
+ * Monitor Counter Groups (PMCG) associated with an SMMUv3 node
+ * to monitor that node.
+ *
+ * SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
+ *  is the physical page address of the SMMU PMCG wrapped
+ * to 4K boundary. For example, the PMCG at 0xff8884 is named
+ * smmuv3_pmcg_ff88840
+ *
+ * Filtering by stream id is done by specifying filtering parameters
+ * with the event. options are:
+ *   filter_enable- 0 = no filtering, 1 = filtering enabled
+ *   filter_span  - 0 = exact match, 1 = pattern match
+ *   filter_stream_id - pattern to filter against
+ *
+ * To match a partial StreamID where the X most-significant bits must match
+ * but the Y least-significant bits might differ, STREAMID is programmed
+ * with a value that contains:
+ *  STREAMID[Y - 1] == 0.
+ *  STREAMID[Y - 2:0] == 1 (where Y > 1).
+ * The remainder of implemented bits of STREAMID (X bits, from bit Y upwards)
+ * contain  a value to match from the corresponding bits of event StreamID.
+ *
+ * Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
+ *filter_span=1,filter_stream_id=0x42/ -a netperf
+ * Applies filter pattern 0x42 to transaction events, which means events
+ * matching stream ids 0x42 and 0x43 are counted. Further filtering
+ * information is available in the SMMU documentation.
+ *
+ * SMMU events are not attributable to a CPU, so task mode and sampling
+ * are not supported.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define SMMU_PMCG_EVCNTR0   0x0
+#define SMMU_PMCG_EVCNTR(n, stride) (SMMU_PMCG_EVCNTR0 + (n) * (stride))
+#define SMMU_PMCG_EVTYPER0  0x400
+#define SMMU_PMCG_EVTYPER(n)(SMMU_PMCG_EVTYPER0 + (n) * 4)
+#define SMMU_PMCG_SID_SPAN_SHIFT29
+#de

[PATCH v4 2/4] perf: add arm64 smmuv3 pmu driver

2018-10-16 Thread Shameer Kolothum

From: Neil Leeder 

Adds a new driver to support the SMMUv3 PMU and add it into the
perf events framework.

Each SMMU node may have multiple PMUs associated with it, each of
which may support different events.

SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
 is the physical page address of the SMMU PMCG
wrapped to 4K boundary. For example, the PMCG at 0xff8884 is
named smmuv3_pmcg_ff88840

Filtering by stream id is done by specifying filtering parameters
with the event. options are:
   filter_enable- 0 = no filtering, 1 = filtering enabled
   filter_span  - 0 = exact match, 1 = pattern match
   filter_stream_id - pattern to filter against

Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
   filter_span=1,filter_stream_id=0x42/ -a netperf

Applies filter pattern 0x42 to transaction events, which means events
matching stream ids 0x42 & 0x43 are counted as only upper StreamID
bits are required to match the given filter. Further filtering
information is available in the SMMU documentation.

SMMU events are not attributable to a CPU, so task mode and sampling
are not supported.

Signed-off-by: Neil Leeder 
Signed-off-by: Shameer Kolothum 
---
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 778 ++
 3 files changed, 788 insertions(+)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 08ebaf7..c5deb8a 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -52,6 +52,15 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config ARM_SMMU_V3_PMU
+bool "ARM SMMUv3 Performance Monitors Extension"
+depends on (ARM64 && ACPI && ARM_SMMU_V3) || COMPILE_TEST
+  help
+  Provides support for the SMMU version 3 performance monitor unit 
(PMU)
+  on ARM-based systems.
+  Adds the SMMU PMU into the perf events subsystem for
+  monitoring SMMU performance events.
+
 config ARM_DSU_PMU
tristate "ARM DynamIQ Shared Unit (DSU) PMU"
depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index b3902bd..f10a932 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ARM_CCN) += arm-ccn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
new file mode 100644
index 000..e30b939
--- /dev/null
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -0,0 +1,778 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * This driver adds support for perf events to use the Performance
+ * Monitor Counter Groups (PMCG) associated with an SMMUv3 node
+ * to monitor that node.
+ *
+ * SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
+ *  is the physical page address of the SMMU PMCG wrapped
+ * to 4K boundary. For example, the PMCG at 0xff8884 is named
+ * smmuv3_pmcg_ff88840
+ *
+ * Filtering by stream id is done by specifying filtering parameters
+ * with the event. options are:
+ *   filter_enable- 0 = no filtering, 1 = filtering enabled
+ *   filter_span  - 0 = exact match, 1 = pattern match
+ *   filter_stream_id - pattern to filter against
+ *
+ * To match a partial StreamID where the X most-significant bits must match
+ * but the Y least-significant bits might differ, STREAMID is programmed
+ * with a value that contains:
+ *  STREAMID[Y - 1] == 0.
+ *  STREAMID[Y - 2:0] == 1 (where Y > 1).
+ * The remainder of implemented bits of STREAMID (X bits, from bit Y upwards)
+ * contain  a value to match from the corresponding bits of event StreamID.
+ *
+ * Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
+ *filter_span=1,filter_stream_id=0x42/ -a netperf
+ * Applies filter pattern 0x42 to transaction events, which means events
+ * matching stream ids 0x42 and 0x43 are counted. Further filtering
+ * information is available in the SMMU documentation.
+ *
+ * SMMU events are not attributable to a CPU, so task mode and sampling
+ * are not supported.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define SMMU_PMCG_EVCNTR0   0x0
+#define SMMU_PMCG_EVCNTR(n, stride) (SMMU_PMCG_EVCNTR0 + (n) * (stride))
+#define SMMU_PMCG_EVTYPER0  0x400
+#define SMMU_PMCG_EVTYPER(n)(SMMU_PMCG_EVTYPER0 + (n) * 4)
+#define SMMU_PMCG_SID_SPAN_SHIFT29
+#de

[PATCH v3 0/3] arm64 SMMUv3 PMU driver with IORT support

2018-09-21 Thread Shameer Kolothum

This adds a driver for the SMMUv3 PMU into the perf framework.
It includes an IORT update to support PM Counter Groups.

This is based on the initial work done by Neil Leeder[1]

SMMUv3 PMCG devices are named as smmuv3_pmcg_
where  is the physical page address of the SMMU PMCG.
For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840

Usage example:
For common arch supported events:
perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
 filter_span=1,filter_stream_id=0x42/ -a pwd

For IMP DEF events:
perf stat -e smmuv3_pmcg_ff88840/event=id/ -a pwd

Sanity tested on HiSilicon platform. Further testing on supported
platforms are very much welcome.

v2 --> v3

-Addressed comments from Robin.
-Removed iort helper function to retrieve the PMCG reference smmu.
-PMCG devices are now named using the base address

v1 --> v2

- Addressed comments from Robin.
- Added an helper to retrieve the associated smmu dev and named PMUs
  to make the association visible to user.
- Added MSI support  for overflow irq

[1]https://www.spinics.net/lists/arm-kernel/msg598591.html

Neil Leeder (2):
  acpi: arm64: add iort support for PMCG
  perf: add arm64 smmuv3 pmu driver

Shameer Kolothum (1):
  perf/smmuv3: Add MSI irq support

 drivers/acpi/arm64/iort.c |  78 -
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 794 ++
 4 files changed, 870 insertions(+), 12 deletions(-)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

-- 
2.7.4

[PATCH v3 0/3] arm64 SMMUv3 PMU driver with IORT support

2018-09-21 Thread Shameer Kolothum

This adds a driver for the SMMUv3 PMU into the perf framework.
It includes an IORT update to support PM Counter Groups.

This is based on the initial work done by Neil Leeder[1]

SMMUv3 PMCG devices are named as smmuv3_pmcg_
where  is the physical page address of the SMMU PMCG.
For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840

Usage example:
For common arch supported events:
perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
 filter_span=1,filter_stream_id=0x42/ -a pwd

For IMP DEF events:
perf stat -e smmuv3_pmcg_ff88840/event=id/ -a pwd

Sanity tested on HiSilicon platform. Further testing on supported
platforms are very much welcome.

v2 --> v3

-Addressed comments from Robin.
-Removed iort helper function to retrieve the PMCG reference smmu.
-PMCG devices are now named using the base address

v1 --> v2

- Addressed comments from Robin.
- Added an helper to retrieve the associated smmu dev and named PMUs
  to make the association visible to user.
- Added MSI support  for overflow irq

[1]https://www.spinics.net/lists/arm-kernel/msg598591.html

Neil Leeder (2):
  acpi: arm64: add iort support for PMCG
  perf: add arm64 smmuv3 pmu driver

Shameer Kolothum (1):
  perf/smmuv3: Add MSI irq support

 drivers/acpi/arm64/iort.c |  78 -
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 794 ++
 4 files changed, 870 insertions(+), 12 deletions(-)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

-- 
2.7.4

[PATCH v3 3/3] perf/smmuv3: Add MSI irq support

2018-09-21 Thread Shameer Kolothum

This adds support for MSI-based counter overflow interrupt.

Signed-off-by: Shameer Kolothum 
---
 drivers/perf/arm_smmuv3_pmu.c | 58 +++
 1 file changed, 58 insertions(+)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 2fa6c96..84f7907 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -57,6 +57,7 @@
 #define SMMU_PMCG_OVSSET0   0xCC0
 #define SMMU_PMCG_CFGR  0xE00
 #define SMMU_PMCG_CFGR_RELOC_CTRS   BIT(20)
+#define SMMU_PMCG_CFGR_MSI  BIT(21)
 #define SMMU_PMCG_CFGR_SIZE_MASKGENMASK(13, 8)
 #define SMMU_PMCG_CFGR_NCTR_MASKGENMASK(5, 0)
 #define SMMU_PMCG_CR0xE04
@@ -66,6 +67,12 @@
 #define SMMU_PMCG_IRQ_CTRL  0xE50
 #define SMMU_PMCG_IRQ_CTRL_IRQENBIT(0)
 #define SMMU_PMCG_IRQ_CFG0  0xE58
+#define SMMU_PMCG_IRQ_CFG1  0xE60
+#define SMMU_PMCG_IRQ_CFG2  0xE64
+
+/* MSI config fields */
+#define MSI_CFG0_ADDR_MASK  GENMASK_ULL(51, 2)
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
 
 #define SMMU_DEFAULT_FILTER_SPAN1
 #define SMMU_DEFAULT_FILTER_STREAM_ID   GENMASK(31, 0)
@@ -548,11 +555,62 @@ static irqreturn_t smmu_pmu_handle_irq(int irq_num, void 
*data)
return IRQ_HANDLED;
 }
 
+static void smmu_pmu_free_msis(void *data)
+{
+   struct device *dev = data;
+
+   platform_msi_domain_free_irqs(dev);
+}
+
+static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   phys_addr_t doorbell;
+   struct device *dev = msi_desc_to_dev(desc);
+   struct smmu_pmu *pmu = dev_get_drvdata(dev);
+
+   doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+   doorbell &= MSI_CFG0_ADDR_MASK;
+
+   writeq_relaxed(doorbell, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+   writel_relaxed(msg->data, pmu->reg_base + SMMU_PMCG_IRQ_CFG1);
+   writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE,
+  pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
+}
+
+static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
+{
+   struct msi_desc *desc;
+   struct device *dev = pmu->dev;
+   int ret;
+
+   /* Clear MSI address reg */
+   writeq_relaxed(0, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+
+   /* MSI supported or not */
+   if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
+   return;
+
+   ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
+   if (ret) {
+   dev_warn(dev, "failed to allocate MSIs\n");
+   return;
+   }
+
+   desc = first_msi_entry(dev);
+   if (desc)
+   pmu->irq = desc->irq;
+
+   /* Add callback to free MSIs on teardown */
+   devm_add_action(dev, smmu_pmu_free_msis, dev);
+}
+
 static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
 {
unsigned long flags = IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD;
int irq, ret = -ENXIO;
 
+   smmu_pmu_setup_msi(pmu);
+
irq = pmu->irq;
if (irq)
ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
-- 
2.7.4

[PATCH v3 3/3] perf/smmuv3: Add MSI irq support

2018-09-21 Thread Shameer Kolothum

This adds support for MSI-based counter overflow interrupt.

Signed-off-by: Shameer Kolothum 
---
 drivers/perf/arm_smmuv3_pmu.c | 58 +++
 1 file changed, 58 insertions(+)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 2fa6c96..84f7907 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -57,6 +57,7 @@
 #define SMMU_PMCG_OVSSET0   0xCC0
 #define SMMU_PMCG_CFGR  0xE00
 #define SMMU_PMCG_CFGR_RELOC_CTRS   BIT(20)
+#define SMMU_PMCG_CFGR_MSI  BIT(21)
 #define SMMU_PMCG_CFGR_SIZE_MASKGENMASK(13, 8)
 #define SMMU_PMCG_CFGR_NCTR_MASKGENMASK(5, 0)
 #define SMMU_PMCG_CR0xE04
@@ -66,6 +67,12 @@
 #define SMMU_PMCG_IRQ_CTRL  0xE50
 #define SMMU_PMCG_IRQ_CTRL_IRQENBIT(0)
 #define SMMU_PMCG_IRQ_CFG0  0xE58
+#define SMMU_PMCG_IRQ_CFG1  0xE60
+#define SMMU_PMCG_IRQ_CFG2  0xE64
+
+/* MSI config fields */
+#define MSI_CFG0_ADDR_MASK  GENMASK_ULL(51, 2)
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
 
 #define SMMU_DEFAULT_FILTER_SPAN1
 #define SMMU_DEFAULT_FILTER_STREAM_ID   GENMASK(31, 0)
@@ -548,11 +555,62 @@ static irqreturn_t smmu_pmu_handle_irq(int irq_num, void 
*data)
return IRQ_HANDLED;
 }
 
+static void smmu_pmu_free_msis(void *data)
+{
+   struct device *dev = data;
+
+   platform_msi_domain_free_irqs(dev);
+}
+
+static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   phys_addr_t doorbell;
+   struct device *dev = msi_desc_to_dev(desc);
+   struct smmu_pmu *pmu = dev_get_drvdata(dev);
+
+   doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+   doorbell &= MSI_CFG0_ADDR_MASK;
+
+   writeq_relaxed(doorbell, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+   writel_relaxed(msg->data, pmu->reg_base + SMMU_PMCG_IRQ_CFG1);
+   writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE,
+  pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
+}
+
+static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
+{
+   struct msi_desc *desc;
+   struct device *dev = pmu->dev;
+   int ret;
+
+   /* Clear MSI address reg */
+   writeq_relaxed(0, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+
+   /* MSI supported or not */
+   if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
+   return;
+
+   ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
+   if (ret) {
+   dev_warn(dev, "failed to allocate MSIs\n");
+   return;
+   }
+
+   desc = first_msi_entry(dev);
+   if (desc)
+   pmu->irq = desc->irq;
+
+   /* Add callback to free MSIs on teardown */
+   devm_add_action(dev, smmu_pmu_free_msis, dev);
+}
+
 static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
 {
unsigned long flags = IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD;
int irq, ret = -ENXIO;
 
+   smmu_pmu_setup_msi(pmu);
+
irq = pmu->irq;
if (irq)
ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
-- 
2.7.4

[PATCH v3 1/3] acpi: arm64: add iort support for PMCG

2018-09-21 Thread Shameer Kolothum

From: Neil Leeder 

Add support for the SMMU Performance Monitor Counter Group
information from ACPI. This is in preparation for its use
in the SMMUv3 PMU driver.

Signed-off-by: Neil Leeder 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 78 +++
 1 file changed, 66 insertions(+), 12 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 08f26db..b979c86 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -356,7 +356,8 @@ static struct acpi_iort_node *iort_node_get_id(struct 
acpi_iort_node *node,
if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX ||
-   node->type == ACPI_IORT_NODE_SMMU_V3) {
+   node->type == ACPI_IORT_NODE_SMMU_V3 ||
+   node->type == ACPI_IORT_NODE_PMCG) {
*id_out = map->output_base;
return parent;
}
@@ -394,6 +395,8 @@ static int iort_get_id_mapping_index(struct acpi_iort_node 
*node)
}
 
return smmu->id_mapping_index;
+   case ACPI_IORT_NODE_PMCG:
+   return 0;
default:
return -EINVAL;
}
@@ -1309,6 +1312,50 @@ static bool __init arm_smmu_is_coherent(struct 
acpi_iort_node *node)
return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
 }
 
+static void __init arm_smmu_common_dma_configure(struct device *dev,
+   enum dev_dma_attr attr)
+{
+   /* We expect the dma masks to be equivalent for all SMMUs set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
+}
+
+static int __init arm_smmu_v3_pmcg_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   /*
+* There are always 2 memory resources.
+* If the overflow_gsiv is present then add that for a total of 3.
+*/
+   return pmcg->overflow_gsiv ? 3 : 2;
+}
+
+static void __init arm_smmu_v3_pmcg_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   res[0].start = pmcg->page0_base_address;
+   res[0].end = pmcg->page0_base_address + SZ_4K - 1;
+   res[0].flags = IORESOURCE_MEM;
+   res[1].start = pmcg->page1_base_address;
+   res[1].end = pmcg->page1_base_address + SZ_4K - 1;
+   res[1].flags = IORESOURCE_MEM;
+
+   if (pmcg->overflow_gsiv)
+   acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow",
+  ACPI_EDGE_SENSITIVE, [2]);
+}
+
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
@@ -1318,6 +1365,8 @@ struct iort_dev_config {
 struct acpi_iort_node *node);
void (*dev_set_proximity)(struct device *dev,
struct acpi_iort_node *node);
+   void (*dev_dma_configure)(struct device *dev,
+   enum dev_dma_attr attr);
 };
 
 static const struct iort_dev_config iort_arm_smmu_v3_cfg __initconst = {
@@ -1326,23 +1375,34 @@ static const struct iort_dev_config 
iort_arm_smmu_v3_cfg __initconst = {
.dev_count_resources = arm_smmu_v3_count_resources,
.dev_init_resources = arm_smmu_v3_init_resources,
.dev_set_proximity = arm_smmu_v3_set_proximity,
+   .dev_dma_configure = arm_smmu_common_dma_configure,
 };
 
 static const struct iort_dev_config iort_arm_smmu_cfg __initconst = {
.name = "arm-smmu",
.dev_is_coherent = arm_smmu_is_coherent,
.dev_count_resources = arm_smmu_count_resources,
-   .dev_init_resources = arm_smmu_init_resources
+   .dev_init_resources = arm_smmu_init_resources,
+   .dev_dma_configure = arm_smmu_common_dma_configure,
+};
+
+static const struct iort_dev_config iort_arm_smmu_v3_pmcg_cfg __initconst = {
+   .name = "arm-smmu-v3-pmu",
+   .dev_count_resources = arm_smmu_v3_pmcg_count_resources,
+   .dev_init_resources = arm_smmu_v3_pmcg_init_resources,
 };
 
 static __init const struct iort_dev_config *iort_get_dev_cfg(
struct acpi_iort_node *node)
 {
+
switch (node->type) {
case ACPI_IORT_NODE_SMMU_V3:
return _arm_smmu_v3_cfg;
case ACPI_IORT_NODE_SMMU:

[PATCH v3 1/3] acpi: arm64: add iort support for PMCG

2018-09-21 Thread Shameer Kolothum

From: Neil Leeder 

Add support for the SMMU Performance Monitor Counter Group
information from ACPI. This is in preparation for its use
in the SMMUv3 PMU driver.

Signed-off-by: Neil Leeder 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 78 +++
 1 file changed, 66 insertions(+), 12 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 08f26db..b979c86 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -356,7 +356,8 @@ static struct acpi_iort_node *iort_node_get_id(struct 
acpi_iort_node *node,
if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX ||
-   node->type == ACPI_IORT_NODE_SMMU_V3) {
+   node->type == ACPI_IORT_NODE_SMMU_V3 ||
+   node->type == ACPI_IORT_NODE_PMCG) {
*id_out = map->output_base;
return parent;
}
@@ -394,6 +395,8 @@ static int iort_get_id_mapping_index(struct acpi_iort_node 
*node)
}
 
return smmu->id_mapping_index;
+   case ACPI_IORT_NODE_PMCG:
+   return 0;
default:
return -EINVAL;
}
@@ -1309,6 +1312,50 @@ static bool __init arm_smmu_is_coherent(struct 
acpi_iort_node *node)
return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
 }
 
+static void __init arm_smmu_common_dma_configure(struct device *dev,
+   enum dev_dma_attr attr)
+{
+   /* We expect the dma masks to be equivalent for all SMMUs set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
+}
+
+static int __init arm_smmu_v3_pmcg_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   /*
+* There are always 2 memory resources.
+* If the overflow_gsiv is present then add that for a total of 3.
+*/
+   return pmcg->overflow_gsiv ? 3 : 2;
+}
+
+static void __init arm_smmu_v3_pmcg_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   res[0].start = pmcg->page0_base_address;
+   res[0].end = pmcg->page0_base_address + SZ_4K - 1;
+   res[0].flags = IORESOURCE_MEM;
+   res[1].start = pmcg->page1_base_address;
+   res[1].end = pmcg->page1_base_address + SZ_4K - 1;
+   res[1].flags = IORESOURCE_MEM;
+
+   if (pmcg->overflow_gsiv)
+   acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow",
+  ACPI_EDGE_SENSITIVE, [2]);
+}
+
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
@@ -1318,6 +1365,8 @@ struct iort_dev_config {
 struct acpi_iort_node *node);
void (*dev_set_proximity)(struct device *dev,
struct acpi_iort_node *node);
+   void (*dev_dma_configure)(struct device *dev,
+   enum dev_dma_attr attr);
 };
 
 static const struct iort_dev_config iort_arm_smmu_v3_cfg __initconst = {
@@ -1326,23 +1375,34 @@ static const struct iort_dev_config 
iort_arm_smmu_v3_cfg __initconst = {
.dev_count_resources = arm_smmu_v3_count_resources,
.dev_init_resources = arm_smmu_v3_init_resources,
.dev_set_proximity = arm_smmu_v3_set_proximity,
+   .dev_dma_configure = arm_smmu_common_dma_configure,
 };
 
 static const struct iort_dev_config iort_arm_smmu_cfg __initconst = {
.name = "arm-smmu",
.dev_is_coherent = arm_smmu_is_coherent,
.dev_count_resources = arm_smmu_count_resources,
-   .dev_init_resources = arm_smmu_init_resources
+   .dev_init_resources = arm_smmu_init_resources,
+   .dev_dma_configure = arm_smmu_common_dma_configure,
+};
+
+static const struct iort_dev_config iort_arm_smmu_v3_pmcg_cfg __initconst = {
+   .name = "arm-smmu-v3-pmu",
+   .dev_count_resources = arm_smmu_v3_pmcg_count_resources,
+   .dev_init_resources = arm_smmu_v3_pmcg_init_resources,
 };
 
 static __init const struct iort_dev_config *iort_get_dev_cfg(
struct acpi_iort_node *node)
 {
+
switch (node->type) {
case ACPI_IORT_NODE_SMMU_V3:
return _arm_smmu_v3_cfg;
case ACPI_IORT_NODE_SMMU:

[PATCH v3 2/3] perf: add arm64 smmuv3 pmu driver

2018-09-21 Thread Shameer Kolothum

From: Neil Leeder 

Adds a new driver to support the SMMUv3 PMU and add it into the
perf events framework.

Each SMMU node may have multiple PMUs associated with it, each of
which may support different events.

SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
 is the physical page address of the SMMU PMCG.
For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840

Filtering by stream id is done by specifying filtering parameters
with the event. options are:
   filter_enable- 0 = no filtering, 1 = filtering enabled
   filter_span  - 0 = exact match, 1 = pattern match
   filter_stream_id - pattern to filter against
Further filtering information is available in the SMMU documentation.

Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
   filter_span=1,filter_stream_id=0x42/ -a pwd
Applies filter pattern 0x42 to transaction events.

SMMU events are not attributable to a CPU, so task mode and sampling
are not supported.

Signed-off-by: Neil Leeder 
Signed-off-by: Shameer Kolothum 
---
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 736 ++
 3 files changed, 746 insertions(+)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 08ebaf7..34969dd 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -52,6 +52,15 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config ARM_SMMU_V3_PMU
+bool "ARM SMMUv3 Performance Monitors {Extension}"
+depends on ARM64 && ACPI && ARM_SMMU_V3
+  help
+  Provides support for the SMMU version 3 performance monitor unit 
(PMU)
+  on ARM-based systems.
+  Adds the SMMU PMU into the perf events subsystem for
+  monitoring SMMU performance events.
+
 config ARM_DSU_PMU
tristate "ARM DynamIQ Shared Unit (DSU) PMU"
depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index b3902bd..f10a932 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ARM_CCN) += arm-ccn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
new file mode 100644
index 000..2fa6c96
--- /dev/null
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -0,0 +1,736 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * This driver adds support for perf events to use the Performance
+ * Monitor Counter Groups (PMCG) associated with an SMMUv3 node
+ * to monitor that node.
+ *
+ * SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
+ *  is the physical page address of the SMMU PMCG.
+ * For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840
+
+ * Filtering by stream id is done by specifying filtering parameters
+ * with the event. options are:
+ *   filter_enable- 0 = no filtering, 1 = filtering enabled
+ *   filter_span  - 0 = exact match, 1 = pattern match
+ *   filter_stream_id - pattern to filter against
+ * Further filtering information is available in the SMMU documentation.
+ *
+ * Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
+ *   filter_span=1,filter_stream_id=0x42/ -a pwd
+ * Applies filter pattern 0x42 to transaction events.
+ *
+ * SMMU events are not attributable to a CPU, so task mode and sampling
+ * are not supported.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define SMMU_PMCG_EVCNTR0   0x0
+#define SMMU_PMCG_EVCNTR(n, stride) (SMMU_PMCG_EVCNTR0 + (n) * (stride))
+#define SMMU_PMCG_EVTYPER0  0x400
+#define SMMU_PMCG_EVTYPER(n)(SMMU_PMCG_EVTYPER0 + (n) * 4)
+#define SMMU_PMCG_SID_SPAN_SHIFT29
+#define SMMU_PMCG_SMR0  0xA00
+#define SMMU_PMCG_SMR(n)(SMMU_PMCG_SMR0 + (n) * 4)
+#define SMMU_PMCG_CNTENSET0 0xC00
+#define SMMU_PMCG_CNTENCLR0 0xC20
+#define SMMU_PMCG_INTENSET0 0xC40
+#define SMMU_PMCG_INTENCLR0 0xC60
+#define SMMU_PMCG_OVSCLR0   0xC80
+#define SMMU_PMCG_OVSSET0   0xCC0
+#define SMMU_PMCG_CFGR  0xE00
+#define SMMU_PMCG_CFGR_RELOC_CTRS   BIT(20)
+#define SMMU_PMCG_CFGR_SIZE_MASKGENMASK(13, 8)
+#define SMMU_PMCG_CFGR_NCTR_MASKGENMASK(5, 0)
+#define SMMU_PMCG_CR0xE04
+#define SMMU_PMCG_CR_ENABLE BIT(0)
+#

[PATCH v3 2/3] perf: add arm64 smmuv3 pmu driver

2018-09-21 Thread Shameer Kolothum

From: Neil Leeder 

Adds a new driver to support the SMMUv3 PMU and add it into the
perf events framework.

Each SMMU node may have multiple PMUs associated with it, each of
which may support different events.

SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
 is the physical page address of the SMMU PMCG.
For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840

Filtering by stream id is done by specifying filtering parameters
with the event. options are:
   filter_enable- 0 = no filtering, 1 = filtering enabled
   filter_span  - 0 = exact match, 1 = pattern match
   filter_stream_id - pattern to filter against
Further filtering information is available in the SMMU documentation.

Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
   filter_span=1,filter_stream_id=0x42/ -a pwd
Applies filter pattern 0x42 to transaction events.

SMMU events are not attributable to a CPU, so task mode and sampling
are not supported.

Signed-off-by: Neil Leeder 
Signed-off-by: Shameer Kolothum 
---
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 736 ++
 3 files changed, 746 insertions(+)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 08ebaf7..34969dd 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -52,6 +52,15 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config ARM_SMMU_V3_PMU
+bool "ARM SMMUv3 Performance Monitors {Extension}"
+depends on ARM64 && ACPI && ARM_SMMU_V3
+  help
+  Provides support for the SMMU version 3 performance monitor unit 
(PMU)
+  on ARM-based systems.
+  Adds the SMMU PMU into the perf events subsystem for
+  monitoring SMMU performance events.
+
 config ARM_DSU_PMU
tristate "ARM DynamIQ Shared Unit (DSU) PMU"
depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index b3902bd..f10a932 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ARM_CCN) += arm-ccn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMU_V3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
new file mode 100644
index 000..2fa6c96
--- /dev/null
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -0,0 +1,736 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * This driver adds support for perf events to use the Performance
+ * Monitor Counter Groups (PMCG) associated with an SMMUv3 node
+ * to monitor that node.
+ *
+ * SMMUv3 PMCG devices are named as smmuv3_pmcg_ where
+ *  is the physical page address of the SMMU PMCG.
+ * For example, the PMCG at 0xff8884 is named smmuv3_pmcg_ff88840
+
+ * Filtering by stream id is done by specifying filtering parameters
+ * with the event. options are:
+ *   filter_enable- 0 = no filtering, 1 = filtering enabled
+ *   filter_span  - 0 = exact match, 1 = pattern match
+ *   filter_stream_id - pattern to filter against
+ * Further filtering information is available in the SMMU documentation.
+ *
+ * Example: perf stat -e smmuv3_pmcg_ff88840/transaction,filter_enable=1,
+ *   filter_span=1,filter_stream_id=0x42/ -a pwd
+ * Applies filter pattern 0x42 to transaction events.
+ *
+ * SMMU events are not attributable to a CPU, so task mode and sampling
+ * are not supported.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define SMMU_PMCG_EVCNTR0   0x0
+#define SMMU_PMCG_EVCNTR(n, stride) (SMMU_PMCG_EVCNTR0 + (n) * (stride))
+#define SMMU_PMCG_EVTYPER0  0x400
+#define SMMU_PMCG_EVTYPER(n)(SMMU_PMCG_EVTYPER0 + (n) * 4)
+#define SMMU_PMCG_SID_SPAN_SHIFT29
+#define SMMU_PMCG_SMR0  0xA00
+#define SMMU_PMCG_SMR(n)(SMMU_PMCG_SMR0 + (n) * 4)
+#define SMMU_PMCG_CNTENSET0 0xC00
+#define SMMU_PMCG_CNTENCLR0 0xC20
+#define SMMU_PMCG_INTENSET0 0xC40
+#define SMMU_PMCG_INTENCLR0 0xC60
+#define SMMU_PMCG_OVSCLR0   0xC80
+#define SMMU_PMCG_OVSSET0   0xCC0
+#define SMMU_PMCG_CFGR  0xE00
+#define SMMU_PMCG_CFGR_RELOC_CTRS   BIT(20)
+#define SMMU_PMCG_CFGR_SIZE_MASKGENMASK(13, 8)
+#define SMMU_PMCG_CFGR_NCTR_MASKGENMASK(5, 0)
+#define SMMU_PMCG_CR0xE04
+#define SMMU_PMCG_CR_ENABLE BIT(0)
+#

[PATCH v2 3/4] perf: add arm64 smmuv3 pmu driver

2018-07-24 Thread Shameer Kolothum

From: Neil Leeder 

Adds a new driver to support the SMMU v3 PMU and add it into the
perf events framework.

Each SMMU node may have multiple PMUs associated with it, each of
which may support different events.

SMMUv3 PMCG devices are named as arm_smmu_v3_x_pmcg_y where x
denotes the associated smmuv3 dev id(if any) and y denotes the
pmu dev id.

Filtering by stream id is done by specifying filtering parameters
with the event. options are:
   filter_enable- 0 = no filtering, 1 = filtering enabled
   filter_span  - 0 = exact match, 1 = pattern match
   filter_stream_id - pattern to filter against
Further filtering information is available in the SMMU documentation.

Example: perf stat -e arm_smmu_v3_0_pmcg_6/transaction,filter_enable=1,
   filter_span=1,filter_stream_id=0x42/ -a pwd
Applies filter pattern 0x42 to transaction events.

SMMU events are not attributable to a CPU, so task mode and sampling
are not supported.

Signed-off-by: Neil Leeder 
Signed-off-by: Shameer Kolothum 
---
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 838 ++
 3 files changed, 848 insertions(+)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 08ebaf7..0b9cc1a 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -52,6 +52,15 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config ARM_SMMUV3_PMU
+bool "ARM SMMUv3 PMU"
+depends on ARM64 && ACPI
+  help
+  Provides support for the SMMU version 3 performance monitor unit 
(PMU)
+  on ARM-based systems.
+  Adds the SMMU PMU into the perf events subsystem for
+  monitoring SMMU performance events.
+
 config ARM_DSU_PMU
tristate "ARM DynamIQ Shared Unit (DSU) PMU"
depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index b3902bd..b3ae48d 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ARM_CCN) += arm-ccn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMUV3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
new file mode 100644
index 000..b3dc394
--- /dev/null
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -0,0 +1,838 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (c) 2017 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * This driver adds support for perf events to use the Performance
+ * Monitor Counter Groups (PMCG) associated with an SMMUv3 node
+ * to monitor that node.
+ *
+ * SMMUv3 PMCG devices are named as arm_smmu_v3.x_pmcg.y where x
+ * denotes the associated smmuv3 dev id and y denotes the pmu dev id.
+ *
+ * Filtering by stream id is done by specifying filtering parameters
+ * with the event. options are:
+ *   filter_enable- 0 = no filtering, 1 = filtering enabled
+ *   filter_span  - 0 = exact match, 1 = pattern match
+ *   filter_stream_id - pattern to filter against
+ * Further filtering information is available in the SMMU documentation.
+ *
+ * Example: perf stat -e arm_smmu_v3.0_pmcg.6/transaction,filter_enable=1,
+ *   filter_span=1,filter_stream_id=0x42/ -a pwd
+ * Applies filter pattern 0x42 to transaction events.
+ *
+ * SMMU events are not attributable to a CPU, so task mode and sampling
+ * are not supported.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define SMMU_PMCG_EVCNTR0   0x0
+#define SMMU_PMCG_EVCNTR(n, stride) (SMMU_PMCG_EVCNTR0 + (n) * (stride))
+#define SMMU_PMCG_EVTYPER0  0x400
+#define SMMU_PMCG_EVTYPER(n)(SMMU_PMCG_EVTYPER0 + (n) * 4)
+#define SMMU_PMCG_EVTYPER_SEC_SID_SHIFT   30
+#define SMMU_PMCG_EVTYPER_SID_SPAN_SHIFT  29
+#define SMMU_PMCG_EVTYPER_EVENT_MASK  GENMASK(15, 0)
+#define SMMU_PMCG_SVR0  0x600
+#define SMMU_PMCG_SVR(n, stride)(SMMU_PMCG_SVR0 + (n) * (stride))
+#define SMMU_PMCG_SMR0

[PATCH v2 4/4] perf/smmuv3: Add MSI irq support

2018-07-24 Thread Shameer Kolothum

This adds support for MSI based counter overflow interrupt.

Signed-off-by: Shameer Kolothum 
---
 drivers/perf/arm_smmuv3_pmu.c | 105 +-
 1 file changed, 84 insertions(+), 21 deletions(-)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index b3dc394..ca69813 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -94,6 +94,10 @@
 #define SMMU_PMCG_IRQ_CFG2  0xE64
 #define SMMU_PMCG_IRQ_STATUS0xE68
 
+/* MSI config fields */
+#define MSI_CFG0_ADDR_MASK  GENMASK_ULL(51, 2)
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
+
 #define SMMU_COUNTER_RELOAD BIT(31)
 #define SMMU_DEFAULT_FILTER_SEC 0
 #define SMMU_DEFAULT_FILTER_SPAN1
@@ -657,14 +661,89 @@ static irqreturn_t smmu_pmu_handle_irq(int irq_num, void 
*data)
return IRQ_HANDLED;
 }
 
+static void smmu_pmu_free_msis(void *data)
+{
+   struct device *dev = data;
+
+   platform_msi_domain_free_irqs(dev);
+}
+
+static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   phys_addr_t doorbell;
+   struct device *dev = msi_desc_to_dev(desc);
+   struct smmu_pmu *pmu = dev_get_drvdata(dev);
+
+   doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+   doorbell &= MSI_CFG0_ADDR_MASK;
+
+   writeq_relaxed(doorbell, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+   writel_relaxed(msg->data, pmu->reg_base + SMMU_PMCG_IRQ_CFG1);
+   writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE,
+   pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
+}
+
+static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
+{
+   struct msi_desc *desc;
+   struct device *dev = pmu->dev;
+   int ret;
+
+   /* Clear MSI address reg */
+   writeq_relaxed(0, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+
+   /* MSI supported or not */
+   if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
+   return;
+
+   ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
+   if (ret) {
+   dev_warn(dev, "failed to allocate MSIs\n");
+   return;
+   }
+
+   desc = first_msi_entry(dev);
+   if (desc)
+   pmu->irq = desc->irq;
+
+   /* Add callback to free MSIs on teardown */
+   devm_add_action(dev, smmu_pmu_free_msis, dev);
+}
+
+static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
+{
+   int irq, ret = -ENXIO;
+
+   smmu_pmu_setup_msi(pmu);
+
+   irq = pmu->irq;
+   if (irq)
+   ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
+  IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD,
+  "smmu-v3-pmu", pmu);
+   return ret;
+}
+
 static int smmu_pmu_reset(struct smmu_pmu *smmu_pmu)
 {
+   int ret;
+
/* Disable counter and interrupt */
writeq(smmu_pmu->counter_present_mask,
smmu_pmu->reg_base + SMMU_PMCG_CNTENCLR0);
writeq(smmu_pmu->counter_present_mask,
smmu_pmu->reg_base + SMMU_PMCG_INTENCLR0);
 
+   ret = smmu_pmu_setup_irq(smmu_pmu);
+   if (ret) {
+   dev_err(smmu_pmu->dev, "failed to setup irqs\n");
+   return ret;
+   }
+
+   /* Pick one CPU to be the preferred one to use */
+   smmu_pmu->on_cpu = smp_processor_id();
+   WARN_ON(irq_set_affinity(smmu_pmu->irq, cpumask_of(smmu_pmu->on_cpu)));
+
smmu_pmu_disable(_pmu->pmu);
return 0;
 }
@@ -738,26 +817,8 @@ static int smmu_pmu_probe(struct platform_device *pdev)
}
 
irq = platform_get_irq(pdev, 0);
-   if (irq < 0) {
-   dev_err(dev, "Failed to get valid irq for smmu @%pa\n",
-   _resource_0->start);
-   return irq;
-   }
-
-   err = devm_request_irq(dev, irq, smmu_pmu_handle_irq,
-  IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD,
-  "smmu-pmu", smmu_pmu);
-   if (err) {
-   dev_err(dev,
-   "Unable to request IRQ%d for SMMU PMU counters\n", irq);
-   return err;
-   }
-
-   smmu_pmu->irq = irq;
-
-   /* Pick one CPU to be the preferred one to use */
-   smmu_pmu->on_cpu = smp_processor_id();
-   WARN_ON(irq_set_affinity(smmu_pmu->irq, cpumask_of(smmu_pmu->on_cpu)));
+   if (irq > 0)
+   smmu_pmu->irq = irq;
 
smmu_pmu->num_counters = get_num_counters(smmu_pmu);
smmu_pmu->counter_present_mask = GENMASK(smmu_pmu->num_counters - 1, 0);
@@ -765,7 +826,9 @@ static int smmu_pmu_probe(struct platform_device *pdev)
SMMU_PMCG_CFGR_SIZE_MASK) >> SMMU_PMCG_CFGR_SIZE_SH

[PATCH v2 3/4] perf: add arm64 smmuv3 pmu driver

2018-07-24 Thread Shameer Kolothum

From: Neil Leeder 

Adds a new driver to support the SMMU v3 PMU and add it into the
perf events framework.

Each SMMU node may have multiple PMUs associated with it, each of
which may support different events.

SMMUv3 PMCG devices are named as arm_smmu_v3_x_pmcg_y where x
denotes the associated smmuv3 dev id(if any) and y denotes the
pmu dev id.

Filtering by stream id is done by specifying filtering parameters
with the event. options are:
   filter_enable- 0 = no filtering, 1 = filtering enabled
   filter_span  - 0 = exact match, 1 = pattern match
   filter_stream_id - pattern to filter against
Further filtering information is available in the SMMU documentation.

Example: perf stat -e arm_smmu_v3_0_pmcg_6/transaction,filter_enable=1,
   filter_span=1,filter_stream_id=0x42/ -a pwd
Applies filter pattern 0x42 to transaction events.

SMMU events are not attributable to a CPU, so task mode and sampling
are not supported.

Signed-off-by: Neil Leeder 
Signed-off-by: Shameer Kolothum 
---
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 838 ++
 3 files changed, 848 insertions(+)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 08ebaf7..0b9cc1a 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -52,6 +52,15 @@ config ARM_PMU_ACPI
depends on ARM_PMU && ACPI
def_bool y
 
+config ARM_SMMUV3_PMU
+bool "ARM SMMUv3 PMU"
+depends on ARM64 && ACPI
+  help
+  Provides support for the SMMU version 3 performance monitor unit 
(PMU)
+  on ARM-based systems.
+  Adds the SMMU PMU into the perf events subsystem for
+  monitoring SMMU performance events.
+
 config ARM_DSU_PMU
tristate "ARM DynamIQ Shared Unit (DSU) PMU"
depends on ARM64
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index b3902bd..b3ae48d 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_ARM_CCN) += arm-ccn.o
 obj-$(CONFIG_ARM_DSU_PMU) += arm_dsu_pmu.o
 obj-$(CONFIG_ARM_PMU) += arm_pmu.o arm_pmu_platform.o
 obj-$(CONFIG_ARM_PMU_ACPI) += arm_pmu_acpi.o
+obj-$(CONFIG_ARM_SMMUV3_PMU) += arm_smmuv3_pmu.o
 obj-$(CONFIG_HISI_PMU) += hisilicon/
 obj-$(CONFIG_QCOM_L2_PMU)  += qcom_l2_pmu.o
 obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o
diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
new file mode 100644
index 000..b3dc394
--- /dev/null
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -0,0 +1,838 @@
+// SPDX-License-Identifier: GPL-2.0+
+/* Copyright (c) 2017 The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+/*
+ * This driver adds support for perf events to use the Performance
+ * Monitor Counter Groups (PMCG) associated with an SMMUv3 node
+ * to monitor that node.
+ *
+ * SMMUv3 PMCG devices are named as arm_smmu_v3.x_pmcg.y where x
+ * denotes the associated smmuv3 dev id and y denotes the pmu dev id.
+ *
+ * Filtering by stream id is done by specifying filtering parameters
+ * with the event. options are:
+ *   filter_enable- 0 = no filtering, 1 = filtering enabled
+ *   filter_span  - 0 = exact match, 1 = pattern match
+ *   filter_stream_id - pattern to filter against
+ * Further filtering information is available in the SMMU documentation.
+ *
+ * Example: perf stat -e arm_smmu_v3.0_pmcg.6/transaction,filter_enable=1,
+ *   filter_span=1,filter_stream_id=0x42/ -a pwd
+ * Applies filter pattern 0x42 to transaction events.
+ *
+ * SMMU events are not attributable to a CPU, so task mode and sampling
+ * are not supported.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define SMMU_PMCG_EVCNTR0   0x0
+#define SMMU_PMCG_EVCNTR(n, stride) (SMMU_PMCG_EVCNTR0 + (n) * (stride))
+#define SMMU_PMCG_EVTYPER0  0x400
+#define SMMU_PMCG_EVTYPER(n)(SMMU_PMCG_EVTYPER0 + (n) * 4)
+#define SMMU_PMCG_EVTYPER_SEC_SID_SHIFT   30
+#define SMMU_PMCG_EVTYPER_SID_SPAN_SHIFT  29
+#define SMMU_PMCG_EVTYPER_EVENT_MASK  GENMASK(15, 0)
+#define SMMU_PMCG_SVR0  0x600
+#define SMMU_PMCG_SVR(n, stride)(SMMU_PMCG_SVR0 + (n) * (stride))
+#define SMMU_PMCG_SMR0

[PATCH v2 4/4] perf/smmuv3: Add MSI irq support

2018-07-24 Thread Shameer Kolothum

This adds support for MSI based counter overflow interrupt.

Signed-off-by: Shameer Kolothum 
---
 drivers/perf/arm_smmuv3_pmu.c | 105 +-
 1 file changed, 84 insertions(+), 21 deletions(-)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index b3dc394..ca69813 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -94,6 +94,10 @@
 #define SMMU_PMCG_IRQ_CFG2  0xE64
 #define SMMU_PMCG_IRQ_STATUS0xE68
 
+/* MSI config fields */
+#define MSI_CFG0_ADDR_MASK  GENMASK_ULL(51, 2)
+#define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
+
 #define SMMU_COUNTER_RELOAD BIT(31)
 #define SMMU_DEFAULT_FILTER_SEC 0
 #define SMMU_DEFAULT_FILTER_SPAN1
@@ -657,14 +661,89 @@ static irqreturn_t smmu_pmu_handle_irq(int irq_num, void 
*data)
return IRQ_HANDLED;
 }
 
+static void smmu_pmu_free_msis(void *data)
+{
+   struct device *dev = data;
+
+   platform_msi_domain_free_irqs(dev);
+}
+
+static void smmu_pmu_write_msi_msg(struct msi_desc *desc, struct msi_msg *msg)
+{
+   phys_addr_t doorbell;
+   struct device *dev = msi_desc_to_dev(desc);
+   struct smmu_pmu *pmu = dev_get_drvdata(dev);
+
+   doorbell = (((u64)msg->address_hi) << 32) | msg->address_lo;
+   doorbell &= MSI_CFG0_ADDR_MASK;
+
+   writeq_relaxed(doorbell, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+   writel_relaxed(msg->data, pmu->reg_base + SMMU_PMCG_IRQ_CFG1);
+   writel_relaxed(MSI_CFG2_MEMATTR_DEVICE_nGnRE,
+   pmu->reg_base + SMMU_PMCG_IRQ_CFG2);
+}
+
+static void smmu_pmu_setup_msi(struct smmu_pmu *pmu)
+{
+   struct msi_desc *desc;
+   struct device *dev = pmu->dev;
+   int ret;
+
+   /* Clear MSI address reg */
+   writeq_relaxed(0, pmu->reg_base + SMMU_PMCG_IRQ_CFG0);
+
+   /* MSI supported or not */
+   if (!(readl(pmu->reg_base + SMMU_PMCG_CFGR) & SMMU_PMCG_CFGR_MSI))
+   return;
+
+   ret = platform_msi_domain_alloc_irqs(dev, 1, smmu_pmu_write_msi_msg);
+   if (ret) {
+   dev_warn(dev, "failed to allocate MSIs\n");
+   return;
+   }
+
+   desc = first_msi_entry(dev);
+   if (desc)
+   pmu->irq = desc->irq;
+
+   /* Add callback to free MSIs on teardown */
+   devm_add_action(dev, smmu_pmu_free_msis, dev);
+}
+
+static int smmu_pmu_setup_irq(struct smmu_pmu *pmu)
+{
+   int irq, ret = -ENXIO;
+
+   smmu_pmu_setup_msi(pmu);
+
+   irq = pmu->irq;
+   if (irq)
+   ret = devm_request_irq(pmu->dev, irq, smmu_pmu_handle_irq,
+  IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD,
+  "smmu-v3-pmu", pmu);
+   return ret;
+}
+
 static int smmu_pmu_reset(struct smmu_pmu *smmu_pmu)
 {
+   int ret;
+
/* Disable counter and interrupt */
writeq(smmu_pmu->counter_present_mask,
smmu_pmu->reg_base + SMMU_PMCG_CNTENCLR0);
writeq(smmu_pmu->counter_present_mask,
smmu_pmu->reg_base + SMMU_PMCG_INTENCLR0);
 
+   ret = smmu_pmu_setup_irq(smmu_pmu);
+   if (ret) {
+   dev_err(smmu_pmu->dev, "failed to setup irqs\n");
+   return ret;
+   }
+
+   /* Pick one CPU to be the preferred one to use */
+   smmu_pmu->on_cpu = smp_processor_id();
+   WARN_ON(irq_set_affinity(smmu_pmu->irq, cpumask_of(smmu_pmu->on_cpu)));
+
smmu_pmu_disable(_pmu->pmu);
return 0;
 }
@@ -738,26 +817,8 @@ static int smmu_pmu_probe(struct platform_device *pdev)
}
 
irq = platform_get_irq(pdev, 0);
-   if (irq < 0) {
-   dev_err(dev, "Failed to get valid irq for smmu @%pa\n",
-   _resource_0->start);
-   return irq;
-   }
-
-   err = devm_request_irq(dev, irq, smmu_pmu_handle_irq,
-  IRQF_NOBALANCING | IRQF_SHARED | IRQF_NO_THREAD,
-  "smmu-pmu", smmu_pmu);
-   if (err) {
-   dev_err(dev,
-   "Unable to request IRQ%d for SMMU PMU counters\n", irq);
-   return err;
-   }
-
-   smmu_pmu->irq = irq;
-
-   /* Pick one CPU to be the preferred one to use */
-   smmu_pmu->on_cpu = smp_processor_id();
-   WARN_ON(irq_set_affinity(smmu_pmu->irq, cpumask_of(smmu_pmu->on_cpu)));
+   if (irq > 0)
+   smmu_pmu->irq = irq;
 
smmu_pmu->num_counters = get_num_counters(smmu_pmu);
smmu_pmu->counter_present_mask = GENMASK(smmu_pmu->num_counters - 1, 0);
@@ -765,7 +826,9 @@ static int smmu_pmu_probe(struct platform_device *pdev)
SMMU_PMCG_CFGR_SIZE_MASK) >> SMMU_PMCG_CFGR_SIZE_SH

[PATCH v2 1/4] acpi: arm64: add iort support for PMCG

2018-07-24 Thread Shameer Kolothum

From: Neil Leeder 

Add support for the SMMU Performance Monitor Counter Group
information from ACPI. This is in preparation for its use
in the SMMU v3 PMU driver.

Signed-off-by: Neil Leeder 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 95 +--
 1 file changed, 83 insertions(+), 12 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 7a3a541..ac4d0d6 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -356,7 +356,8 @@ static struct acpi_iort_node *iort_node_get_id(struct 
acpi_iort_node *node,
if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX ||
-   node->type == ACPI_IORT_NODE_SMMU_V3) {
+   node->type == ACPI_IORT_NODE_SMMU_V3 ||
+   node->type == ACPI_IORT_NODE_PMCG) {
*id_out = map->output_base;
return parent;
}
@@ -394,6 +395,8 @@ static int iort_get_id_mapping_index(struct acpi_iort_node 
*node)
}
 
return smmu->id_mapping_index;
+   case ACPI_IORT_NODE_PMCG:
+   return 0;
default:
return -EINVAL;
}
@@ -1287,6 +1290,63 @@ static bool __init arm_smmu_is_coherent(struct 
acpi_iort_node *node)
return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
 }
 
+static void __init arm_smmu_common_dma_configure(struct device *dev,
+   enum dev_dma_attr attr)
+{
+   /* We expect the dma masks to be equivalent for all SMMUs set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
+}
+
+static int __init arm_smmu_v3_pmu_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   /*
+* There are always 2 memory resources.
+* If the overflow_gsiv is present then add that for a total of 3.
+*/
+   return pmcg->overflow_gsiv > 0 ? 3 : 2;
+}
+
+static void __init arm_smmu_v3_pmu_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   res[0].start = pmcg->page0_base_address;
+   res[0].end = pmcg->page0_base_address + SZ_4K - 1;
+   res[0].flags = IORESOURCE_MEM;
+   res[1].start = pmcg->page1_base_address;
+   res[1].end = pmcg->page1_base_address + SZ_4K - 1;
+   res[1].flags = IORESOURCE_MEM;
+
+   if (pmcg->overflow_gsiv)
+   acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow",
+  ACPI_EDGE_SENSITIVE, [2]);
+}
+
+static struct acpi_iort_node *iort_find_pmcg_ref(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+   struct acpi_iort_node *ref_node = NULL;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+   if (pmcg->node_reference)
+   ref_node = ACPI_ADD_PTR(struct acpi_iort_node,
+   iort_table,  pmcg->node_reference);
+   return ref_node;
+}
+
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
@@ -1296,6 +1356,8 @@ struct iort_dev_config {
 struct acpi_iort_node *node);
void (*dev_set_proximity)(struct device *dev,
struct acpi_iort_node *node);
+   void (*dev_dma_configure)(struct device *dev,
+   enum dev_dma_attr attr);
 };
 
 static const struct iort_dev_config iort_arm_smmu_v3_cfg __initconst = {
@@ -1304,23 +1366,38 @@ static const struct iort_dev_config 
iort_arm_smmu_v3_cfg __initconst = {
.dev_count_resources = arm_smmu_v3_count_resources,
.dev_init_resources = arm_smmu_v3_init_resources,
.dev_set_proximity = arm_smmu_v3_set_proximity,
+   .dev_dma_configure = arm_smmu_common_dma_configure
 };
 
 static const struct iort_dev_config iort_arm_smmu_cfg __initconst = {
.name = "arm-smmu",
.dev_is_coherent = arm_smmu_is_coherent,
.dev_count_resources = arm_smmu_count_resources,
-   .dev_init_resources = arm_smmu_init_resources
+   .dev_init_resources = arm_smmu_init_resources,
+   .dev_dma_configure = arm_smmu_common_dma_configure
+};
+
+static const struct iort_dev_config iort_

[PATCH v2 1/4] acpi: arm64: add iort support for PMCG

2018-07-24 Thread Shameer Kolothum

From: Neil Leeder 

Add support for the SMMU Performance Monitor Counter Group
information from ACPI. This is in preparation for its use
in the SMMU v3 PMU driver.

Signed-off-by: Neil Leeder 
Signed-off-by: Hanjun Guo 
Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 95 +--
 1 file changed, 83 insertions(+), 12 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 7a3a541..ac4d0d6 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -356,7 +356,8 @@ static struct acpi_iort_node *iort_node_get_id(struct 
acpi_iort_node *node,
if (map->flags & ACPI_IORT_ID_SINGLE_MAPPING) {
if (node->type == ACPI_IORT_NODE_NAMED_COMPONENT ||
node->type == ACPI_IORT_NODE_PCI_ROOT_COMPLEX ||
-   node->type == ACPI_IORT_NODE_SMMU_V3) {
+   node->type == ACPI_IORT_NODE_SMMU_V3 ||
+   node->type == ACPI_IORT_NODE_PMCG) {
*id_out = map->output_base;
return parent;
}
@@ -394,6 +395,8 @@ static int iort_get_id_mapping_index(struct acpi_iort_node 
*node)
}
 
return smmu->id_mapping_index;
+   case ACPI_IORT_NODE_PMCG:
+   return 0;
default:
return -EINVAL;
}
@@ -1287,6 +1290,63 @@ static bool __init arm_smmu_is_coherent(struct 
acpi_iort_node *node)
return smmu->flags & ACPI_IORT_SMMU_COHERENT_WALK;
 }
 
+static void __init arm_smmu_common_dma_configure(struct device *dev,
+   enum dev_dma_attr attr)
+{
+   /* We expect the dma masks to be equivalent for all SMMUs set-ups */
+   dev->dma_mask = >coherent_dma_mask;
+
+   /* Configure DMA for the page table walker */
+   acpi_dma_configure(dev, attr);
+}
+
+static int __init arm_smmu_v3_pmu_count_resources(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   /*
+* There are always 2 memory resources.
+* If the overflow_gsiv is present then add that for a total of 3.
+*/
+   return pmcg->overflow_gsiv > 0 ? 3 : 2;
+}
+
+static void __init arm_smmu_v3_pmu_init_resources(struct resource *res,
+  struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+
+   res[0].start = pmcg->page0_base_address;
+   res[0].end = pmcg->page0_base_address + SZ_4K - 1;
+   res[0].flags = IORESOURCE_MEM;
+   res[1].start = pmcg->page1_base_address;
+   res[1].end = pmcg->page1_base_address + SZ_4K - 1;
+   res[1].flags = IORESOURCE_MEM;
+
+   if (pmcg->overflow_gsiv)
+   acpi_iort_register_irq(pmcg->overflow_gsiv, "overflow",
+  ACPI_EDGE_SENSITIVE, [2]);
+}
+
+static struct acpi_iort_node *iort_find_pmcg_ref(struct acpi_iort_node *node)
+{
+   struct acpi_iort_pmcg *pmcg;
+   struct acpi_iort_node *ref_node = NULL;
+
+   /* Retrieve PMCG specific data */
+   pmcg = (struct acpi_iort_pmcg *)node->node_data;
+   if (pmcg->node_reference)
+   ref_node = ACPI_ADD_PTR(struct acpi_iort_node,
+   iort_table,  pmcg->node_reference);
+   return ref_node;
+}
+
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
@@ -1296,6 +1356,8 @@ struct iort_dev_config {
 struct acpi_iort_node *node);
void (*dev_set_proximity)(struct device *dev,
struct acpi_iort_node *node);
+   void (*dev_dma_configure)(struct device *dev,
+   enum dev_dma_attr attr);
 };
 
 static const struct iort_dev_config iort_arm_smmu_v3_cfg __initconst = {
@@ -1304,23 +1366,38 @@ static const struct iort_dev_config 
iort_arm_smmu_v3_cfg __initconst = {
.dev_count_resources = arm_smmu_v3_count_resources,
.dev_init_resources = arm_smmu_v3_init_resources,
.dev_set_proximity = arm_smmu_v3_set_proximity,
+   .dev_dma_configure = arm_smmu_common_dma_configure
 };
 
 static const struct iort_dev_config iort_arm_smmu_cfg __initconst = {
.name = "arm-smmu",
.dev_is_coherent = arm_smmu_is_coherent,
.dev_count_resources = arm_smmu_count_resources,
-   .dev_init_resources = arm_smmu_init_resources
+   .dev_init_resources = arm_smmu_init_resources,
+   .dev_dma_configure = arm_smmu_common_dma_configure
+};
+
+static const struct iort_dev_config iort_

[PATCH v2 2/4] acpi: arm64: iort helper to find the associated smmu of pmcg node

2018-07-24 Thread Shameer Kolothum

This adds an helper to retrieve the smmuv3 dev(if any) associated
with the PMCG node. This will be used in subsequent SMMUv3 PMU
driver patch to name the pmu device.

Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 84 ---
 include/linux/acpi_iort.h |  4 +++
 2 files changed, 69 insertions(+), 19 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index ac4d0d6..7940080 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -42,6 +42,7 @@ struct iort_fwnode {
struct list_head list;
struct acpi_iort_node *iort_node;
struct fwnode_handle *fwnode;
+   struct platform_device *pdev;
 };
 static LIST_HEAD(iort_fwnode_list);
 static DEFINE_SPINLOCK(iort_fwnode_lock);
@@ -52,12 +53,14 @@ static DEFINE_SPINLOCK(iort_fwnode_lock);
  *
  * @node: IORT table node associated with the IOMMU
  * @fwnode: fwnode associated with the IORT node
+ * @pdev: platform dev associated with the IORT node if any
  *
  * Returns: 0 on success
  *  <0 on failure
  */
 static inline int iort_set_fwnode(struct acpi_iort_node *iort_node,
- struct fwnode_handle *fwnode)
+ struct fwnode_handle *fwnode,
+ struct platform_device *pdev)
 {
struct iort_fwnode *np;
 
@@ -69,6 +72,7 @@ static inline int iort_set_fwnode(struct acpi_iort_node 
*iort_node,
INIT_LIST_HEAD(>list);
np->iort_node = iort_node;
np->fwnode = fwnode;
+   np->pdev = pdev;
 
spin_lock(_fwnode_lock);
list_add_tail(>list, _fwnode_list);
@@ -78,6 +82,31 @@ static inline int iort_set_fwnode(struct acpi_iort_node 
*iort_node,
 }
 
 /**
+ * iort_get_pdev() - Retrieve pdev associated with an IORT node
+ *
+ * @node: IORT table node to be looked-up
+ *
+ * Returns: platform dev pointer on success, NULL on failure
+ */
+static inline struct platform_device *iort_get_pdev(
+   struct acpi_iort_node *node)
+{
+   struct iort_fwnode *curr;
+   struct platform_device *pdev = NULL;
+
+   spin_lock(_fwnode_lock);
+   list_for_each_entry(curr, _fwnode_list, list) {
+   if (curr->iort_node == node) {
+   pdev = curr->pdev;
+   break;
+   }
+   }
+   spin_unlock(_fwnode_lock);
+
+   return pdev;
+}
+
+/**
  * iort_get_fwnode() - Retrieve fwnode associated with an IORT node
  *
  * @node: IORT table node to be looked-up
@@ -1347,6 +1376,32 @@ static struct acpi_iort_node *iort_find_pmcg_ref(struct 
acpi_iort_node *node)
return ref_node;
 }
 
+/**
+ * iort_find_pmcg_ref_smmu - helper to retrieve SMMUv3 associated with PMCG
+ * @dev: PMCG device
+ *
+ * Returns: smmu dev associated with the PMCG on success, NULL on failure
+ */
+struct device *iort_find_pmcg_ref_smmu(struct device *dev)
+{
+   struct acpi_iort_node *node;
+   struct acpi_iort_node *ref_node = NULL;
+   struct platform_device *pdev = NULL;
+
+   node = iort_get_iort_node(dev->fwnode);
+   if (!node || node->type != ACPI_IORT_NODE_PMCG)
+   return NULL;
+
+   ref_node = iort_find_pmcg_ref(node);
+   if (ref_node && ref_node->type == ACPI_IORT_NODE_SMMU_V3)
+   pdev = iort_get_pdev(ref_node);
+
+   if (pdev)
+   return >dev;
+
+   return NULL;
+}
+
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
@@ -1453,13 +1508,14 @@ static int __init iort_add_platform_device(struct 
acpi_iort_node *node,
if (ret)
goto dev_put;
 
-   fwnode = iort_get_fwnode(node);
-
+   fwnode = acpi_alloc_fwnode_static();
if (!fwnode) {
ret = -ENODEV;
goto dev_put;
}
 
+   iort_set_fwnode(node, fwnode, pdev);
+
pdev->dev.fwnode = fwnode;
 
if (ops->dev_dma_configure) {
@@ -1472,12 +1528,14 @@ static int __init iort_add_platform_device(struct 
acpi_iort_node *node,
 
ret = platform_device_add(pdev);
if (ret)
-   goto dma_deconfigure;
+   goto out;
 
return 0;
 
-dma_deconfigure:
+out:
acpi_dma_deconfigure(>dev);
+   iort_delete_fwnode(node);
+   acpi_free_fwnode_static(fwnode);
 dev_put:
platform_device_put(pdev);
 
@@ -1519,8 +1577,7 @@ static void __init iort_init_platform_devices(void)
 {
struct acpi_iort_node *iort_node, *iort_end;
struct acpi_table_iort *iort;
-   struct fwnode_handle *fwnode;
-   int i, ret;
+   int i;
bool acs_enabled = false;
const struct iort_dev_config *ops;
 
@@ -1547,18 +1604,7 @@ static void __init iort_init_platform_devices(void)
 
ops = iort_get_dev_cfg(iort_node);
if (ops) {
-

[PATCH v2 2/4] acpi: arm64: iort helper to find the associated smmu of pmcg node

2018-07-24 Thread Shameer Kolothum

This adds an helper to retrieve the smmuv3 dev(if any) associated
with the PMCG node. This will be used in subsequent SMMUv3 PMU
driver patch to name the pmu device.

Signed-off-by: Shameer Kolothum 
---
 drivers/acpi/arm64/iort.c | 84 ---
 include/linux/acpi_iort.h |  4 +++
 2 files changed, 69 insertions(+), 19 deletions(-)

diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index ac4d0d6..7940080 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -42,6 +42,7 @@ struct iort_fwnode {
struct list_head list;
struct acpi_iort_node *iort_node;
struct fwnode_handle *fwnode;
+   struct platform_device *pdev;
 };
 static LIST_HEAD(iort_fwnode_list);
 static DEFINE_SPINLOCK(iort_fwnode_lock);
@@ -52,12 +53,14 @@ static DEFINE_SPINLOCK(iort_fwnode_lock);
  *
  * @node: IORT table node associated with the IOMMU
  * @fwnode: fwnode associated with the IORT node
+ * @pdev: platform dev associated with the IORT node if any
  *
  * Returns: 0 on success
  *  <0 on failure
  */
 static inline int iort_set_fwnode(struct acpi_iort_node *iort_node,
- struct fwnode_handle *fwnode)
+ struct fwnode_handle *fwnode,
+ struct platform_device *pdev)
 {
struct iort_fwnode *np;
 
@@ -69,6 +72,7 @@ static inline int iort_set_fwnode(struct acpi_iort_node 
*iort_node,
INIT_LIST_HEAD(>list);
np->iort_node = iort_node;
np->fwnode = fwnode;
+   np->pdev = pdev;
 
spin_lock(_fwnode_lock);
list_add_tail(>list, _fwnode_list);
@@ -78,6 +82,31 @@ static inline int iort_set_fwnode(struct acpi_iort_node 
*iort_node,
 }
 
 /**
+ * iort_get_pdev() - Retrieve pdev associated with an IORT node
+ *
+ * @node: IORT table node to be looked-up
+ *
+ * Returns: platform dev pointer on success, NULL on failure
+ */
+static inline struct platform_device *iort_get_pdev(
+   struct acpi_iort_node *node)
+{
+   struct iort_fwnode *curr;
+   struct platform_device *pdev = NULL;
+
+   spin_lock(_fwnode_lock);
+   list_for_each_entry(curr, _fwnode_list, list) {
+   if (curr->iort_node == node) {
+   pdev = curr->pdev;
+   break;
+   }
+   }
+   spin_unlock(_fwnode_lock);
+
+   return pdev;
+}
+
+/**
  * iort_get_fwnode() - Retrieve fwnode associated with an IORT node
  *
  * @node: IORT table node to be looked-up
@@ -1347,6 +1376,32 @@ static struct acpi_iort_node *iort_find_pmcg_ref(struct 
acpi_iort_node *node)
return ref_node;
 }
 
+/**
+ * iort_find_pmcg_ref_smmu - helper to retrieve SMMUv3 associated with PMCG
+ * @dev: PMCG device
+ *
+ * Returns: smmu dev associated with the PMCG on success, NULL on failure
+ */
+struct device *iort_find_pmcg_ref_smmu(struct device *dev)
+{
+   struct acpi_iort_node *node;
+   struct acpi_iort_node *ref_node = NULL;
+   struct platform_device *pdev = NULL;
+
+   node = iort_get_iort_node(dev->fwnode);
+   if (!node || node->type != ACPI_IORT_NODE_PMCG)
+   return NULL;
+
+   ref_node = iort_find_pmcg_ref(node);
+   if (ref_node && ref_node->type == ACPI_IORT_NODE_SMMU_V3)
+   pdev = iort_get_pdev(ref_node);
+
+   if (pdev)
+   return >dev;
+
+   return NULL;
+}
+
 struct iort_dev_config {
const char *name;
int (*dev_init)(struct acpi_iort_node *node);
@@ -1453,13 +1508,14 @@ static int __init iort_add_platform_device(struct 
acpi_iort_node *node,
if (ret)
goto dev_put;
 
-   fwnode = iort_get_fwnode(node);
-
+   fwnode = acpi_alloc_fwnode_static();
if (!fwnode) {
ret = -ENODEV;
goto dev_put;
}
 
+   iort_set_fwnode(node, fwnode, pdev);
+
pdev->dev.fwnode = fwnode;
 
if (ops->dev_dma_configure) {
@@ -1472,12 +1528,14 @@ static int __init iort_add_platform_device(struct 
acpi_iort_node *node,
 
ret = platform_device_add(pdev);
if (ret)
-   goto dma_deconfigure;
+   goto out;
 
return 0;
 
-dma_deconfigure:
+out:
acpi_dma_deconfigure(>dev);
+   iort_delete_fwnode(node);
+   acpi_free_fwnode_static(fwnode);
 dev_put:
platform_device_put(pdev);
 
@@ -1519,8 +1577,7 @@ static void __init iort_init_platform_devices(void)
 {
struct acpi_iort_node *iort_node, *iort_end;
struct acpi_table_iort *iort;
-   struct fwnode_handle *fwnode;
-   int i, ret;
+   int i;
bool acs_enabled = false;
const struct iort_dev_config *ops;
 
@@ -1547,18 +1604,7 @@ static void __init iort_init_platform_devices(void)
 
ops = iort_get_dev_cfg(iort_node);
if (ops) {
-

[PATCH v2 0/4] arm64 SMMUv3 PMU driver with IORT support

2018-07-24 Thread Shameer Kolothum

This adds a driver for the SMMUv3 PMU into the perf framework.
It includes an IORT update to support PM Counter Groups.

This is based on the initial work done by Neil Leeder[1]

SMMUv3 PMCG devices are named as arm_smmu_v3_x_pmcg_y where x
denotes the associated smmuv3 dev id(if any) and y denotes the
pmu dev id.

Usage example:
For common arch supported events:
perf stat -e arm_smmu_v3_0_pmcg_6/transaction,filter_enable=1,
 filter_span=1,filter_stream_id=0x42/ -a pwd

For IMP DEF events:
perf stat -e arm_smmu_v3.0_pmcg.6/event=id/ -a pwd

Sanity tested on HiSilicon platform. Further testing on supported
platforms are very much welcome.

v1 --> v2

- Addressed comments from Robin.
- Added an helper to retrieve the associated smmu dev and named PMUs
  to make the association visible to user.
- Added MSI support  for overflow irq

[1]https://www.spinics.net/lists/arm-kernel/msg598591.html

Neil Leeder (2):
  acpi: arm64: add iort support for PMCG
  perf: add arm64 smmuv3 pmu driver

Shameer Kolothum (2):
  acpi: arm64: iort helper to find the associated smmu of pmcg node
  perf/smmuv3: Add MSI irq support

 drivers/acpi/arm64/iort.c | 179 +++--
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 901 ++
 include/linux/acpi_iort.h |   4 +
 5 files changed, 1063 insertions(+), 31 deletions(-)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

-- 
2.7.4

[PATCH v2 0/4] arm64 SMMUv3 PMU driver with IORT support

2018-07-24 Thread Shameer Kolothum

This adds a driver for the SMMUv3 PMU into the perf framework.
It includes an IORT update to support PM Counter Groups.

This is based on the initial work done by Neil Leeder[1]

SMMUv3 PMCG devices are named as arm_smmu_v3_x_pmcg_y where x
denotes the associated smmuv3 dev id(if any) and y denotes the
pmu dev id.

Usage example:
For common arch supported events:
perf stat -e arm_smmu_v3_0_pmcg_6/transaction,filter_enable=1,
 filter_span=1,filter_stream_id=0x42/ -a pwd

For IMP DEF events:
perf stat -e arm_smmu_v3.0_pmcg.6/event=id/ -a pwd

Sanity tested on HiSilicon platform. Further testing on supported
platforms are very much welcome.

v1 --> v2

- Addressed comments from Robin.
- Added an helper to retrieve the associated smmu dev and named PMUs
  to make the association visible to user.
- Added MSI support  for overflow irq

[1]https://www.spinics.net/lists/arm-kernel/msg598591.html

Neil Leeder (2):
  acpi: arm64: add iort support for PMCG
  perf: add arm64 smmuv3 pmu driver

Shameer Kolothum (2):
  acpi: arm64: iort helper to find the associated smmu of pmcg node
  perf/smmuv3: Add MSI irq support

 drivers/acpi/arm64/iort.c | 179 +++--
 drivers/perf/Kconfig  |   9 +
 drivers/perf/Makefile |   1 +
 drivers/perf/arm_smmuv3_pmu.c | 901 ++
 include/linux/acpi_iort.h |   4 +
 5 files changed, 1063 insertions(+), 31 deletions(-)
 create mode 100644 drivers/perf/arm_smmuv3_pmu.c

-- 
2.7.4

[PATCH v6 4/7] iommu/dma: Move PCI window region reservation back into dma specific path.

2018-04-18 Thread Shameer Kolothum

This pretty much reverts commit 273df9635385 ("iommu/dma: Make PCI
window reservation generic")  by moving the PCI window region
reservation back into the dma specific path so that these regions
doesn't get exposed via the IOMMU API interface. With this change,
the vfio interface will report only iommu specific reserved regions
to the user space.

Cc: Joerg Roedel <j...@8bytes.org>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
Reviewed-by: Robin Murphy <robin.mur...@arm.com>
---
 drivers/iommu/dma-iommu.c | 54 ++-
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f05f3cf..ddcbbdb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);
  * @list: Reserved region list from iommu_get_resv_regions()
  *
  * IOMMU drivers can use this to implement their .get_resv_regions callback
- * for general non-IOMMU-specific reservations. Currently, this covers host
- * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI
- * based ARM platforms that may require HW MSI reservation.
+ * for general non-IOMMU-specific reservations. Currently, this covers GICv3
+ * ITS region reservation on ACPI based ARM platforms that may require HW MSI
+ * reservation.
  */
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
-   struct pci_host_bridge *bridge;
-   struct resource_entry *window;
-
-   if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) &&
-   iort_iommu_msi_get_resv_regions(dev, list) < 0)
-   return;
-
-   if (!dev_is_pci(dev))
-   return;
-
-   bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
-   resource_list_for_each_entry(window, >windows) {
-   struct iommu_resv_region *region;
-   phys_addr_t start;
-   size_t length;
-
-   if (resource_type(window->res) != IORESOURCE_MEM)
-   continue;
 
-   start = window->res->start - window->offset;
-   length = window->res->end - window->res->start + 1;
-   region = iommu_alloc_resv_region(start, length, 0,
-   IOMMU_RESV_RESERVED);
-   if (!region)
-   return;
+   if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))
+   iort_iommu_msi_get_resv_regions(dev, list);
 
-   list_add_tail(>list, list);
-   }
 }
 EXPORT_SYMBOL(iommu_dma_get_resv_regions);
 
@@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct 
iommu_dma_cookie *cookie,
return 0;
 }
 
+static void iova_reserve_pci_windows(struct pci_dev *dev,
+   struct iova_domain *iovad)
+{
+   struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
+   struct resource_entry *window;
+   unsigned long lo, hi;
+
+   resource_list_for_each_entry(window, >windows) {
+   if (resource_type(window->res) != IORESOURCE_MEM)
+   continue;
+
+   lo = iova_pfn(iovad, window->res->start - window->offset);
+   hi = iova_pfn(iovad, window->res->end - window->offset);
+   reserve_iova(iovad, lo, hi);
+   }
+}
+
 static int iova_reserve_iommu_regions(struct device *dev,
struct iommu_domain *domain)
 {
@@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device *dev,
LIST_HEAD(resv_regions);
int ret = 0;
 
+   if (dev_is_pci(dev))
+   iova_reserve_pci_windows(to_pci_dev(dev), iovad);
+
iommu_get_resv_regions(dev, _regions);
list_for_each_entry(region, _regions, list) {
unsigned long lo, hi;
-- 
2.7.4

[PATCH v6 1/7] vfio/type1: Introduce iova list and add iommu aperture validity check

2018-04-18 Thread Shameer Kolothum

This introduces an iova list that is valid for dma mappings. Make
sure the new iommu aperture window doesn't conflict with the current
one or with any existing dma mappings during attach.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 183 +++-
 1 file changed, 180 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5c212bf..775946d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,6 +60,7 @@ MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
struct list_headdomain_list;
+   struct list_headiova_list;
struct vfio_domain  *external_domain; /* domain for external user */
struct mutexlock;
struct rb_root  dma_list;
@@ -92,6 +93,12 @@ struct vfio_group {
struct list_headnext;
 };
 
+struct vfio_iova {
+   struct list_headlist;
+   dma_addr_t  start;
+   dma_addr_t  end;
+};
+
 /*
  * Guest RAM pinning working set or DMA target
  */
@@ -1313,6 +1320,149 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group 
*group, phys_addr_t *base)
return ret;
 }
 
+/*
+ * This is a helper function to insert an address range to iova list.
+ * The list starts with a single entry corresponding to the IOMMU
+ * domain geometry to which the device group is attached. The list
+ * aperture gets modified when a new domain is added to the container
+ * if the new aperture doesn't conflict with the current one or with
+ * any existing dma mappings. The list is also modified to exclude
+ * any reserved regions associated with the device group.
+ */
+static int vfio_iommu_iova_insert(struct list_head *head,
+ dma_addr_t start, dma_addr_t end)
+{
+   struct vfio_iova *region;
+
+   region = kmalloc(sizeof(*region), GFP_KERNEL);
+   if (!region)
+   return -ENOMEM;
+
+   INIT_LIST_HEAD(>list);
+   region->start = start;
+   region->end = end;
+
+   list_add_tail(>list, head);
+   return 0;
+}
+
+/*
+ * Check the new iommu aperture conflicts with existing aper or with any
+ * existing dma mappings.
+ */
+static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
+dma_addr_t start, dma_addr_t end)
+{
+   struct vfio_iova *first, *last;
+   struct list_head *iova = >iova_list;
+
+   if (list_empty(iova))
+   return false;
+
+   /* Disjoint sets, return conflict */
+   first = list_first_entry(iova, struct vfio_iova, list);
+   last = list_last_entry(iova, struct vfio_iova, list);
+   if ((start > last->end) || (end < first->start))
+   return true;
+
+   /* Check for any existing dma mappings outside the new start */
+   if (start > first->start) {
+   if (vfio_find_dma(iommu, first->start, start - first->start))
+   return true;
+   }
+
+   /* Check for any existing dma mappings outside the new end */
+   if (end < last->end) {
+   if (vfio_find_dma(iommu, end + 1, last->end - end))
+   return true;
+   }
+
+   return false;
+}
+
+/*
+ * Resize iommu iova aperture window. This is called only if the new
+ * aperture has no conflict with existing aperture and dma mappings.
+ */
+static int vfio_iommu_aper_resize(struct list_head *iova,
+ dma_addr_t start,
+ dma_addr_t end)
+{
+   struct vfio_iova *node, *next;
+
+   if (list_empty(iova))
+   return vfio_iommu_iova_insert(iova, start, end);
+
+   /* Adjust iova list start */
+   list_for_each_entry_safe(node, next, iova, list) {
+   if (start < node->start)
+   break;
+   if ((start >= node->start) && (start < node->end)) {
+   node->start = start;
+   break;
+   }
+   /* Delete nodes before new start */
+   list_del(>list);
+   kfree(node);
+   }
+
+   /* Adjust iova list end */
+   list_for_each_entry_safe(node, next, iova, list) {
+   if (end > node->end)
+   continue;
+   if ((end > node->start) && (end <= node->end)) {
+   node->end = end;
+   continue;
+   }
+   /* Delete nodes after new end */
+   list_del(>list);
+   kfree(node);
+   }
+
+   return 0;
+}
+
+static void vfio_iommu_iova_free(struct list_head *iova)
+{
+   struct vfio_iova *n, *next;
+
+   list_for_each_e

[PATCH v6 4/7] iommu/dma: Move PCI window region reservation back into dma specific path.

2018-04-18 Thread Shameer Kolothum

This pretty much reverts commit 273df9635385 ("iommu/dma: Make PCI
window reservation generic")  by moving the PCI window region
reservation back into the dma specific path so that these regions
doesn't get exposed via the IOMMU API interface. With this change,
the vfio interface will report only iommu specific reserved regions
to the user space.

Cc: Joerg Roedel 
Signed-off-by: Shameer Kolothum 
Reviewed-by: Robin Murphy 
---
 drivers/iommu/dma-iommu.c | 54 ++-
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f05f3cf..ddcbbdb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);
  * @list: Reserved region list from iommu_get_resv_regions()
  *
  * IOMMU drivers can use this to implement their .get_resv_regions callback
- * for general non-IOMMU-specific reservations. Currently, this covers host
- * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI
- * based ARM platforms that may require HW MSI reservation.
+ * for general non-IOMMU-specific reservations. Currently, this covers GICv3
+ * ITS region reservation on ACPI based ARM platforms that may require HW MSI
+ * reservation.
  */
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
-   struct pci_host_bridge *bridge;
-   struct resource_entry *window;
-
-   if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) &&
-   iort_iommu_msi_get_resv_regions(dev, list) < 0)
-   return;
-
-   if (!dev_is_pci(dev))
-   return;
-
-   bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
-   resource_list_for_each_entry(window, >windows) {
-   struct iommu_resv_region *region;
-   phys_addr_t start;
-   size_t length;
-
-   if (resource_type(window->res) != IORESOURCE_MEM)
-   continue;
 
-   start = window->res->start - window->offset;
-   length = window->res->end - window->res->start + 1;
-   region = iommu_alloc_resv_region(start, length, 0,
-   IOMMU_RESV_RESERVED);
-   if (!region)
-   return;
+   if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))
+   iort_iommu_msi_get_resv_regions(dev, list);
 
-   list_add_tail(>list, list);
-   }
 }
 EXPORT_SYMBOL(iommu_dma_get_resv_regions);
 
@@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct 
iommu_dma_cookie *cookie,
return 0;
 }
 
+static void iova_reserve_pci_windows(struct pci_dev *dev,
+   struct iova_domain *iovad)
+{
+   struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
+   struct resource_entry *window;
+   unsigned long lo, hi;
+
+   resource_list_for_each_entry(window, >windows) {
+   if (resource_type(window->res) != IORESOURCE_MEM)
+   continue;
+
+   lo = iova_pfn(iovad, window->res->start - window->offset);
+   hi = iova_pfn(iovad, window->res->end - window->offset);
+   reserve_iova(iovad, lo, hi);
+   }
+}
+
 static int iova_reserve_iommu_regions(struct device *dev,
struct iommu_domain *domain)
 {
@@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device *dev,
LIST_HEAD(resv_regions);
int ret = 0;
 
+   if (dev_is_pci(dev))
+   iova_reserve_pci_windows(to_pci_dev(dev), iovad);
+
iommu_get_resv_regions(dev, _regions);
list_for_each_entry(region, _regions, list) {
unsigned long lo, hi;
-- 
2.7.4

[PATCH v6 1/7] vfio/type1: Introduce iova list and add iommu aperture validity check

2018-04-18 Thread Shameer Kolothum

This introduces an iova list that is valid for dma mappings. Make
sure the new iommu aperture window doesn't conflict with the current
one or with any existing dma mappings during attach.

Signed-off-by: Shameer Kolothum 
---
 drivers/vfio/vfio_iommu_type1.c | 183 +++-
 1 file changed, 180 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 5c212bf..775946d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,6 +60,7 @@ MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
struct list_headdomain_list;
+   struct list_headiova_list;
struct vfio_domain  *external_domain; /* domain for external user */
struct mutexlock;
struct rb_root  dma_list;
@@ -92,6 +93,12 @@ struct vfio_group {
struct list_headnext;
 };
 
+struct vfio_iova {
+   struct list_headlist;
+   dma_addr_t  start;
+   dma_addr_t  end;
+};
+
 /*
  * Guest RAM pinning working set or DMA target
  */
@@ -1313,6 +1320,149 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group 
*group, phys_addr_t *base)
return ret;
 }
 
+/*
+ * This is a helper function to insert an address range to iova list.
+ * The list starts with a single entry corresponding to the IOMMU
+ * domain geometry to which the device group is attached. The list
+ * aperture gets modified when a new domain is added to the container
+ * if the new aperture doesn't conflict with the current one or with
+ * any existing dma mappings. The list is also modified to exclude
+ * any reserved regions associated with the device group.
+ */
+static int vfio_iommu_iova_insert(struct list_head *head,
+ dma_addr_t start, dma_addr_t end)
+{
+   struct vfio_iova *region;
+
+   region = kmalloc(sizeof(*region), GFP_KERNEL);
+   if (!region)
+   return -ENOMEM;
+
+   INIT_LIST_HEAD(>list);
+   region->start = start;
+   region->end = end;
+
+   list_add_tail(>list, head);
+   return 0;
+}
+
+/*
+ * Check the new iommu aperture conflicts with existing aper or with any
+ * existing dma mappings.
+ */
+static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
+dma_addr_t start, dma_addr_t end)
+{
+   struct vfio_iova *first, *last;
+   struct list_head *iova = >iova_list;
+
+   if (list_empty(iova))
+   return false;
+
+   /* Disjoint sets, return conflict */
+   first = list_first_entry(iova, struct vfio_iova, list);
+   last = list_last_entry(iova, struct vfio_iova, list);
+   if ((start > last->end) || (end < first->start))
+   return true;
+
+   /* Check for any existing dma mappings outside the new start */
+   if (start > first->start) {
+   if (vfio_find_dma(iommu, first->start, start - first->start))
+   return true;
+   }
+
+   /* Check for any existing dma mappings outside the new end */
+   if (end < last->end) {
+   if (vfio_find_dma(iommu, end + 1, last->end - end))
+   return true;
+   }
+
+   return false;
+}
+
+/*
+ * Resize iommu iova aperture window. This is called only if the new
+ * aperture has no conflict with existing aperture and dma mappings.
+ */
+static int vfio_iommu_aper_resize(struct list_head *iova,
+ dma_addr_t start,
+ dma_addr_t end)
+{
+   struct vfio_iova *node, *next;
+
+   if (list_empty(iova))
+   return vfio_iommu_iova_insert(iova, start, end);
+
+   /* Adjust iova list start */
+   list_for_each_entry_safe(node, next, iova, list) {
+   if (start < node->start)
+   break;
+   if ((start >= node->start) && (start < node->end)) {
+   node->start = start;
+   break;
+   }
+   /* Delete nodes before new start */
+   list_del(>list);
+   kfree(node);
+   }
+
+   /* Adjust iova list end */
+   list_for_each_entry_safe(node, next, iova, list) {
+   if (end > node->end)
+   continue;
+   if ((end > node->start) && (end <= node->end)) {
+   node->end = end;
+   continue;
+   }
+   /* Delete nodes after new end */
+   list_del(>list);
+   kfree(node);
+   }
+
+   return 0;
+}
+
+static void vfio_iommu_iova_free(struct list_head *iova)
+{
+   struct vfio_iova *n, *next;
+
+   list_for_each_entry_safe(n, next, iova, list) {
+

[PATCH v6 0/7] vfio/type1: Add support for valid iova list management

2018-04-18 Thread Shameer Kolothum

This series introduces an iova list associated with a vfio 
iommu. The list is kept updated taking care of iommu apertures,
and reserved regions. Also this series adds checks for any conflict
with existing dma mappings whenever a new device group is attached to
the domain.

User-space can retrieve valid iova ranges using VFIO_IOMMU_GET_INFO
ioctl capability chains. Any dma map request outside the valid iova
range will be rejected.

v5 --> v6

 -Rebased to 4.17-rc1
 -Changed the ordering such that previous patch#7 "iommu/dma: Move
  PCI window region reservation back...")  is now patch #4. This
  will avoid any bisection issues pointed out by Alex.
 -Added Robin's Reviewed-by tag for patch#4

v4 --> v5
Rebased to next-20180315.
 
 -Incorporated the corner case bug fix suggested by Alex to patch #5.
 -Based on suggestions by Alex and Robin, added patch#7. This
  moves the PCI window  reservation back in to DMA specific path.
  This is to fix the issue reported by Eric[1].

Note:
The patch #7 has dependency with [2][3]

1. https://patchwork.kernel.org/patch/10232043/
2. https://patchwork.kernel.org/patch/10216553/
3. https://patchwork.kernel.org/patch/10216555/

v3 --> v4
 Addressed comments received for v3.
 -dma_addr_t instead of phys_addr_t
 -LIST_HEAD() usage.
 -Free up iova_copy list in case of error.
 -updated logic in filling the iova caps info(patch #5)

RFCv2 --> v3
 Removed RFC tag.
 Addressed comments from Alex and Eric:
 - Added comments to make iova list management logic more clear.
 - Use of iova list copy so that original is not altered in
   case of failure.

RFCv1 --> RFCv2
 Addressed comments from Alex:
-Introduced IOVA list management and added checks for conflicts with 
 existing dma map entries during attach/detach.

Shameer Kolothum (7):
  vfio/type1: Introduce iova list and add iommu aperture validity check
  vfio/type1: Check reserve region conflict and update iova list
  vfio/type1: Update iova list on detach
  iommu/dma: Move PCI window region reservation back into dma specific
path.
  vfio/type1: check dma map request is within a valid iova range
  vfio/type1: Add IOVA range capability support
  vfio/type1: remove duplicate retrieval of reserved regions

 drivers/iommu/dma-iommu.c   |  54 ++---
 drivers/vfio/vfio_iommu_type1.c | 497 +++-
 include/uapi/linux/vfio.h   |  23 ++
 3 files changed, 533 insertions(+), 41 deletions(-)

-- 
2.7.4

[PATCH v6 0/7] vfio/type1: Add support for valid iova list management

2018-04-18 Thread Shameer Kolothum

This series introduces an iova list associated with a vfio 
iommu. The list is kept updated taking care of iommu apertures,
and reserved regions. Also this series adds checks for any conflict
with existing dma mappings whenever a new device group is attached to
the domain.

User-space can retrieve valid iova ranges using VFIO_IOMMU_GET_INFO
ioctl capability chains. Any dma map request outside the valid iova
range will be rejected.

v5 --> v6

 -Rebased to 4.17-rc1
 -Changed the ordering such that previous patch#7 "iommu/dma: Move
  PCI window region reservation back...")  is now patch #4. This
  will avoid any bisection issues pointed out by Alex.
 -Added Robin's Reviewed-by tag for patch#4

v4 --> v5
Rebased to next-20180315.
 
 -Incorporated the corner case bug fix suggested by Alex to patch #5.
 -Based on suggestions by Alex and Robin, added patch#7. This
  moves the PCI window  reservation back in to DMA specific path.
  This is to fix the issue reported by Eric[1].

Note:
The patch #7 has dependency with [2][3]

1. https://patchwork.kernel.org/patch/10232043/
2. https://patchwork.kernel.org/patch/10216553/
3. https://patchwork.kernel.org/patch/10216555/

v3 --> v4
 Addressed comments received for v3.
 -dma_addr_t instead of phys_addr_t
 -LIST_HEAD() usage.
 -Free up iova_copy list in case of error.
 -updated logic in filling the iova caps info(patch #5)

RFCv2 --> v3
 Removed RFC tag.
 Addressed comments from Alex and Eric:
 - Added comments to make iova list management logic more clear.
 - Use of iova list copy so that original is not altered in
   case of failure.

RFCv1 --> RFCv2
 Addressed comments from Alex:
-Introduced IOVA list management and added checks for conflicts with 
 existing dma map entries during attach/detach.

Shameer Kolothum (7):
  vfio/type1: Introduce iova list and add iommu aperture validity check
  vfio/type1: Check reserve region conflict and update iova list
  vfio/type1: Update iova list on detach
  iommu/dma: Move PCI window region reservation back into dma specific
path.
  vfio/type1: check dma map request is within a valid iova range
  vfio/type1: Add IOVA range capability support
  vfio/type1: remove duplicate retrieval of reserved regions

 drivers/iommu/dma-iommu.c   |  54 ++---
 drivers/vfio/vfio_iommu_type1.c | 497 +++-
 include/uapi/linux/vfio.h   |  23 ++
 3 files changed, 533 insertions(+), 41 deletions(-)

-- 
2.7.4

[PATCH v6 3/7] vfio/type1: Update iova list on detach

2018-04-18 Thread Shameer Kolothum

Get a copy of iova list on _group_detach and try to update the list.
On success replace the current one with the copy. Leave the list as
it is if update fails.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 91 +
 1 file changed, 91 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a0a79e1..6fd6841 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1776,12 +1776,88 @@ static void vfio_sanity_check_pfn_list(struct 
vfio_iommu *iommu)
WARN_ON(iommu->notifier.head);
 }
 
+/*
+ * Called when a domain is removed in detach. It is possible that
+ * the removed domain decided the iova aperture window. Modify the
+ * iova aperture with the smallest window among existing domains.
+ */
+static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
+  struct list_head *iova_copy)
+{
+   struct vfio_domain *domain;
+   struct iommu_domain_geometry geo;
+   struct vfio_iova *node;
+   dma_addr_t start = 0;
+   dma_addr_t end = (dma_addr_t)~0;
+
+   list_for_each_entry(domain, >domain_list, next) {
+   iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
+ );
+   if (geo.aperture_start > start)
+   start = geo.aperture_start;
+   if (geo.aperture_end < end)
+   end = geo.aperture_end;
+   }
+
+   /* Modify aperture limits. The new aper is either same or bigger */
+   node = list_first_entry(iova_copy, struct vfio_iova, list);
+   node->start = start;
+   node = list_last_entry(iova_copy, struct vfio_iova, list);
+   node->end = end;
+}
+
+/*
+ * Called when a group is detached. The reserved regions for that
+ * group can be part of valid iova now. But since reserved regions
+ * may be duplicated among groups, populate the iova valid regions
+ * list again.
+ */
+static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
+  struct list_head *iova_copy)
+{
+   struct vfio_domain *d;
+   struct vfio_group *g;
+   struct vfio_iova *node;
+   dma_addr_t start, end;
+   LIST_HEAD(resv_regions);
+   int ret;
+
+   list_for_each_entry(d, >domain_list, next) {
+   list_for_each_entry(g, >group_list, next)
+   iommu_get_group_resv_regions(g->iommu_group,
+_regions);
+   }
+
+   if (list_empty(_regions))
+   return 0;
+
+   node = list_first_entry(iova_copy, struct vfio_iova, list);
+   start = node->start;
+   node = list_last_entry(iova_copy, struct vfio_iova, list);
+   end = node->end;
+
+   /* purge the iova list and create new one */
+   vfio_iommu_iova_free(iova_copy);
+
+   ret = vfio_iommu_aper_resize(iova_copy, start, end);
+   if (ret)
+   goto done;
+
+   /* Exclude current reserved regions from iova ranges */
+   ret = vfio_iommu_resv_exclude(iova_copy, _regions);
+done:
+   vfio_iommu_resv_free(_regions);
+   return ret;
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
  struct iommu_group *iommu_group)
 {
struct vfio_iommu *iommu = iommu_data;
struct vfio_domain *domain;
struct vfio_group *group;
+   bool iova_copy_fail;
+   LIST_HEAD(iova_copy);
 
mutex_lock(>lock);
 
@@ -1804,6 +1880,12 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
}
}
 
+   /*
+* Get a copy of iova list. If success, use copy to update the
+* list and to replace the current one.
+*/
+   iova_copy_fail = !!vfio_iommu_iova_get_copy(iommu, _copy);
+
list_for_each_entry(domain, >domain_list, next) {
group = find_iommu_group(domain, iommu_group);
if (!group)
@@ -1829,10 +1911,19 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
iommu_domain_free(domain->domain);
list_del(>next);
kfree(domain);
+   if (!iova_copy_fail && !list_empty(>domain_list))
+   vfio_iommu_aper_expand(iommu, _copy);
}
break;
}
 
+   if (!iova_copy_fail && !list_empty(>domain_list)) {
+   if (!vfio_iommu_resv_refresh(iommu, _copy))
+   vfio_iommu_iova_insert_copy(iommu, _copy);
+   else
+   vfio_iommu_iova_free(_copy);
+   }
+
 detach_group_done:
mutex_unlock(>lock);
 }
-- 
2.7.4

[PATCH v6 3/7] vfio/type1: Update iova list on detach

2018-04-18 Thread Shameer Kolothum

Get a copy of iova list on _group_detach and try to update the list.
On success replace the current one with the copy. Leave the list as
it is if update fails.

Signed-off-by: Shameer Kolothum 
---
 drivers/vfio/vfio_iommu_type1.c | 91 +
 1 file changed, 91 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index a0a79e1..6fd6841 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1776,12 +1776,88 @@ static void vfio_sanity_check_pfn_list(struct 
vfio_iommu *iommu)
WARN_ON(iommu->notifier.head);
 }
 
+/*
+ * Called when a domain is removed in detach. It is possible that
+ * the removed domain decided the iova aperture window. Modify the
+ * iova aperture with the smallest window among existing domains.
+ */
+static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
+  struct list_head *iova_copy)
+{
+   struct vfio_domain *domain;
+   struct iommu_domain_geometry geo;
+   struct vfio_iova *node;
+   dma_addr_t start = 0;
+   dma_addr_t end = (dma_addr_t)~0;
+
+   list_for_each_entry(domain, >domain_list, next) {
+   iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
+ );
+   if (geo.aperture_start > start)
+   start = geo.aperture_start;
+   if (geo.aperture_end < end)
+   end = geo.aperture_end;
+   }
+
+   /* Modify aperture limits. The new aper is either same or bigger */
+   node = list_first_entry(iova_copy, struct vfio_iova, list);
+   node->start = start;
+   node = list_last_entry(iova_copy, struct vfio_iova, list);
+   node->end = end;
+}
+
+/*
+ * Called when a group is detached. The reserved regions for that
+ * group can be part of valid iova now. But since reserved regions
+ * may be duplicated among groups, populate the iova valid regions
+ * list again.
+ */
+static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
+  struct list_head *iova_copy)
+{
+   struct vfio_domain *d;
+   struct vfio_group *g;
+   struct vfio_iova *node;
+   dma_addr_t start, end;
+   LIST_HEAD(resv_regions);
+   int ret;
+
+   list_for_each_entry(d, >domain_list, next) {
+   list_for_each_entry(g, >group_list, next)
+   iommu_get_group_resv_regions(g->iommu_group,
+_regions);
+   }
+
+   if (list_empty(_regions))
+   return 0;
+
+   node = list_first_entry(iova_copy, struct vfio_iova, list);
+   start = node->start;
+   node = list_last_entry(iova_copy, struct vfio_iova, list);
+   end = node->end;
+
+   /* purge the iova list and create new one */
+   vfio_iommu_iova_free(iova_copy);
+
+   ret = vfio_iommu_aper_resize(iova_copy, start, end);
+   if (ret)
+   goto done;
+
+   /* Exclude current reserved regions from iova ranges */
+   ret = vfio_iommu_resv_exclude(iova_copy, _regions);
+done:
+   vfio_iommu_resv_free(_regions);
+   return ret;
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
  struct iommu_group *iommu_group)
 {
struct vfio_iommu *iommu = iommu_data;
struct vfio_domain *domain;
struct vfio_group *group;
+   bool iova_copy_fail;
+   LIST_HEAD(iova_copy);
 
mutex_lock(>lock);
 
@@ -1804,6 +1880,12 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
}
}
 
+   /*
+* Get a copy of iova list. If success, use copy to update the
+* list and to replace the current one.
+*/
+   iova_copy_fail = !!vfio_iommu_iova_get_copy(iommu, _copy);
+
list_for_each_entry(domain, >domain_list, next) {
group = find_iommu_group(domain, iommu_group);
if (!group)
@@ -1829,10 +1911,19 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
iommu_domain_free(domain->domain);
list_del(>next);
kfree(domain);
+   if (!iova_copy_fail && !list_empty(>domain_list))
+   vfio_iommu_aper_expand(iommu, _copy);
}
break;
}
 
+   if (!iova_copy_fail && !list_empty(>domain_list)) {
+   if (!vfio_iommu_resv_refresh(iommu, _copy))
+   vfio_iommu_iova_insert_copy(iommu, _copy);
+   else
+   vfio_iommu_iova_free(_copy);
+   }
+
 detach_group_done:
mutex_unlock(>lock);
 }
-- 
2.7.4

[PATCH v6 5/7] vfio/type1: check dma map request is within a valid iova range

2018-04-18 Thread Shameer Kolothum

This checks and rejects any dma map request outside valid iova
range.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6fd6841..bf33281 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1091,6 +1091,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma,
return ret;
 }
 
+/*
+ * Check dma map request is within a valid iova range
+ */
+static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
+   dma_addr_t start, dma_addr_t end)
+{
+   struct list_head *iova = >iova_list;
+   struct vfio_iova *node;
+
+   list_for_each_entry(node, iova, list) {
+   if ((start >= node->start) && (end <= node->end))
+   return true;
+   }
+
+   return false;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
   struct vfio_iommu_type1_dma_map *map)
 {
@@ -1129,6 +1146,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
goto out_unlock;
}
 
+   if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
ret = -ENOMEM;
-- 
2.7.4

[PATCH v6 5/7] vfio/type1: check dma map request is within a valid iova range

2018-04-18 Thread Shameer Kolothum

This checks and rejects any dma map request outside valid iova
range.

Signed-off-by: Shameer Kolothum 
---
 drivers/vfio/vfio_iommu_type1.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 6fd6841..bf33281 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1091,6 +1091,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma,
return ret;
 }
 
+/*
+ * Check dma map request is within a valid iova range
+ */
+static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
+   dma_addr_t start, dma_addr_t end)
+{
+   struct list_head *iova = >iova_list;
+   struct vfio_iova *node;
+
+   list_for_each_entry(node, iova, list) {
+   if ((start >= node->start) && (end <= node->end))
+   return true;
+   }
+
+   return false;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
   struct vfio_iommu_type1_dma_map *map)
 {
@@ -1129,6 +1146,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
goto out_unlock;
}
 
+   if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
ret = -ENOMEM;
-- 
2.7.4

[PATCH v6 7/7] vfio/type1: remove duplicate retrieval of reserved regions

2018-04-18 Thread Shameer Kolothum

As we now already have the reserved regions list, just pass that into
vfio_iommu_has_sw_msi() fn.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 44d0f13d..13c631a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1313,15 +1313,13 @@ static struct vfio_group *find_iommu_group(struct 
vfio_domain *domain,
return NULL;
 }
 
-static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
+static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
+   phys_addr_t *base)
 {
-   struct list_head group_resv_regions;
-   struct iommu_resv_region *region, *next;
+   struct iommu_resv_region *region;
bool ret = false;
 
-   INIT_LIST_HEAD(_resv_regions);
-   iommu_get_group_resv_regions(group, _resv_regions);
-   list_for_each_entry(region, _resv_regions, list) {
+   list_for_each_entry(region, group_resv_regions, list) {
/*
 * The presence of any 'real' MSI regions should take
 * precedence over the software-managed one if the
@@ -1337,8 +1335,7 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group 
*group, phys_addr_t *base)
ret = true;
}
}
-   list_for_each_entry_safe(region, next, _resv_regions, list)
-   kfree(region);
+
return ret;
 }
 
@@ -1673,7 +1670,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
if (ret)
goto out_detach;
 
-   resv_msi = vfio_iommu_has_sw_msi(iommu_group, _msi_base);
+   resv_msi = vfio_iommu_has_sw_msi(_resv_regions, _msi_base);
 
INIT_LIST_HEAD(>group_list);
list_add(>next, >group_list);
-- 
2.7.4

[PATCH v6 7/7] vfio/type1: remove duplicate retrieval of reserved regions

2018-04-18 Thread Shameer Kolothum

As we now already have the reserved regions list, just pass that into
vfio_iommu_has_sw_msi() fn.

Signed-off-by: Shameer Kolothum 
---
 drivers/vfio/vfio_iommu_type1.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 44d0f13d..13c631a 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1313,15 +1313,13 @@ static struct vfio_group *find_iommu_group(struct 
vfio_domain *domain,
return NULL;
 }
 
-static bool vfio_iommu_has_sw_msi(struct iommu_group *group, phys_addr_t *base)
+static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
+   phys_addr_t *base)
 {
-   struct list_head group_resv_regions;
-   struct iommu_resv_region *region, *next;
+   struct iommu_resv_region *region;
bool ret = false;
 
-   INIT_LIST_HEAD(_resv_regions);
-   iommu_get_group_resv_regions(group, _resv_regions);
-   list_for_each_entry(region, _resv_regions, list) {
+   list_for_each_entry(region, group_resv_regions, list) {
/*
 * The presence of any 'real' MSI regions should take
 * precedence over the software-managed one if the
@@ -1337,8 +1335,7 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group 
*group, phys_addr_t *base)
ret = true;
}
}
-   list_for_each_entry_safe(region, next, _resv_regions, list)
-   kfree(region);
+
return ret;
 }
 
@@ -1673,7 +1670,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
if (ret)
goto out_detach;
 
-   resv_msi = vfio_iommu_has_sw_msi(iommu_group, _msi_base);
+   resv_msi = vfio_iommu_has_sw_msi(_resv_regions, _msi_base);
 
INIT_LIST_HEAD(>group_list);
list_add(>next, >group_list);
-- 
2.7.4

[PATCH v6 2/7] vfio/type1: Check reserve region conflict and update iova list

2018-04-18 Thread Shameer Kolothum

This retrieves the reserved regions associated with dev group and
checks for conflicts with any existing dma mappings. Also update
the iova list excluding the reserved regions.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 90 +
 1 file changed, 90 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 775946d..a0a79e1 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1422,6 +1422,82 @@ static int vfio_iommu_aper_resize(struct list_head *iova,
return 0;
 }
 
+/*
+ * Check reserved region conflicts with existing dma mappings
+ */
+static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
+   struct list_head *resv_regions)
+{
+   struct iommu_resv_region *region;
+
+   /* Check for conflict with existing dma mappings */
+   list_for_each_entry(region, resv_regions, list) {
+   if (vfio_find_dma(iommu, region->start, region->length))
+   return true;
+   }
+
+   return false;
+}
+
+/*
+ * Check iova region overlap with  reserved regions and
+ * exclude them from the iommu iova range
+ */
+static int vfio_iommu_resv_exclude(struct list_head *iova,
+   struct list_head *resv_regions)
+{
+   struct iommu_resv_region *resv;
+   struct vfio_iova *n, *next;
+
+   list_for_each_entry(resv, resv_regions, list) {
+   phys_addr_t start, end;
+
+   start = resv->start;
+   end = resv->start + resv->length - 1;
+
+   list_for_each_entry_safe(n, next, iova, list) {
+   int ret = 0;
+
+   /* No overlap */
+   if ((start > n->end) || (end < n->start))
+   continue;
+   /*
+* Insert a new node if current node overlaps with the
+* reserve region to exlude that from valid iova range.
+* Note that, new node is inserted before the current
+* node and finally the current node is deleted keeping
+* the list updated and sorted.
+*/
+   if (start > n->start)
+   ret = vfio_iommu_iova_insert(>list,
+   n->start, start - 1);
+   if (!ret && end < n->end)
+   ret = vfio_iommu_iova_insert(>list,
+   end + 1, n->end);
+   if (ret)
+   return ret;
+
+   list_del(>list);
+   kfree(n);
+   }
+   }
+
+   if (list_empty(iova))
+   return -EINVAL;
+
+   return 0;
+}
+
+static void vfio_iommu_resv_free(struct list_head *resv_regions)
+{
+   struct iommu_resv_region *n, *next;
+
+   list_for_each_entry_safe(n, next, resv_regions, list) {
+   list_del(>list);
+   kfree(n);
+   }
+}
+
 static void vfio_iommu_iova_free(struct list_head *iova)
 {
struct vfio_iova *n, *next;
@@ -1475,6 +1551,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
phys_addr_t resv_msi_base;
struct iommu_domain_geometry geo;
LIST_HEAD(iova_copy);
+   LIST_HEAD(group_resv_regions);
 
mutex_lock(>lock);
 
@@ -1553,6 +1630,13 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
goto out_detach;
}
 
+   iommu_get_group_resv_regions(iommu_group, _resv_regions);
+
+   if (vfio_iommu_resv_conflict(iommu, _resv_regions)) {
+   ret = -EINVAL;
+   goto out_detach;
+   }
+
/* Get a copy of the current iova list and work on it */
ret = vfio_iommu_iova_get_copy(iommu, _copy);
if (ret)
@@ -1563,6 +1647,10 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
if (ret)
goto out_detach;
 
+   ret = vfio_iommu_resv_exclude(_copy, _resv_regions);
+   if (ret)
+   goto out_detach;
+
resv_msi = vfio_iommu_has_sw_msi(iommu_group, _msi_base);
 
INIT_LIST_HEAD(>group_list);
@@ -1623,6 +1711,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
/* Delete the old one and insert new iova list */
vfio_iommu_iova_insert_copy(iommu, _copy);
mutex_unlock(>lock);
+   vfio_iommu_resv_free(_resv_regions);
 
return 0;
 
@@ -1631,6 +1720,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 out_domain:
iommu_domain_free(domain->domain);
vfio_iommu_iova_free(_

[PATCH v6 2/7] vfio/type1: Check reserve region conflict and update iova list

2018-04-18 Thread Shameer Kolothum

This retrieves the reserved regions associated with dev group and
checks for conflicts with any existing dma mappings. Also update
the iova list excluding the reserved regions.

Signed-off-by: Shameer Kolothum 
---
 drivers/vfio/vfio_iommu_type1.c | 90 +
 1 file changed, 90 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 775946d..a0a79e1 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1422,6 +1422,82 @@ static int vfio_iommu_aper_resize(struct list_head *iova,
return 0;
 }
 
+/*
+ * Check reserved region conflicts with existing dma mappings
+ */
+static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
+   struct list_head *resv_regions)
+{
+   struct iommu_resv_region *region;
+
+   /* Check for conflict with existing dma mappings */
+   list_for_each_entry(region, resv_regions, list) {
+   if (vfio_find_dma(iommu, region->start, region->length))
+   return true;
+   }
+
+   return false;
+}
+
+/*
+ * Check iova region overlap with  reserved regions and
+ * exclude them from the iommu iova range
+ */
+static int vfio_iommu_resv_exclude(struct list_head *iova,
+   struct list_head *resv_regions)
+{
+   struct iommu_resv_region *resv;
+   struct vfio_iova *n, *next;
+
+   list_for_each_entry(resv, resv_regions, list) {
+   phys_addr_t start, end;
+
+   start = resv->start;
+   end = resv->start + resv->length - 1;
+
+   list_for_each_entry_safe(n, next, iova, list) {
+   int ret = 0;
+
+   /* No overlap */
+   if ((start > n->end) || (end < n->start))
+   continue;
+   /*
+* Insert a new node if current node overlaps with the
+* reserve region to exlude that from valid iova range.
+* Note that, new node is inserted before the current
+* node and finally the current node is deleted keeping
+* the list updated and sorted.
+*/
+   if (start > n->start)
+   ret = vfio_iommu_iova_insert(>list,
+   n->start, start - 1);
+   if (!ret && end < n->end)
+   ret = vfio_iommu_iova_insert(>list,
+   end + 1, n->end);
+   if (ret)
+   return ret;
+
+   list_del(>list);
+   kfree(n);
+   }
+   }
+
+   if (list_empty(iova))
+   return -EINVAL;
+
+   return 0;
+}
+
+static void vfio_iommu_resv_free(struct list_head *resv_regions)
+{
+   struct iommu_resv_region *n, *next;
+
+   list_for_each_entry_safe(n, next, resv_regions, list) {
+   list_del(>list);
+   kfree(n);
+   }
+}
+
 static void vfio_iommu_iova_free(struct list_head *iova)
 {
struct vfio_iova *n, *next;
@@ -1475,6 +1551,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
phys_addr_t resv_msi_base;
struct iommu_domain_geometry geo;
LIST_HEAD(iova_copy);
+   LIST_HEAD(group_resv_regions);
 
mutex_lock(>lock);
 
@@ -1553,6 +1630,13 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
goto out_detach;
}
 
+   iommu_get_group_resv_regions(iommu_group, _resv_regions);
+
+   if (vfio_iommu_resv_conflict(iommu, _resv_regions)) {
+   ret = -EINVAL;
+   goto out_detach;
+   }
+
/* Get a copy of the current iova list and work on it */
ret = vfio_iommu_iova_get_copy(iommu, _copy);
if (ret)
@@ -1563,6 +1647,10 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
if (ret)
goto out_detach;
 
+   ret = vfio_iommu_resv_exclude(_copy, _resv_regions);
+   if (ret)
+   goto out_detach;
+
resv_msi = vfio_iommu_has_sw_msi(iommu_group, _msi_base);
 
INIT_LIST_HEAD(>group_list);
@@ -1623,6 +1711,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
/* Delete the old one and insert new iova list */
vfio_iommu_iova_insert_copy(iommu, _copy);
mutex_unlock(>lock);
+   vfio_iommu_resv_free(_resv_regions);
 
return 0;
 
@@ -1631,6 +1720,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 out_domain:
iommu_domain_free(domain->domain);
vfio_iommu_iova_free(_copy);
+   vfio_iommu_resv_free(_resv_regions);
 out_free:
kfree(domain);
kfree(group);
-- 
2.7.4

[PATCH v6 6/7] vfio/type1: Add IOVA range capability support

2018-04-18 Thread Shameer Kolothum

This  allows the user-space to retrieve the supported IOVA
range(s), excluding any reserved regions. The implementation
is based on capability chains, added to VFIO_IOMMU_GET_INFO ioctl.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 96 +
 include/uapi/linux/vfio.h   | 23 ++
 2 files changed, 119 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bf33281..44d0f13d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2038,6 +2038,68 @@ static int vfio_domains_have_iommu_cache(struct 
vfio_iommu *iommu)
return ret;
 }
 
+static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
+struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
+size_t size)
+{
+   struct vfio_info_cap_header *header;
+   struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
+
+   header = vfio_info_cap_add(caps, size,
+   VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
+   if (IS_ERR(header))
+   return PTR_ERR(header);
+
+   iova_cap = container_of(header,
+   struct vfio_iommu_type1_info_cap_iova_range, header);
+   iova_cap->nr_iovas = cap_iovas->nr_iovas;
+   memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
+   cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
+   return 0;
+}
+
+static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
+   struct vfio_info_cap *caps)
+{
+   struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
+   struct vfio_iova *iova;
+   size_t size;
+   int iovas = 0, i = 0, ret;
+
+   mutex_lock(>lock);
+
+   list_for_each_entry(iova, >iova_list, list)
+   iovas++;
+
+   if (!iovas) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
+   size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
+
+   cap_iovas = kzalloc(size, GFP_KERNEL);
+   if (!cap_iovas) {
+   ret = -ENOMEM;
+   goto out_unlock;
+   }
+
+   cap_iovas->nr_iovas = iovas;
+
+   list_for_each_entry(iova, >iova_list, list) {
+   cap_iovas->iova_ranges[i].start = iova->start;
+   cap_iovas->iova_ranges[i].end = iova->end;
+   i++;
+   }
+
+   ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
+
+   kfree(cap_iovas);
+out_unlock:
+   mutex_unlock(>lock);
+   return ret;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -2059,19 +2121,53 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
}
} else if (cmd == VFIO_IOMMU_GET_INFO) {
struct vfio_iommu_type1_info info;
+   struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+   unsigned long capsz;
+   int ret;
 
minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 
+   /* For backward compatibility, cannot require this */
+   capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+
if (copy_from_user(, (void __user *)arg, minsz))
return -EFAULT;
 
if (info.argsz < minsz)
return -EINVAL;
 
+   if (info.argsz >= capsz) {
+   minsz = capsz;
+   info.cap_offset = 0; /* output, no-recopy necessary */
+   }
+
info.flags = VFIO_IOMMU_INFO_PGSIZES;
 
info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
 
+   ret = vfio_iommu_iova_build_caps(iommu, );
+   if (ret)
+   return ret;
+
+   if (caps.size) {
+   info.flags |= VFIO_IOMMU_INFO_CAPS;
+
+   if (info.argsz < sizeof(info) + caps.size) {
+   info.argsz = sizeof(info) + caps.size;
+   } else {
+   vfio_info_cap_shift(, sizeof(info));
+   if (copy_to_user((void __user *)arg +
+   sizeof(info), caps.buf,
+   caps.size)) {
+   kfree(caps.buf);
+   return -EFAULT;
+   }
+   info.cap_offset = sizeof(info);
+   }
+
+   kfree(caps.buf);
+   }
+
return copy_to_user((void __user *)arg, , minsz) ?
-EFAULT : 0;
 
diff --git a/include/uapi/

[PATCH v6 6/7] vfio/type1: Add IOVA range capability support

2018-04-18 Thread Shameer Kolothum

This  allows the user-space to retrieve the supported IOVA
range(s), excluding any reserved regions. The implementation
is based on capability chains, added to VFIO_IOMMU_GET_INFO ioctl.

Signed-off-by: Shameer Kolothum 
---
 drivers/vfio/vfio_iommu_type1.c | 96 +
 include/uapi/linux/vfio.h   | 23 ++
 2 files changed, 119 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index bf33281..44d0f13d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -2038,6 +2038,68 @@ static int vfio_domains_have_iommu_cache(struct 
vfio_iommu *iommu)
return ret;
 }
 
+static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
+struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
+size_t size)
+{
+   struct vfio_info_cap_header *header;
+   struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
+
+   header = vfio_info_cap_add(caps, size,
+   VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
+   if (IS_ERR(header))
+   return PTR_ERR(header);
+
+   iova_cap = container_of(header,
+   struct vfio_iommu_type1_info_cap_iova_range, header);
+   iova_cap->nr_iovas = cap_iovas->nr_iovas;
+   memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
+   cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
+   return 0;
+}
+
+static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
+   struct vfio_info_cap *caps)
+{
+   struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
+   struct vfio_iova *iova;
+   size_t size;
+   int iovas = 0, i = 0, ret;
+
+   mutex_lock(>lock);
+
+   list_for_each_entry(iova, >iova_list, list)
+   iovas++;
+
+   if (!iovas) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
+   size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
+
+   cap_iovas = kzalloc(size, GFP_KERNEL);
+   if (!cap_iovas) {
+   ret = -ENOMEM;
+   goto out_unlock;
+   }
+
+   cap_iovas->nr_iovas = iovas;
+
+   list_for_each_entry(iova, >iova_list, list) {
+   cap_iovas->iova_ranges[i].start = iova->start;
+   cap_iovas->iova_ranges[i].end = iova->end;
+   i++;
+   }
+
+   ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
+
+   kfree(cap_iovas);
+out_unlock:
+   mutex_unlock(>lock);
+   return ret;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -2059,19 +2121,53 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
}
} else if (cmd == VFIO_IOMMU_GET_INFO) {
struct vfio_iommu_type1_info info;
+   struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+   unsigned long capsz;
+   int ret;
 
minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 
+   /* For backward compatibility, cannot require this */
+   capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+
if (copy_from_user(, (void __user *)arg, minsz))
return -EFAULT;
 
if (info.argsz < minsz)
return -EINVAL;
 
+   if (info.argsz >= capsz) {
+   minsz = capsz;
+   info.cap_offset = 0; /* output, no-recopy necessary */
+   }
+
info.flags = VFIO_IOMMU_INFO_PGSIZES;
 
info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
 
+   ret = vfio_iommu_iova_build_caps(iommu, );
+   if (ret)
+   return ret;
+
+   if (caps.size) {
+   info.flags |= VFIO_IOMMU_INFO_CAPS;
+
+   if (info.argsz < sizeof(info) + caps.size) {
+   info.argsz = sizeof(info) + caps.size;
+   } else {
+   vfio_info_cap_shift(, sizeof(info));
+   if (copy_to_user((void __user *)arg +
+   sizeof(info), caps.buf,
+   caps.size)) {
+   kfree(caps.buf);
+   return -EFAULT;
+   }
+   info.cap_offset = sizeof(info);
+   }
+
+   kfree(caps.buf);
+   }
+
return copy_to_user((void __user *)arg, , minsz) ?
-EFAULT : 0;
 
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
inde

[PATCH v5 2/7] vfio/type1: Check reserve region conflict and update iova list

2018-03-15 Thread Shameer Kolothum

This retrieves the reserved regions associated with dev group and
checks for conflicts with any existing dma mappings. Also update
the iova list excluding the reserved regions.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 90 +
 1 file changed, 90 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 1123c74..cfe2bb2 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1313,6 +1313,82 @@ static int vfio_iommu_aper_resize(struct list_head *iova,
return 0;
 }
 
+/*
+ * Check reserved region conflicts with existing dma mappings
+ */
+static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
+   struct list_head *resv_regions)
+{
+   struct iommu_resv_region *region;
+
+   /* Check for conflict with existing dma mappings */
+   list_for_each_entry(region, resv_regions, list) {
+   if (vfio_find_dma(iommu, region->start, region->length))
+   return true;
+   }
+
+   return false;
+}
+
+/*
+ * Check iova region overlap with  reserved regions and
+ * exclude them from the iommu iova range
+ */
+static int vfio_iommu_resv_exclude(struct list_head *iova,
+   struct list_head *resv_regions)
+{
+   struct iommu_resv_region *resv;
+   struct vfio_iova *n, *next;
+
+   list_for_each_entry(resv, resv_regions, list) {
+   phys_addr_t start, end;
+
+   start = resv->start;
+   end = resv->start + resv->length - 1;
+
+   list_for_each_entry_safe(n, next, iova, list) {
+   int ret = 0;
+
+   /* No overlap */
+   if ((start > n->end) || (end < n->start))
+   continue;
+   /*
+* Insert a new node if current node overlaps with the
+* reserve region to exlude that from valid iova range.
+* Note that, new node is inserted before the current
+* node and finally the current node is deleted keeping
+* the list updated and sorted.
+*/
+   if (start > n->start)
+   ret = vfio_iommu_iova_insert(>list,
+   n->start, start - 1);
+   if (!ret && end < n->end)
+   ret = vfio_iommu_iova_insert(>list,
+   end + 1, n->end);
+   if (ret)
+   return ret;
+
+   list_del(>list);
+   kfree(n);
+   }
+   }
+
+   if (list_empty(iova))
+   return -EINVAL;
+
+   return 0;
+}
+
+static void vfio_iommu_resv_free(struct list_head *resv_regions)
+{
+   struct iommu_resv_region *n, *next;
+
+   list_for_each_entry_safe(n, next, resv_regions, list) {
+   list_del(>list);
+   kfree(n);
+   }
+}
+
 static void vfio_iommu_iova_free(struct list_head *iova)
 {
struct vfio_iova *n, *next;
@@ -1366,6 +1442,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
phys_addr_t resv_msi_base;
struct iommu_domain_geometry geo;
LIST_HEAD(iova_copy);
+   LIST_HEAD(group_resv_regions);
 
mutex_lock(>lock);
 
@@ -1444,6 +1521,13 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
goto out_detach;
}
 
+   iommu_get_group_resv_regions(iommu_group, _resv_regions);
+
+   if (vfio_iommu_resv_conflict(iommu, _resv_regions)) {
+   ret = -EINVAL;
+   goto out_detach;
+   }
+
/* Get a copy of the current iova list and work on it */
ret = vfio_iommu_iova_get_copy(iommu, _copy);
if (ret)
@@ -1454,6 +1538,10 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
if (ret)
goto out_detach;
 
+   ret = vfio_iommu_resv_exclude(_copy, _resv_regions);
+   if (ret)
+   goto out_detach;
+
resv_msi = vfio_iommu_has_sw_msi(iommu_group, _msi_base);
 
INIT_LIST_HEAD(>group_list);
@@ -1514,6 +1602,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
/* Delete the old one and insert new iova list */
vfio_iommu_iova_insert_copy(iommu, _copy);
mutex_unlock(>lock);
+   vfio_iommu_resv_free(_resv_regions);
 
return 0;
 
@@ -1522,6 +1611,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 out_domain:
iommu_domain_free(domain->domain);
vfio_iommu_iova_free(_

[PATCH v5 2/7] vfio/type1: Check reserve region conflict and update iova list

2018-03-15 Thread Shameer Kolothum

This retrieves the reserved regions associated with dev group and
checks for conflicts with any existing dma mappings. Also update
the iova list excluding the reserved regions.

Signed-off-by: Shameer Kolothum 
---
 drivers/vfio/vfio_iommu_type1.c | 90 +
 1 file changed, 90 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 1123c74..cfe2bb2 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1313,6 +1313,82 @@ static int vfio_iommu_aper_resize(struct list_head *iova,
return 0;
 }
 
+/*
+ * Check reserved region conflicts with existing dma mappings
+ */
+static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
+   struct list_head *resv_regions)
+{
+   struct iommu_resv_region *region;
+
+   /* Check for conflict with existing dma mappings */
+   list_for_each_entry(region, resv_regions, list) {
+   if (vfio_find_dma(iommu, region->start, region->length))
+   return true;
+   }
+
+   return false;
+}
+
+/*
+ * Check iova region overlap with  reserved regions and
+ * exclude them from the iommu iova range
+ */
+static int vfio_iommu_resv_exclude(struct list_head *iova,
+   struct list_head *resv_regions)
+{
+   struct iommu_resv_region *resv;
+   struct vfio_iova *n, *next;
+
+   list_for_each_entry(resv, resv_regions, list) {
+   phys_addr_t start, end;
+
+   start = resv->start;
+   end = resv->start + resv->length - 1;
+
+   list_for_each_entry_safe(n, next, iova, list) {
+   int ret = 0;
+
+   /* No overlap */
+   if ((start > n->end) || (end < n->start))
+   continue;
+   /*
+* Insert a new node if current node overlaps with the
+* reserve region to exlude that from valid iova range.
+* Note that, new node is inserted before the current
+* node and finally the current node is deleted keeping
+* the list updated and sorted.
+*/
+   if (start > n->start)
+   ret = vfio_iommu_iova_insert(>list,
+   n->start, start - 1);
+   if (!ret && end < n->end)
+   ret = vfio_iommu_iova_insert(>list,
+   end + 1, n->end);
+   if (ret)
+   return ret;
+
+   list_del(>list);
+   kfree(n);
+   }
+   }
+
+   if (list_empty(iova))
+   return -EINVAL;
+
+   return 0;
+}
+
+static void vfio_iommu_resv_free(struct list_head *resv_regions)
+{
+   struct iommu_resv_region *n, *next;
+
+   list_for_each_entry_safe(n, next, resv_regions, list) {
+   list_del(>list);
+   kfree(n);
+   }
+}
+
 static void vfio_iommu_iova_free(struct list_head *iova)
 {
struct vfio_iova *n, *next;
@@ -1366,6 +1442,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
phys_addr_t resv_msi_base;
struct iommu_domain_geometry geo;
LIST_HEAD(iova_copy);
+   LIST_HEAD(group_resv_regions);
 
mutex_lock(>lock);
 
@@ -1444,6 +1521,13 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
goto out_detach;
}
 
+   iommu_get_group_resv_regions(iommu_group, _resv_regions);
+
+   if (vfio_iommu_resv_conflict(iommu, _resv_regions)) {
+   ret = -EINVAL;
+   goto out_detach;
+   }
+
/* Get a copy of the current iova list and work on it */
ret = vfio_iommu_iova_get_copy(iommu, _copy);
if (ret)
@@ -1454,6 +1538,10 @@ static int vfio_iommu_type1_attach_group(void 
*iommu_data,
if (ret)
goto out_detach;
 
+   ret = vfio_iommu_resv_exclude(_copy, _resv_regions);
+   if (ret)
+   goto out_detach;
+
resv_msi = vfio_iommu_has_sw_msi(iommu_group, _msi_base);
 
INIT_LIST_HEAD(>group_list);
@@ -1514,6 +1602,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
/* Delete the old one and insert new iova list */
vfio_iommu_iova_insert_copy(iommu, _copy);
mutex_unlock(>lock);
+   vfio_iommu_resv_free(_resv_regions);
 
return 0;
 
@@ -1522,6 +1611,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 out_domain:
iommu_domain_free(domain->domain);
vfio_iommu_iova_free(_copy);
+   vfio_iommu_resv_free(_resv_regions);
 out_free:
kfree(domain);
kfree(group);
-- 
2.7.4

[PATCH v5 4/7] vfio/type1: check dma map request is within a valid iova range

2018-03-15 Thread Shameer Kolothum

This checks and rejects any dma map request outside valid iova
range.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 25e6920..d59db31 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -982,6 +982,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma,
return ret;
 }
 
+/*
+ * Check dma map request is within a valid iova range
+ */
+static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
+   dma_addr_t start, dma_addr_t end)
+{
+   struct list_head *iova = >iova_list;
+   struct vfio_iova *node;
+
+   list_for_each_entry(node, iova, list) {
+   if ((start >= node->start) && (end <= node->end))
+   return true;
+   }
+
+   return false;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
   struct vfio_iommu_type1_dma_map *map)
 {
@@ -1020,6 +1037,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
goto out_unlock;
}
 
+   if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
ret = -ENOMEM;
-- 
2.7.4

[PATCH v5 4/7] vfio/type1: check dma map request is within a valid iova range

2018-03-15 Thread Shameer Kolothum

This checks and rejects any dma map request outside valid iova
range.

Signed-off-by: Shameer Kolothum 
---
 drivers/vfio/vfio_iommu_type1.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 25e6920..d59db31 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -982,6 +982,23 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, 
struct vfio_dma *dma,
return ret;
 }
 
+/*
+ * Check dma map request is within a valid iova range
+ */
+static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
+   dma_addr_t start, dma_addr_t end)
+{
+   struct list_head *iova = >iova_list;
+   struct vfio_iova *node;
+
+   list_for_each_entry(node, iova, list) {
+   if ((start >= node->start) && (end <= node->end))
+   return true;
+   }
+
+   return false;
+}
+
 static int vfio_dma_do_map(struct vfio_iommu *iommu,
   struct vfio_iommu_type1_dma_map *map)
 {
@@ -1020,6 +1037,11 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
goto out_unlock;
}
 
+   if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
dma = kzalloc(sizeof(*dma), GFP_KERNEL);
if (!dma) {
ret = -ENOMEM;
-- 
2.7.4

[PATCH v5 1/7] vfio/type1: Introduce iova list and add iommu aperture validity check

2018-03-15 Thread Shameer Kolothum

This introduces an iova list that is valid for dma mappings. Make
sure the new iommu aperture window doesn't conflict with the current
one or with any existing dma mappings during attach.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 183 +++-
 1 file changed, 180 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 45657e2..1123c74 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,6 +60,7 @@ MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
struct list_headdomain_list;
+   struct list_headiova_list;
struct vfio_domain  *external_domain; /* domain for external user */
struct mutexlock;
struct rb_root  dma_list;
@@ -92,6 +93,12 @@ struct vfio_group {
struct list_headnext;
 };
 
+struct vfio_iova {
+   struct list_headlist;
+   dma_addr_t  start;
+   dma_addr_t  end;
+};
+
 /*
  * Guest RAM pinning working set or DMA target
  */
@@ -1204,6 +1211,149 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group 
*group, phys_addr_t *base)
return ret;
 }
 
+/*
+ * This is a helper function to insert an address range to iova list.
+ * The list starts with a single entry corresponding to the IOMMU
+ * domain geometry to which the device group is attached. The list
+ * aperture gets modified when a new domain is added to the container
+ * if the new aperture doesn't conflict with the current one or with
+ * any existing dma mappings. The list is also modified to exclude
+ * any reserved regions associated with the device group.
+ */
+static int vfio_iommu_iova_insert(struct list_head *head,
+ dma_addr_t start, dma_addr_t end)
+{
+   struct vfio_iova *region;
+
+   region = kmalloc(sizeof(*region), GFP_KERNEL);
+   if (!region)
+   return -ENOMEM;
+
+   INIT_LIST_HEAD(>list);
+   region->start = start;
+   region->end = end;
+
+   list_add_tail(>list, head);
+   return 0;
+}
+
+/*
+ * Check the new iommu aperture conflicts with existing aper or with any
+ * existing dma mappings.
+ */
+static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
+dma_addr_t start, dma_addr_t end)
+{
+   struct vfio_iova *first, *last;
+   struct list_head *iova = >iova_list;
+
+   if (list_empty(iova))
+   return false;
+
+   /* Disjoint sets, return conflict */
+   first = list_first_entry(iova, struct vfio_iova, list);
+   last = list_last_entry(iova, struct vfio_iova, list);
+   if ((start > last->end) || (end < first->start))
+   return true;
+
+   /* Check for any existing dma mappings outside the new start */
+   if (start > first->start) {
+   if (vfio_find_dma(iommu, first->start, start - first->start))
+   return true;
+   }
+
+   /* Check for any existing dma mappings outside the new end */
+   if (end < last->end) {
+   if (vfio_find_dma(iommu, end + 1, last->end - end))
+   return true;
+   }
+
+   return false;
+}
+
+/*
+ * Resize iommu iova aperture window. This is called only if the new
+ * aperture has no conflict with existing aperture and dma mappings.
+ */
+static int vfio_iommu_aper_resize(struct list_head *iova,
+ dma_addr_t start,
+ dma_addr_t end)
+{
+   struct vfio_iova *node, *next;
+
+   if (list_empty(iova))
+   return vfio_iommu_iova_insert(iova, start, end);
+
+   /* Adjust iova list start */
+   list_for_each_entry_safe(node, next, iova, list) {
+   if (start < node->start)
+   break;
+   if ((start >= node->start) && (start < node->end)) {
+   node->start = start;
+   break;
+   }
+   /* Delete nodes before new start */
+   list_del(>list);
+   kfree(node);
+   }
+
+   /* Adjust iova list end */
+   list_for_each_entry_safe(node, next, iova, list) {
+   if (end > node->end)
+   continue;
+   if ((end > node->start) && (end <= node->end)) {
+   node->end = end;
+   continue;
+   }
+   /* Delete nodes after new end */
+   list_del(>list);
+   kfree(node);
+   }
+
+   return 0;
+}
+
+static void vfio_iommu_iova_free(struct list_head *iova)
+{
+   struct vfio_iova *n, *next;
+
+   list_for_each_e

[PATCH v5 1/7] vfio/type1: Introduce iova list and add iommu aperture validity check

2018-03-15 Thread Shameer Kolothum

This introduces an iova list that is valid for dma mappings. Make
sure the new iommu aperture window doesn't conflict with the current
one or with any existing dma mappings during attach.

Signed-off-by: Shameer Kolothum 
---
 drivers/vfio/vfio_iommu_type1.c | 183 +++-
 1 file changed, 180 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 45657e2..1123c74 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -60,6 +60,7 @@ MODULE_PARM_DESC(disable_hugepages,
 
 struct vfio_iommu {
struct list_headdomain_list;
+   struct list_headiova_list;
struct vfio_domain  *external_domain; /* domain for external user */
struct mutexlock;
struct rb_root  dma_list;
@@ -92,6 +93,12 @@ struct vfio_group {
struct list_headnext;
 };
 
+struct vfio_iova {
+   struct list_headlist;
+   dma_addr_t  start;
+   dma_addr_t  end;
+};
+
 /*
  * Guest RAM pinning working set or DMA target
  */
@@ -1204,6 +1211,149 @@ static bool vfio_iommu_has_sw_msi(struct iommu_group 
*group, phys_addr_t *base)
return ret;
 }
 
+/*
+ * This is a helper function to insert an address range to iova list.
+ * The list starts with a single entry corresponding to the IOMMU
+ * domain geometry to which the device group is attached. The list
+ * aperture gets modified when a new domain is added to the container
+ * if the new aperture doesn't conflict with the current one or with
+ * any existing dma mappings. The list is also modified to exclude
+ * any reserved regions associated with the device group.
+ */
+static int vfio_iommu_iova_insert(struct list_head *head,
+ dma_addr_t start, dma_addr_t end)
+{
+   struct vfio_iova *region;
+
+   region = kmalloc(sizeof(*region), GFP_KERNEL);
+   if (!region)
+   return -ENOMEM;
+
+   INIT_LIST_HEAD(>list);
+   region->start = start;
+   region->end = end;
+
+   list_add_tail(>list, head);
+   return 0;
+}
+
+/*
+ * Check the new iommu aperture conflicts with existing aper or with any
+ * existing dma mappings.
+ */
+static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
+dma_addr_t start, dma_addr_t end)
+{
+   struct vfio_iova *first, *last;
+   struct list_head *iova = >iova_list;
+
+   if (list_empty(iova))
+   return false;
+
+   /* Disjoint sets, return conflict */
+   first = list_first_entry(iova, struct vfio_iova, list);
+   last = list_last_entry(iova, struct vfio_iova, list);
+   if ((start > last->end) || (end < first->start))
+   return true;
+
+   /* Check for any existing dma mappings outside the new start */
+   if (start > first->start) {
+   if (vfio_find_dma(iommu, first->start, start - first->start))
+   return true;
+   }
+
+   /* Check for any existing dma mappings outside the new end */
+   if (end < last->end) {
+   if (vfio_find_dma(iommu, end + 1, last->end - end))
+   return true;
+   }
+
+   return false;
+}
+
+/*
+ * Resize iommu iova aperture window. This is called only if the new
+ * aperture has no conflict with existing aperture and dma mappings.
+ */
+static int vfio_iommu_aper_resize(struct list_head *iova,
+ dma_addr_t start,
+ dma_addr_t end)
+{
+   struct vfio_iova *node, *next;
+
+   if (list_empty(iova))
+   return vfio_iommu_iova_insert(iova, start, end);
+
+   /* Adjust iova list start */
+   list_for_each_entry_safe(node, next, iova, list) {
+   if (start < node->start)
+   break;
+   if ((start >= node->start) && (start < node->end)) {
+   node->start = start;
+   break;
+   }
+   /* Delete nodes before new start */
+   list_del(>list);
+   kfree(node);
+   }
+
+   /* Adjust iova list end */
+   list_for_each_entry_safe(node, next, iova, list) {
+   if (end > node->end)
+   continue;
+   if ((end > node->start) && (end <= node->end)) {
+   node->end = end;
+   continue;
+   }
+   /* Delete nodes after new end */
+   list_del(>list);
+   kfree(node);
+   }
+
+   return 0;
+}
+
+static void vfio_iommu_iova_free(struct list_head *iova)
+{
+   struct vfio_iova *n, *next;
+
+   list_for_each_entry_safe(n, next, iova, list) {
+

[PATCH v5 7/7] iommu/dma: Move PCI window region reservation back into dma specific path.

2018-03-15 Thread Shameer Kolothum

This pretty much reverts commit 273df9635385 ("iommu/dma: Make PCI
window reservation generic")  by moving the PCI window region
reservation back into the dma specific path so that these regions
doesn't get exposed via the IOMMU API interface. With this change,
the vfio interface will report only iommu specific reserved regions
to the user space.

Cc: Robin Murphy <robin.mur...@arm.com>
Cc: Joerg Roedel <j...@8bytes.org>
Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/iommu/dma-iommu.c | 54 ++-
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f05f3cf..ddcbbdb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);
  * @list: Reserved region list from iommu_get_resv_regions()
  *
  * IOMMU drivers can use this to implement their .get_resv_regions callback
- * for general non-IOMMU-specific reservations. Currently, this covers host
- * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI
- * based ARM platforms that may require HW MSI reservation.
+ * for general non-IOMMU-specific reservations. Currently, this covers GICv3
+ * ITS region reservation on ACPI based ARM platforms that may require HW MSI
+ * reservation.
  */
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
-   struct pci_host_bridge *bridge;
-   struct resource_entry *window;
-
-   if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) &&
-   iort_iommu_msi_get_resv_regions(dev, list) < 0)
-   return;
-
-   if (!dev_is_pci(dev))
-   return;
-
-   bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
-   resource_list_for_each_entry(window, >windows) {
-   struct iommu_resv_region *region;
-   phys_addr_t start;
-   size_t length;
-
-   if (resource_type(window->res) != IORESOURCE_MEM)
-   continue;
 
-   start = window->res->start - window->offset;
-   length = window->res->end - window->res->start + 1;
-   region = iommu_alloc_resv_region(start, length, 0,
-   IOMMU_RESV_RESERVED);
-   if (!region)
-   return;
+   if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))
+   iort_iommu_msi_get_resv_regions(dev, list);
 
-   list_add_tail(>list, list);
-   }
 }
 EXPORT_SYMBOL(iommu_dma_get_resv_regions);
 
@@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct 
iommu_dma_cookie *cookie,
return 0;
 }
 
+static void iova_reserve_pci_windows(struct pci_dev *dev,
+   struct iova_domain *iovad)
+{
+   struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
+   struct resource_entry *window;
+   unsigned long lo, hi;
+
+   resource_list_for_each_entry(window, >windows) {
+   if (resource_type(window->res) != IORESOURCE_MEM)
+   continue;
+
+   lo = iova_pfn(iovad, window->res->start - window->offset);
+   hi = iova_pfn(iovad, window->res->end - window->offset);
+   reserve_iova(iovad, lo, hi);
+   }
+}
+
 static int iova_reserve_iommu_regions(struct device *dev,
struct iommu_domain *domain)
 {
@@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device *dev,
LIST_HEAD(resv_regions);
int ret = 0;
 
+   if (dev_is_pci(dev))
+   iova_reserve_pci_windows(to_pci_dev(dev), iovad);
+
iommu_get_resv_regions(dev, _regions);
list_for_each_entry(region, _regions, list) {
unsigned long lo, hi;
-- 
2.7.4

[PATCH v5 7/7] iommu/dma: Move PCI window region reservation back into dma specific path.

2018-03-15 Thread Shameer Kolothum

This pretty much reverts commit 273df9635385 ("iommu/dma: Make PCI
window reservation generic")  by moving the PCI window region
reservation back into the dma specific path so that these regions
doesn't get exposed via the IOMMU API interface. With this change,
the vfio interface will report only iommu specific reserved regions
to the user space.

Cc: Robin Murphy 
Cc: Joerg Roedel 
Signed-off-by: Shameer Kolothum 
---
 drivers/iommu/dma-iommu.c | 54 ++-
 1 file changed, 25 insertions(+), 29 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f05f3cf..ddcbbdb 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -167,40 +167,16 @@ EXPORT_SYMBOL(iommu_put_dma_cookie);
  * @list: Reserved region list from iommu_get_resv_regions()
  *
  * IOMMU drivers can use this to implement their .get_resv_regions callback
- * for general non-IOMMU-specific reservations. Currently, this covers host
- * bridge windows for PCI devices and GICv3 ITS region reservation on ACPI
- * based ARM platforms that may require HW MSI reservation.
+ * for general non-IOMMU-specific reservations. Currently, this covers GICv3
+ * ITS region reservation on ACPI based ARM platforms that may require HW MSI
+ * reservation.
  */
 void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list)
 {
-   struct pci_host_bridge *bridge;
-   struct resource_entry *window;
-
-   if (!is_of_node(dev->iommu_fwspec->iommu_fwnode) &&
-   iort_iommu_msi_get_resv_regions(dev, list) < 0)
-   return;
-
-   if (!dev_is_pci(dev))
-   return;
-
-   bridge = pci_find_host_bridge(to_pci_dev(dev)->bus);
-   resource_list_for_each_entry(window, >windows) {
-   struct iommu_resv_region *region;
-   phys_addr_t start;
-   size_t length;
-
-   if (resource_type(window->res) != IORESOURCE_MEM)
-   continue;
 
-   start = window->res->start - window->offset;
-   length = window->res->end - window->res->start + 1;
-   region = iommu_alloc_resv_region(start, length, 0,
-   IOMMU_RESV_RESERVED);
-   if (!region)
-   return;
+   if (!is_of_node(dev->iommu_fwspec->iommu_fwnode))
+   iort_iommu_msi_get_resv_regions(dev, list);
 
-   list_add_tail(>list, list);
-   }
 }
 EXPORT_SYMBOL(iommu_dma_get_resv_regions);
 
@@ -229,6 +205,23 @@ static int cookie_init_hw_msi_region(struct 
iommu_dma_cookie *cookie,
return 0;
 }
 
+static void iova_reserve_pci_windows(struct pci_dev *dev,
+   struct iova_domain *iovad)
+{
+   struct pci_host_bridge *bridge = pci_find_host_bridge(dev->bus);
+   struct resource_entry *window;
+   unsigned long lo, hi;
+
+   resource_list_for_each_entry(window, >windows) {
+   if (resource_type(window->res) != IORESOURCE_MEM)
+   continue;
+
+   lo = iova_pfn(iovad, window->res->start - window->offset);
+   hi = iova_pfn(iovad, window->res->end - window->offset);
+   reserve_iova(iovad, lo, hi);
+   }
+}
+
 static int iova_reserve_iommu_regions(struct device *dev,
struct iommu_domain *domain)
 {
@@ -238,6 +231,9 @@ static int iova_reserve_iommu_regions(struct device *dev,
LIST_HEAD(resv_regions);
int ret = 0;
 
+   if (dev_is_pci(dev))
+   iova_reserve_pci_windows(to_pci_dev(dev), iovad);
+
iommu_get_resv_regions(dev, _regions);
list_for_each_entry(region, _regions, list) {
unsigned long lo, hi;
-- 
2.7.4

[PATCH v5 3/7] vfio/type1: Update iova list on detach

2018-03-15 Thread Shameer Kolothum

Get a copy of iova list on _group_detach and try to update the list.
On success replace the current one with the copy. Leave the list as
it is if update fails.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 91 +
 1 file changed, 91 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index cfe2bb2..25e6920 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1667,12 +1667,88 @@ static void vfio_sanity_check_pfn_list(struct 
vfio_iommu *iommu)
WARN_ON(iommu->notifier.head);
 }
 
+/*
+ * Called when a domain is removed in detach. It is possible that
+ * the removed domain decided the iova aperture window. Modify the
+ * iova aperture with the smallest window among existing domains.
+ */
+static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
+  struct list_head *iova_copy)
+{
+   struct vfio_domain *domain;
+   struct iommu_domain_geometry geo;
+   struct vfio_iova *node;
+   dma_addr_t start = 0;
+   dma_addr_t end = (dma_addr_t)~0;
+
+   list_for_each_entry(domain, >domain_list, next) {
+   iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
+ );
+   if (geo.aperture_start > start)
+   start = geo.aperture_start;
+   if (geo.aperture_end < end)
+   end = geo.aperture_end;
+   }
+
+   /* Modify aperture limits. The new aper is either same or bigger */
+   node = list_first_entry(iova_copy, struct vfio_iova, list);
+   node->start = start;
+   node = list_last_entry(iova_copy, struct vfio_iova, list);
+   node->end = end;
+}
+
+/*
+ * Called when a group is detached. The reserved regions for that
+ * group can be part of valid iova now. But since reserved regions
+ * may be duplicated among groups, populate the iova valid regions
+ * list again.
+ */
+static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
+  struct list_head *iova_copy)
+{
+   struct vfio_domain *d;
+   struct vfio_group *g;
+   struct vfio_iova *node;
+   dma_addr_t start, end;
+   LIST_HEAD(resv_regions);
+   int ret;
+
+   list_for_each_entry(d, >domain_list, next) {
+   list_for_each_entry(g, >group_list, next)
+   iommu_get_group_resv_regions(g->iommu_group,
+_regions);
+   }
+
+   if (list_empty(_regions))
+   return 0;
+
+   node = list_first_entry(iova_copy, struct vfio_iova, list);
+   start = node->start;
+   node = list_last_entry(iova_copy, struct vfio_iova, list);
+   end = node->end;
+
+   /* purge the iova list and create new one */
+   vfio_iommu_iova_free(iova_copy);
+
+   ret = vfio_iommu_aper_resize(iova_copy, start, end);
+   if (ret)
+   goto done;
+
+   /* Exclude current reserved regions from iova ranges */
+   ret = vfio_iommu_resv_exclude(iova_copy, _regions);
+done:
+   vfio_iommu_resv_free(_regions);
+   return ret;
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
  struct iommu_group *iommu_group)
 {
struct vfio_iommu *iommu = iommu_data;
struct vfio_domain *domain;
struct vfio_group *group;
+   bool iova_copy_fail;
+   LIST_HEAD(iova_copy);
 
mutex_lock(>lock);
 
@@ -1695,6 +1771,12 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
}
}
 
+   /*
+* Get a copy of iova list. If success, use copy to update the
+* list and to replace the current one.
+*/
+   iova_copy_fail = !!vfio_iommu_iova_get_copy(iommu, _copy);
+
list_for_each_entry(domain, >domain_list, next) {
group = find_iommu_group(domain, iommu_group);
if (!group)
@@ -1720,10 +1802,19 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
iommu_domain_free(domain->domain);
list_del(>next);
kfree(domain);
+   if (!iova_copy_fail && !list_empty(>domain_list))
+   vfio_iommu_aper_expand(iommu, _copy);
}
break;
}
 
+   if (!iova_copy_fail && !list_empty(>domain_list)) {
+   if (!vfio_iommu_resv_refresh(iommu, _copy))
+   vfio_iommu_iova_insert_copy(iommu, _copy);
+   else
+   vfio_iommu_iova_free(_copy);
+   }
+
 detach_group_done:
mutex_unlock(>lock);
 }
-- 
2.7.4

[PATCH v5 3/7] vfio/type1: Update iova list on detach

2018-03-15 Thread Shameer Kolothum

Get a copy of iova list on _group_detach and try to update the list.
On success replace the current one with the copy. Leave the list as
it is if update fails.

Signed-off-by: Shameer Kolothum 
---
 drivers/vfio/vfio_iommu_type1.c | 91 +
 1 file changed, 91 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index cfe2bb2..25e6920 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1667,12 +1667,88 @@ static void vfio_sanity_check_pfn_list(struct 
vfio_iommu *iommu)
WARN_ON(iommu->notifier.head);
 }
 
+/*
+ * Called when a domain is removed in detach. It is possible that
+ * the removed domain decided the iova aperture window. Modify the
+ * iova aperture with the smallest window among existing domains.
+ */
+static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
+  struct list_head *iova_copy)
+{
+   struct vfio_domain *domain;
+   struct iommu_domain_geometry geo;
+   struct vfio_iova *node;
+   dma_addr_t start = 0;
+   dma_addr_t end = (dma_addr_t)~0;
+
+   list_for_each_entry(domain, >domain_list, next) {
+   iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
+ );
+   if (geo.aperture_start > start)
+   start = geo.aperture_start;
+   if (geo.aperture_end < end)
+   end = geo.aperture_end;
+   }
+
+   /* Modify aperture limits. The new aper is either same or bigger */
+   node = list_first_entry(iova_copy, struct vfio_iova, list);
+   node->start = start;
+   node = list_last_entry(iova_copy, struct vfio_iova, list);
+   node->end = end;
+}
+
+/*
+ * Called when a group is detached. The reserved regions for that
+ * group can be part of valid iova now. But since reserved regions
+ * may be duplicated among groups, populate the iova valid regions
+ * list again.
+ */
+static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
+  struct list_head *iova_copy)
+{
+   struct vfio_domain *d;
+   struct vfio_group *g;
+   struct vfio_iova *node;
+   dma_addr_t start, end;
+   LIST_HEAD(resv_regions);
+   int ret;
+
+   list_for_each_entry(d, >domain_list, next) {
+   list_for_each_entry(g, >group_list, next)
+   iommu_get_group_resv_regions(g->iommu_group,
+_regions);
+   }
+
+   if (list_empty(_regions))
+   return 0;
+
+   node = list_first_entry(iova_copy, struct vfio_iova, list);
+   start = node->start;
+   node = list_last_entry(iova_copy, struct vfio_iova, list);
+   end = node->end;
+
+   /* purge the iova list and create new one */
+   vfio_iommu_iova_free(iova_copy);
+
+   ret = vfio_iommu_aper_resize(iova_copy, start, end);
+   if (ret)
+   goto done;
+
+   /* Exclude current reserved regions from iova ranges */
+   ret = vfio_iommu_resv_exclude(iova_copy, _regions);
+done:
+   vfio_iommu_resv_free(_regions);
+   return ret;
+}
+
 static void vfio_iommu_type1_detach_group(void *iommu_data,
  struct iommu_group *iommu_group)
 {
struct vfio_iommu *iommu = iommu_data;
struct vfio_domain *domain;
struct vfio_group *group;
+   bool iova_copy_fail;
+   LIST_HEAD(iova_copy);
 
mutex_lock(>lock);
 
@@ -1695,6 +1771,12 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
}
}
 
+   /*
+* Get a copy of iova list. If success, use copy to update the
+* list and to replace the current one.
+*/
+   iova_copy_fail = !!vfio_iommu_iova_get_copy(iommu, _copy);
+
list_for_each_entry(domain, >domain_list, next) {
group = find_iommu_group(domain, iommu_group);
if (!group)
@@ -1720,10 +1802,19 @@ static void vfio_iommu_type1_detach_group(void 
*iommu_data,
iommu_domain_free(domain->domain);
list_del(>next);
kfree(domain);
+   if (!iova_copy_fail && !list_empty(>domain_list))
+   vfio_iommu_aper_expand(iommu, _copy);
}
break;
}
 
+   if (!iova_copy_fail && !list_empty(>domain_list)) {
+   if (!vfio_iommu_resv_refresh(iommu, _copy))
+   vfio_iommu_iova_insert_copy(iommu, _copy);
+   else
+   vfio_iommu_iova_free(_copy);
+   }
+
 detach_group_done:
mutex_unlock(>lock);
 }
-- 
2.7.4

[PATCH v5 0/7] vfio/type1: Add support for valid iova list management

2018-03-15 Thread Shameer Kolothum

This series introduces an iova list associated with a vfio 
iommu. The list is kept updated taking care of iommu apertures,
and reserved regions. Also this series adds checks for any conflict
with existing dma mappings whenever a new device group is attached to
the domain.

User-space can retrieve valid iova ranges using VFIO_IOMMU_GET_INFO
ioctl capability chains. Any dma map request outside the valid iova
range will be rejected.


v4 --> v5
Rebased to next-20180315.
 
 -Incorporated the corner case bug fix suggested by Alex to patch #5.
 -Based on suggestions by Alex and Robin, added patch#7. This
  moves the PCI window  reservation back in to DMA specific path.
  This is to fix the issue reported by Eric[1].

Note:
The patch #7 has dependency with [2][3]

1. https://patchwork.kernel.org/patch/10232043/
2. https://patchwork.kernel.org/patch/10216553/
3. https://patchwork.kernel.org/patch/10216555/

v3 --> v4
 Addressed comments received for v3.
 -dma_addr_t instead of phys_addr_t
 -LIST_HEAD() usage.
 -Free up iova_copy list in case of error.
 -updated logic in filling the iova caps info(patch #5)

RFCv2 --> v3
 Removed RFC tag.
 Addressed comments from Alex and Eric:
 - Added comments to make iova list management logic more clear.
 - Use of iova list copy so that original is not altered in
   case of failure.

RFCv1 --> RFCv2
 Addressed comments from Alex:
-Introduced IOVA list management and added checks for conflicts with 
 existing dma map entries during attach/detach.

Shameer Kolothum (2):
  vfio/type1: Add IOVA range capability support
  iommu/dma: Move PCI window region reservation back into dma specific
path.

Shameerali Kolothum Thodi (5):
  vfio/type1: Introduce iova list and add iommu aperture validity check
  vfio/type1: Check reserve region conflict and update iova list
  vfio/type1: Update iova list on detach
  vfio/type1: check dma map request is within a valid iova range
  vfio/type1: remove duplicate retrieval of reserved regions

 drivers/iommu/dma-iommu.c   |  54 ++---
 drivers/vfio/vfio_iommu_type1.c | 497 +++-
 include/uapi/linux/vfio.h   |  23 ++
 3 files changed, 533 insertions(+), 41 deletions(-)

-- 
2.7.4

[PATCH v5 5/7] vfio/type1: Add IOVA range capability support

2018-03-15 Thread Shameer Kolothum

This  allows the user-space to retrieve the supported IOVA
range(s), excluding any reserved regions. The implementation
is based on capability chains, added to VFIO_IOMMU_GET_INFO ioctl.

Signed-off-by: Shameer Kolothum <shameerali.kolothum.th...@huawei.com>
---
 drivers/vfio/vfio_iommu_type1.c | 96 +
 include/uapi/linux/vfio.h   | 23 ++
 2 files changed, 119 insertions(+)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index d59db31..90f195d 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -1929,6 +1929,68 @@ static int vfio_domains_have_iommu_cache(struct 
vfio_iommu *iommu)
return ret;
 }
 
+static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
+struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
+size_t size)
+{
+   struct vfio_info_cap_header *header;
+   struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
+
+   header = vfio_info_cap_add(caps, size,
+   VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
+   if (IS_ERR(header))
+   return PTR_ERR(header);
+
+   iova_cap = container_of(header,
+   struct vfio_iommu_type1_info_cap_iova_range, header);
+   iova_cap->nr_iovas = cap_iovas->nr_iovas;
+   memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
+   cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
+   return 0;
+}
+
+static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
+   struct vfio_info_cap *caps)
+{
+   struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
+   struct vfio_iova *iova;
+   size_t size;
+   int iovas = 0, i = 0, ret;
+
+   mutex_lock(>lock);
+
+   list_for_each_entry(iova, >iova_list, list)
+   iovas++;
+
+   if (!iovas) {
+   ret = -EINVAL;
+   goto out_unlock;
+   }
+
+   size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
+
+   cap_iovas = kzalloc(size, GFP_KERNEL);
+   if (!cap_iovas) {
+   ret = -ENOMEM;
+   goto out_unlock;
+   }
+
+   cap_iovas->nr_iovas = iovas;
+
+   list_for_each_entry(iova, >iova_list, list) {
+   cap_iovas->iova_ranges[i].start = iova->start;
+   cap_iovas->iova_ranges[i].end = iova->end;
+   i++;
+   }
+
+   ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
+
+   kfree(cap_iovas);
+out_unlock:
+   mutex_unlock(>lock);
+   return ret;
+}
+
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -1950,19 +2012,53 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
}
} else if (cmd == VFIO_IOMMU_GET_INFO) {
struct vfio_iommu_type1_info info;
+   struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
+   unsigned long capsz;
+   int ret;
 
minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
 
+   /* For backward compatibility, cannot require this */
+   capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
+
if (copy_from_user(, (void __user *)arg, minsz))
return -EFAULT;
 
if (info.argsz < minsz)
return -EINVAL;
 
+   if (info.argsz >= capsz) {
+   minsz = capsz;
+   info.cap_offset = 0; /* output, no-recopy necessary */
+   }
+
info.flags = VFIO_IOMMU_INFO_PGSIZES;
 
info.iova_pgsizes = vfio_pgsize_bitmap(iommu);
 
+   ret = vfio_iommu_iova_build_caps(iommu, );
+   if (ret)
+   return ret;
+
+   if (caps.size) {
+   info.flags |= VFIO_IOMMU_INFO_CAPS;
+
+   if (info.argsz < sizeof(info) + caps.size) {
+   info.argsz = sizeof(info) + caps.size;
+   } else {
+   vfio_info_cap_shift(, sizeof(info));
+   if (copy_to_user((void __user *)arg +
+   sizeof(info), caps.buf,
+   caps.size)) {
+   kfree(caps.buf);
+   return -EFAULT;
+   }
+   info.cap_offset = sizeof(info);
+   }
+
+   kfree(caps.buf);
+   }
+
return copy_to_user((void __user *)arg, , minsz) ?
-EFAULT : 0;
 
diff --git a/include/uapi/

1 2 >

1 - 100 of 146 matches

Mail list logo