Re: [PATCH v3] KVM: arm/arm64 : add lpi info in vgic-debug

2018-03-23 Thread Marc Zyngier
On 24/03/18 00:42, Peng Hao wrote:
> Add lpi debug info to vgic-stat.
> The printed info like this:
> SPI  287  0 0100   0 160  -1
> LPI 8192  2 00010000   0 160  -1
> 
> Signed-off-by: Peng Hao 
> ---
>  virt/kvm/arm/vgic/vgic-debug.c | 59 
> ++
>  virt/kvm/arm/vgic/vgic-its.c   | 16 ++--
>  virt/kvm/arm/vgic/vgic.h   |  1 +
>  3 files changed, 63 insertions(+), 13 deletions(-)
> 
> diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
> index 10b3817..ddac6bd 100644
> --- a/virt/kvm/arm/vgic/vgic-debug.c
> +++ b/virt/kvm/arm/vgic/vgic-debug.c
> @@ -36,9 +36,12 @@
>  struct vgic_state_iter {
>   int nr_cpus;
>   int nr_spis;
> + int nr_lpis;
>   int dist_id;
>   int vcpu_id;
>   int intid;
> + int lpi_print_count;
> + struct vgic_irq **lpi_irqs;
>  };
>  
>  static void iter_next(struct vgic_state_iter *iter)
> @@ -52,6 +55,39 @@ static void iter_next(struct vgic_state_iter *iter)
>   if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
>   ++iter->vcpu_id < iter->nr_cpus)
>   iter->intid = 0;
> +
> + if (iter->intid >= VGIC_NR_PRIVATE_IRQS + iter->nr_spis) {
> + if (iter->lpi_print_count < iter->nr_lpis)
> + iter->intid = 
> iter->lpi_irqs[iter->lpi_print_count]->intid;
> + iter->lpi_print_count++;
> + }
> +}
> +
> +static void vgic_debug_get_lpis(struct kvm *kvm, struct vgic_state_iter 
> *iter)
> +{
> + u32 *intids;
> + int i, irq_count;
> + struct vgic_irq *irq = NULL, **lpi_irqs;
> +
> + iter->nr_lpis = 0;
> + irq_count = vgic_copy_lpi_list(kvm, NULL, );
> + if (irq_count < 0)
> + return;
> +
> + lpi_irqs = kmalloc_array(irq_count, sizeof(irq), GFP_KERNEL);
> + if (!lpi_irqs) {
> + kfree(intids);
> + return;
> + }
> +
> + for (i = 0; i < irq_count; i++) {
> + irq = vgic_get_irq(kvm, NULL, intids[i]);
> + if (!irq)
> + continue;
> + lpi_irqs[iter->nr_lpis++] = irq;
> + }
> + iter->lpi_irqs = lpi_irqs;
> + kfree(intids);

You are still completely missing the point. Why are you allocating this
array of pointers while you have a perfectly sensible array of intids,
allowing you do treat all the irqs uniformly?

>  }
>  
>  static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
> @@ -64,6 +100,8 @@ static void iter_init(struct kvm *kvm, struct 
> vgic_state_iter *iter,
>   iter->nr_cpus = nr_cpus;
>   iter->nr_spis = kvm->arch.vgic.nr_spis;
>  
> + if (vgic_supports_direct_msis(kvm) && !pos)
> + vgic_debug_get_lpis(kvm, iter);

Again: What is the point of this?

>   /* Fast forward to the right position if needed */
>   while (pos--)
>   iter_next(iter);
> @@ -73,7 +111,9 @@ static bool end_of_vgic(struct vgic_state_iter *iter)
>  {
>   return iter->dist_id > 0 &&
>   iter->vcpu_id == iter->nr_cpus &&
> - (iter->intid - VGIC_NR_PRIVATE_IRQS) == iter->nr_spis;
> + (iter->intid - VGIC_NR_PRIVATE_IRQS) >= iter->nr_spis &&
> + ((iter->nr_lpis == 0) ||
> + (iter->lpi_print_count == iter->nr_lpis + 1));
>  }
>  
>  static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
> @@ -130,6 +170,7 @@ static void vgic_debug_stop(struct seq_file *s, void *v)
>  
>   mutex_lock(>lock);
>   iter = kvm->arch.vgic.iter;
> + kfree(iter->lpi_irqs);
>   kfree(iter);
>   kvm->arch.vgic.iter = NULL;
>   mutex_unlock(>lock);
> @@ -154,7 +195,7 @@ static void print_header(struct seq_file *s, struct 
> vgic_irq *irq,
>struct kvm_vcpu *vcpu)
>  {
>   int id = 0;
> - char *hdr = "SPI ";
> + char *hdr = "Global";
>  
>   if (vcpu) {
>   hdr = "VCPU";
> @@ -162,7 +203,10 @@ static void print_header(struct seq_file *s, struct 
> vgic_irq *irq,
>   }
>  
>   seq_printf(s, "\n");
> - seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET SRC PRI 
> VCPU_ID\n", hdr, id);
> + if (vcpu)
> + seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET 
> SRC PRI VCPU_ID\n", hdr, id);
> + else
> + seq_printf(s, "%s TYP   ID TGT_ID PLAEHC HWID   TARGET SRC 
> PRI VCPU_ID\n", hdr);
>   seq_printf(s, 
> "---\n");
>  }
>  
> @@ -174,8 +218,10 @@ static void print_irq_state(struct seq_file *s, struct 
> vgic_irq *irq,
>   type = "SGI";
>   else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
>   type = "PPI";
> - else
> + else if (irq->intid < VGIC_MAX_SPI)
>   type = "SPI";
> + else if (irq->intid >= VGIC_MIN_LPI)
> + type = "LPI";
>  
>   if (irq->intid ==0 || irq->intid == 

[RFC PATCH v2 15/15] khwasan: update kasan documentation

2018-03-23 Thread Andrey Konovalov
This patch updates KASAN documentation to reflect the addition of KHWASAN.

Signed-off-by: Andrey Konovalov 
---
 Documentation/dev-tools/kasan.rst | 212 +-
 1 file changed, 122 insertions(+), 90 deletions(-)

diff --git a/Documentation/dev-tools/kasan.rst 
b/Documentation/dev-tools/kasan.rst
index f7a18f274357..a817f4c4285c 100644
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -8,11 +8,18 @@ KernelAddressSANitizer (KASAN) is a dynamic memory error 
detector. It provides
 a fast and comprehensive solution for finding use-after-free and out-of-bounds
 bugs.
 
-KASAN uses compile-time instrumentation for checking every memory access,
-therefore you will need a GCC version 4.9.2 or later. GCC 5.0 or later is
-required for detection of out-of-bounds accesses to stack or global variables.
+KASAN has two modes: classic KASAN (a classic version, similar to user space
+ASan) and KHWASAN (a version based on memory tagging, similar to user space
+HWASan).
 
-Currently KASAN is supported only for the x86_64 and arm64 architectures.
+KASAN uses compile-time instrumentation to insert validity checks before every
+memory access, and therefore requires a compiler version that supports that.
+For classic KASAN you need GCC version 4.9.2 or later. GCC 5.0 or later is
+required for detection of out-of-bounds accesses on stack and global variables.
+TODO: compiler requirements for KHWASAN
+
+Currently classic KASAN is supported for the x86_64, arm64 and xtensa
+architectures, and KHWASAN is supported only for arm64.
 
 Usage
 -
@@ -21,12 +28,14 @@ To enable KASAN configure kernel with::
 
  CONFIG_KASAN = y
 
-and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline and
-inline are compiler instrumentation types. The former produces smaller binary
-the latter is 1.1 - 2 times faster. Inline instrumentation requires a GCC
+and choose between CONFIG_KASAN_CLASSIC (to enable classic KASAN) and
+CONFIG_KASAN_TAGS (to enabled KHWASAN). You also need to choose choose between
+CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline and inline are compiler
+instrumentation types. The former produces smaller binary the latter is
+1.1 - 2 times faster. For classic KASAN inline instrumentation requires GCC
 version 5.0 or later.
 
-KASAN works with both SLUB and SLAB memory allocators.
+Both KASAN modes work with both SLUB and SLAB memory allocators.
 For better bug detection and nicer reporting, enable CONFIG_STACKTRACE.
 
 To disable instrumentation for specific files or directories, add a line
@@ -43,85 +52,80 @@ similar to the following to the respective kernel Makefile:
 Error reports
 ~
 
-A typical out of bounds access report looks like this::
+A typical out-of-bounds access classic KASAN report looks like this::
 
 ==
-BUG: AddressSanitizer: out of bounds access in kmalloc_oob_right+0x65/0x75 
[test_kasan] at addr 8800693bc5d3
-Write of size 1 by task modprobe/1689
-
=
-BUG kmalloc-128 (Not tainted): kasan error
-
-
-
-Disabling lock debugging due to kernel taint
-INFO: Allocated in kmalloc_oob_right+0x3d/0x75 [test_kasan] age=0 cpu=0 
pid=1689
- __slab_alloc+0x4b4/0x4f0
- kmem_cache_alloc_trace+0x10b/0x190
- kmalloc_oob_right+0x3d/0x75 [test_kasan]
- init_module+0x9/0x47 [test_kasan]
- do_one_initcall+0x99/0x200
- load_module+0x2cb3/0x3b20
- SyS_finit_module+0x76/0x80
- system_call_fastpath+0x12/0x17
-INFO: Slab 0xea0001a4ef00 objects=17 used=7 fp=0x8800693bd728 
flags=0x1004080
-INFO: Object 0x8800693bc558 @offset=1368 fp=0x8800693bc720
-
-Bytes b4 8800693bc548: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a 
 
-Object 8800693bc558: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

-Object 8800693bc568: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

-Object 8800693bc578: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

-Object 8800693bc588: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

-Object 8800693bc598: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

-Object 8800693bc5a8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

-Object 8800693bc5b8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  

-Object 8800693bc5c8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5  
kkk.
-Redzone 8800693bc5d8: cc cc cc cc cc cc cc cc  

-Padding 8800693bc718: 5a 5a 5a 5a 5a 5a 5a 5a  

-CPU: 0 PID: 

[RFC PATCH v2 14/15] khwasan, arm64: add brk handler for inline instrumentation

2018-03-23 Thread Andrey Konovalov
KHWASAN inline instrumentation mode (which embeds checks of shadow memory
into the generated code, instead of inserting a callback) generates a brk
instruction when a tag mismatch is detected.

This commit add a KHWASAN brk handler, that decodes the immediate value
passed to the brk instructions (to extract information about the memory
access that triggered the mismatch), reads the register values (x0 contains
the guilty address) and reports the bug.

Signed-off-by: Andrey Konovalov 
---
 arch/arm64/include/asm/brk-imm.h |  2 ++
 arch/arm64/kernel/traps.c| 61 
 2 files changed, 63 insertions(+)

diff --git a/arch/arm64/include/asm/brk-imm.h b/arch/arm64/include/asm/brk-imm.h
index ed693c5bcec0..e4a7013321dc 100644
--- a/arch/arm64/include/asm/brk-imm.h
+++ b/arch/arm64/include/asm/brk-imm.h
@@ -16,10 +16,12 @@
  * 0x400: for dynamic BRK instruction
  * 0x401: for compile time BRK instruction
  * 0x800: kernel-mode BUG() and WARN() traps
+ * 0x9xx: KHWASAN trap (allowed values 0x900 - 0x9ff)
  */
 #define FAULT_BRK_IMM  0x100
 #define KGDB_DYN_DBG_BRK_IMM   0x400
 #define KGDB_COMPILED_DBG_BRK_IMM  0x401
 #define BUG_BRK_IMM0x800
+#define KHWASAN_BRK_IMM0x900
 
 #endif
diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index eb2d15147e8d..9d9ca4eb6d2f 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -35,6 +35,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -771,6 +772,59 @@ static struct break_hook bug_break_hook = {
.fn = bug_handler,
 };
 
+#ifdef CONFIG_KASAN_TAGS
+
+#define KHWASAN_ESR_RECOVER0x20
+#define KHWASAN_ESR_WRITE  0x10
+#define KHWASAN_ESR_SIZE_MASK  0x0f
+#define KHWASAN_ESR_SIZE(esr)  (1 << ((esr) & KHWASAN_ESR_SIZE_MASK))
+
+static int khwasan_handler(struct pt_regs *regs, unsigned int esr)
+{
+   bool recover = esr & KHWASAN_ESR_RECOVER;
+   bool write = esr & KHWASAN_ESR_WRITE;
+   size_t size = KHWASAN_ESR_SIZE(esr);
+   u64 addr = regs->regs[0];
+   u64 pc = regs->pc;
+
+   if (user_mode(regs))
+   return DBG_HOOK_ERROR;
+
+   khwasan_report(addr, size, write, pc);
+
+   /*
+* The instrumentation allows to control whether we can proceed after
+* a crash was detected. This is done by passing the -recover flag to
+* the compiler. Disabling recovery allows to generate more compact
+* code.
+*
+* Unfortunately disabling recovery doesn't work for the kernel right
+* now. KHWASAN reporting is disabled in some contexts (for example when
+* the allocator accesses slab object metadata; same is true for KASAN;
+* this is controlled by current->kasan_depth). All these accesses are
+* detected by the tool, even though the reports for them are not
+* printed.
+*
+* This is something that might be fixed at some point in the future.
+*/
+   if (!recover)
+   die("Oops - KHWASAN", regs, 0);
+
+   /* If thread survives, skip over the BUG instruction and continue: */
+   arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE);
+   return DBG_HOOK_HANDLED;
+}
+
+#define KHWASAN_ESR_VAL (0xf200 | KHWASAN_BRK_IMM)
+#define KHWASAN_ESR_MASK 0xff00
+
+static struct break_hook khwasan_break_hook = {
+   .esr_val = KHWASAN_ESR_VAL,
+   .esr_mask = KHWASAN_ESR_MASK,
+   .fn = khwasan_handler,
+};
+#endif
+
 /*
  * Initial handler for AArch64 BRK exceptions
  * This handler only used until debug_traps_init().
@@ -778,6 +832,10 @@ static struct break_hook bug_break_hook = {
 int __init early_brk64(unsigned long addr, unsigned int esr,
struct pt_regs *regs)
 {
+#ifdef CONFIG_KASAN_TAGS
+   if ((esr & KHWASAN_ESR_MASK) == KHWASAN_ESR_VAL)
+   return khwasan_handler(regs, esr) != DBG_HOOK_HANDLED;
+#endif
return bug_handler(regs, esr) != DBG_HOOK_HANDLED;
 }
 
@@ -785,4 +843,7 @@ int __init early_brk64(unsigned long addr, unsigned int esr,
 void __init trap_init(void)
 {
register_break_hook(_break_hook);
+#ifdef CONFIG_KASAN_TAGS
+   register_break_hook(_break_hook);
+#endif
 }
-- 
2.17.0.rc0.231.g781580f067-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH v2 13/15] khwasan: add hooks implementation

2018-03-23 Thread Andrey Konovalov
This commit adds KHWASAN hooks implementation.

1. When a new slab cache is created, KHWASAN rounds up the size of the
   objects in this cache to KASAN_SHADOW_SCALE_SIZE (== 16).

2. On each kmalloc KHWASAN generates a random tag, sets the shadow memory,
   that corresponds to this object to this tag, and embeds this tag value
   into the top byte of the returned pointer.

3. On each kfree KHWASAN poisons the shadow memory with a random tag to
   allow detection of use-after-free bugs.

The rest of the logic of the hook implementation is very much similar to
the one provided by KASAN. KHWASAN saves allocation and free stack metadata
to the slab object the same was KASAN does this.

Signed-off-by: Andrey Konovalov 
---
 mm/kasan/khwasan.c | 200 -
 1 file changed, 197 insertions(+), 3 deletions(-)

diff --git a/mm/kasan/khwasan.c b/mm/kasan/khwasan.c
index da4b17997c71..e8bed5a078c7 100644
--- a/mm/kasan/khwasan.c
+++ b/mm/kasan/khwasan.c
@@ -90,69 +90,260 @@ void *khwasan_reset_tag(const void *addr)
return reset_tag(addr);
 }
 
+void kasan_poison_shadow(const void *address, size_t size, u8 value)
+{
+   void *shadow_start, *shadow_end;
+
+   /* Perform shadow offset calculation based on untagged address */
+   address = reset_tag(address);
+
+   shadow_start = kasan_mem_to_shadow(address);
+   shadow_end = kasan_mem_to_shadow(address + size);
+
+   memset(shadow_start, value, shadow_end - shadow_start);
+}
+
 void kasan_unpoison_shadow(const void *address, size_t size)
 {
+   /* KHWASAN only allows 16-byte granularity */
+   size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+   kasan_poison_shadow(address, size, get_tag(address));
 }
 
 void check_memory_region(unsigned long addr, size_t size, bool write,
unsigned long ret_ip)
 {
+   u8 tag;
+   u8 *shadow_first, *shadow_last, *shadow;
+   void *untagged_addr;
+
+   tag = get_tag((const void *)addr);
+
+   /* Ignore accesses for pointers tagged with 0xff (native kernel
+* pointer tag) to suppress false positives caused by kmap.
+*
+* Some kernel code was written to account for archs that don't keep
+* high memory mapped all the time, but rather map and unmap particular
+* pages when needed. Instead of storing a pointer to the kernel memory,
+* this code saves the address of the page structure and offset within
+* that page for later use. Those pages are then mapped and unmapped
+* with kmap/kunmap when necessary and virt_to_page is used to get the
+* virtual address of the page. For arm64 (that keeps the high memory
+* mapped all the time), kmap is turned into a page_address call.
+
+* The issue is that with use of the page_address + virt_to_page
+* sequence the top byte value of the original pointer gets lost (gets
+* set to 0xff.
+*/
+   if (tag == 0xff)
+   return;
+
+   untagged_addr = reset_tag((const void *)addr);
+   shadow_first = kasan_mem_to_shadow(untagged_addr);
+   shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
+
+   for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
+   if (*shadow != tag) {
+   khwasan_report(addr, size, write, ret_ip);
+   return;
+   }
+   }
 }
 
 void kasan_free_pages(struct page *page, unsigned int order)
 {
+   if (likely(!PageHighMem(page)))
+   kasan_poison_shadow(page_address(page),
+   PAGE_SIZE << order,
+   KHWASAN_TAG_INVALID);
 }
 
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
slab_flags_t *flags)
 {
+   int orig_size = *size;
+
+   cache->kasan_info.alloc_meta_offset = *size;
+   *size += sizeof(struct kasan_alloc_meta);
+
+   if (*size % KASAN_SHADOW_SCALE_SIZE != 0)
+   *size = round_up(*size, KASAN_SHADOW_SCALE_SIZE);
+
+
+   if (*size > KMALLOC_MAX_SIZE) {
+   *size = orig_size;
+   return;
+   }
+
+   cache->align = round_up(cache->align, KASAN_SHADOW_SCALE_SIZE);
+
+   *flags |= SLAB_KASAN;
 }
 
 void kasan_poison_slab(struct page *page)
 {
+   kasan_poison_shadow(page_address(page),
+   PAGE_SIZE << compound_order(page),
+   KHWASAN_TAG_INVALID);
 }
 
 void kasan_poison_object_data(struct kmem_cache *cache, void *object)
 {
+   kasan_poison_shadow(object,
+   round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
+   KHWASAN_TAG_INVALID);
 }
 
 void *kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 {
+   if (!READ_ONCE(khwasan_enabled))
+   return object;
+   object = kasan_kmalloc(cache, object, 

[RFC PATCH v2 11/15] khwasan, mm: perform untagged pointers comparison in krealloc

2018-03-23 Thread Andrey Konovalov
The krealloc function checks where the same buffer was reused or a new one
allocated by comparing kernel pointers. KHWASAN changes memory tag on the
krealloc'ed chunk of memory and therefore also changes the pointer tag of
the returned pointer. Therefore we need to perform comparison on untagged
(with tags reset) pointers to check whether it's the same memory region or
not.

Signed-off-by: Andrey Konovalov 
---
 mm/slab_common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/slab_common.c b/mm/slab_common.c
index a33e61315ca6..5911f2194cf7 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1494,7 +1494,7 @@ void *krealloc(const void *p, size_t new_size, gfp_t 
flags)
}
 
ret = __do_krealloc(p, new_size, flags);
-   if (ret && p != ret)
+   if (ret && khwasan_reset_tag(p) != khwasan_reset_tag(ret))
kfree(p);
 
return ret;
-- 
2.17.0.rc0.231.g781580f067-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH v2 12/15] khwasan: add bug reporting routines

2018-03-23 Thread Andrey Konovalov
This commit adds rountines, that print KHWASAN error reports. Those are
quite similar to KASAN, the difference is:

1. The way KHWASAN finds the first bad shadow cell (with a mismatching
   tag). KHWASAN compares memory tags from the shadow memory to the pointer
   tag.

2. KHWASAN reports all bugs with the "KASAN: invalid-access" header. This
   is done, so various external tools that already parse the kernel logs
   looking for KASAN reports wouldn't need to be changed.

Signed-off-by: Andrey Konovalov 
---
 include/linux/kasan.h |  3 ++
 mm/kasan/report.c | 88 ++-
 2 files changed, 82 insertions(+), 9 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index a2869464a8be..54e7c437dc8f 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -161,6 +161,9 @@ void *khwasan_set_tag(const void *addr, u8 tag);
 u8 khwasan_get_tag(const void *addr);
 void *khwasan_reset_tag(const void *ptr);
 
+void khwasan_report(unsigned long addr, size_t size, bool write,
+   unsigned long ip);
+
 #else /* CONFIG_KASAN_TAGS */
 
 static inline void khwasan_init(void) { }
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 5c169aa688fd..ed17168a083e 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -51,10 +51,9 @@ static const void *find_first_bad_addr(const void *addr, 
size_t size)
return first_bad_addr;
 }
 
-static bool addr_has_shadow(struct kasan_access_info *info)
+static bool addr_has_shadow(const void *addr)
 {
-   return (info->access_addr >=
-   kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+   return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
 }
 
 static const char *get_shadow_bug_type(struct kasan_access_info *info)
@@ -127,15 +126,14 @@ static const char *get_wild_bug_type(struct 
kasan_access_info *info)
 
 static const char *get_bug_type(struct kasan_access_info *info)
 {
-   if (addr_has_shadow(info))
+   if (addr_has_shadow(info->access_addr))
return get_shadow_bug_type(info);
return get_wild_bug_type(info);
 }
 
-static void print_error_description(struct kasan_access_info *info)
+static void print_error_description(struct kasan_access_info *info,
+   const char *bug_type)
 {
-   const char *bug_type = get_bug_type(info);
-
pr_err("BUG: KASAN: %s in %pS\n",
bug_type, (void *)info->ip);
pr_err("%s of size %zu at addr %px by task %s/%d\n",
@@ -345,10 +343,10 @@ static void kasan_report_error(struct kasan_access_info 
*info)
 
kasan_start_report();
 
-   print_error_description(info);
+   print_error_description(info, get_bug_type(info));
pr_err("\n");
 
-   if (!addr_has_shadow(info)) {
+   if (!addr_has_shadow(info->access_addr)) {
dump_stack();
} else {
print_address_description((void *)info->access_addr);
@@ -412,6 +410,78 @@ void kasan_report(unsigned long addr, size_t size,
kasan_report_error();
 }
 
+static inline void khwasan_print_tags(const void *addr)
+{
+   u8 addr_tag = get_tag(addr);
+   void *untagged_addr = reset_tag(addr);
+   u8 *shadow = (u8 *)kasan_mem_to_shadow(untagged_addr);
+
+   pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow);
+}
+
+static const void *khwasan_find_first_bad_addr(const void *addr, size_t size)
+{
+   u8 tag = get_tag((void *)addr);
+   void *untagged_addr = reset_tag((void *)addr);
+   u8 *shadow = (u8 *)kasan_mem_to_shadow(untagged_addr);
+   const void *first_bad_addr = untagged_addr;
+
+   while (*shadow == tag && first_bad_addr < untagged_addr + size) {
+   first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
+   shadow = (u8 *)kasan_mem_to_shadow(first_bad_addr);
+   }
+   return first_bad_addr;
+}
+
+void khwasan_report(unsigned long addr, size_t size, bool write,
+   unsigned long ip)
+{
+   struct kasan_access_info info;
+   unsigned long flags;
+   void *untagged_addr = reset_tag((void *)addr);
+
+   if (likely(!kasan_report_enabled()))
+   return;
+
+   disable_trace_on_warning();
+
+   info.access_addr = (void *)addr;
+   info.first_bad_addr = khwasan_find_first_bad_addr((void *)addr, size);
+   info.access_size = size;
+   info.is_write = write;
+   info.ip = ip;
+
+   kasan_start_report();
+
+   print_error_description(, "invalid-access");
+   khwasan_print_tags((void *)addr);
+   pr_err("\n");
+
+   if (!addr_has_shadow(untagged_addr)) {
+   dump_stack();
+   } else {
+   print_address_description(untagged_addr);
+   pr_err("\n");
+   print_shadow_for_address(info.first_bad_addr);
+   }
+
+   kasan_end_report();
+}
+
+void khwasan_report_invalid_free(void *object, 

[RFC PATCH v2 10/15] khwasan, arm64: enable top byte ignore for the kernel

2018-03-23 Thread Andrey Konovalov
KHWASAN uses the Top Byte Ignore feature of arm64 CPUs to store a pointer
tag in the top byte of each pointer. This commit enables the TCR_TBI1 bit,
which enables Top Byte Ignore for the kernel, when KHWASAN is used.

Signed-off-by: Andrey Konovalov 
---
 arch/arm64/include/asm/pgtable-hwdef.h | 1 +
 arch/arm64/mm/proc.S   | 9 -
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/pgtable-hwdef.h 
b/arch/arm64/include/asm/pgtable-hwdef.h
index cdfe3e657a9e..ae6b6405eacc 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -289,6 +289,7 @@
 #define TCR_A1 (UL(1) << 22)
 #define TCR_ASID16 (UL(1) << 36)
 #define TCR_TBI0   (UL(1) << 37)
+#define TCR_TBI1   (UL(1) << 38)
 #define TCR_HA (UL(1) << 39)
 #define TCR_HD (UL(1) << 40)
 
diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S
index c0af47617299..d64ce2ea40ec 100644
--- a/arch/arm64/mm/proc.S
+++ b/arch/arm64/mm/proc.S
@@ -41,6 +41,12 @@
 /* PTWs cacheable, inner/outer WBWA */
 #define TCR_CACHE_FLAGSTCR_IRGN_WBWA | TCR_ORGN_WBWA
 
+#ifdef CONFIG_KASAN_TAGS
+#define KASAN_TCR_FLAGS TCR_TBI1
+#else
+#define KASAN_TCR_FLAGS 0
+#endif
+
 #define MAIR(attr, mt) ((attr) << ((mt) * 8))
 
 /*
@@ -432,7 +438,8 @@ ENTRY(__cpu_setup)
 * both user and kernel.
 */
ldr x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \
-   TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 | TCR_A1
+   TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 | TCR_A1 | \
+   KASAN_TCR_FLAGS
tcr_set_idmap_t0sz  x10, x9
 
/*
-- 
2.17.0.rc0.231.g781580f067-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH v2 09/15] khwasan, kvm: untag pointers in kern_hyp_va

2018-03-23 Thread Andrey Konovalov
kern_hyp_va that converts kernel VA into a HYP VA relies on the top byte
of kernel pointers being 0xff. Untag pointers passed to it with KHWASAN
enabled.

Also fix create_hyp_mappings() and create_hyp_io_mappings(), to use the
untagged kernel pointers for address computations.

Signed-off-by: Andrey Konovalov 
---
 arch/arm64/include/asm/kvm_mmu.h |  8 
 virt/kvm/arm/mmu.c   | 20 +++-
 2 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index 7faed6e48b46..5149ff83b4c4 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -97,6 +97,9 @@
  * Should be completely invisible on any viable CPU.
  */
 .macro kern_hyp_va reg
+#ifdef CONFIG_KASAN_TAGS
+   orr \reg, \reg, #KASAN_PTR_TAG_MASK
+#endif
 alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
and \reg, \reg, #HYP_PAGE_OFFSET_HIGH_MASK
 alternative_else_nop_endif
@@ -115,6 +118,11 @@ alternative_else_nop_endif
 
 static inline unsigned long __kern_hyp_va(unsigned long v)
 {
+#ifdef CONFIG_KASAN_TAGS
+   asm volatile("orr %0, %0, %1"
+: "+r" (v)
+: "i" (KASAN_PTR_TAG_MASK));
+#endif
asm volatile(ALTERNATIVE("and %0, %0, %1",
 "nop",
 ARM64_HAS_VIRT_HOST_EXTN)
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index b960acdd0c05..3dba9b60e0a0 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -21,6 +21,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -683,9 +684,13 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
 int create_hyp_mappings(void *from, void *to, pgprot_t prot)
 {
phys_addr_t phys_addr;
-   unsigned long virt_addr;
-   unsigned long start = kern_hyp_va((unsigned long)from);
-   unsigned long end = kern_hyp_va((unsigned long)to);
+   unsigned long virt_addr, start, end;
+
+   from = khwasan_reset_tag(from);
+   to = khwasan_reset_tag(to);
+
+   start = kern_hyp_va((unsigned long)from);
+   end = kern_hyp_va((unsigned long)to);
 
if (is_kernel_in_hyp_mode())
return 0;
@@ -719,8 +724,13 @@ int create_hyp_mappings(void *from, void *to, pgprot_t 
prot)
  */
 int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 {
-   unsigned long start = kern_hyp_va((unsigned long)from);
-   unsigned long end = kern_hyp_va((unsigned long)to);
+   unsigned long start, end;
+
+   from = khwasan_reset_tag(from);
+   to = khwasan_reset_tag(to);
+
+   start = kern_hyp_va((unsigned long)from);
+   end = kern_hyp_va((unsigned long)to);
 
if (is_kernel_in_hyp_mode())
return 0;
-- 
2.17.0.rc0.231.g781580f067-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH v2 08/15] khwasan: add tag related helper functions

2018-03-23 Thread Andrey Konovalov
This commit adds a few helper functions, that are meant to be used to
work with tags embedded in the top byte of kernel pointers: to set, to
get or to reset (set to 0xff) the top byte.

Signed-off-by: Andrey Konovalov 
---
 arch/arm64/mm/kasan_init.c |  2 ++
 include/linux/kasan.h  | 23 +
 mm/kasan/kasan.h   | 29 ++
 mm/kasan/khwasan.c | 51 ++
 4 files changed, 105 insertions(+)

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index d4bceba60010..7fd9aee88069 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -247,6 +247,8 @@ void __init kasan_init(void)
memset(kasan_zero_page, KASAN_SHADOW_INIT, PAGE_SIZE);
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 
+   khwasan_init();
+
/* At this point kasan is fully initialized. Enable error messages */
init_task.kasan_depth = 0;
pr_info("KernelAddressSanitizer initialized\n");
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 700734dff218..a2869464a8be 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -155,6 +155,29 @@ static inline void kasan_cache_shutdown(struct kmem_cache 
*cache) {}
 
 #define KASAN_SHADOW_INIT 0xFF
 
+void khwasan_init(void);
+
+void *khwasan_set_tag(const void *addr, u8 tag);
+u8 khwasan_get_tag(const void *addr);
+void *khwasan_reset_tag(const void *ptr);
+
+#else /* CONFIG_KASAN_TAGS */
+
+static inline void khwasan_init(void) { }
+
+static inline void *khwasan_set_tag(const void *addr, u8 tag)
+{
+   return (void *)addr;
+}
+static inline u8 khwasan_get_tag(const void *addr)
+{
+   return 0xFF;
+}
+static inline void *khwasan_reset_tag(const void *ptr)
+{
+   return (void *)ptr;
+}
+
 #endif /* CONFIG_KASAN_TAGS */
 
 #endif /* LINUX_KASAN_H */
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index 2be31754278e..c715b44c4780 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -113,6 +113,35 @@ void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
 void kasan_report_invalid_free(void *object, unsigned long ip);
 
+#define KHWASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */
+#define KHWASAN_TAG_INVALID0xFE /* redzone or free memory tag */
+#define KHWASAN_TAG_MAX0xFD /* maximum value for random tags */
+
+#define KHWASAN_TAG_SHIFT 56
+#define KHWASAN_TAG_MASK (0xFFUL << KHWASAN_TAG_SHIFT)
+
+static inline void *set_tag(const void *addr, u8 tag)
+{
+   u64 a = (u64)addr;
+
+   a &= ~KHWASAN_TAG_MASK;
+   a |= ((u64)tag << KHWASAN_TAG_SHIFT);
+
+   return (void *)a;
+}
+
+static inline u8 get_tag(const void *addr)
+{
+   return (u8)((u64)addr >> KHWASAN_TAG_SHIFT);
+}
+
+static inline void *reset_tag(const void *addr)
+{
+   return set_tag(addr, KHWASAN_TAG_KERNEL);
+}
+
+void khwasan_report_invalid_free(void *object, unsigned long ip);
+
 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
 void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
 void quarantine_reduce(void);
diff --git a/mm/kasan/khwasan.c b/mm/kasan/khwasan.c
index 24d75245e9d0..da4b17997c71 100644
--- a/mm/kasan/khwasan.c
+++ b/mm/kasan/khwasan.c
@@ -39,6 +39,57 @@
 #include "kasan.h"
 #include "../slab.h"
 
+int khwasan_enabled;
+
+static DEFINE_PER_CPU(u32, prng_state);
+
+void khwasan_init(void)
+{
+   int cpu;
+
+   for_each_possible_cpu(cpu) {
+   per_cpu(prng_state, cpu) = get_random_u32();
+   }
+   WRITE_ONCE(khwasan_enabled, 1);
+}
+
+/*
+ * If a preemption happens between this_cpu_read and this_cpu_write, the only
+ * side effect is that we'll give a few allocated in different contexts objects
+ * the same tag. Since KHWASAN is meant to be used a probabilistic 
bug-detection
+ * debug feature, this doesn’t have significant negative impact.
+ *
+ * Ideally the tags use strong randomness to prevent any attempts to predict
+ * them during explicit exploit attempts. But strong randomness is expensive,
+ * and we did an intentional trade-off to use a PRNG. This non-atomic RMW
+ * sequence has in fact positive effect, since interrupts that randomly skew
+ * PRNG at unpredictable points do only good.
+ */
+static inline u8 khwasan_random_tag(void)
+{
+   u32 state = this_cpu_read(prng_state);
+
+   state = 1664525 * state + 1013904223;
+   this_cpu_write(prng_state, state);
+
+   return (u8)state % (KHWASAN_TAG_MAX + 1);
+}
+
+void *khwasan_set_tag(const void *addr, u8 tag)
+{
+   return set_tag(addr, tag);
+}
+
+u8 khwasan_get_tag(const void *addr)
+{
+   return get_tag(addr);
+}
+
+void *khwasan_reset_tag(const void *addr)
+{
+   return reset_tag(addr);
+}
+
 void kasan_unpoison_shadow(const void *address, size_t size)
 {
 }
-- 
2.17.0.rc0.231.g781580f067-goog

___
kvmarm mailing list

[RFC PATCH v2 05/15] khwasan: initialize shadow to 0xff

2018-03-23 Thread Andrey Konovalov
A KHWASAN shadow memory cell contains a memory tag, that corresponds to
the tag in the top byte of the pointer, that points to that memory. The
native top byte value of kernel pointers is 0xff, so with KHWASAN we
need to initialize shadow memory to 0xff. This commit does that.

Signed-off-by: Andrey Konovalov 
---
 arch/arm64/mm/kasan_init.c | 11 ++-
 include/linux/kasan.h  |  8 
 mm/kasan/common.c  |  7 +++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c
index dabfc1ecda3d..d4bceba60010 100644
--- a/arch/arm64/mm/kasan_init.c
+++ b/arch/arm64/mm/kasan_init.c
@@ -90,6 +90,10 @@ static void __init kasan_pte_populate(pmd_t *pmdp, unsigned 
long addr,
do {
phys_addr_t page_phys = early ? __pa_symbol(kasan_zero_page)
  : kasan_alloc_zeroed_page(node);
+#if KASAN_SHADOW_INIT != 0
+   if (!early)
+   memset(__va(page_phys), KASAN_SHADOW_INIT, PAGE_SIZE);
+#endif
next = addr + PAGE_SIZE;
set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL));
} while (ptep++, addr = next, addr != end && 
pte_none(READ_ONCE(*ptep)));
@@ -139,6 +143,11 @@ asmlinkage void __init kasan_early_init(void)
KASAN_SHADOW_END - (1UL << (64 - KASAN_SHADOW_SCALE_SHIFT)));
BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE));
BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE));
+
+#if KASAN_SHADOW_INIT != 0
+   memset(kasan_zero_page, KASAN_SHADOW_INIT, PAGE_SIZE);
+#endif
+
kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE,
   true);
 }
@@ -235,7 +244,7 @@ void __init kasan_init(void)
set_pte(_zero_pte[i],
pfn_pte(sym_to_pfn(kasan_zero_page), PAGE_KERNEL_RO));
 
-   memset(kasan_zero_page, 0, PAGE_SIZE);
+   memset(kasan_zero_page, KASAN_SHADOW_INIT, PAGE_SIZE);
cpu_replace_ttbr1(lm_alias(swapper_pg_dir));
 
/* At this point kasan is fully initialized. Enable error messages */
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 3c45e273a936..700734dff218 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -139,6 +139,8 @@ static inline size_t kasan_metadata_size(struct kmem_cache 
*cache) { return 0; }
 
 #ifdef CONFIG_KASAN_CLASSIC
 
+#define KASAN_SHADOW_INIT 0
+
 void kasan_cache_shrink(struct kmem_cache *cache);
 void kasan_cache_shutdown(struct kmem_cache *cache);
 
@@ -149,4 +151,10 @@ static inline void kasan_cache_shutdown(struct kmem_cache 
*cache) {}
 
 #endif /* CONFIG_KASAN_CLASSIC */
 
+#ifdef CONFIG_KASAN_TAGS
+
+#define KASAN_SHADOW_INIT 0xFF
+
+#endif /* CONFIG_KASAN_TAGS */
+
 #endif /* LINUX_KASAN_H */
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 08f6c8cb9f84..f4ccb9425655 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -253,6 +253,9 @@ int kasan_module_alloc(void *addr, size_t size)
__builtin_return_address(0));
 
if (ret) {
+#if KASAN_SHADOW_INIT != 0
+   __memset(ret, KASAN_SHADOW_INIT, shadow_size);
+#endif
find_vm_area(addr)->flags |= VM_KASAN;
kmemleak_ignore(ret);
return 0;
@@ -297,6 +300,10 @@ static int __meminit kasan_mem_notifier(struct 
notifier_block *nb,
if (!ret)
return NOTIFY_BAD;
 
+#if KASAN_SHADOW_INIT != 0
+   __memset(ret, KASAN_SHADOW_INIT, shadow_end - shadow_start);
+#endif
+
kmemleak_ignore(ret);
return NOTIFY_OK;
}
-- 
2.17.0.rc0.231.g781580f067-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH v2 06/15] khwasan, arm64: untag virt address in __kimg_to_phys

2018-03-23 Thread Andrey Konovalov
__kimg_to_phys (which is used by virt_to_phys) assumes that the top byte
of the address is 0xff, which isn't always the case with KHWASAN enabled.
The solution is to reset the tag in __kimg_to_phys.

__lm_to_phys doesn't require any fixups, as it zeroes out the top byte
with the current implementation.

Signed-off-by: Andrey Konovalov 
---
 arch/arm64/include/asm/memory.h | 9 +
 1 file changed, 9 insertions(+)

diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h
index febd54ff3354..c13b89257352 100644
--- a/arch/arm64/include/asm/memory.h
+++ b/arch/arm64/include/asm/memory.h
@@ -98,6 +98,10 @@
 #define KASAN_THREAD_SHIFT 0
 #endif
 
+#ifdef CONFIG_KASAN_TAGS
+#define KASAN_PTR_TAG_MASK (UL(0xff) << 56)
+#endif
+
 #define MIN_THREAD_SHIFT   (14 + KASAN_THREAD_SHIFT)
 
 /*
@@ -231,7 +235,12 @@ static inline unsigned long kaslr_offset(void)
 #define __is_lm_address(addr)  (!!((addr) & BIT(VA_BITS - 1)))
 
 #define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET)
+
+#ifdef CONFIG_KASAN_TAGS
+#define __kimg_to_phys(addr)   (((addr) | KASAN_PTR_TAG_MASK) - kimage_voffset)
+#else
 #define __kimg_to_phys(addr)   ((addr) - kimage_voffset)
+#endif
 
 #define __virt_to_phys_nodebug(x) ({   \
phys_addr_t __x = (phys_addr_t)(x); \
-- 
2.17.0.rc0.231.g781580f067-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH v2 07/15] khwasan, arm64: fix up fault handling logic

2018-03-23 Thread Andrey Konovalov
show_pte in arm64 fault handling relies on the fact that the top byte of
a kernel pointer is 0xff, which isn't always the case with KHWASAN enabled.
Reset the top byte.

Signed-off-by: Andrey Konovalov 
---
 arch/arm64/mm/fault.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index bff11553eb05..234613777f2a 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -32,6 +32,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -133,6 +134,8 @@ void show_pte(unsigned long addr)
pgd_t *pgdp;
pgd_t pgd;
 
+   addr = (unsigned long)khwasan_reset_tag((void *)addr);
+
if (addr < TASK_SIZE) {
/* TTBR0 */
mm = current->active_mm;
-- 
2.17.0.rc0.231.g781580f067-goog

___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


[RFC PATCH v2 03/15] khwasan: add CONFIG_KASAN_CLASSIC and CONFIG_KASAN_TAGS

2018-03-23 Thread Andrey Konovalov
This commit splits the current CONFIG_KASAN config option into two:
1. CONFIG_KASAN_CLASSIC, that enables the classic KASAN version (the one
   that exists now);
2. CONFIG_KASAN_TAGS, that enables KHWASAN.

With CONFIG_KASAN_TAGS enabled, compiler options are changed to instrument
kernel files wiht -fsantize=hwaddress (except the ones for which
KASAN_SANITIZE := n is set).

Both CONFIG_KASAN_CLASSIC and CONFIG_KASAN_CLASSIC support both
CONFIG_KASAN_INLINE and CONFIG_KASAN_OUTLINE instrumentation modes.

This commit also adds empty placeholder (for now) KHWASAN implementation
of KASAN hooks (which KHWASAN reuses) and placeholder implementation
of KHWASAN specific hooks inserted by the compiler.

Signed-off-by: Andrey Konovalov 
---
 arch/arm64/Kconfig |   1 +
 include/linux/compiler-clang.h |   9 ++-
 include/linux/compiler-gcc.h   |   4 ++
 include/linux/compiler.h   |   3 +-
 include/linux/kasan.h  |  16 +++--
 lib/Kconfig.kasan  |  68 +-
 mm/kasan/Makefile  |   6 +-
 mm/kasan/khwasan.c | 127 +
 mm/slub.c  |   2 +-
 scripts/Makefile.kasan |  32 -
 10 files changed, 242 insertions(+), 26 deletions(-)
 create mode 100644 mm/kasan/khwasan.c

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7381eeb7ef8e..759871510f87 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -88,6 +88,7 @@ config ARM64
select HAVE_ARCH_HUGE_VMAP
select HAVE_ARCH_JUMP_LABEL
select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
+   select HAVE_ARCH_KASAN_TAGS if !(ARM64_16K_PAGES && ARM64_VA_BITS_48)
select HAVE_ARCH_KGDB
select HAVE_ARCH_MMAP_RND_BITS
select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT
diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h
index c8f4eea6a5f3..fccebb963ee3 100644
--- a/include/linux/compiler-clang.h
+++ b/include/linux/compiler-clang.h
@@ -24,15 +24,20 @@
 #define KASAN_ABI_VERSION 5
 
 /* emulate gcc's __SANITIZE_ADDRESS__ flag */
-#if __has_feature(address_sanitizer)
+#if __has_feature(address_sanitizer) || __has_feature(hwaddress_sanitizer)
 #define __SANITIZE_ADDRESS__
 #endif
 
-#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_CLASSIC
 #undef __no_sanitize_address
 #define __no_sanitize_address __attribute__((no_sanitize("kernel-address")))
 #endif
 
+#ifdef CONFIG_KASAN_TAGS
+#undef __no_sanitize_hwaddress
+#define __no_sanitize_hwaddress __attribute__((no_sanitize("hwaddress")))
+#endif
+
 /* Clang doesn't have a way to turn it off per-function, yet. */
 #ifdef __noretpoline
 #undef __noretpoline
diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index e2c7f4369eff..e9bc985c1227 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -344,6 +344,10 @@
 #define __no_sanitize_address
 #endif
 
+#if !defined(__no_sanitize_hwaddress)
+#define __no_sanitize_hwaddress/* gcc doesn't support KHWASAN */
+#endif
+
 /*
  * A trick to suppress uninitialized variable warning without generating any
  * code
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index ab4711c63601..6142bae513e8 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -195,7 +195,8 @@ void __read_once_size(const volatile void *p, void *res, 
int size)
  * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
  * '__maybe_unused' allows us to avoid defined-but-not-used warnings.
  */
-# define __no_kasan_or_inline __no_sanitize_address __maybe_unused
+# define __no_kasan_or_inline __no_sanitize_address __no_sanitize_hwaddress \
+ __maybe_unused
 #else
 # define __no_kasan_or_inline __always_inline
 #endif
diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index 3bfebcf7ad2b..3c45e273a936 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -45,8 +45,6 @@ void kasan_free_pages(struct page *page, unsigned int order);
 
 void kasan_cache_create(struct kmem_cache *cache, size_t *size,
slab_flags_t *flags);
-void kasan_cache_shrink(struct kmem_cache *cache);
-void kasan_cache_shutdown(struct kmem_cache *cache);
 
 void kasan_poison_slab(struct page *page);
 void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
@@ -94,8 +92,6 @@ static inline void kasan_free_pages(struct page *page, 
unsigned int order) {}
 static inline void kasan_cache_create(struct kmem_cache *cache,
  size_t *size,
  slab_flags_t *flags) {}
-static inline void kasan_cache_shrink(struct kmem_cache *cache) {}
-static inline void kasan_cache_shutdown(struct kmem_cache *cache) {}
 
 static inline void kasan_poison_slab(struct page *page) {}
 static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
@@ -141,4 +137,16 @@ static inline size_t 

[RFC PATCH v2 00/15] khwasan: kernel hardware assisted address sanitizer

2018-03-23 Thread Andrey Konovalov
Hi! This is the 2nd RFC version of the patchset.

This patchset adds a new mode to KASAN [1], which is called KHWASAN
(Kernel HardWare assisted Address SANitizer). There's still some work to
do and there are a few TODOs in the code, so I'm publishing this as an RFC
to collect some initial feedback.

The plan is to implement HWASan [2] for the kernel with the incentive,
that it's going to have comparable to KASAN performance, but in the same
time consume much less memory, trading that off for somewhat imprecise
bug detection and being supported only for arm64.

The overall idea of the approach used by KHWASAN is the following:

1. By using the Top Byte Ignore arm64 CPU feature, we can store pointer
   tags in the top byte of each kernel pointer.

2. Using shadow memory, we can store memory tags for each chunk of kernel
   memory.

3. On each memory allocation, we can generate a random tag, embed it into
   the returned pointer and set the memory tags that correspond to this
   chunk of memory to the same value.

4. By using compiler instrumentation, before each memory access we can add
   a check that the pointer tag matches the tag of the memory that is being
   accessed.

5. On a tag mismatch we report an error.

[1] https://www.kernel.org/doc/html/latest/dev-tools/kasan.html

[2] http://clang.llvm.org/docs/HardwareAssistedAddressSanitizerDesign.html


== Technical details

KHWASAN is implemented in a very similar way to KASAN. This patchset
essentially does the following:

1. TCR_TBI1 is set to enable Top Byte Ignore.

2. Shadow memory is used (with a different scale, 1:16, so each shadow
   byte corresponds to 16 bytes of kernel memory) to store memory tags.

3. All slab objects are aligned to shadow scale, which is 16 bytes.

4. All pointers returned from the slab allocator are tagged with a random
   tag and the corresponding shadow memory is poisoned with the same value.

5. Compiler instrumentation is used to insert tag checks. Either by
   calling callbacks or by inlining them (CONFIG_KASAN_OUTLINE and
   CONFIG_KASAN_INLINE flags are reused).

6. When a tag mismatch is detected in callback instrumentation mode
   KHWASAN simply prints a bug report. In case of inline instrumentation,
   clang inserts a brk instruction, and KHWASAN has it's own brk handler,
   which reports the bug.

7. The memory in between slab objects is marked with a reserved tag, and
   acts as a redzone.

8. When a slab object is freed it's marked with a reserved tag.

Bug detection is imprecise for two reasons:

1. We won't catch some small out-of-bounds accesses, that fall into the
   same shadow cell, as the last byte of a slab object.

2. We only have 1 byte to store tags, which means we have a 1/256
   probability of a tag match for an incorrect access (actually even
   slightly less due to reserved tag values).


== Benchmarks

The following numbers were collected on Odroid C2 board. Both KASAN and
KHWASAN were used in inline instrumentation mode. These are the numbers
I got with the current prototype and they might change.

Boot time [1]:
* ~4.5 sec for clean kernel
* ~5.0 sec for KASAN
* ~5.1 sec for KHWASAN

Slab memory usage after boot [2]:
* ~32 kb for clean kernel
* ~95 kb + 1/8th shadow ~= 107 kb for KASAN
* ~38 kb + 1/16th shadow ~= 40 kb for KHWASAN

Network performance [3]:
* 11.9 Gbits/sec for clean kernel
* 3.08 Gbits/sec for KASAN
* 3.02 Gbits/sec for KHWASAN

Note, that KHWASAN (compared to KASAN) doesn't require quarantine.

[1] Time before the ext4 driver is initialized.
[2] Measured as `cat /proc/meminfo | grep Slab`.
[3] Measured as `iperf -s & iperf -c 127.0.0.1 -t 30`.


== Some notes

A few notes:

1. The patchset can be found here:
   https://github.com/xairy/kasan-prototype/tree/khwasan

2. Building requires a recent LLVM version (r325711 or later) with this
   patch applied: https://reviews.llvm.org/D44827.

3. Stack instrumentation is not supported yet (in progress).

4. There are still a few TODOs in the code, that need to be addressed.


== Changes

Changes in RFC v2:
- Removed explicit casts to u8 * for kasan_mem_to_shadow() calls.
- Introduced KASAN_TCR_FLAGS for setting the TCR_TBI1 flag.
- Added a comment regarding the non-atomic RMW sequence in
  khwasan_random_tag().
- Made all tag related functions accept const void *.
- Untagged pointers in __kimg_to_phys, which is used by virt_to_phys.
- Untagged pointers in show_ptr in fault handling logic.
- Untagged pointers passed to KVM.
- Added two reserved tag values: 0xFF and 0xFE.
- Used the reserved tag 0xFF to disable validity checking (to resolve the
  issue with pointer tag being lost after page_address + kmap usage).
- Used the reserved tag 0xFE to mark redzones and freed objects.
- Added mnemonics for esr manipulation in KHWASAN brk handler.
- Added a comment about the -recover flag.
- Some minor cleanups and fixes.
- Rebased onto 3215b9d5 (4.16-rc6+).
- Tested on real hardware (Odroid C2 board).
- Added better 

[RFC PATCH v2 01/15] khwasan, mm: change kasan hooks signatures

2018-03-23 Thread Andrey Konovalov
KHWASAN will change the value of the top byte of pointers returned from the
kernel allocation functions (such as kmalloc). This patch updates KASAN
hooks signatures and their usage in SLAB and SLUB code to reflect that.

Signed-off-by: Andrey Konovalov 
---
 include/linux/kasan.h | 34 +++---
 mm/kasan/kasan.c  | 24 ++--
 mm/slab.c | 12 ++--
 mm/slab.h |  2 +-
 mm/slab_common.c  |  4 ++--
 mm/slub.c | 16 
 6 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/include/linux/kasan.h b/include/linux/kasan.h
index adc13474a53b..3bfebcf7ad2b 100644
--- a/include/linux/kasan.h
+++ b/include/linux/kasan.h
@@ -53,14 +53,14 @@ void kasan_unpoison_object_data(struct kmem_cache *cache, 
void *object);
 void kasan_poison_object_data(struct kmem_cache *cache, void *object);
 void kasan_init_slab_obj(struct kmem_cache *cache, const void *object);
 
-void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
+void *kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags);
 void kasan_kfree_large(void *ptr, unsigned long ip);
 void kasan_poison_kfree(void *ptr, unsigned long ip);
-void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
+void *kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size,
  gfp_t flags);
-void kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
+void *kasan_krealloc(const void *object, size_t new_size, gfp_t flags);
 
-void kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
+void *kasan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags);
 bool kasan_slab_free(struct kmem_cache *s, void *object, unsigned long ip);
 
 struct kasan_cache {
@@ -105,16 +105,28 @@ static inline void kasan_poison_object_data(struct 
kmem_cache *cache,
 static inline void kasan_init_slab_obj(struct kmem_cache *cache,
const void *object) {}
 
-static inline void kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags) {}
+static inline void *kasan_kmalloc_large(void *ptr, size_t size, gfp_t flags)
+{
+   return ptr;
+}
 static inline void kasan_kfree_large(void *ptr, unsigned long ip) {}
 static inline void kasan_poison_kfree(void *ptr, unsigned long ip) {}
-static inline void kasan_kmalloc(struct kmem_cache *s, const void *object,
-   size_t size, gfp_t flags) {}
-static inline void kasan_krealloc(const void *object, size_t new_size,
-gfp_t flags) {}
+static inline void *kasan_kmalloc(struct kmem_cache *s, const void *object,
+   size_t size, gfp_t flags)
+{
+   return (void *)object;
+}
+static inline void *kasan_krealloc(const void *object, size_t new_size,
+gfp_t flags)
+{
+   return (void *)object;
+}
 
-static inline void kasan_slab_alloc(struct kmem_cache *s, void *object,
-  gfp_t flags) {}
+static inline void *kasan_slab_alloc(struct kmem_cache *s, void *object,
+  gfp_t flags)
+{
+   return object;
+}
 static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
   unsigned long ip)
 {
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index e13d911251e7..d8cb63bd1ecc 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -484,9 +484,9 @@ void kasan_init_slab_obj(struct kmem_cache *cache, const 
void *object)
__memset(alloc_info, 0, sizeof(*alloc_info));
 }
 
-void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
+void *kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
 {
-   kasan_kmalloc(cache, object, cache->object_size, flags);
+   return kasan_kmalloc(cache, object, cache->object_size, flags);
 }
 
 static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
@@ -527,7 +527,7 @@ bool kasan_slab_free(struct kmem_cache *cache, void 
*object, unsigned long ip)
return __kasan_slab_free(cache, object, ip, true);
 }
 
-void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
+void *kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
   gfp_t flags)
 {
unsigned long redzone_start;
@@ -537,7 +537,7 @@ void kasan_kmalloc(struct kmem_cache *cache, const void 
*object, size_t size,
quarantine_reduce();
 
if (unlikely(object == NULL))
-   return;
+   return NULL;
 
redzone_start = round_up((unsigned long)(object + size),
KASAN_SHADOW_SCALE_SIZE);
@@ -550,10 +550,12 @@ void kasan_kmalloc(struct kmem_cache *cache, const void 
*object, size_t size,
 
if (cache->flags & SLAB_KASAN)
set_track(_alloc_info(cache, object)->alloc_track, flags);
+
+   return 

[RFC PATCH v2 02/15] khwasan: move common kasan and khwasan code to common.c

2018-03-23 Thread Andrey Konovalov
KHWASAN will reuse a significant part of KASAN code, so move the common
parts to common.c without any functional changes.

Signed-off-by: Andrey Konovalov 
---
 mm/kasan/Makefile |   5 +-
 mm/kasan/common.c | 318 ++
 mm/kasan/kasan.c  | 288 +
 mm/kasan/kasan.h  |   4 +
 4 files changed, 330 insertions(+), 285 deletions(-)
 create mode 100644 mm/kasan/common.c

diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 3289db38bc87..a6df14bffb6b 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,11 +1,14 @@
 # SPDX-License-Identifier: GPL-2.0
 KASAN_SANITIZE := n
+UBSAN_SANITIZE_common.o := n
 UBSAN_SANITIZE_kasan.o := n
 KCOV_INSTRUMENT := n
 
 CFLAGS_REMOVE_kasan.o = -pg
 # Function splitter causes unnecessary splits in __asan_load1/__asan_store1
 # see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
+
+CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
 
-obj-y := kasan.o report.o kasan_init.o quarantine.o
+obj-y := common.o kasan.o report.o kasan_init.o quarantine.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
new file mode 100644
index ..08f6c8cb9f84
--- /dev/null
+++ b/mm/kasan/common.c
@@ -0,0 +1,318 @@
+/*
+ * This file contains common KASAN and KHWASAN code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin 
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ *Andrey Konovalov 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "kasan.h"
+#include "../slab.h"
+
+void kasan_enable_current(void)
+{
+   current->kasan_depth++;
+}
+
+void kasan_disable_current(void)
+{
+   current->kasan_depth--;
+}
+
+static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
+{
+   void *base = task_stack_page(task);
+   size_t size = sp - base;
+
+   kasan_unpoison_shadow(base, size);
+}
+
+/* Unpoison the entire stack for a task. */
+void kasan_unpoison_task_stack(struct task_struct *task)
+{
+   __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+}
+
+/* Unpoison the stack for the current task beyond a watermark sp value. */
+asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
+{
+   /*
+* Calculate the task stack base address.  Avoid using 'current'
+* because this function is called by early resume code which hasn't
+* yet set up the percpu register (%gs).
+*/
+   void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
+
+   kasan_unpoison_shadow(base, watermark - base);
+}
+
+/*
+ * Clear all poison for the region between the current SP and a provided
+ * watermark value, as is sometimes required prior to hand-crafted asm function
+ * returns in the middle of functions.
+ */
+void kasan_unpoison_stack_above_sp_to(const void *watermark)
+{
+   const void *sp = __builtin_frame_address(0);
+   size_t size = watermark - sp;
+
+   if (WARN_ON(sp > watermark))
+   return;
+   kasan_unpoison_shadow(sp, size);
+}
+
+void kasan_check_read(const volatile void *p, unsigned int size)
+{
+   check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_read);
+
+void kasan_check_write(const volatile void *p, unsigned int size)
+{
+   check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
+
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+   check_memory_region((unsigned long)addr, len, true, _RET_IP_);
+
+   return __memset(addr, c, len);
+}
+
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+   check_memory_region((unsigned long)src, len, false, _RET_IP_);
+   check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+
+   return __memmove(dest, src, len);
+}
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+   check_memory_region((unsigned long)src, len, false, _RET_IP_);
+   check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+
+   return __memcpy(dest, src, len);
+}
+
+void kasan_alloc_pages(struct page *page, unsigned int order)
+{
+   if (likely(!PageHighMem(page)))
+   kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+}
+
+size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+   return 

[PATCH v3] KVM: arm/arm64 : add lpi info in vgic-debug

2018-03-23 Thread Peng Hao
Add lpi debug info to vgic-stat.
The printed info like this:
SPI  287  0 0100   0 160  -1
LPI 8192  2 00010000   0 160  -1

Signed-off-by: Peng Hao 
---
 virt/kvm/arm/vgic/vgic-debug.c | 59 ++
 virt/kvm/arm/vgic/vgic-its.c   | 16 ++--
 virt/kvm/arm/vgic/vgic.h   |  1 +
 3 files changed, 63 insertions(+), 13 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
index 10b3817..ddac6bd 100644
--- a/virt/kvm/arm/vgic/vgic-debug.c
+++ b/virt/kvm/arm/vgic/vgic-debug.c
@@ -36,9 +36,12 @@
 struct vgic_state_iter {
int nr_cpus;
int nr_spis;
+   int nr_lpis;
int dist_id;
int vcpu_id;
int intid;
+   int lpi_print_count;
+   struct vgic_irq **lpi_irqs;
 };
 
 static void iter_next(struct vgic_state_iter *iter)
@@ -52,6 +55,39 @@ static void iter_next(struct vgic_state_iter *iter)
if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
++iter->vcpu_id < iter->nr_cpus)
iter->intid = 0;
+
+   if (iter->intid >= VGIC_NR_PRIVATE_IRQS + iter->nr_spis) {
+   if (iter->lpi_print_count < iter->nr_lpis)
+   iter->intid = 
iter->lpi_irqs[iter->lpi_print_count]->intid;
+   iter->lpi_print_count++;
+   }
+}
+
+static void vgic_debug_get_lpis(struct kvm *kvm, struct vgic_state_iter *iter)
+{
+   u32 *intids;
+   int i, irq_count;
+   struct vgic_irq *irq = NULL, **lpi_irqs;
+
+   iter->nr_lpis = 0;
+   irq_count = vgic_copy_lpi_list(kvm, NULL, );
+   if (irq_count < 0)
+   return;
+
+   lpi_irqs = kmalloc_array(irq_count, sizeof(irq), GFP_KERNEL);
+   if (!lpi_irqs) {
+   kfree(intids);
+   return;
+   }
+
+   for (i = 0; i < irq_count; i++) {
+   irq = vgic_get_irq(kvm, NULL, intids[i]);
+   if (!irq)
+   continue;
+   lpi_irqs[iter->nr_lpis++] = irq;
+   }
+   iter->lpi_irqs = lpi_irqs;
+   kfree(intids);
 }
 
 static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
@@ -64,6 +100,8 @@ static void iter_init(struct kvm *kvm, struct 
vgic_state_iter *iter,
iter->nr_cpus = nr_cpus;
iter->nr_spis = kvm->arch.vgic.nr_spis;
 
+   if (vgic_supports_direct_msis(kvm) && !pos)
+   vgic_debug_get_lpis(kvm, iter);
/* Fast forward to the right position if needed */
while (pos--)
iter_next(iter);
@@ -73,7 +111,9 @@ static bool end_of_vgic(struct vgic_state_iter *iter)
 {
return iter->dist_id > 0 &&
iter->vcpu_id == iter->nr_cpus &&
-   (iter->intid - VGIC_NR_PRIVATE_IRQS) == iter->nr_spis;
+   (iter->intid - VGIC_NR_PRIVATE_IRQS) >= iter->nr_spis &&
+   ((iter->nr_lpis == 0) ||
+   (iter->lpi_print_count == iter->nr_lpis + 1));
 }
 
 static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
@@ -130,6 +170,7 @@ static void vgic_debug_stop(struct seq_file *s, void *v)
 
mutex_lock(>lock);
iter = kvm->arch.vgic.iter;
+   kfree(iter->lpi_irqs);
kfree(iter);
kvm->arch.vgic.iter = NULL;
mutex_unlock(>lock);
@@ -154,7 +195,7 @@ static void print_header(struct seq_file *s, struct 
vgic_irq *irq,
 struct kvm_vcpu *vcpu)
 {
int id = 0;
-   char *hdr = "SPI ";
+   char *hdr = "Global";
 
if (vcpu) {
hdr = "VCPU";
@@ -162,7 +203,10 @@ static void print_header(struct seq_file *s, struct 
vgic_irq *irq,
}
 
seq_printf(s, "\n");
-   seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET SRC PRI 
VCPU_ID\n", hdr, id);
+   if (vcpu)
+   seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET 
SRC PRI VCPU_ID\n", hdr, id);
+   else
+   seq_printf(s, "%s TYP   ID TGT_ID PLAEHC HWID   TARGET SRC 
PRI VCPU_ID\n", hdr);
seq_printf(s, 
"---\n");
 }
 
@@ -174,8 +218,10 @@ static void print_irq_state(struct seq_file *s, struct 
vgic_irq *irq,
type = "SGI";
else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
type = "PPI";
-   else
+   else if (irq->intid < VGIC_MAX_SPI)
type = "SPI";
+   else if (irq->intid >= VGIC_MIN_LPI)
+   type = "LPI";
 
if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS)
print_header(s, irq, vcpu);
@@ -220,7 +266,9 @@ static int vgic_debug_show(struct seq_file *s, void *v)
if (!kvm->arch.vgic.initialized)
return 0;
 
-   if (iter->vcpu_id < iter->nr_cpus) {
+   if (iter->intid >= VGIC_MIN_LPI)
+   irq = iter->lpi_irqs[iter->lpi_print_count - 1];
+   else if (iter->vcpu_id < 

Re: [PATCH] KVM: arm/arm64: vgic-its: Fix potential overrun in vgic_copy_lpi_list

2018-03-23 Thread Andre Przywara
Hi,

On 23/03/18 15:21, Marc Zyngier wrote:
> vgic_copy_lpi_list() parses the LPI list and picks LPIs targetting

  targeting

> a given vcpu. We allocate the array containing the intids before taking
> the lpi_list_lock, which means we can have an array size that is not
> equal to the number of LPIs.
> 
> This is particularily obvious when looking at the path coming from

  particularly

kudos go to Thunderbird's spell checker for underlining those ;-)

> vgic_enable_lpis, which is not a command, and thus can run in parallel
> with commands:
> 
> vcpu 0:vcpu 1:
> vgic_enable_lpis
>   its_sync_lpi_pending_table
> vgic_copy_lpi_list
>   intids = kmalloc_array(irq_count)
>MAPI(lpi targeting vcpu 0)
>   list_for_each_entry(lpi_list_head)
> intids[i++] = irq->intid;
> 
> At that stage, we will happily overrun the intids array. Boo. An easy
> fix is is to break once the array is full. The MAPI command will update
> the config anyway, and we won't miss a thing. We also make sure that
> lpi_list_count is read exactly once, so that further updates of that
> value will not affect the array bound check.
> 
> Cc: sta...@vger.kernel.org
> Fixes: ccb1d791ab9e ("KVM: arm64: vgic-its: Fix pending table sync")
> Signed-off-by: Marc Zyngier 
> ---
>  virt/kvm/arm/vgic/vgic-its.c | 15 +--
>  1 file changed, 9 insertions(+), 6 deletions(-)
> 
> diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
> index 465095355666..44ee2f336440 100644
> --- a/virt/kvm/arm/vgic/vgic-its.c
> +++ b/virt/kvm/arm/vgic/vgic-its.c
> @@ -316,21 +316,24 @@ static int vgic_copy_lpi_list(struct kvm_vcpu *vcpu, 
> u32 **intid_ptr)
>   struct vgic_dist *dist = >kvm->arch.vgic;
>   struct vgic_irq *irq;
>   u32 *intids;
> - int irq_count = dist->lpi_list_count, i = 0;
> + int irq_count, i = 0;
>  
>   /*
> -  * We use the current value of the list length, which may change
> -  * after the kmalloc. We don't care, because the guest shouldn't
> -  * change anything while the command handling is still running,
> -  * and in the worst case we would miss a new IRQ, which one wouldn't
> -  * expect to be covered by this command anyway.
> +  * There is an obvious race between allocating the array and LPIs
> +  * being mapped/unmapped. If we ended up here as a result of a
> +  * command, we're safe (locks are held, preventing another
> +  * command). If coming from another path (such as enabling LPIs),
> +  * we must be carefull not to overrun the array.

  careful

But enough of Friday afternoon nits. Since I can't find anything else:

Reviewed-by: Andre Przywara 

Thanks!
Andre

>*/
> + irq_count = READ_ONCE(dist->lpi_list_count);
>   intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
>   if (!intids)
>   return -ENOMEM;
>  
>   spin_lock(>lpi_list_lock);
>   list_for_each_entry(irq, >lpi_list_head, lpi_list) {
> + if (i == irq_count)
> + break;
>   /* We don't need to "get" the IRQ, as we hold the list lock. */
>   if (irq->target_vcpu != vcpu)
>   continue;
> 
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH v2] KVM: arm/arm64 : add lpi info in vgic-debug

2018-03-23 Thread Marc Zyngier
On 23/03/18 23:01, Peng Hao wrote:
> Add lpi debug info to vgic-stat.
> the printed info like this:
> SPI  287  0 0100   0 160  -1
> LPI 8192  2 00010000   0 160  -1
> 
> Signed-off-by: Peng Hao 
> ---
>  virt/kvm/arm/vgic/vgic-debug.c | 56 
> ++
>  1 file changed, 51 insertions(+), 5 deletions(-)
> 
> diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
> index 10b3817..cb85550 100644
> --- a/virt/kvm/arm/vgic/vgic-debug.c
> +++ b/virt/kvm/arm/vgic/vgic-debug.c
> @@ -36,9 +36,12 @@
>  struct vgic_state_iter {
>   int nr_cpus;
>   int nr_spis;
> + int nr_lpis;
>   int dist_id;
>   int vcpu_id;
>   int intid;
> + int lpi_print_count;
> + struct vgic_irq **lpi_irqs;
>  };
>  
>  static void iter_next(struct vgic_state_iter *iter)
> @@ -52,6 +55,35 @@ static void iter_next(struct vgic_state_iter *iter)
>   if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
>   ++iter->vcpu_id < iter->nr_cpus)
>   iter->intid = 0;
> +
> + if (iter->intid >= VGIC_NR_PRIVATE_IRQS + iter->nr_spis) {
> + if (iter->lpi_print_count < iter->nr_lpis)
> + iter->intid = 
> iter->lpi_irqs[iter->lpi_print_count]->intid;
> + iter->lpi_print_count++;
> + }
> +}
> +
> +static void vgic_debug_get_lpis(struct kvm *kvm, struct vgic_state_iter 
> *iter)
> +{
> + struct vgic_dist *dist = >arch.vgic;
> + int i = 0;
> + struct vgic_irq *irq = NULL, **lpi_irqs;
> +
> + iter->nr_lpis = dist->lpi_list_count;
> + lpi_irqs = kmalloc_array(iter->nr_lpis, sizeof(irq), GFP_KERNEL);
> + if (!lpi_irqs) {
> + iter->nr_lpis = 0;
> + return;
> + }
> + spin_lock(>lpi_list_lock);
> + list_for_each_entry(irq, >lpi_list_head, lpi_list) {
> + vgic_get_irq_kref(irq);
> + if (i < iter->nr_lpis)
> + lpi_irqs[i++] = irq;
> + }
> + iter->nr_lpis = i;
> + spin_unlock(>lpi_list_lock);
> + iter->lpi_irqs = lpi_irqs;

I've already explained why I didn't like this construct.

I still don't like it.

M.
-- 
Jazz is not dead. It just smells funny...
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH 2/4] iommu/virtio: Add probe request

2018-03-23 Thread Robin Murphy

On 14/02/18 14:53, Jean-Philippe Brucker wrote:

When the device offers the probe feature, send a probe request for each
device managed by the IOMMU. Extract RESV_MEM information. When we
encounter a MSI doorbell region, set it up as a IOMMU_RESV_MSI region.
This will tell other subsystems that there is no need to map the MSI
doorbell in the virtio-iommu, because MSIs bypass it.

Signed-off-by: Jean-Philippe Brucker 
---
  drivers/iommu/virtio-iommu.c  | 163 --
  include/uapi/linux/virtio_iommu.h |  37 +
  2 files changed, 193 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index a9c9245e8ba2..3ac4b38eaf19 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -45,6 +45,7 @@ struct viommu_dev {
struct iommu_domain_geometrygeometry;
u64 pgsize_bitmap;
u8  domain_bits;
+   u32 probe_size;
  };
  
  struct viommu_mapping {

@@ -72,6 +73,7 @@ struct viommu_domain {
  struct viommu_endpoint {
struct viommu_dev   *viommu;
struct viommu_domain*vdomain;
+   struct list_headresv_regions;
  };
  
  struct viommu_request {

@@ -140,6 +142,10 @@ static int viommu_get_req_size(struct viommu_dev *viommu,
case VIRTIO_IOMMU_T_UNMAP:
size = sizeof(r->unmap);
break;
+   case VIRTIO_IOMMU_T_PROBE:
+   *bottom += viommu->probe_size;
+   size = sizeof(r->probe) + *bottom;
+   break;
default:
return -EINVAL;
}
@@ -448,6 +454,105 @@ static int viommu_replay_mappings(struct viommu_domain 
*vdomain)
return ret;
  }
  
+static int viommu_add_resv_mem(struct viommu_endpoint *vdev,

+  struct virtio_iommu_probe_resv_mem *mem,
+  size_t len)
+{
+   struct iommu_resv_region *region = NULL;
+   unsigned long prot = IOMMU_WRITE | IOMMU_NOEXEC | IOMMU_MMIO;
+
+   u64 addr = le64_to_cpu(mem->addr);
+   u64 size = le64_to_cpu(mem->size);
+
+   if (len < sizeof(*mem))
+   return -EINVAL;
+
+   switch (mem->subtype) {
+   case VIRTIO_IOMMU_RESV_MEM_T_MSI:
+   region = iommu_alloc_resv_region(addr, size, prot,
+IOMMU_RESV_MSI);
+   break;
+   case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
+   default:
+   region = iommu_alloc_resv_region(addr, size, 0,
+IOMMU_RESV_RESERVED);
+   break;
+   }
+
+   list_add(>resv_regions, >list);
+
+   /*
+* Treat unknown subtype as RESERVED, but urge users to update their
+* driver.
+*/
+   if (mem->subtype != VIRTIO_IOMMU_RESV_MEM_T_RESERVED &&
+   mem->subtype != VIRTIO_IOMMU_RESV_MEM_T_MSI)
+   pr_warn("unknown resv mem subtype 0x%x\n", mem->subtype);


Might as well avoid the extra comparisons by incorporating this into the 
switch statement, i.e.:


default:
dev_warn(vdev->viommu_dev->dev, ...);
/* Fallthrough */
case VIRTIO_IOMMU_RESV_MEM_T_RESERVED:
...

(dev_warn is generally preferable to pr_warn when feasible)


+
+   return 0;
+}
+
+static int viommu_probe_endpoint(struct viommu_dev *viommu, struct device *dev)
+{
+   int ret;
+   u16 type, len;
+   size_t cur = 0;
+   struct virtio_iommu_req_probe *probe;
+   struct virtio_iommu_probe_property *prop;
+   struct iommu_fwspec *fwspec = dev->iommu_fwspec;
+   struct viommu_endpoint *vdev = fwspec->iommu_priv;
+
+   if (!fwspec->num_ids)
+   /* Trouble ahead. */
+   return -EINVAL;
+
+   probe = kzalloc(sizeof(*probe) + viommu->probe_size +
+   sizeof(struct virtio_iommu_req_tail), GFP_KERNEL);
+   if (!probe)
+   return -ENOMEM;
+
+   probe->head.type = VIRTIO_IOMMU_T_PROBE;
+   /*
+* For now, assume that properties of an endpoint that outputs multiple
+* IDs are consistent. Only probe the first one.
+*/
+   probe->endpoint = cpu_to_le32(fwspec->ids[0]);
+
+   ret = viommu_send_req_sync(viommu, probe);
+   if (ret)
+   goto out_free;
+
+   prop = (void *)probe->properties;
+   type = le16_to_cpu(prop->type) & VIRTIO_IOMMU_PROBE_T_MASK;
+
+   while (type != VIRTIO_IOMMU_PROBE_T_NONE &&
+  cur < viommu->probe_size) {
+   len = le16_to_cpu(prop->length);
+
+   switch (type) {
+   case VIRTIO_IOMMU_PROBE_T_RESV_MEM:
+   ret = viommu_add_resv_mem(vdev, (void *)prop->value, 
len);
+   break;
+

Re: [PATCH 1/4] iommu: Add virtio-iommu driver

2018-03-23 Thread Robin Murphy

On 14/02/18 14:53, Jean-Philippe Brucker wrote:

The virtio IOMMU is a para-virtualized device, allowing to send IOMMU
requests such as map/unmap over virtio-mmio transport without emulating
page tables. This implementation handles ATTACH, DETACH, MAP and UNMAP
requests.

The bulk of the code transforms calls coming from the IOMMU API into
corresponding virtio requests. Mappings are kept in an interval tree
instead of page tables.

Signed-off-by: Jean-Philippe Brucker 
---
  MAINTAINERS   |   6 +
  drivers/iommu/Kconfig |  11 +
  drivers/iommu/Makefile|   1 +
  drivers/iommu/virtio-iommu.c  | 960 ++
  include/uapi/linux/virtio_ids.h   |   1 +
  include/uapi/linux/virtio_iommu.h | 116 +
  6 files changed, 1095 insertions(+)
  create mode 100644 drivers/iommu/virtio-iommu.c
  create mode 100644 include/uapi/linux/virtio_iommu.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 3bdc260e36b7..2a181924d420 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -14818,6 +14818,12 @@ S: Maintained
  F:drivers/virtio/virtio_input.c
  F:include/uapi/linux/virtio_input.h
  
+VIRTIO IOMMU DRIVER

+M: Jean-Philippe Brucker 
+S: Maintained
+F: drivers/iommu/virtio-iommu.c
+F: include/uapi/linux/virtio_iommu.h
+
  VIRTUAL BOX GUEST DEVICE DRIVER
  M:Hans de Goede 
  M:Arnd Bergmann 
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index f3a21343e636..1ea0ec74524f 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -381,4 +381,15 @@ config QCOM_IOMMU
help
  Support for IOMMU on certain Qualcomm SoCs.
  
+config VIRTIO_IOMMU

+   bool "Virtio IOMMU driver"
+   depends on VIRTIO_MMIO
+   select IOMMU_API
+   select INTERVAL_TREE
+   select ARM_DMA_USE_IOMMU if ARM
+   help
+ Para-virtualised IOMMU driver with virtio.
+
+ Say Y here if you intend to run this kernel as a guest.
+
  endif # IOMMU_SUPPORT
diff --git a/drivers/iommu/Makefile b/drivers/iommu/Makefile
index 1fb695854809..9c68be1365e1 100644
--- a/drivers/iommu/Makefile
+++ b/drivers/iommu/Makefile
@@ -29,3 +29,4 @@ obj-$(CONFIG_EXYNOS_IOMMU) += exynos-iommu.o
  obj-$(CONFIG_FSL_PAMU) += fsl_pamu.o fsl_pamu_domain.o
  obj-$(CONFIG_S390_IOMMU) += s390-iommu.o
  obj-$(CONFIG_QCOM_IOMMU) += qcom_iommu.o
+obj-$(CONFIG_VIRTIO_IOMMU) += virtio-iommu.o
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
new file mode 100644
index ..a9c9245e8ba2
--- /dev/null
+++ b/drivers/iommu/virtio-iommu.c
@@ -0,0 +1,960 @@
+/*
+ * Virtio driver for the paravirtualized IOMMU
+ *
+ * Copyright (C) 2018 ARM Limited
+ * Author: Jean-Philippe Brucker 
+ *
+ * SPDX-License-Identifier: GPL-2.0


This wants to be a // comment at the very top of the file (thankfully 
the policy is now properly documented in-tree since 
Documentation/process/license-rules.rst got merged)



+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include 
+
+#define MSI_IOVA_BASE  0x800
+#define MSI_IOVA_LENGTH0x10
+
+struct viommu_dev {
+   struct iommu_device iommu;
+   struct device   *dev;
+   struct virtio_device*vdev;
+
+   struct ida  domain_ids;
+
+   struct virtqueue*vq;
+   /* Serialize anything touching the request queue */
+   spinlock_t  request_lock;
+
+   /* Device configuration */
+   struct iommu_domain_geometrygeometry;
+   u64 pgsize_bitmap;
+   u8  domain_bits;
+};
+
+struct viommu_mapping {
+   phys_addr_t paddr;
+   struct interval_tree_node   iova;
+   union {
+   struct virtio_iommu_req_map map;
+   struct virtio_iommu_req_unmap unmap;
+   } req;
+};
+
+struct viommu_domain {
+   struct iommu_domain domain;
+   struct viommu_dev   *viommu;
+   struct mutexmutex;
+   unsigned intid;
+
+   spinlock_t  mappings_lock;
+   struct rb_root_cached   mappings;
+
+   /* Number of endpoints attached to this domain */
+   unsigned long   endpoints;
+};
+
+struct viommu_endpoint {
+   struct viommu_dev   *viommu;
+   struct viommu_domain*vdomain;
+};
+
+struct viommu_request {
+   struct scatterlist  top;
+   struct scatterlist  bottom;
+
+   int 

Re: [PATCH] KVM: arm/arm64 : add lpi info in vgic-debug

2018-03-23 Thread Marc Zyngier
[fixing Christoffer's email address]

On 23/03/18 13:33, peng.h...@zte.com.cn wrote:
>> On 23/03/18 10:36, Peng Hao wrote:
>>> Add lpi debug info to vgic-stat.
>>> the printed info like this:
>>>  SPI  287  0 0100   0 160  -1
>>>  LPI 8192  2 00010000   0 160  -1
>>>
>>> Signed-off-by: Peng Hao 
>>> ---
>>>  virt/kvm/arm/vgic/vgic-debug.c | 61 
>>> ++
>>>  1 file changed, 56 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
>>> index 10b3817..444115e 100644
>>> --- a/virt/kvm/arm/vgic/vgic-debug.c
>>> +++ b/virt/kvm/arm/vgic/vgic-debug.c
>>> @@ -36,9 +36,12 @@
>>>  struct vgic_state_iter {
>>>  int nr_cpus;
>>>  int nr_spis;
>>> +int nr_lpis;
>>>  int dist_id;
>>>  int vcpu_id;
>>>  int intid;
>>> +int lpi_print_count;
>>> +struct vgic_irq **lpi_irqs;
>>>  };
>>>  
>>>  static void iter_next(struct vgic_state_iter *iter)
>>> @@ -52,6 +55,40 @@ static void iter_next(struct vgic_state_iter *iter)
>>>  if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
>>>  ++iter->vcpu_id < iter->nr_cpus)
>>>  iter->intid = 0;
>>> +
>>> +if (iter->intid >= VGIC_NR_PRIVATE_IRQS + iter->nr_spis) {
>>> +if (iter->lpi_print_count < iter->nr_lpis)
>>> +iter->intid = iter->lpi_irqs[iter->lpi_print_count]->intid;
>>> +iter->lpi_print_count++;
>>> +}
>>> +}
>>> +
>>> +static void vgic_debug_get_lpis(struct kvm *kvm, struct vgic_state_iter 
>>> *iter)
>>> +{
>>> +struct vgic_dist *dist = >arch.vgic;
>>> +int i = 0;
>>> +struct vgic_irq *irq = NULL, **lpi_irqs;
>>> +
>>> +again:
>>> +iter->nr_lpis = dist->lpi_list_count;
>>> +lpi_irqs = kmalloc_array(iter->nr_lpis, sizeof(irq), GFP_KERNEL);
>>> +if (!lpi_irqs) {
>>> +iter->nr_lpis = 0;
>>> +return;
>>> +}
>>> +spin_lock(>lpi_list_lock);
>>> +if (iter->nr_lpis != dist->lpi_list_count) {
>>> +kfree(lpi_irqs);
>>> +spin_unlock(>lpi_list_lock);
>>> +goto again;
>>> +}
> 
>> Why do we need an exact count? It is fine to have a transient count, and
>> the debug code should be able to come with that without performing this
>> terrible loop.
> yeah, it is enough to have a rough count for debug code .
>> We also already have some code that snapshot the the LPIs (see
>> vgic_copy_lpi_list), so please consider reusing that instead.
> I can't reuse vgic_copy_lpi_list. It snapshots based on LPI's target vcpu. 

Guess what? You can also change it!

>>> +
>>> +list_for_each_entry(irq, >lpi_list_head, lpi_list) {
>>> +vgic_get_irq_kref(irq);
>>> +lpi_irqs[i++] = irq;
>>> +}
>>> +spin_unlock(>lpi_list_lock);
>>> +iter->lpi_irqs = lpi_irqs;
> 
>> Messing with the internals of the refcounts is really a bad idea. Please
>> use vgic_get_irq() in conjunction with the above, and allow it to fail
>> gracefully.
>   vgic_get_irq require intid as input  and vgic_get_lpi that vgic_get_irq 
> calling will traverse the lpi_list with holding lpi_list_lock again,
>   but here I has held lpi_list_lock. So I think calling vgic_get_irq_kref is 
> safe with holding the
>  lpi_list_lock. 

It is safe but terribly ugly. Traversing the LPI list is completely
fine, and we have this API for a reason (not reinventing the wheel). I
don't think the debug code should sidestep it. If the list proves to be
too slow to traverse, then we should adopt a better data structure.

>>>  }
>>>  
>>>  static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
>>> @@ -64,6 +101,8 @@ static void iter_init(struct kvm *kvm, struct 
>>> vgic_state_iter *iter,
>>>  iter->nr_cpus = nr_cpus;
>>>  iter->nr_spis = kvm->arch.vgic.nr_spis;
>>>  
>>> +if (vgic_supports_direct_msis(kvm) && !pos)
>>> +vgic_debug_get_lpis(kvm, iter);

BTW, what's the point of this?

>>>  /* Fast forward to the right position if needed */
>>>  while (pos--)
>>>  iter_next(iter);
>>> @@ -73,7 +112,9 @@ static bool end_of_vgic(struct vgic_state_iter *iter)
>>>  {
>>>  return iter->dist_id > 0 &&
>>>  iter->vcpu_id == iter->nr_cpus &&
>>> -(iter->intid - VGIC_NR_PRIVATE_IRQS) == iter->nr_spis;
>>> +(iter->intid - VGIC_NR_PRIVATE_IRQS) >= iter->nr_spis &&
>>> +((iter->nr_lpis == 0) ||
>>> +(iter->lpi_print_count == iter->nr_lpis + 1));
>>>  }
>>>  
>>>  static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
>>> @@ -130,6 +171,7 @@ static void vgic_debug_stop(struct seq_file *s, void *v)
>>>  
>>>  mutex_lock(>lock);
>>>  iter = kvm->arch.vgic.iter;
>>> +kfree(iter->lpi_irqs);
>>>  kfree(iter);
>>>  kvm->arch.vgic.iter = NULL;
>>>  mutex_unlock(>lock);
>>> @@ -154,7 +196,7 @@ static void print_header(struct seq_file *s, struct 
>>> vgic_irq *irq,
>>>   struct kvm_vcpu *vcpu)

[PATCH v2] KVM: arm/arm64 : add lpi info in vgic-debug

2018-03-23 Thread Peng Hao
Add lpi debug info to vgic-stat.
the printed info like this:
SPI  287  0 0100   0 160  -1
LPI 8192  2 00010000   0 160  -1

Signed-off-by: Peng Hao 
---
 virt/kvm/arm/vgic/vgic-debug.c | 56 ++
 1 file changed, 51 insertions(+), 5 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
index 10b3817..cb85550 100644
--- a/virt/kvm/arm/vgic/vgic-debug.c
+++ b/virt/kvm/arm/vgic/vgic-debug.c
@@ -36,9 +36,12 @@
 struct vgic_state_iter {
int nr_cpus;
int nr_spis;
+   int nr_lpis;
int dist_id;
int vcpu_id;
int intid;
+   int lpi_print_count;
+   struct vgic_irq **lpi_irqs;
 };
 
 static void iter_next(struct vgic_state_iter *iter)
@@ -52,6 +55,35 @@ static void iter_next(struct vgic_state_iter *iter)
if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
++iter->vcpu_id < iter->nr_cpus)
iter->intid = 0;
+
+   if (iter->intid >= VGIC_NR_PRIVATE_IRQS + iter->nr_spis) {
+   if (iter->lpi_print_count < iter->nr_lpis)
+   iter->intid = 
iter->lpi_irqs[iter->lpi_print_count]->intid;
+   iter->lpi_print_count++;
+   }
+}
+
+static void vgic_debug_get_lpis(struct kvm *kvm, struct vgic_state_iter *iter)
+{
+   struct vgic_dist *dist = >arch.vgic;
+   int i = 0;
+   struct vgic_irq *irq = NULL, **lpi_irqs;
+
+   iter->nr_lpis = dist->lpi_list_count;
+   lpi_irqs = kmalloc_array(iter->nr_lpis, sizeof(irq), GFP_KERNEL);
+   if (!lpi_irqs) {
+   iter->nr_lpis = 0;
+   return;
+   }
+   spin_lock(>lpi_list_lock);
+   list_for_each_entry(irq, >lpi_list_head, lpi_list) {
+   vgic_get_irq_kref(irq);
+   if (i < iter->nr_lpis)
+   lpi_irqs[i++] = irq;
+   }
+   iter->nr_lpis = i;
+   spin_unlock(>lpi_list_lock);
+   iter->lpi_irqs = lpi_irqs;
 }
 
 static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
@@ -64,6 +96,8 @@ static void iter_init(struct kvm *kvm, struct vgic_state_iter 
*iter,
iter->nr_cpus = nr_cpus;
iter->nr_spis = kvm->arch.vgic.nr_spis;
 
+   if (vgic_supports_direct_msis(kvm) && !pos)
+   vgic_debug_get_lpis(kvm, iter);
/* Fast forward to the right position if needed */
while (pos--)
iter_next(iter);
@@ -73,7 +107,9 @@ static bool end_of_vgic(struct vgic_state_iter *iter)
 {
return iter->dist_id > 0 &&
iter->vcpu_id == iter->nr_cpus &&
-   (iter->intid - VGIC_NR_PRIVATE_IRQS) == iter->nr_spis;
+   (iter->intid - VGIC_NR_PRIVATE_IRQS) >= iter->nr_spis &&
+   ((iter->nr_lpis == 0) ||
+   (iter->lpi_print_count == iter->nr_lpis + 1));
 }
 
 static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
@@ -130,6 +166,7 @@ static void vgic_debug_stop(struct seq_file *s, void *v)
 
mutex_lock(>lock);
iter = kvm->arch.vgic.iter;
+   kfree(iter->lpi_irqs);
kfree(iter);
kvm->arch.vgic.iter = NULL;
mutex_unlock(>lock);
@@ -154,7 +191,7 @@ static void print_header(struct seq_file *s, struct 
vgic_irq *irq,
 struct kvm_vcpu *vcpu)
 {
int id = 0;
-   char *hdr = "SPI ";
+   char *hdr = "Global";
 
if (vcpu) {
hdr = "VCPU";
@@ -162,7 +199,10 @@ static void print_header(struct seq_file *s, struct 
vgic_irq *irq,
}
 
seq_printf(s, "\n");
-   seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET SRC PRI 
VCPU_ID\n", hdr, id);
+   if (vcpu)
+   seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET 
SRC PRI VCPU_ID\n", hdr, id);
+   else
+   seq_printf(s, "%s TYP   ID TGT_ID PLAEHC HWID   TARGET SRC 
PRI VCPU_ID\n", hdr);
seq_printf(s, 
"---\n");
 }
 
@@ -174,8 +214,10 @@ static void print_irq_state(struct seq_file *s, struct 
vgic_irq *irq,
type = "SGI";
else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
type = "PPI";
-   else
+   else if (irq->intid < VGIC_MAX_SPI)
type = "SPI";
+   else if (irq->intid >= VGIC_MIN_LPI)
+   type = "LPI";
 
if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS)
print_header(s, irq, vcpu);
@@ -220,7 +262,9 @@ static int vgic_debug_show(struct seq_file *s, void *v)
if (!kvm->arch.vgic.initialized)
return 0;
 
-   if (iter->vcpu_id < iter->nr_cpus) {
+   if (iter->intid >= VGIC_MIN_LPI)
+   irq = iter->lpi_irqs[iter->lpi_print_count - 1];
+   else if (iter->vcpu_id < iter->nr_cpus) {
vcpu = kvm_get_vcpu(kvm, iter->vcpu_id);
irq = 

Re:Re: [PATCH] KVM: arm/arm64 : add lpi info in vgic-debug

2018-03-23 Thread peng.hao2
>On 23/03/18 10:36, Peng Hao wrote:
>> Add lpi debug info to vgic-stat.
>> the printed info like this:
>>  SPI  287  0 0100   0 160  -1
>>  LPI 8192  2 00010000   0 160  -1
>> 
>> Signed-off-by: Peng Hao 
>> ---
>>  virt/kvm/arm/vgic/vgic-debug.c | 61 
>> ++
>>  1 file changed, 56 insertions(+), 5 deletions(-)
>> 
>> diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
>> index 10b3817..444115e 100644
>> --- a/virt/kvm/arm/vgic/vgic-debug.c
>> +++ b/virt/kvm/arm/vgic/vgic-debug.c
>> @@ -36,9 +36,12 @@
>>  struct vgic_state_iter {
>>  int nr_cpus;
>>  int nr_spis;
>> +int nr_lpis;
>>  int dist_id;
>>  int vcpu_id;
>>  int intid;
>> +int lpi_print_count;
>> +struct vgic_irq **lpi_irqs;
>>  };
>>  
>>  static void iter_next(struct vgic_state_iter *iter)
>> @@ -52,6 +55,40 @@ static void iter_next(struct vgic_state_iter *iter)
>>  if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
>>  ++iter->vcpu_id < iter->nr_cpus)
>>  iter->intid = 0;
>> +
>> +if (iter->intid >= VGIC_NR_PRIVATE_IRQS + iter->nr_spis) {
>> +if (iter->lpi_print_count < iter->nr_lpis)
>> +iter->intid = iter->lpi_irqs[iter->lpi_print_count]->intid;
>> +iter->lpi_print_count++;
>> +}
>> +}
>> +
>> +static void vgic_debug_get_lpis(struct kvm *kvm, struct vgic_state_iter 
>> *iter)
>> +{
>> +struct vgic_dist *dist = >arch.vgic;
>> +int i = 0;
>> +struct vgic_irq *irq = NULL, **lpi_irqs;
>> +
>> +again:
>> +iter->nr_lpis = dist->lpi_list_count;
>> +lpi_irqs = kmalloc_array(iter->nr_lpis, sizeof(irq), GFP_KERNEL);
>> +if (!lpi_irqs) {
>> +iter->nr_lpis = 0;
>> +return;
>> +}
>> +spin_lock(>lpi_list_lock);
>> +if (iter->nr_lpis != dist->lpi_list_count) {
>> +kfree(lpi_irqs);
>> +spin_unlock(>lpi_list_lock);
>> +goto again;
>> +}

>Why do we need an exact count? It is fine to have a transient count, and
>the debug code should be able to come with that without performing this
>terrible loop.
yeah, it is enough to have a rough count for debug code .
>We also already have some code that snapshot the the LPIs (see
>vgic_copy_lpi_list), so please consider reusing that instead.
I can't reuse vgic_copy_lpi_list. It snapshots based on LPI's target vcpu. 
>> +
>> +list_for_each_entry(irq, >lpi_list_head, lpi_list) {
>> +vgic_get_irq_kref(irq);
>> +lpi_irqs[i++] = irq;
>> +}
>> +spin_unlock(>lpi_list_lock);
>> +iter->lpi_irqs = lpi_irqs;

>Messing with the internals of the refcounts is really a bad idea. Please
>use vgic_get_irq() in conjunction with the above, and allow it to fail
>gracefully.
  vgic_get_irq require intid as input  and vgic_get_lpi that vgic_get_irq 
calling will traverse the lpi_list with holding lpi_list_lock again,
  but here I has held lpi_list_lock. So I think calling vgic_get_irq_kref is 
safe with holding the
 lpi_list_lock. 
> >  }
> >  
>>  static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
>> @@ -64,6 +101,8 @@ static void iter_init(struct kvm *kvm, struct 
>> vgic_state_iter *iter,
>>  iter->nr_cpus = nr_cpus;
>>  iter->nr_spis = kvm->arch.vgic.nr_spis;
>>  
>> +if (vgic_supports_direct_msis(kvm) && !pos)
>> +vgic_debug_get_lpis(kvm, iter);
>>  /* Fast forward to the right position if needed */
>>  while (pos--)
>>  iter_next(iter);
>> @@ -73,7 +112,9 @@ static bool end_of_vgic(struct vgic_state_iter *iter)
>>  {
>>  return iter->dist_id > 0 &&
>>  iter->vcpu_id == iter->nr_cpus &&
>> -(iter->intid - VGIC_NR_PRIVATE_IRQS) == iter->nr_spis;
>> +(iter->intid - VGIC_NR_PRIVATE_IRQS) >= iter->nr_spis &&
>> +((iter->nr_lpis == 0) ||
>> +(iter->lpi_print_count == iter->nr_lpis + 1));
>>  }
>>  
>>  static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
>> @@ -130,6 +171,7 @@ static void vgic_debug_stop(struct seq_file *s, void *v)
>>  
>>  mutex_lock(>lock);
>>  iter = kvm->arch.vgic.iter;
>> +kfree(iter->lpi_irqs);
>>  kfree(iter);
>>  kvm->arch.vgic.iter = NULL;
>>  mutex_unlock(>lock);
>> @@ -154,7 +196,7 @@ static void print_header(struct seq_file *s, struct 
>> vgic_irq *irq,
>>   struct kvm_vcpu *vcpu)
>>  {
>>  int id = 0;
>> -char *hdr = "SPI ";
>> +char *hdr = "S/LPI ";
>>  
>>  if (vcpu) {
>>  hdr = "VCPU";
>> @@ -162,7 +204,10 @@ static void print_header(struct seq_file *s, struct 
>> vgic_irq *irq,
>>  }
>>  
>>  seq_printf(s, "\n");
>> -seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET SRC PRI 
>> VCPU_ID\n", hdr, id);
>> +if (vcpu)
>> +seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET SRC 
>> PRI VCPU_ID\n", hdr, id);
>> +else
>> +seq_printf(s, "%s TYP   ID TGT_ID PLAEHC  

Re: [RFC 02/12] KVM: arm/arm64: Document KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION

2018-03-23 Thread Peter Maydell
On 19 March 2018 at 09:20, Eric Auger  wrote:
> We introduce a new KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION attribute in
> KVM_DEV_ARM_VGIC_GRP_ADDR group. It allows userspace to provide the
> base address and size of a redistributor region
>
> Compared to KVM_VGIC_V3_ADDR_TYPE_REDIST, this new attribute allows
> to declare several separate redistributor regions.
>
> So the whole redist space does not need to be contiguous anymore.
>
> Signed-off-by: Eric Auger 
> ---
>  Documentation/virtual/kvm/devices/arm-vgic-v3.txt | 12 
>  1 file changed, 12 insertions(+)
>
> diff --git a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt 
> b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
> index 9293b45..2c0bedf 100644
> --- a/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
> +++ b/Documentation/virtual/kvm/devices/arm-vgic-v3.txt
> @@ -27,6 +27,18 @@ Groups:
>VCPU and all of the redistributor pages are contiguous.
>Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
>This address needs to be 64K aligned.
> +
> +KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION (rw, 64-bit)
> +  The attr field of kvm_device_attr encodes 3 values:
> +  bits: | 63     52  |  51      12 |11 - 0
> +  values:   | pfns   |   base  | index
> +  - index encodes the unique redistibutor region index

"redistributor"

> +  - base field encodes bits [51:12] the guest physical base address

"of the guest"

> +of the first redistributor in the region. There are two 64K pages
> +for each VCPU and all of the redistributor pages are contiguous
> +within the redistributor region.
> +  - pfns encodes the size of the region in 64kB pages.
> +  Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.

You should say something here about what happens if userspace
tries to use both KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION and
KVM_VGIC_V3_ADDR_TYPE_REDIST. I think this should be an error
(reported by whichever of the two you try to set second).

>Errors:
>  -E2BIG:  Address outside of addressable IPA range
>  -EINVAL: Incorrectly aligned address

Marc wrote:
> Why does base have to include bits [15:12] of the IPA? If it is 64kB
> aligned (as it should), these bits are guaranteed to be 0. This also
> avoid having to return -EINVAL in case of alignment problems.

If you're not using the bits for anything else you want to
check they're 0 anyway. Otherwise we can't guarantee to safely
use them for something else in future, because userspace might
be handing us garbage in those bits without noticing.

thanks
-- PMM
___
kvmarm mailing list
kvmarm@lists.cs.columbia.edu
https://lists.cs.columbia.edu/mailman/listinfo/kvmarm


Re: [PATCH] KVM: arm/arm64 : add lpi info in vgic-debug

2018-03-23 Thread Marc Zyngier
On 23/03/18 10:36, Peng Hao wrote:
> Add lpi debug info to vgic-stat.
> the printed info like this:
>  SPI  287  0 0100   0 160  -1
>  LPI 8192  2 00010000   0 160  -1
> 
> Signed-off-by: Peng Hao 
> ---
>  virt/kvm/arm/vgic/vgic-debug.c | 61 
> ++
>  1 file changed, 56 insertions(+), 5 deletions(-)
> 
> diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
> index 10b3817..444115e 100644
> --- a/virt/kvm/arm/vgic/vgic-debug.c
> +++ b/virt/kvm/arm/vgic/vgic-debug.c
> @@ -36,9 +36,12 @@
>  struct vgic_state_iter {
>   int nr_cpus;
>   int nr_spis;
> + int nr_lpis;
>   int dist_id;
>   int vcpu_id;
>   int intid;
> + int lpi_print_count;
> + struct vgic_irq **lpi_irqs;
>  };
>  
>  static void iter_next(struct vgic_state_iter *iter)
> @@ -52,6 +55,40 @@ static void iter_next(struct vgic_state_iter *iter)
>   if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
>   ++iter->vcpu_id < iter->nr_cpus)
>   iter->intid = 0;
> +
> + if (iter->intid >= VGIC_NR_PRIVATE_IRQS + iter->nr_spis) {
> + if (iter->lpi_print_count < iter->nr_lpis)
> + iter->intid = 
> iter->lpi_irqs[iter->lpi_print_count]->intid;
> + iter->lpi_print_count++;
> + }
> +}
> +
> +static void vgic_debug_get_lpis(struct kvm *kvm, struct vgic_state_iter 
> *iter)
> +{
> + struct vgic_dist *dist = >arch.vgic;
> + int i = 0;
> + struct vgic_irq *irq = NULL, **lpi_irqs;
> +
> +again:
> + iter->nr_lpis = dist->lpi_list_count;
> + lpi_irqs = kmalloc_array(iter->nr_lpis, sizeof(irq), GFP_KERNEL);
> + if (!lpi_irqs) {
> + iter->nr_lpis = 0;
> + return;
> + }
> + spin_lock(>lpi_list_lock);
> + if (iter->nr_lpis != dist->lpi_list_count) {
> + kfree(lpi_irqs);
> + spin_unlock(>lpi_list_lock);
> + goto again;
> + }

Why do we need an exact count? It is fine to have a transient count, and
the debug code should be able to come with that without performing this
terrible loop.

We also already have some code that snapshot the the LPIs (see
vgic_copy_lpi_list), so please consider reusing that instead.

> +
> + list_for_each_entry(irq, >lpi_list_head, lpi_list) {
> + vgic_get_irq_kref(irq);
> + lpi_irqs[i++] = irq;
> + }
> + spin_unlock(>lpi_list_lock);
> + iter->lpi_irqs = lpi_irqs;

Messing with the internals of the refcounts is really a bad idea. Please
use vgic_get_irq() in conjunction with the above, and allow it to fail
gracefully.

>  }
>  
>  static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
> @@ -64,6 +101,8 @@ static void iter_init(struct kvm *kvm, struct 
> vgic_state_iter *iter,
>   iter->nr_cpus = nr_cpus;
>   iter->nr_spis = kvm->arch.vgic.nr_spis;
>  
> + if (vgic_supports_direct_msis(kvm) && !pos)
> + vgic_debug_get_lpis(kvm, iter);
>   /* Fast forward to the right position if needed */
>   while (pos--)
>   iter_next(iter);
> @@ -73,7 +112,9 @@ static bool end_of_vgic(struct vgic_state_iter *iter)
>  {
>   return iter->dist_id > 0 &&
>   iter->vcpu_id == iter->nr_cpus &&
> - (iter->intid - VGIC_NR_PRIVATE_IRQS) == iter->nr_spis;
> + (iter->intid - VGIC_NR_PRIVATE_IRQS) >= iter->nr_spis &&
> + ((iter->nr_lpis == 0) ||
> + (iter->lpi_print_count == iter->nr_lpis + 1));
>  }
>  
>  static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
> @@ -130,6 +171,7 @@ static void vgic_debug_stop(struct seq_file *s, void *v)
>  
>   mutex_lock(>lock);
>   iter = kvm->arch.vgic.iter;
> + kfree(iter->lpi_irqs);
>   kfree(iter);
>   kvm->arch.vgic.iter = NULL;
>   mutex_unlock(>lock);
> @@ -154,7 +196,7 @@ static void print_header(struct seq_file *s, struct 
> vgic_irq *irq,
>struct kvm_vcpu *vcpu)
>  {
>   int id = 0;
> - char *hdr = "SPI ";
> + char *hdr = "S/LPI ";
>  
>   if (vcpu) {
>   hdr = "VCPU";
> @@ -162,7 +204,10 @@ static void print_header(struct seq_file *s, struct 
> vgic_irq *irq,
>   }
>  
>   seq_printf(s, "\n");
> - seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET SRC PRI 
> VCPU_ID\n", hdr, id);
> + if (vcpu)
> + seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET 
> SRC PRI VCPU_ID\n", hdr, id);
> + else
> + seq_printf(s, "%s TYP   ID TGT_ID PLAEHC HWID   TARGET SRC 
> PRI VCPU_ID\n", hdr);

This feels like an unnecessary change. But if you really want that kind
of detail, change your "S/LPI" to say something more generic, such as
"Global".

>   seq_printf(s, 
> "---\n");
>  }
>  
> @@ -174,8 +219,10 @@ static void 

RE: [PATCH 1/4] iommu: Add virtio-iommu driver

2018-03-23 Thread Tian, Kevin
> From: Tian, Kevin
> Sent: Thursday, March 22, 2018 6:06 PM
> 
> > From: Robin Murphy [mailto:robin.mur...@arm.com]
> > Sent: Wednesday, March 21, 2018 10:24 PM
> >
> > On 21/03/18 13:14, Jean-Philippe Brucker wrote:
> > > On 21/03/18 06:43, Tian, Kevin wrote:
> > > [...]
> > >>> +
> > >>> +#include 
> > >>> +
> > >>> +#define MSI_IOVA_BASE  0x800
> > >>> +#define MSI_IOVA_LENGTH0x10
> > >>
> > >> this is ARM specific, and according to virtio-iommu spec isn't it
> > >> better probed on the endpoint instead of hard-coding here?
> > >
> > > These values are arbitrary, not really ARM-specific even if ARM is the
> > > only user yet: we're just reserving a random IOVA region for mapping
> > MSIs.
> > > It is hard-coded because of the way iommu-dma.c works, but I don't
> > quite
> > > remember why that allocation isn't dynamic.
> >
> > The host kernel needs to have *some* MSI region in place before the
> > guest can start configuring interrupts, otherwise it won't know what
> > address to give to the underlying hardware. However, as soon as the host
> > kernel has picked a region, host userspace needs to know that it can no
> > longer use addresses in that region for DMA-able guest memory. It's a
> > lot easier when the address is fixed in hardware and the host userspace
> > will never be stupid enough to try and VFIO_IOMMU_DMA_MAP it, but in
> > the
> > more general case where MSI writes undergo IOMMU address translation
> > so
> > it's an arbitrary IOVA, this has the potential to conflict with stuff
> > like guest memory hotplug.
> >
> > What we currently have is just the simplest option, with the host kernel
> > just picking something up-front and pretending to host userspace that
> > it's a fixed hardware address. There's certainly scope for it to be a
> > bit more dynamic in the sense of adding an interface to let userspace
> > move it around (before attaching any devices, at least), but I don't
> > think it's feasible for the host kernel to second-guess userspace enough
> > to make it entirely transparent like it is in the DMA API domain case.
> >
> > Of course, that's all assuming the host itself is using a virtio-iommu
> > (e.g. in a nested virt or emulation scenario). When it's purely within a
> > guest then an MSI reservation shouldn't matter so much, since the guest
> > won't be anywhere near the real hardware configuration anyway.
> >
> > Robin.
> 
> Curious since anyway we are defining a new iommu architecture
> is it possible to avoid those ARM-specific burden completely?
> 

OK, after some study around those tricks below is my learning:

- MSI_IOVA window is used only on request (iommu_dma_get
_msi_page), not meant to take effect on all architectures once 
initialized. e.g. ARM GIC does it but not x86. So it is reasonable 
for virtio-iommu driver to implement such capability;

- I thought whether hardware MSI doorbell can be always reported
on virtio-iommu since it's newly defined. Looks there is a problem
if underlying IOMMU is sw-managed MSI style - valid mapping is
expected in all level of translations, meaning guest has to manage
stage-1 mapping in nested configuration since stage-1 is owned
by guest. 

Then virtio-iommu is naturally expected to report the same MSI 
model as supported by underlying hardware. Below are some
further thoughts along this route (use 'IOMMU' to represent the
physical one and 'virtio-iommu' for virtual one):



In the scope of current virtio-iommu spec v.6, there is no nested
consideration yet. Guest driver is expected to use MAP/UNMAP
interface on assigned endpoints. In this case the MAP requests
(IOVA->GPA) is caught and maintained within Qemu which then 
further talks to VFIO to map IOVA->HPA in IOMMU.

Qemu can learn the MSI model of IOMMU from sysfs.

For hardware MSI doorbell (x86 and some ARM):
* Host kernel reports to Qemu as IOMMU_RESV_MSI
* Qemu report to guest as VIRTIO_IOMMU_RESV_MEM_T_MSI
* Guest takes the range as IOMMU_RESV_MSI. reserved
* Qemu MAP database has no mapping for the doorbell
* Physical IOMMU page table has no mapping for the doorbell
* MSI from passthrough device bypass IOMMU
* MSI from emulated device bypass virtio-iommu

For software MSI doorbell (most ARM):
* Host kernel reports to Qemu as IOMMU_RESV_SW_MSI
* Qemu report to guest as VIRTIO_IOMMU_RESV_MEM_T_RESERVED
* Guest takes the range as IOMMU_RESV_RESERVED
* vGIC requests to map 'GPA of the virtual doorbell'
* a map request (IOVA->GPA) sent on endpoint
* Qemu maintains the mapping in MAP database
* but no VFIO_MAP request since it's purely virtual
* GIC requests to map 'HPA of the physical doorbell'
* e.g. triggered by VFIO enable msi
* IOMMU now includes a valid mapping (IOVA->HPA)
* MSI from emulated device go through Qemu MAP
database (IOVA->'GPA of virtual doorbell') and then hit vGIC
* MSI from passthrough device go through IOMMU
(IOVA->'HPA of physical doorbell') and then hit GIC

In this case, host 

[PATCH] KVM: arm/arm64 : add lpi info in vgic-debug

2018-03-23 Thread Peng Hao
Add lpi debug info to vgic-stat.
the printed info like this:
 SPI  287  0 0100   0 160  -1
 LPI 8192  2 00010000   0 160  -1

Signed-off-by: Peng Hao 
---
 virt/kvm/arm/vgic/vgic-debug.c | 61 ++
 1 file changed, 56 insertions(+), 5 deletions(-)

diff --git a/virt/kvm/arm/vgic/vgic-debug.c b/virt/kvm/arm/vgic/vgic-debug.c
index 10b3817..444115e 100644
--- a/virt/kvm/arm/vgic/vgic-debug.c
+++ b/virt/kvm/arm/vgic/vgic-debug.c
@@ -36,9 +36,12 @@
 struct vgic_state_iter {
int nr_cpus;
int nr_spis;
+   int nr_lpis;
int dist_id;
int vcpu_id;
int intid;
+   int lpi_print_count;
+   struct vgic_irq **lpi_irqs;
 };
 
 static void iter_next(struct vgic_state_iter *iter)
@@ -52,6 +55,40 @@ static void iter_next(struct vgic_state_iter *iter)
if (iter->intid == VGIC_NR_PRIVATE_IRQS &&
++iter->vcpu_id < iter->nr_cpus)
iter->intid = 0;
+
+   if (iter->intid >= VGIC_NR_PRIVATE_IRQS + iter->nr_spis) {
+   if (iter->lpi_print_count < iter->nr_lpis)
+   iter->intid = 
iter->lpi_irqs[iter->lpi_print_count]->intid;
+   iter->lpi_print_count++;
+   }
+}
+
+static void vgic_debug_get_lpis(struct kvm *kvm, struct vgic_state_iter *iter)
+{
+   struct vgic_dist *dist = >arch.vgic;
+   int i = 0;
+   struct vgic_irq *irq = NULL, **lpi_irqs;
+
+again:
+   iter->nr_lpis = dist->lpi_list_count;
+   lpi_irqs = kmalloc_array(iter->nr_lpis, sizeof(irq), GFP_KERNEL);
+   if (!lpi_irqs) {
+   iter->nr_lpis = 0;
+   return;
+   }
+   spin_lock(>lpi_list_lock);
+   if (iter->nr_lpis != dist->lpi_list_count) {
+   kfree(lpi_irqs);
+   spin_unlock(>lpi_list_lock);
+   goto again;
+   }
+
+   list_for_each_entry(irq, >lpi_list_head, lpi_list) {
+   vgic_get_irq_kref(irq);
+   lpi_irqs[i++] = irq;
+   }
+   spin_unlock(>lpi_list_lock);
+   iter->lpi_irqs = lpi_irqs;
 }
 
 static void iter_init(struct kvm *kvm, struct vgic_state_iter *iter,
@@ -64,6 +101,8 @@ static void iter_init(struct kvm *kvm, struct 
vgic_state_iter *iter,
iter->nr_cpus = nr_cpus;
iter->nr_spis = kvm->arch.vgic.nr_spis;
 
+   if (vgic_supports_direct_msis(kvm) && !pos)
+   vgic_debug_get_lpis(kvm, iter);
/* Fast forward to the right position if needed */
while (pos--)
iter_next(iter);
@@ -73,7 +112,9 @@ static bool end_of_vgic(struct vgic_state_iter *iter)
 {
return iter->dist_id > 0 &&
iter->vcpu_id == iter->nr_cpus &&
-   (iter->intid - VGIC_NR_PRIVATE_IRQS) == iter->nr_spis;
+   (iter->intid - VGIC_NR_PRIVATE_IRQS) >= iter->nr_spis &&
+   ((iter->nr_lpis == 0) ||
+   (iter->lpi_print_count == iter->nr_lpis + 1));
 }
 
 static void *vgic_debug_start(struct seq_file *s, loff_t *pos)
@@ -130,6 +171,7 @@ static void vgic_debug_stop(struct seq_file *s, void *v)
 
mutex_lock(>lock);
iter = kvm->arch.vgic.iter;
+   kfree(iter->lpi_irqs);
kfree(iter);
kvm->arch.vgic.iter = NULL;
mutex_unlock(>lock);
@@ -154,7 +196,7 @@ static void print_header(struct seq_file *s, struct 
vgic_irq *irq,
 struct kvm_vcpu *vcpu)
 {
int id = 0;
-   char *hdr = "SPI ";
+   char *hdr = "S/LPI ";
 
if (vcpu) {
hdr = "VCPU";
@@ -162,7 +204,10 @@ static void print_header(struct seq_file *s, struct 
vgic_irq *irq,
}
 
seq_printf(s, "\n");
-   seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET SRC PRI 
VCPU_ID\n", hdr, id);
+   if (vcpu)
+   seq_printf(s, "%s%2d TYP   ID TGT_ID PLAEHC HWID   TARGET 
SRC PRI VCPU_ID\n", hdr, id);
+   else
+   seq_printf(s, "%s TYP   ID TGT_ID PLAEHC HWID   TARGET SRC 
PRI VCPU_ID\n", hdr);
seq_printf(s, 
"---\n");
 }
 
@@ -174,8 +219,10 @@ static void print_irq_state(struct seq_file *s, struct 
vgic_irq *irq,
type = "SGI";
else if (irq->intid < VGIC_NR_PRIVATE_IRQS)
type = "PPI";
-   else
+   else if (irq->intid < VGIC_MAX_SPI)
type = "SPI";
+   else if (irq->intid >= VGIC_MIN_LPI)
+   type = "LPI";
 
if (irq->intid ==0 || irq->intid == VGIC_NR_PRIVATE_IRQS)
print_header(s, irq, vcpu);
@@ -220,7 +267,9 @@ static int vgic_debug_show(struct seq_file *s, void *v)
if (!kvm->arch.vgic.initialized)
return 0;
 
-   if (iter->vcpu_id < iter->nr_cpus) {
+   if (iter->intid >= VGIC_MIN_LPI)
+   irq = iter->lpi_irqs[iter->lpi_print_count - 1];
+   else if