date:20160719

Re: [PATCH v2 1/1] KVM: PPC: Introduce KVM_CAP_PPC_HTM

2016-07-19 Thread Michael Ellerman

Sam Bobroff  writes:

> Introduce a new KVM capability, KVM_CAP_PPC_HTM, that can be queried to
> determine if a PowerPC KVM guest should use HTM (Hardware Transactional

Minor nit, "should" should be "can" IMHO.

> Memory).
>
> This will be used by QEMU to populate the pa-features bits in the
> guest's device tree.
>
> Signed-off-by: Sam Bobroff 
> ---
>
> v2:
>
> * Use CPU_FTR_TM_COMP instead of CPU_FTR_TM.

Thanks.

Acked-by: Michael Ellerman 

Or do you want me to merge this before Paul gets back?

> * I didn't unbreak the line, as with the extra characters checkpatch will
>   complain if I do. I did move the break to a more usual place.

I would just ignore checkpatch. But I don't mind that much.

cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH kernel v2 1/2] powerpc/iommu: Stop using @current in mm_iommu_xxx

2016-07-19 Thread Alexey Kardashevskiy

In some situations the userspace memory context may live longer than
the userspace process itself so if we need to do proper memory context
cleanup, we better cache @mm and use it later when the process is gone
(@current or @current->mm are NULL).

This changes mm_iommu_xxx API to receive mm_struct instead of using one
from @current.

This is needed by the following patch to do proper cleanup in time.
This depends on "powerpc/powernv/ioda: Fix endianness when reading TCEs"
to do proper cleanup via tce_iommu_clear() patch.

This should cause no behavioral change.

Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/mmu_context.h | 15 ++--
 arch/powerpc/mm/mmu_context_iommu.c| 45 +-
 drivers/vfio/vfio_iommu_spapr_tce.c| 41 +++
 3 files changed, 51 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 9d2cd0c..745b4bd 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -18,16 +18,17 @@ extern void destroy_context(struct mm_struct *mm);
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 struct mm_iommu_table_group_mem_t;
 
-extern bool mm_iommu_preregistered(void);
-extern long mm_iommu_get(unsigned long ua, unsigned long entries,
+extern bool mm_iommu_preregistered(struct mm_struct *mm);
+extern long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long 
entries,
struct mm_iommu_table_group_mem_t **pmem);
-extern long mm_iommu_put(struct mm_iommu_table_group_mem_t *mem);
+extern long mm_iommu_put(struct mm_struct *mm,
+   struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_init(mm_context_t *ctx);
 extern void mm_iommu_cleanup(mm_context_t *ctx);
-extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(unsigned long ua,
-   unsigned long size);
-extern struct mm_iommu_table_group_mem_t *mm_iommu_find(unsigned long ua,
-   unsigned long entries);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
+   unsigned long ua, unsigned long size);
+extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
+   unsigned long ua, unsigned long entries);
 extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem,
unsigned long ua, unsigned long *hpa);
 extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index da6a216..65086bf 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -53,7 +53,7 @@ static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
}
 
pr_debug("[%d] RLIMIT_MEMLOCK HASH64 %c%ld %ld/%ld\n",
-   current->pid,
+   current ? current->pid : 0,
incr ? '+' : '-',
npages << PAGE_SHIFT,
mm->locked_vm << PAGE_SHIFT,
@@ -63,28 +63,22 @@ static long mm_iommu_adjust_locked_vm(struct mm_struct *mm,
return ret;
 }
 
-bool mm_iommu_preregistered(void)
+bool mm_iommu_preregistered(struct mm_struct *mm)
 {
-   if (!current || !current->mm)
-   return false;
-
-   return !list_empty(>mm->context.iommu_group_mem_list);
+   return !list_empty(>context.iommu_group_mem_list);
 }
 EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
 
-long mm_iommu_get(unsigned long ua, unsigned long entries,
+long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long 
entries,
struct mm_iommu_table_group_mem_t **pmem)
 {
struct mm_iommu_table_group_mem_t *mem;
long i, j, ret = 0, locked_entries = 0;
struct page *page = NULL;
 
-   if (!current || !current->mm)
-   return -ESRCH; /* process exited */
-
mutex_lock(_list_mutex);
 
-   list_for_each_entry_rcu(mem, >mm->context.iommu_group_mem_list,
+   list_for_each_entry_rcu(mem, >context.iommu_group_mem_list,
next) {
if ((mem->ua == ua) && (mem->entries == entries)) {
++mem->used;
@@ -102,7 +96,7 @@ long mm_iommu_get(unsigned long ua, unsigned long entries,
 
}
 
-   ret = mm_iommu_adjust_locked_vm(current->mm, entries, true);
+   ret = mm_iommu_adjust_locked_vm(mm, entries, true);
if (ret)
goto unlock_exit;
 
@@ -142,11 +136,11 @@ long mm_iommu_get(unsigned long ua, unsigned long entries,
mem->entries = entries;
*pmem = mem;
 
-   list_add_rcu(>next, >mm->context.iommu_group_mem_list);
+   list_add_rcu(>next, >context.iommu_group_mem_list);
 
 unlock_exit:
if (locked_entries && ret)
-   mm_iommu_adjust_locked_vm(current->mm, locked_entries, false);
+

[PATCH kernel v2 0/2] powerpc/mm/iommu: Put pages on process exit

2016-07-19 Thread Alexey Kardashevskiy

This is a fix to a bug when guest memory stays Active
after QEMU process exited. This happened because the QEMU memory context
was not released in a short period of time after QEMU process exited.
More details are in the commit logs.

Please comment. Thanks.

Alexey Kardashevskiy (2):
  powerpc/iommu: Stop using @current in mm_iommu_xxx
  powerpc/mm/iommu: Put pages on process exit

 arch/powerpc/include/asm/mmu_context.h | 16 +++---
 arch/powerpc/mm/mmu_context_book3s64.c |  4 --
 arch/powerpc/mm/mmu_context_iommu.c| 55 +++--
 drivers/vfio/vfio_iommu_spapr_tce.c| 89 --
 4 files changed, 100 insertions(+), 64 deletions(-)

-- 
2.5.0.rc3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH kernel v2 2/2] powerpc/mm/iommu: Put pages on process exit

2016-07-19 Thread Alexey Kardashevskiy

At the moment VFIO IOMMU SPAPR v2 driver pins all guest RAM pages when
the userspace starts using VFIO. When the userspace process finishes,
all the pinned pages need to be put; this is done as a part of
the userspace memory context (MM) destruction which happens on
the very last mmdrop().

This approach has a problem that a MM of the userspace process
may live longer than the userspace process itself as kernel threads
use userspace process MMs which was runnning on a CPU where
the kernel thread was scheduled to. If this happened, the MM remains
referenced until this exact kernel thread wakes up again
and releases the very last reference to the MM, on an idle system this
can take even hours.

This references and caches MM once per container and adds tracking
how many times each preregistered area was registered in
a specific container. This way we do not depend on @current pointing to
a valid task descriptor.

This changes the userspce interface to return EBUSY if memory is
already registered (mm_iommu_get() used to increment the counter);
however it should not have any practical effect as the only
userspace tool available now does register memory area once per
container anyway.

As tce_iommu_register_pages/tce_iommu_unregister_pages are called
under container->lock, this does not need additional locking.

Cc: David Gibson 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Balbir Singh 
Cc: Nicholas Piggin 
Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/include/asm/mmu_context.h |  1 -
 arch/powerpc/mm/mmu_context_book3s64.c |  4 ---
 arch/powerpc/mm/mmu_context_iommu.c| 10 ---
 drivers/vfio/vfio_iommu_spapr_tce.c| 52 +-
 4 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/arch/powerpc/include/asm/mmu_context.h 
b/arch/powerpc/include/asm/mmu_context.h
index 745b4bd..90338fd 100644
--- a/arch/powerpc/include/asm/mmu_context.h
+++ b/arch/powerpc/include/asm/mmu_context.h
@@ -24,7 +24,6 @@ extern long mm_iommu_get(struct mm_struct *mm, unsigned long 
ua, unsigned long e
 extern long mm_iommu_put(struct mm_struct *mm,
struct mm_iommu_table_group_mem_t *mem);
 extern void mm_iommu_init(mm_context_t *ctx);
-extern void mm_iommu_cleanup(mm_context_t *ctx);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm,
unsigned long ua, unsigned long size);
 extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm,
diff --git a/arch/powerpc/mm/mmu_context_book3s64.c 
b/arch/powerpc/mm/mmu_context_book3s64.c
index 227b2a6..5c67d1c 100644
--- a/arch/powerpc/mm/mmu_context_book3s64.c
+++ b/arch/powerpc/mm/mmu_context_book3s64.c
@@ -159,10 +159,6 @@ static inline void destroy_pagetable_page(struct mm_struct 
*mm)
 
 void destroy_context(struct mm_struct *mm)
 {
-#ifdef CONFIG_SPAPR_TCE_IOMMU
-   mm_iommu_cleanup(>context);
-#endif
-
 #ifdef CONFIG_PPC_ICSWX
drop_cop(mm->context.acop, mm);
kfree(mm->context.cop_lockp);
diff --git a/arch/powerpc/mm/mmu_context_iommu.c 
b/arch/powerpc/mm/mmu_context_iommu.c
index 65086bf..901773d 100644
--- a/arch/powerpc/mm/mmu_context_iommu.c
+++ b/arch/powerpc/mm/mmu_context_iommu.c
@@ -293,13 +293,3 @@ void mm_iommu_init(mm_context_t *ctx)
 {
INIT_LIST_HEAD_RCU(>iommu_group_mem_list);
 }
-
-void mm_iommu_cleanup(mm_context_t *ctx)
-{
-   struct mm_iommu_table_group_mem_t *mem, *tmp;
-
-   list_for_each_entry_safe(mem, tmp, >iommu_group_mem_list, next) {
-   list_del_rcu(>next);
-   mm_iommu_do_free(mem);
-   }
-}
diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c 
b/drivers/vfio/vfio_iommu_spapr_tce.c
index 9752e77..40e71a0 100644
--- a/drivers/vfio/vfio_iommu_spapr_tce.c
+++ b/drivers/vfio/vfio_iommu_spapr_tce.c
@@ -89,6 +89,15 @@ struct tce_iommu_group {
 };
 
 /*
+ * A container needs to remember which preregistered areas and how many times
+ * it has referenced to do proper cleanup at the userspace process exit.
+ */
+struct tce_iommu_prereg {
+   struct list_head next;
+   struct mm_iommu_table_group_mem_t *mem;
+};
+
+/*
  * The container descriptor supports only a single group per container.
  * Required by the API as the container is not supplied with the IOMMU group
  * at the moment of initialization.
@@ -101,12 +110,26 @@ struct tce_container {
struct mm_struct *mm;
struct iommu_table *tables[IOMMU_TABLE_GROUP_MAX_TABLES];
struct list_head group_list;
+   struct list_head prereg_list;
 };
 
+static long tce_iommu_prereg_free(struct tce_container *container,
+   struct tce_iommu_prereg *tcemem)
+{
+   long ret;
+
+   list_del(>next);
+   ret = mm_iommu_put(container->mm, tcemem->mem);
+   kfree(tcemem);
+
+   return ret;
+}
+
 static long tce_iommu_unregister_pages(struct

[PATCH kernel] powerpc/powernv/ioda: Fix endianness when reading TCEs

2016-07-19 Thread Alexey Kardashevskiy

The iommu_table_ops::exchange() callback writes new TCE to the table
and returns old value and permission mask. The old TCE value is
correctly converted from BE to CPU endian; however permission mask
was calculated from BE value and therefore always returned DMA_NONE
which could cause memory leak on LE systems using VFIO SPAPR TCE IOMMU v1
driver.

This fixes pnv_tce_xchg() to have @oldtce a CPU endian.

Fixes: 05c6cfb9dce0d13d37e9d007ee6a4af36f1c0a58
Cc: sta...@vger.kernel.org # 4.2+
Signed-off-by: Alexey Kardashevskiy 
---
 arch/powerpc/platforms/powernv/pci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/pci.c 
b/arch/powerpc/platforms/powernv/pci.c
index 1d92bd9..7b17f88 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -620,8 +620,8 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index,
if (newtce & TCE_PCI_WRITE)
newtce |= TCE_PCI_READ;
 
-   oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce));
-   *hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+   oldtce = be64_to_cpu(xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce)));
+   *hpa = oldtce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
*direction = iommu_tce_direction(oldtce);
 
return 0;
-- 
2.5.0.rc3

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [v2,1/2] refactor code parsing size based on memory range

2016-07-19 Thread Hari Bathini



Ping..


On Friday 24 June 2016 10:45 PM, Hari Bathini wrote:



On 06/24/2016 10:56 AM, Michael Ellerman wrote:

On Wed, 2016-22-06 at 19:25:26 UTC, Hari Bathini wrote:
Currently, crashkernel parameter supports the below syntax to parse 
size

based on memory range:

crashkernel=:[,:,...]

While such parsing is implemented for crashkernel parameter, it 
applies to
other parameters with similar syntax. So, move this code to a more 
generic

place for code reuse.

Cc: Eric Biederman 
Cc: Vivek Goyal 
Cc: Rusty Russell 
Cc: ke...@lists.infradead.org
Signed-off-by: Hari Bathini 
Hari, it's not immediately clear that this makes no change to the 
logic in the
kexec code. Can you reply with a longer change log explaining why the 
old & new

logic is the same for kexec.



Hi Michael,

Please consider this changelog for this patch:

--
crashkernel parameter supports different syntaxes to specify the amount
of memory to be reserved for kdump kernel. Below is one of the supported
syntaxes that needs parsing to find the memory size to reserve, based on
memory range:

crashkernel=:[,:,...]

While such parsing is implemented for crashkernel parameter, it 
applies to
other parameters, like fadump_reserve_mem, which could use similar 
syntax.
So, to reuse code, moving the code that checks if the parameter syntax 
is as
above and also the code that parses memory size to reserve, for this 
syntax.
While the code is moved to kernel/params.c file, there is no change in 
logic
for crashkernel parameter parsing as the moved code is invoked with 
function

calls at appropriate places.
--

Thanks
Hari





diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 94aa10f..72f55e5 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -436,6 +436,11 @@ extern char *get_options(const char *str, int 
nints, int *ints);

  extern unsigned long long memparse(const char *ptr, char **retptr);
  extern bool parse_option_str(const char *str, const char *option);
  +extern bool __init is_param_range_based(const char *cmdline);
+extern unsigned long long __init parse_mem_range_size(const char 
*param,

+  char **str,
+  unsigned long long system_ram);
+
  extern int core_kernel_text(unsigned long addr);
  extern int core_kernel_data(unsigned long addr);
  extern int __kernel_text_address(unsigned long addr);
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
index 56b3ed0..d43f5cc 100644
--- a/kernel/kexec_core.c
+++ b/kernel/kexec_core.c
@@ -1083,59 +1083,9 @@ static int __init parse_crashkernel_mem(char 
*cmdline,

  char *cur = cmdline, *tmp;
/* for each entry of the comma-separated list */
-do {
-unsigned long long start, end = ULLONG_MAX, size;
-
-/* get the start of the range */
-start = memparse(cur, );
-if (cur == tmp) {
-pr_warn("crashkernel: Memory value expected\n");
-return -EINVAL;
-}
-cur = tmp;
-if (*cur != '-') {
-pr_warn("crashkernel: '-' expected\n");
-return -EINVAL;
-}
-cur++;
-
-/* if no ':' is here, than we read the end */
-if (*cur != ':') {
-end = memparse(cur, );
-if (cur == tmp) {
-pr_warn("crashkernel: Memory value expected\n");
-return -EINVAL;
-}
-cur = tmp;
-if (end <= start) {
-pr_warn("crashkernel: end <= start\n");
-return -EINVAL;
-}
-}
-
-if (*cur != ':') {
-pr_warn("crashkernel: ':' expected\n");
-return -EINVAL;
-}
-cur++;
-
-size = memparse(cur, );
-if (cur == tmp) {
-pr_warn("Memory value expected\n");
-return -EINVAL;
-}
-cur = tmp;
-if (size >= system_ram) {
-pr_warn("crashkernel: invalid size\n");
-return -EINVAL;
-}
-
-/* match ? */
-if (system_ram >= start && system_ram < end) {
-*crash_size = size;
-break;
-}
-} while (*cur++ == ',');
+*crash_size = parse_mem_range_size("crashkernel", , 
system_ram);

+if (cur == cmdline)
+return -EINVAL;
if (*crash_size > 0) {
  while (*cur && *cur != ' ' && *cur != '@')
@@ -1272,7 +1222,6 @@ static int __init __parse_crashkernel(char 
*cmdline,

   const char *name,
   const char *suffix)
  {
-char*first_colon, *first_space;
  char*ck_cmdline;
BUG_ON(!crash_size || !crash_base);
@@ -1290,12 +1239,10 @@ static int __init __parse_crashkernel(char 
*cmdline,

  return parse_crashkernel_suffix(ck_cmdline, crash_size,
  suffix);
  /*
- * if the commandline contains a ':', then that's

Re: [PATCH v2 1/1] KVM: PPC: Introduce KVM_CAP_PPC_HTM

2016-07-19 Thread Balbir Singh



On 20/07/16 13:41, Sam Bobroff wrote:
> Introduce a new KVM capability, KVM_CAP_PPC_HTM, that can be queried to
> determine if a PowerPC KVM guest should use HTM (Hardware Transactional
> Memory).
> 
> This will be used by QEMU to populate the pa-features bits in the
> guest's device tree.
> 
> Signed-off-by: Sam Bobroff 
> ---
> 
Make sense

Acked-by: Balbir Singh 
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC 0/3] extend kexec_file_load system call

2016-07-19 Thread Balbir Singh

>  
> Command line options are not signed. I thought idea behind secureboot
> was to execute only trusted code and command line options don't enforce
> you to execute unsigned code.
>  
>>
>> You can set module.sig_enforce=0 and open up the system a bit assuming
>> that you can get a module to load with another attack
> 
> IIUC, sig_enforce bool_enable_only so it can only be enabled. Default
> value of it is 0 if CONFIG_MODULE_SIG_FORCE=n.
> 
> IOW, if your kernel forced signature verification, you should not be
> able to do sig_enforce=0. If you kernel did not have
> CONFIG_MODULE_SIG_FORCE=y, then sig_enforce should be 0 by default anyway
> and you are not making it worse using command line.
> 

OK.. I checked and you are right, but that is an example and there are
other things like security=, thermal.*, nosmep, nosmap that need auditing
for safety and might hurt the system security if used. I still think
think that assuming you can pass any command line without breaking security
is a broken argument.

>>
> So it sounds like different class of security problems which you are
> referring to and not necessarily covered by secureboot or signed
> kernel.
 Let me give you an example.
  
 You have a secure boot setup, where the firmware/ROM validates the boot
 loader.  Good, the boot loader hasn't been tampered with.
  
 You interrupt the boot loader and are able to modify the command line
 for the booted kernel.
  
 The boot loader loads the kernel and verifies the kernel's signature.
 Good, the kernel hasn't been tampered with.  The kernel starts running.
  
 You've plugged in a USB drive to the device, and specified a partition
 containing a root filesystem that you control to the kernel.  The
 validated kernel finds the USB drive, and mounts it, and executes
 your own binaries on the USB drive.
>>> You will require physical access to the machine to be able to
>>> insert your usb drive. And IIRC, argument was that if attacker has
>>> physical access to machine, all bets are off anyway.
>>>
>>
>> You don't need physical access -- your machine controller BMC can
>> do the magic for you. So its not always physical access, is it?
> 
> Well, idea was that if you have physical access to machine, then all
> bets are off. If BMC can do something which allows running unsigned
> code at ring level 0, its a problem I think from secureboot model of
> security.
> 
>>  
  
  
 You run a shell on the console.  You now have control of the system,
 and can mount the real rootfs, inspect it, and work out what it does,
 etc.
  
 At this point, what use was all the validation that the secure boot
 has done?  Absolutely useless.
  
 If you can change the command line arguments given to the kernel, you
 have no security, no matter how much you verify signatures.  It's
 the illusion of security, nothing more, nothing less.
  
>>
>> I agree, if you can change command line arguments, all bets are of lesser 
>> value
> 
> If changing command line allows execution of unsigned code at ring level
> 0, then it is a problem. Otherwise we are talking of security issues which
> are not covered by secure


I agree that from what I can see/grep there is nothing that allows unsigned
code to run at boot in ring0, but there are implications like the ones
I've mentioned above.

Attacks are typically built as a chain and every bit might matter. One could
turn off features that might lead to the system being attacked at run-time


Balbir Singh.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v2 1/1] KVM: PPC: Introduce KVM_CAP_PPC_HTM

2016-07-19 Thread Sam Bobroff

Introduce a new KVM capability, KVM_CAP_PPC_HTM, that can be queried to
determine if a PowerPC KVM guest should use HTM (Hardware Transactional
Memory).

This will be used by QEMU to populate the pa-features bits in the
guest's device tree.

Signed-off-by: Sam Bobroff 
---

v2:

* Use CPU_FTR_TM_COMP instead of CPU_FTR_TM.
* I didn't unbreak the line, as with the extra characters checkpatch will
  complain if I do. I did move the break to a more usual place.

 arch/powerpc/kvm/powerpc.c | 4 
 include/uapi/linux/kvm.h   | 1 +
 2 files changed, 5 insertions(+)

diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 02416fe..5ebc8ff 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = 1;
break;
 #endif
+   case KVM_CAP_PPC_HTM:
+   r = cpu_has_feature(CPU_FTR_TM_COMP) &&
+   is_kvmppc_hv_enabled(kvm);
+   break;
default:
r = 0;
break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 05ebf47..f421d0e 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -866,6 +866,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_ARM_PMU_V3 126
 #define KVM_CAP_VCPU_ATTRIBUTES 127
 #define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_PPC_HTM 129
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.1.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2 1/2] powerpc/pseries: Implemented indexed-count hotplug memory add

2016-07-19 Thread Thomas Falcon

On 07/18/2016 10:07 AM, Sahil Mehta wrote:
> Indexed-count add for memory hotplug guarantees that a contiguous block
> of  lmbs beginning at a specified  will be assigned (NOT
> that  lmbs will be added). Because of Qemu's per-DIMM memory
> management, the addition of a contiguous block of memory currently
> requires a series of individual calls. Indexed-count add reduces
> this series into a single call.
>
> Signed-off-by: Sahil Mehta 
> ---
> v2:   -remove potential memory leak when parsing command
>   -use u32s drc_index and count instead of u32 ic[]
>in dlpar_memory
>
>  arch/powerpc/include/asm/rtas.h |2
>  arch/powerpc/platforms/pseries/dlpar.c  |   34 +++-
>  arch/powerpc/platforms/pseries/hotplug-memory.c |  100 
> +--
>  3 files changed, 124 insertions(+), 12 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
> index 51400ba..f46b271 100644
> --- a/arch/powerpc/include/asm/rtas.h
> +++ b/arch/powerpc/include/asm/rtas.h
> @@ -307,6 +307,7 @@ struct pseries_hp_errorlog {
>   union {
>   __be32  drc_index;
>   __be32  drc_count;
> + __be32  indexed_count[2];
>   chardrc_name[1];
>   } _drc_u;
>  };
> @@ -322,6 +323,7 @@ struct pseries_hp_errorlog {
>  #define PSERIES_HP_ELOG_ID_DRC_NAME  1
>  #define PSERIES_HP_ELOG_ID_DRC_INDEX 2
>  #define PSERIES_HP_ELOG_ID_DRC_COUNT 3
> +#define PSERIES_HP_ELOG_ID_IC4
>
>  struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
> uint16_t section_id);
> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
> b/arch/powerpc/platforms/pseries/dlpar.c
> index 2b93ae8..2a6dc9e 100644
> --- a/arch/powerpc/platforms/pseries/dlpar.c
> +++ b/arch/powerpc/platforms/pseries/dlpar.c
> @@ -345,11 +345,17 @@ static int handle_dlpar_errorlog(struct 
> pseries_hp_errorlog *hp_elog)
>   switch (hp_elog->id_type) {
>   case PSERIES_HP_ELOG_ID_DRC_COUNT:
>   hp_elog->_drc_u.drc_count =
> - be32_to_cpu(hp_elog->_drc_u.drc_count);
> + be32_to_cpu(hp_elog->_drc_u.drc_count);
>   break;
>   case PSERIES_HP_ELOG_ID_DRC_INDEX:
>   hp_elog->_drc_u.drc_index =
> - be32_to_cpu(hp_elog->_drc_u.drc_index);
> + be32_to_cpu(hp_elog->_drc_u.drc_index);
> + break;
> + case PSERIES_HP_ELOG_ID_IC:
> + hp_elog->_drc_u.indexed_count[0] =
> + be32_to_cpu(hp_elog->_drc_u.indexed_count[0]);
> + hp_elog->_drc_u.indexed_count[1] =
> + be32_to_cpu(hp_elog->_drc_u.indexed_count[1]);
>   }
>
>   switch (hp_elog->resource) {
> @@ -409,7 +415,29 @@ static ssize_t dlpar_store(struct class *class, struct 
> class_attribute *attr,
>   goto dlpar_store_out;
>   }
>
> - if (!strncmp(arg, "index", 5)) {
> + if (!strncmp(arg, "indexed-count", 13)) {
> + u32 index, count;
> + char *cstr, *istr;
> +
> + hp_elog->id_type = PSERIES_HP_ELOG_ID_IC;
> + arg += strlen("indexed-count ");
> +
> + cstr = kstrdup(arg, GFP_KERNEL);
> + istr = strchr(cstr, ' ');
> + *istr++ = '\0';
> +
> + if (kstrtou32(cstr, 0, ) || kstrtou32(istr, 0, )) {
> + rc = -EINVAL;
> + pr_err("Invalid index or count : \"%s\"\n", buf);
> + kfree(cstr);
> + goto dlpar_store_out;
> + }
> +
> + kfree(cstr);
> +
> + hp_elog->_drc_u.indexed_count[0] = cpu_to_be32(count);
> + hp_elog->_drc_u.indexed_count[1] = cpu_to_be32(index);
> + } else if (!strncmp(arg, "index", 5)) {
>   u32 index;
>
>   hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 2ce1385..d7942ca 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -701,6 +701,83 @@ static int dlpar_memory_add_by_index(u32 drc_index, 
> struct property *prop)
>   return rc;
>  }
>
> +static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index,
> +   struct property *prop)
> +{
> + struct of_drconf_cell *lmbs;
> + u32 num_lmbs, *p;
> + int i, rc;
> + int lmbs_available = 0, start_index = 0, end_index;
> +
> + pr_info("Attempting to hot-add %u LMB(s) at index %x\n",
> + lmbs_to_add, drc_index);
> +
> + if (lmbs_to_add == 0)
> + return -EINVAL;
> +
> + p = prop->value;
> + num_lmbs = *p++;
> + lmbs = (struct

Re: [PATCH v3 02/11] mm: Hardened usercopy

2016-07-19 Thread Kees Cook

On Tue, Jul 19, 2016 at 12:12 PM, Kees Cook  wrote:
> On Mon, Jul 18, 2016 at 6:52 PM, Laura Abbott  wrote:
>> On 07/15/2016 02:44 PM, Kees Cook wrote:
>>> +static inline const char *check_heap_object(const void *ptr, unsigned
>>> long n,
>>> +   bool to_user)
>>> +{
>>> +   struct page *page, *endpage;
>>> +   const void *end = ptr + n - 1;
>>> +
>>> +   if (!virt_addr_valid(ptr))
>>> +   return NULL;
>>> +
>>
>>
>> virt_addr_valid returns true on vmalloc addresses on arm64 which causes some
>> intermittent false positives (tab completion in a qemu buildroot environment
>> was showing it fairly reliably). I think this is an arm64 bug because
>> virt_addr_valid should return true if and only if virt_to_page returns the
>> corresponding page. We can work around this for now by explicitly
>> checking against is_vmalloc_addr.
>
> Hrm, that's weird. Sounds like a bug too, but I'll add a check for
> is_vmalloc_addr() to catch it for now.

BTW, if you were testing against -next, KASAN moved things around in
copy_*_user() in a way I wasn't expecting (__copy* and copy* now both
call __arch_copy* instead of copy* calling __copy*). I'll have this
fixed in the next version.

-Kees

-- 
Kees Cook
Chrome OS & Brillo Security
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] linuxppc/devtree: Parse new DRC mem/cpu/dev device tree elements

2016-07-19 Thread Michael Bringmann

Responses to your remarks about the patch.  Note that I will repost it in
smaller segments later this week.

On 07/13/2016 03:41 PM, Nathan Fontenot wrote:
> On 06/30/2016 04:44 PM, Michael Bringmann wrote:
>> Several properties in the DRC device tree format are replaced by
>> more compact representations to allow, for example, for the encoding
>> of vast amounts of memory, and or reduced duplication of information
>> in related data structures.
>>
>> "ibm,drc-info": This property, when present, replaces the following
>> four properties: "ibm,drc-indexes", "ibm,drc-names", "ibm,drc-types"
>> and "ibm,drc-power-domains".  This property is defined for all
>> dynamically reconfigurable platform nodes.  The "ibm,drc-info" elements
>> are intended to provide a more compact representation, and reduce some
>> search overhead.
>>
>> "ibm,dynamic-memory-v2": This property replaces the "ibm,dynamic-memory"
>> node representation within the "ibm,dynamic-reconfiguration-memory"
>> property provided by the BMC.  This element format is intended to provide
> 
> BMC?

Just a term for the underlying platform.  I think that it came from a
conversation with another developer.  We can just use 'underlying platform'.

>> +#define DRCONF_V2_CELL_OFFSET(i)((i) * DRCONF_V2_CELLS_LEN)
>> +#define DRCONF_V2_CELL_POSITION(p, i)   \
>> +(void *)(((char *)(p))+((i) * DRCONF_V2_CELLS_LEN))
>> +#define DYN_MEM_V2_LEN(entries) (((entries) * DRCONF_V2_CELLS_LEN) + \
>> + (1 * sizeof(unsigned int)))
>> +
> 
> These should probably be functions instead of #defines, makes debugging
> the code easier.

6-of-1 or half-a-dozen to me.  The main reason that I made them macros was
to document the size calculation in one place, instead of having it embedded
in multiple locations in the code as was done for the 'ibm,dynamic-memory'
struct parsing.

> 
>> +#define DRCONF_MEM_PRESERVED0x0001
>> +#define DRCONF_MEM_PRESERVABLE  0x0002
>> +#define DRCONF_MEM_PRESERVED_STATE  0x0004
>> +#define DRCONF_MEM_ASSIGNED 0x0008
>> +#define DRCONF_MEM_NO_H_MIGRATE_DATA0x0010
>> +#define DRCONF_MEM_DRC_INVALID  0x0020
>> +#define DRCONF_MEM_AI_INVALID   0x0040
>> +#define DRCONF_MEM_RESERVED 0x0080
>> +#define DRCONF_MEM_RESERVED_SW  0x8000
> 
> I'll let others chime in, but we don't use all of these flags, or plan
> to at this point so I'm not sure we need to include definitions for them.

I can cut down the list.  3 were previously defined in this file.  
> 

>>  /*
>> - * Retrieve and validate the ibm,dynamic-memory property of the device tree.
>> + * Read the next memblock set entry from the ibm,dynamic-memory-v2 property
> 
> Just saw this here, ans see that it is used elsewhere. You may want to avoid
> using the term memblock, this already has a meaning in the kernel and may
> cause some confusion.
> 
> Still reviewing this patch, more comments as I review more.
> 
> -Nathan

'memblock' was used by the original comments for 'ibm,dynamic-memory' 
structures.
I will change them.

> 

-- 
Michael W. Bringmann
Linux Technology Center
IBM Corporation
Tie-Line  363-5196
External: (512) 286-5196
Cell:   (512) 466-0650
m...@linux.vnet.ibm.com

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH] mm: Add is_migrate_cma_page

2016-07-19 Thread Kees Cook

On Tue, Jul 19, 2016 at 3:00 PM, Laura Abbott  wrote:
> Code such as hardened user copy[1] needs a way to tell if a
> page is CMA or not. Add is_migrate_cma_page in a similar way
> to is_migrate_isolate_page.
>
> [1]http://article.gmane.org/gmane.linux.kernel.mm/155238
>
> Signed-off-by: Laura Abbott 

Great, thanks!

> ---
> Here's an explicit patch, slightly different than what I posted before. It can
> be kept separate or folded in as needed.

Assuming there's no objection, I'll add it to my tree and use the new macro.

-Kees

> ---
>  include/linux/mmzone.h | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 02069c2..c8478b2 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -68,8 +68,10 @@ extern char * const migratetype_names[MIGRATE_TYPES];
>
>  #ifdef CONFIG_CMA
>  #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
> +#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == 
> MIGRATE_CMA)
>  #else
>  #  define is_migrate_cma(migratetype) false
> +#  define is_migrate_cma_page(_page) false
>  #endif
>
>  #define for_each_migratetype_order(order, type) \
> --
> 2.7.4
>



-- 
Kees Cook
Chrome OS & Brillo Security
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] mm: Add is_migrate_cma_page

2016-07-19 Thread Laura Abbott

Code such as hardened user copy[1] needs a way to tell if a
page is CMA or not. Add is_migrate_cma_page in a similar way
to is_migrate_isolate_page.

[1]http://article.gmane.org/gmane.linux.kernel.mm/155238

Signed-off-by: Laura Abbott 
---
Here's an explicit patch, slightly different than what I posted before. It can
be kept separate or folded in as needed.
---
 include/linux/mmzone.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 02069c2..c8478b2 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -68,8 +68,10 @@ extern char * const migratetype_names[MIGRATE_TYPES];
 
 #ifdef CONFIG_CMA
 #  define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
+#  define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == 
MIGRATE_CMA)
 #else
 #  define is_migrate_cma(migratetype) false
+#  define is_migrate_cma_page(_page) false
 #endif
 
 #define for_each_migratetype_order(order, type) \
-- 
2.7.4

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 00/14] Present useful limits to user (v2)

2016-07-19 Thread Topi Miettinen

On 07/18/16 22:05, Doug Ledford wrote:
> On 7/15/2016 12:35 PM, Topi Miettinen wrote:
>> On 07/15/16 13:04, Balbir Singh wrote:
>>> On Fri, Jul 15, 2016 at 01:35:47PM +0300, Topi Miettinen wrote:
 Hello,

 There are many basic ways to control processes, including capabilities,
 cgroups and resource limits. However, there are far fewer ways to find out
 useful values for the limits, except blind trial and error.

 This patch series attempts to fix that by giving at least a nice starting
 point from the highwater mark values of the resources in question.
 I looked where each limit is checked and added a call to update the mark
 nearby.

 Example run of program from Documentation/accounting/getdelauys.c:

 ./getdelays -R -p `pidof smartd`
 printing resource accounting
 RLIMIT_CPU=0
 RLIMIT_FSIZE=0
 RLIMIT_DATA=18198528
 RLIMIT_STACK=135168
 RLIMIT_CORE=0
 RLIMIT_RSS=0
 RLIMIT_NPROC=1
 RLIMIT_NOFILE=55
 RLIMIT_MEMLOCK=0
 RLIMIT_AS=130879488
 RLIMIT_LOCKS=0
 RLIMIT_SIGPENDING=0
 RLIMIT_MSGQUEUE=0
 RLIMIT_NICE=0
 RLIMIT_RTPRIO=0
 RLIMIT_RTTIME=0

 ./getdelays -R -C /sys/fs/cgroup/systemd/system.slice/smartd.service/
 printing resource accounting
 sleeping 1, blocked 0, running 0, stopped 0, uninterruptible 0
 RLIMIT_CPU=0
 RLIMIT_FSIZE=0
 RLIMIT_DATA=18198528
 RLIMIT_STACK=135168
 RLIMIT_CORE=0
 RLIMIT_RSS=0
 RLIMIT_NPROC=1
 RLIMIT_NOFILE=55
 RLIMIT_MEMLOCK=0
 RLIMIT_AS=130879488
 RLIMIT_LOCKS=0
 RLIMIT_SIGPENDING=0
 RLIMIT_MSGQUEUE=0
 RLIMIT_NICE=0
 RLIMIT_RTPRIO=0
 RLIMIT_RTTIME=0
>>>
>>> Does this mean that rlimit_data and rlimit_stack should be set to the
>>> values as specified by the data above?
>>
>> My plan is that either system administrator, distro maintainer or even
>> upstream developer can get reasonable values for the limits. They may
>> still be wrong, but things would be better than without any help to
>> configure the system.
> 
> This is not necessarily true.  It seems like there is a disconnect
> between what these various values are for and what you are positioning
> them as.  Most of these limits are meant to protect the system from
> resource starvation crashes.  They aren't meant to be any sort of double
> check on a specific application.  The vast majority of applications can
> have bugs, leak resources, and do all sorts of other bad things and
> still not hit these limits.  A program that leaks a file handle an hour
> but only normally has 50 handles in use would take 950 hours of constant
> leaking before these limits would kick in to bring the program under
> control.  That's over a month.  What's more though, the kernel couldn't
> really care less that a single application leaked files until it got to
> 1000 open.  The real point of the limit on file handles (since they are
> cheap) is just not to let the system get brought down.  Someone could
> maliciously fire up 1000 processes, and they could all attempt to open
> up as many files as possible in order to drown the system in open
> inodes.  The combination of the limit on maximum user processes and
> maximum files per process are intended to prevent this.  They are not
> intended to prevent a single, properly running application from
> operating.  In fact, there are very few applications that are likely to
> break the 1000 file per process limit.  It is outrageously high for most
> applications.  They will leak files and do all sorts of bad things
> without this ever stopping them.  But it does stop malicious programs.
> And the process limit stops malicious users too.  The max locked memory
> is used by almost no processes, and for the very few that use it, the
> default is more than enough.  The major exception is the RDMA stack,
> which uses it so much that we just disable it on large systems because
> it's impossible to predict how much we'll need and we don't want a job
> to get killed because it couldn't get the memory it needs for buffers.
> The limit on POSIX message queues is another one where it's more than
> enough for most applications which don't use this feature at all, and
> the few systems that use this feature adjust the limit to something sane
> on their system (we can't make the default sane for these special
> systems or else it becomes an avenue for Denial of Service attack, so
> the default must stay low and servers that make extensive use of this
> feature must up their limit on a case by case basis).
> 
>>>
>>> Do we expect a smart user space daemon to then tweak the RLIMIT values?
>>
>> Someone could write an autotuning daemon that checks if the system has
>> changed (for example due to upgrade) and then run some tests to
>> reconfigure the system. But the limits are a bit too fragile, or rather,
>> applications can't handle failure, so I don't know if that would really
>> work.
> 
> This misses the point

Re: Suspected regression?

2016-07-19 Thread Scott Wood

On Tue, 2016-07-19 at 12:00 +0200, Alessio Igor Bogani wrote:
> Hi all,
> 
> I have got two boards MVME5100 (MPC7410 cpu) and MVME7100 (MPC8641D
> cpu) for which I use the same cross-compiler (ppc7400).
> 
> I tested these against kernel HEAD to found that these don't boot
> anymore (PID 1 crash).
> 
> Bisecting results in first offending commit:
> 7aef4136566b0539a1a98391181e188905e33401
> 
> Removing it from HEAD make boards boot properly again.
> 
> A third system based on P2010 isn't affected at all.
> 
> Is it a regression or I have made something wrong?

I booted both my next branch, and Linus's master on MPC8641HPCN and didn't see
this -- though possibly your RFS is doing something different.  Maybe that's
the difference with P2010 as well.

Is there any way you can debug the cause of the crash?  Or send me a minimal
RFS that demonstrates the problem (ideally with debug symbols on the userspace
binaries)?

-Scott

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 02/11] mm: Hardened usercopy

2016-07-19 Thread Christian Borntraeger

On 07/19/2016 10:34 PM, Kees Cook wrote:
[...]
>>
>> So what about for the CONFIG text:
>>
>>An architecture should select this if the kernel mapping has a 
>> secondary
>>linear mapping of the kernel text - in other words more than one 
>> virtual
>>kernel address that points to the kernel image. This is used to verify
>>that kernel text exposures are not visible under 
>> CONFIG_HARDENED_USERCOPY.
> 
> Sounds good, I've adjusted it for now.
> 
>>> I wonder if I can avoid the CONFIG entirely if I just did a
>>> __va(__pa(_stext)) != _stext test... would that break anyone?
>>
>> Can this be resolved on all platforms at compile time?
> 
> Well, I think it still needs a runtime check (compile-time may not be
> able to tell about kaslr, or who knows what else). I would really like
> to avoid the CONFIG if possible, though. Would this do the right thing
> on s390? This appears to work where I'm able to test it (32/64 x86,
> 32/64 arm):
> 
> unsigned long textlow = (unsigned long)_stext;
> unsigned long texthigh = (unsigned long)_etext;
> unsigned long textlow_linear = (unsigned long)__va(__pa(textlow);
> unsigned long texthigh_linear = (unsigned long)__va(__pa(texthigh);
> 
as we have

#define PAGE_OFFSET 0x0UL
#define __pa(x) (unsigned long)(x)
#define __va(x) (void *)(unsigned long)(x)

both should be identical on s390 as of today, so it should work fine and only
do the check once

> if (overlaps(ptr, n, textlow, texthigh))
> return "";
> 
> /* Check against possible secondary linear mapping as well. */
> if (textlow != textlow_linear &&
> overlaps(ptr, n, textlow_linear, texthigh_linear))
> return "";
> 
> return NULL;
> 
> 
> -Kees
> 


PS: Not sure how useful and flexible this offers is but you can get some 
temporary
free access to an s390 on https://developer.ibm.com/linuxone/




___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 02/11] mm: Hardened usercopy

2016-07-19 Thread Kees Cook

On Tue, Jul 19, 2016 at 1:14 PM, Christian Borntraeger
 wrote:
> On 07/19/2016 09:31 PM, Kees Cook wrote:
>> On Tue, Jul 19, 2016 at 2:21 AM, Christian Borntraeger
>>  wrote:
>>> On 07/15/2016 11:44 PM, Kees Cook wrote:
 +config HAVE_ARCH_LINEAR_KERNEL_MAPPING
 + bool
 + help
 +   An architecture should select this if it has a secondary linear
 +   mapping of the kernel text. This is used to verify that kernel
 +   text exposures are not visible under CONFIG_HARDENED_USERCOPY.
>>>
>>> I have trouble parsing this. (What does secondary linear mapping mean?)
>>
>> I likely need help clarifying this language...
>>
>>> So let me give an example below
>>>
 +
>>> [...]
 +/* Is this address range in the kernel text area? */
 +static inline const char *check_kernel_text_object(const void *ptr,
 +unsigned long n)
 +{
 + unsigned long textlow = (unsigned long)_stext;
 + unsigned long texthigh = (unsigned long)_etext;
 +
 + if (overlaps(ptr, n, textlow, texthigh))
 + return "";
 +
 +#ifdef HAVE_ARCH_LINEAR_KERNEL_MAPPING
 + /* Check against linear mapping as well. */
 + if (overlaps(ptr, n, (unsigned long)__va(__pa(textlow)),
 +  (unsigned long)__va(__pa(texthigh
 + return "";
 +#endif
 +
 + return NULL;
 +}
>>>
>>> s390 has an address space for user (primary address space from 0..4TB/8PB) 
>>> and a separate
>>> address space (home space from 0..4TB/8PB) for the kernel. In this home 
>>> space the kernel
>>> mapping is virtual containing the physical memory as well as vmalloc memory 
>>> (creating aliases
>>> into the physical one). The kernel text is mapped from _stext to _etext in 
>>> this mapping.
>>> So I assume this would qualify for HAVE_ARCH_LINEAR_KERNEL_MAPPING ?
>>
>> If I understand your example, yes. In the home space you have two
>> addresses that reference the kernel image?
>
> No, there is only one address that points to the kernel.
> As we have no kernel ASLR yet, and the kernel mapping is
> a 1:1 mapping from 0 to memory end and the kernel is only
> from _stext to _etext. The vmalloc area contains modules
> and vmalloc but not a 2nd kernel mapping.
>
> But thanks for your example, now I understood. If we have only
> one address
 + if (overlaps(ptr, n, textlow, texthigh))
 + return "";
>
> This is just enough.
>
> So what about for the CONFIG text:
>
>An architecture should select this if the kernel mapping has a 
> secondary
>linear mapping of the kernel text - in other words more than one 
> virtual
>kernel address that points to the kernel image. This is used to verify
>that kernel text exposures are not visible under 
> CONFIG_HARDENED_USERCOPY.

Sounds good, I've adjusted it for now.

>> I wonder if I can avoid the CONFIG entirely if I just did a
>> __va(__pa(_stext)) != _stext test... would that break anyone?
>
> Can this be resolved on all platforms at compile time?

Well, I think it still needs a runtime check (compile-time may not be
able to tell about kaslr, or who knows what else). I would really like
to avoid the CONFIG if possible, though. Would this do the right thing
on s390? This appears to work where I'm able to test it (32/64 x86,
32/64 arm):

unsigned long textlow = (unsigned long)_stext;
unsigned long texthigh = (unsigned long)_etext;
unsigned long textlow_linear = (unsigned long)__va(__pa(textlow);
unsigned long texthigh_linear = (unsigned long)__va(__pa(texthigh);

if (overlaps(ptr, n, textlow, texthigh))
return "";

/* Check against possible secondary linear mapping as well. */
if (textlow != textlow_linear &&
overlaps(ptr, n, textlow_linear, texthigh_linear))
return "";

return NULL;


-Kees

-- 
Kees Cook
Chrome OS & Brillo Security
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 02/11] mm: Hardened usercopy

2016-07-19 Thread Christian Borntraeger

On 07/19/2016 09:31 PM, Kees Cook wrote:
> On Tue, Jul 19, 2016 at 2:21 AM, Christian Borntraeger
>  wrote:
>> On 07/15/2016 11:44 PM, Kees Cook wrote:
>>> +config HAVE_ARCH_LINEAR_KERNEL_MAPPING
>>> + bool
>>> + help
>>> +   An architecture should select this if it has a secondary linear
>>> +   mapping of the kernel text. This is used to verify that kernel
>>> +   text exposures are not visible under CONFIG_HARDENED_USERCOPY.
>>
>> I have trouble parsing this. (What does secondary linear mapping mean?)
> 
> I likely need help clarifying this language...
> 
>> So let me give an example below
>>
>>> +
>> [...]
>>> +/* Is this address range in the kernel text area? */
>>> +static inline const char *check_kernel_text_object(const void *ptr,
>>> +unsigned long n)
>>> +{
>>> + unsigned long textlow = (unsigned long)_stext;
>>> + unsigned long texthigh = (unsigned long)_etext;
>>> +
>>> + if (overlaps(ptr, n, textlow, texthigh))
>>> + return "";
>>> +
>>> +#ifdef HAVE_ARCH_LINEAR_KERNEL_MAPPING
>>> + /* Check against linear mapping as well. */
>>> + if (overlaps(ptr, n, (unsigned long)__va(__pa(textlow)),
>>> +  (unsigned long)__va(__pa(texthigh
>>> + return "";
>>> +#endif
>>> +
>>> + return NULL;
>>> +}
>>
>> s390 has an address space for user (primary address space from 0..4TB/8PB) 
>> and a separate
>> address space (home space from 0..4TB/8PB) for the kernel. In this home 
>> space the kernel
>> mapping is virtual containing the physical memory as well as vmalloc memory 
>> (creating aliases
>> into the physical one). The kernel text is mapped from _stext to _etext in 
>> this mapping.
>> So I assume this would qualify for HAVE_ARCH_LINEAR_KERNEL_MAPPING ?
> 
> If I understand your example, yes. In the home space you have two
> addresses that reference the kernel image?

No, there is only one address that points to the kernel.
As we have no kernel ASLR yet, and the kernel mapping is 
a 1:1 mapping from 0 to memory end and the kernel is only
from _stext to _etext. The vmalloc area contains modules
and vmalloc but not a 2nd kernel mapping.

But thanks for your example, now I understood. If we have only
one address 
>>> + if (overlaps(ptr, n, textlow, texthigh))
>>> + return "";

This is just enough.

So what about for the CONFIG text:

   An architecture should select this if the kernel mapping has a secondary
   linear mapping of the kernel text - in other words more than one virtual
   kernel address that points to the kernel image. This is used to verify 
   that kernel text exposures are not visible under 
CONFIG_HARDENED_USERCOPY.


> I wonder if I can avoid the CONFIG entirely if I just did a
> __va(__pa(_stext)) != _stext test... would that break anyone?

Can this be resolved on all platforms at compile time?

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 02/11] mm: Hardened usercopy

2016-07-19 Thread Kees Cook

On Tue, Jul 19, 2016 at 2:21 AM, Christian Borntraeger
 wrote:
> On 07/15/2016 11:44 PM, Kees Cook wrote:
>> +config HAVE_ARCH_LINEAR_KERNEL_MAPPING
>> + bool
>> + help
>> +   An architecture should select this if it has a secondary linear
>> +   mapping of the kernel text. This is used to verify that kernel
>> +   text exposures are not visible under CONFIG_HARDENED_USERCOPY.
>
> I have trouble parsing this. (What does secondary linear mapping mean?)

I likely need help clarifying this language...

> So let me give an example below
>
>> +
> [...]
>> +/* Is this address range in the kernel text area? */
>> +static inline const char *check_kernel_text_object(const void *ptr,
>> +unsigned long n)
>> +{
>> + unsigned long textlow = (unsigned long)_stext;
>> + unsigned long texthigh = (unsigned long)_etext;
>> +
>> + if (overlaps(ptr, n, textlow, texthigh))
>> + return "";
>> +
>> +#ifdef HAVE_ARCH_LINEAR_KERNEL_MAPPING
>> + /* Check against linear mapping as well. */
>> + if (overlaps(ptr, n, (unsigned long)__va(__pa(textlow)),
>> +  (unsigned long)__va(__pa(texthigh
>> + return "";
>> +#endif
>> +
>> + return NULL;
>> +}
>
> s390 has an address space for user (primary address space from 0..4TB/8PB) 
> and a separate
> address space (home space from 0..4TB/8PB) for the kernel. In this home space 
> the kernel
> mapping is virtual containing the physical memory as well as vmalloc memory 
> (creating aliases
> into the physical one). The kernel text is mapped from _stext to _etext in 
> this mapping.
> So I assume this would qualify for HAVE_ARCH_LINEAR_KERNEL_MAPPING ?

If I understand your example, yes. In the home space you have two
addresses that reference the kernel image? The intent is that if
__va(__pa(_stext)) != _stext, there's a linear mapping of physical
memory in the virtual memory range. On x86_64, the kernel is visible
in two locations in virtual memory. The kernel start in physical
memory address 0x0100 maps to virtual address 0x88000100,
and the "regular" virtual memory kernel address is at
0x8100:

# grep Kernel /proc/iomem
  0100-01a59767 : Kernel code
  01a59768-0213d77f : Kernel data
  0228-02fdefff : Kernel bss

# grep startup_64 /proc/kallsyms
8100 T startup_64

# less /sys/kernel/debug/kernel_page_tables
...
---[ Low Kernel Mapping ]---
...
0x88000100-0x880001a0  10M ro PSE
 GLB NX pmd
0x880001a0-0x880001a5c000 368K ro   GLB NX pte
0x880001a5c000-0x880001c01680K RW   GLB NX pte
...
---[ High Kernel Mapping ]---
...
0x8100-0x81a0  10M ro PSE
 GLB x  pmd
0x81a0-0x81a5c000 368K ro   GLB x  pte
0x81a5c000-0x81c01680K RW   GLB NX pte
...

I wonder if I can avoid the CONFIG entirely if I just did a
__va(__pa(_stext)) != _stext test... would that break anyone?

-Kees

-- 
Kees Cook
Chrome OS & Brillo Security
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V4 5/5] powerpc/kvm/stats: Implement existing and add new halt polling vcpu stats

2016-07-19 Thread David Matlack

On Tue, Jul 19, 2016 at 1:12 AM, Suraj Jitindar Singh
 wrote:
> vcpu stats are used to collect information about a vcpu which can be viewed
> in the debugfs. For example halt_attempted_poll and halt_successful_poll
> are used to keep track of the number of times the vcpu attempts to and
> successfully polls. These stats are currently not used on powerpc.
>
> Implement incrementation of the halt_attempted_poll and
> halt_successful_poll vcpu stats for powerpc. Since these stats are summed
> over all the vcpus for all running guests it doesn't matter which vcpu
> they are attributed to, thus we choose the current runner vcpu of the
> vcore.
>
> Also add new vcpu stats: halt_poll_success_ns, halt_poll_fail_ns and
> halt_wait_ns to be used to accumulate the total time spend polling
> successfully, polling unsuccessfully and waiting respectively, and
> halt_successful_wait to accumulate the number of times the vcpu waits.
> Given that halt_poll_success_ns, halt_poll_fail_ns and halt_wait_ns are
> expressed in nanoseconds it is necessary to represent these as 64-bit
> quantities, otherwise they would overflow after only about 4 seconds.
>
> Given that the total time spend either polling or waiting will be known and
> the number of times that each was done, it will be possible to determine
> the average poll and wait times. This will give the ability to tune the kvm
> module parameters based on the calculated average wait and poll times.
>
> Signed-off-by: Suraj Jitindar Singh 

Reviewed-by: David Matlack 

>
> ---
> Change Log:
>
> V3 -> V4:
> - Instead of accounting just wait and poll time, separate these
>   into successful_poll_time, failed_poll_time and wait_time.
> ---
>  arch/powerpc/include/asm/kvm_host.h |  4 
>  arch/powerpc/kvm/book3s.c   |  4 
>  arch/powerpc/kvm/book3s_hv.c| 36 +++-
>  3 files changed, 39 insertions(+), 5 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_host.h 
> b/arch/powerpc/include/asm/kvm_host.h
> index f6304c5..f15ffc0 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -114,8 +114,12 @@ struct kvm_vcpu_stat {
> u64 emulated_inst_exits;
> u64 dec_exits;
> u64 ext_intr_exits;
> +   u64 halt_poll_success_ns;
> +   u64 halt_poll_fail_ns;
> +   u64 halt_wait_ns;
> u64 halt_successful_poll;
> u64 halt_attempted_poll;
> +   u64 halt_successful_wait;
> u64 halt_poll_invalid;
> u64 halt_wakeup;
> u64 dbell_exits;
> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
> index 47018fc..71eb8f3 100644
> --- a/arch/powerpc/kvm/book3s.c
> +++ b/arch/powerpc/kvm/book3s.c
> @@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
> { "dec", VCPU_STAT(dec_exits) },
> { "ext_intr",VCPU_STAT(ext_intr_exits) },
> { "queue_intr",  VCPU_STAT(queue_intr) },
> +   { "halt_poll_success_ns",   VCPU_STAT(halt_poll_success_ns) },
> +   { "halt_poll_fail_ns",  VCPU_STAT(halt_poll_fail_ns) },
> +   { "halt_wait_ns",   VCPU_STAT(halt_wait_ns) },
> { "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
> { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
> +   { "halt_successful_wait",   VCPU_STAT(halt_successful_wait) },
> { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
> { "halt_wakeup", VCPU_STAT(halt_wakeup) },
> { "pf_storage",  VCPU_STAT(pf_storage) },
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index a9de1d4..81072f2 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -2679,15 +2679,16 @@ static int kvmppc_vcore_check_block(struct 
> kvmppc_vcore *vc)
>   */
>  static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
>  {
> +   ktime_t cur, start_poll, start_wait;
> int do_sleep = 1;
> -   ktime_t cur, start;
> u64 block_ns;
> DECLARE_SWAITQUEUE(wait);
>
> /* Poll for pending exceptions and ceded state */
> -   cur = start = ktime_get();
> +   cur = start_poll = ktime_get();
> if (vc->halt_poll_ns) {
> -   ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns);
> +   ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
> +   ++vc->runner->stat.halt_attempted_poll;
>
> vc->vcore_state = VCORE_POLLING;
> spin_unlock(>lock);
> @@ -2703,8 +2704,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore 
> *vc)
> spin_lock(>lock);
> vc->vcore_state = VCORE_INACTIVE;
>
> -   if (!do_sleep)
> +   if (!do_sleep) {
> +   ++vc->runner->stat.halt_successful_poll;
> goto out;

Re: [PATCH V4 4/5] kvm/stats: Add provisioning for ulong vm stats and u64 vcpu stats

2016-07-19 Thread David Matlack via Linuxppc-dev

On Tue, Jul 19, 2016 at 1:12 AM, Suraj Jitindar Singh
 wrote:
> vms and vcpus have statistics associated with them which can be viewed
> within the debugfs. Currently it is assumed within the vcpu_stat_get() and
> vm_stat_get() functions that all of these statistics are represented as
> u32s, however the next patch adds some u64 vcpu statistics.
>
> Change all vcpu statistics to u64 and modify vcpu_stat_get() accordingly.
> Since vcpu statistics are per vcpu, they will only be updated by a single
> vcpu at a time so this shouldn't present a problem on 32-bit machines
> which can't atomically increment 64-bit numbers. However vm statistics
> could potentially be updated by multiple vcpus from that vm at a time.
> To avoid the overhead of atomics make all vm statistics ulong such that
> they are 64-bit on 64-bit systems where they can be atomically incremented
> and are 32-bit on 32-bit systems which may not be able to atomically
> increment 64-bit numbers. Modify vm_stat_get() to expect ulongs.
>
> Signed-off-by: Suraj Jitindar Singh 

Looks great, thanks.

Reviewed-by: David Matlack 

>
> ---
> Change Log:
>
> V2 -> V3:
> - Instead of implementing separate u32 and u64 functions keep the
>   generic functions and modify them to expect u64s. Thus update all
>   vm and vcpu statistics to u64s accordingly.
> V3 -> V4:
> - Change vm_stats from u64 to ulong
> ---
>  arch/arm/include/asm/kvm_host.h |  12 ++--
>  arch/arm64/include/asm/kvm_host.h   |  12 ++--
>  arch/mips/include/asm/kvm_host.h|  46 ++---
>  arch/powerpc/include/asm/kvm_host.h |  60 -
>  arch/s390/include/asm/kvm_host.h| 128 
> ++--
>  arch/x86/include/asm/kvm_host.h |  72 ++--
>  virt/kvm/kvm_main.c |   4 +-
>  7 files changed, 167 insertions(+), 167 deletions(-)
>
> diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
> index 96387d4..c8e55b3b 100644
> --- a/arch/arm/include/asm/kvm_host.h
> +++ b/arch/arm/include/asm/kvm_host.h
> @@ -183,15 +183,15 @@ struct kvm_vcpu_arch {
>  };
>
>  struct kvm_vm_stat {
> -   u32 remote_tlb_flush;
> +   ulong remote_tlb_flush;
>  };
>
>  struct kvm_vcpu_stat {
> -   u32 halt_successful_poll;
> -   u32 halt_attempted_poll;
> -   u32 halt_poll_invalid;
> -   u32 halt_wakeup;
> -   u32 hvc_exit_stat;
> +   u64 halt_successful_poll;
> +   u64 halt_attempted_poll;
> +   u64 halt_poll_invalid;
> +   u64 halt_wakeup;
> +   u64 hvc_exit_stat;
> u64 wfe_exit_stat;
> u64 wfi_exit_stat;
> u64 mmio_exit_user;
> diff --git a/arch/arm64/include/asm/kvm_host.h 
> b/arch/arm64/include/asm/kvm_host.h
> index 49095fc..b14c8bc 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -291,15 +291,15 @@ struct kvm_vcpu_arch {
>  #endif
>
>  struct kvm_vm_stat {
> -   u32 remote_tlb_flush;
> +   ulong remote_tlb_flush;
>  };
>
>  struct kvm_vcpu_stat {
> -   u32 halt_successful_poll;
> -   u32 halt_attempted_poll;
> -   u32 halt_poll_invalid;
> -   u32 halt_wakeup;
> -   u32 hvc_exit_stat;
> +   u64 halt_successful_poll;
> +   u64 halt_attempted_poll;
> +   u64 halt_poll_invalid;
> +   u64 halt_wakeup;
> +   u64 hvc_exit_stat;
> u64 wfe_exit_stat;
> u64 wfi_exit_stat;
> u64 mmio_exit_user;
> diff --git a/arch/mips/include/asm/kvm_host.h 
> b/arch/mips/include/asm/kvm_host.h
> index 36a391d..9704888 100644
> --- a/arch/mips/include/asm/kvm_host.h
> +++ b/arch/mips/include/asm/kvm_host.h
> @@ -98,32 +98,32 @@ extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
>  extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
>
>  struct kvm_vm_stat {
> -   u32 remote_tlb_flush;
> +   ulong remote_tlb_flush;
>  };
>
>  struct kvm_vcpu_stat {
> -   u32 wait_exits;
> -   u32 cache_exits;
> -   u32 signal_exits;
> -   u32 int_exits;
> -   u32 cop_unusable_exits;
> -   u32 tlbmod_exits;
> -   u32 tlbmiss_ld_exits;
> -   u32 tlbmiss_st_exits;
> -   u32 addrerr_st_exits;
> -   u32 addrerr_ld_exits;
> -   u32 syscall_exits;
> -   u32 resvd_inst_exits;
> -   u32 break_inst_exits;
> -   u32 trap_inst_exits;
> -   u32 msa_fpe_exits;
> -   u32 fpe_exits;
> -   u32 msa_disabled_exits;
> -   u32 flush_dcache_exits;
> -   u32 halt_successful_poll;
> -   u32 halt_attempted_poll;
> -   u32 halt_poll_invalid;
> -   u32 halt_wakeup;
> +   u64 wait_exits;
> +   u64 cache_exits;
> +   u64 signal_exits;
> +   u64 int_exits;
> +   u64 cop_unusable_exits;
> +   u64 tlbmod_exits;
> +   u64 tlbmiss_ld_exits;
> +   u64 tlbmiss_st_exits;
> +   u64 addrerr_st_exits;
> +   u64 addrerr_ld_exits;
> +   u64

Re: [PATCH v3 02/11] mm: Hardened usercopy

2016-07-19 Thread Kees Cook

On Mon, Jul 18, 2016 at 6:52 PM, Laura Abbott  wrote:
> On 07/15/2016 02:44 PM, Kees Cook wrote:
>>
>> This is the start of porting PAX_USERCOPY into the mainline kernel. This
>> is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The
>> work is based on code by PaX Team and Brad Spengler, and an earlier port
>> from Casey Schaufler. Additional non-slab page tests are from Rik van
>> Riel.
>>
>> This patch contains the logic for validating several conditions when
>> performing copy_to_user() and copy_from_user() on the kernel object
>> being copied to/from:
>> - address range doesn't wrap around
>> - address range isn't NULL or zero-allocated (with a non-zero copy size)
>> - if on the slab allocator:
>>   - object size must be less than or equal to copy size (when check is
>> implemented in the allocator, which appear in subsequent patches)
>> - otherwise, object must not span page allocations
>> - if on the stack
>>   - object must not extend before/after the current process task
>>   - object must be contained by the current stack frame (when there is
>> arch/build support for identifying stack frames)
>> - object must not overlap with kernel text
>>
>> Signed-off-by: Kees Cook 
>> Tested-By: Valdis Kletnieks 
>> Tested-by: Michael Ellerman 
>> ---
>>  arch/Kconfig|   7 ++
>>  include/linux/slab.h|  12 +++
>>  include/linux/thread_info.h |  15 +++
>>  mm/Makefile |   4 +
>>  mm/usercopy.c   | 234
>> 
>>  security/Kconfig|  28 ++
>>  6 files changed, 300 insertions(+)
>>  create mode 100644 mm/usercopy.c
>>
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 5e2776562035..195ee4cc939a 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -433,6 +433,13 @@ config HAVE_ARCH_WITHIN_STACK_FRAMES
>>   and similar) by implementing an inline
>> arch_within_stack_frames(),
>>   which is used by CONFIG_HARDENED_USERCOPY.
>>
>> +config HAVE_ARCH_LINEAR_KERNEL_MAPPING
>> +   bool
>> +   help
>> + An architecture should select this if it has a secondary linear
>> + mapping of the kernel text. This is used to verify that kernel
>> + text exposures are not visible under CONFIG_HARDENED_USERCOPY.
>> +
>>  config HAVE_CONTEXT_TRACKING
>> bool
>> help
>> diff --git a/include/linux/slab.h b/include/linux/slab.h
>> index aeb3e6d00a66..96a16a3fb7cb 100644
>> --- a/include/linux/slab.h
>> +++ b/include/linux/slab.h
>> @@ -155,6 +155,18 @@ void kfree(const void *);
>>  void kzfree(const void *);
>>  size_t ksize(const void *);
>>
>> +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
>> +const char *__check_heap_object(const void *ptr, unsigned long n,
>> +   struct page *page);
>> +#else
>> +static inline const char *__check_heap_object(const void *ptr,
>> + unsigned long n,
>> + struct page *page)
>> +{
>> +   return NULL;
>> +}
>> +#endif
>> +
>>  /*
>>   * Some archs want to perform DMA into kmalloc caches and need a
>> guaranteed
>>   * alignment larger than the alignment of a 64-bit integer.
>> diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
>> index 3d5c80b4391d..f24b99eac969 100644
>> --- a/include/linux/thread_info.h
>> +++ b/include/linux/thread_info.h
>> @@ -155,6 +155,21 @@ static inline int arch_within_stack_frames(const void
>> * const stack,
>>  }
>>  #endif
>>
>> +#ifdef CONFIG_HARDENED_USERCOPY
>> +extern void __check_object_size(const void *ptr, unsigned long n,
>> +   bool to_user);
>> +
>> +static inline void check_object_size(const void *ptr, unsigned long n,
>> +bool to_user)
>> +{
>> +   __check_object_size(ptr, n, to_user);
>> +}
>> +#else
>> +static inline void check_object_size(const void *ptr, unsigned long n,
>> +bool to_user)
>> +{ }
>> +#endif /* CONFIG_HARDENED_USERCOPY */
>> +
>>  #endif /* __KERNEL__ */
>>
>>  #endif /* _LINUX_THREAD_INFO_H */
>> diff --git a/mm/Makefile b/mm/Makefile
>> index 78c6f7dedb83..32d37247c7e5 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n
>>  KCOV_INSTRUMENT_mmzone.o := n
>>  KCOV_INSTRUMENT_vmstat.o := n
>>
>> +# Since __builtin_frame_address does work as used, disable the warning.
>> +CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
>> +
>>  mmu-y  := nommu.o
>>  mmu-$(CONFIG_MMU)  := gup.o highmem.o memory.o mincore.o \
>>mlock.o mmap.o mprotect.o mremap.o msync.o
>> rmap.o \
>> @@ -99,3 +102,4 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
>>  obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
>>

Re: [PATCH V4 3/5] kvm/ppc/book3s_hv: Implement halt polling in the kvm_hv kernel module

2016-07-19 Thread David Matlack via Linuxppc-dev

On Tue, Jul 19, 2016 at 1:12 AM, Suraj Jitindar Singh
 wrote:
> This patch introduces new halt polling functionality into the kvm_hv kernel
> module. When a vcore is idle it will poll for some period of time before
> scheduling itself out.
>
> When all of the runnable vcpus on a vcore have ceded (and thus the vcore is
> idle) we schedule ourselves out to allow something else to run. In the
> event that we need to wake up very quickly (for example an interrupt
> arrives), we are required to wait until we get scheduled again.
>
> Implement halt polling so that when a vcore is idle, and before scheduling
> ourselves, we poll for vcpus in the runnable_threads list which have
> pending exceptions or which leave the ceded state. If we poll successfully
> then we can get back into the guest very quickly without ever scheduling
> ourselves, otherwise we schedule ourselves out as before.
>
> Testing of this patch with a TCP round robin test between two guests with
> virtio network interfaces has found a decrease in round trip time of ~15us
> on average. A performance gain is only seen when going out of and
> back into the guest often and quickly, otherwise there is no net benefit
> from the polling. The polling interval is adjusted such that when we are
> often scheduled out for long periods of time it is reduced, and when we
> often poll successfully it is increased. The rate at which the polling
> interval increases or decreases, and the maximum polling interval, can
> be set through module parameters.
>
> Based on the implementation in the generic kvm module by Wanpeng Li and
> Paolo Bonzini, and on direction from Paul Mackerras.
>
> Signed-off-by: Suraj Jitindar Singh 
> ---
>  arch/powerpc/include/asm/kvm_book3s.h |   1 +
>  arch/powerpc/include/asm/kvm_host.h   |   1 +
>  arch/powerpc/kvm/book3s_hv.c  | 116 
> ++
>  arch/powerpc/kvm/trace_hv.h   |  22 +++
>  4 files changed, 126 insertions(+), 14 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
> b/arch/powerpc/include/asm/kvm_book3s.h
> index 151f817..c261f52 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -102,6 +102,7 @@ struct kvmppc_vcore {
> ulong pcr;
> ulong dpdes;/* doorbell state (POWER8) */
> ulong conferring_threads;
> +   unsigned int halt_poll_ns;
>  };
>
>  struct kvmppc_vcpu_book3s {
> diff --git a/arch/powerpc/include/asm/kvm_host.h 
> b/arch/powerpc/include/asm/kvm_host.h
> index 02d06e9..610f393 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -294,6 +294,7 @@ struct kvm_arch {
>  #define VCORE_SLEEPING 3
>  #define VCORE_RUNNING  4
>  #define VCORE_EXITING  5
> +#define VCORE_POLLING  6
>
>  /*
>   * Struct used to manage memory for a virtual processor area
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 3bcf9e6..a9de1d4 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -94,6 +94,23 @@ module_param_cb(h_ipi_redirect, _param_ops, 
> _ipi_redirect,
>  MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host 
> core");
>  #endif
>
> +/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
> +static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
> +module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
> +MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
> +
> +/* Factor by which the vcore halt poll interval is grown, default is to 
> double
> + */
> +static unsigned int halt_poll_ns_grow = 2;
> +module_param(halt_poll_ns_grow, int, S_IRUGO);
> +MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
> +
> +/* Factor by which the vcore halt poll interval is shrunk, default is to 
> reset
> + */
> +static unsigned int halt_poll_ns_shrink;
> +module_param(halt_poll_ns_shrink, int, S_IRUGO);
> +MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
> +
>  static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
>  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
>
> @@ -2620,32 +2637,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore 
> *vc,
> finish_wait(>arch.cpu_run, );
>  }
>
> +static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
> +{
> +   /* 10us base */
> +   if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
> +   vc->halt_poll_ns = 1;
> +   else
> +   vc->halt_poll_ns *= halt_poll_ns_grow;
> +
> +   if (vc->halt_poll_ns > halt_poll_max_ns)
> +   vc->halt_poll_ns = halt_poll_max_ns;
> +}
> +
> +static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
> +{
> +   if (halt_poll_ns_shrink == 0)
> +   vc->halt_poll_ns = 0;
> +   else
> +   vc->halt_poll_ns /= halt_poll_ns_shrink;
> +}
> +
> +/* Check

Re: [PATCH v3 02/11] mm: Hardened usercopy

2016-07-19 Thread Kees Cook

On Mon, Jul 18, 2016 at 6:06 PM, Laura Abbott  wrote:
> On 07/15/2016 02:44 PM, Kees Cook wrote:
>>
>> This is the start of porting PAX_USERCOPY into the mainline kernel. This
>> is the first set of features, controlled by CONFIG_HARDENED_USERCOPY. The
>> work is based on code by PaX Team and Brad Spengler, and an earlier port
>> from Casey Schaufler. Additional non-slab page tests are from Rik van
>> Riel.
>>
>> This patch contains the logic for validating several conditions when
>> performing copy_to_user() and copy_from_user() on the kernel object
>> being copied to/from:
>> - address range doesn't wrap around
>> - address range isn't NULL or zero-allocated (with a non-zero copy size)
>> - if on the slab allocator:
>>   - object size must be less than or equal to copy size (when check is
>> implemented in the allocator, which appear in subsequent patches)
>> - otherwise, object must not span page allocations
>> - if on the stack
>>   - object must not extend before/after the current process task
>>   - object must be contained by the current stack frame (when there is
>> arch/build support for identifying stack frames)
>> - object must not overlap with kernel text
>>
>> Signed-off-by: Kees Cook 
>> Tested-By: Valdis Kletnieks 
>> Tested-by: Michael Ellerman 
>> ---
>>  arch/Kconfig|   7 ++
>>  include/linux/slab.h|  12 +++
>>  include/linux/thread_info.h |  15 +++
>>  mm/Makefile |   4 +
>>  mm/usercopy.c   | 234
>> 
>>  security/Kconfig|  28 ++
>>  6 files changed, 300 insertions(+)
>>  create mode 100644 mm/usercopy.c
>>
>> diff --git a/arch/Kconfig b/arch/Kconfig
>> index 5e2776562035..195ee4cc939a 100644
>> --- a/arch/Kconfig
>> +++ b/arch/Kconfig
>> @@ -433,6 +433,13 @@ config HAVE_ARCH_WITHIN_STACK_FRAMES
>>   and similar) by implementing an inline
>> arch_within_stack_frames(),
>>   which is used by CONFIG_HARDENED_USERCOPY.
>>
>> +config HAVE_ARCH_LINEAR_KERNEL_MAPPING
>> +   bool
>> +   help
>> + An architecture should select this if it has a secondary linear
>> + mapping of the kernel text. This is used to verify that kernel
>> + text exposures are not visible under CONFIG_HARDENED_USERCOPY.
>> +
>>  config HAVE_CONTEXT_TRACKING
>> bool
>> help
>> diff --git a/include/linux/slab.h b/include/linux/slab.h
>> index aeb3e6d00a66..96a16a3fb7cb 100644
>> --- a/include/linux/slab.h
>> +++ b/include/linux/slab.h
>> @@ -155,6 +155,18 @@ void kfree(const void *);
>>  void kzfree(const void *);
>>  size_t ksize(const void *);
>>
>> +#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
>> +const char *__check_heap_object(const void *ptr, unsigned long n,
>> +   struct page *page);
>> +#else
>> +static inline const char *__check_heap_object(const void *ptr,
>> + unsigned long n,
>> + struct page *page)
>> +{
>> +   return NULL;
>> +}
>> +#endif
>> +
>>  /*
>>   * Some archs want to perform DMA into kmalloc caches and need a
>> guaranteed
>>   * alignment larger than the alignment of a 64-bit integer.
>> diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
>> index 3d5c80b4391d..f24b99eac969 100644
>> --- a/include/linux/thread_info.h
>> +++ b/include/linux/thread_info.h
>> @@ -155,6 +155,21 @@ static inline int arch_within_stack_frames(const void
>> * const stack,
>>  }
>>  #endif
>>
>> +#ifdef CONFIG_HARDENED_USERCOPY
>> +extern void __check_object_size(const void *ptr, unsigned long n,
>> +   bool to_user);
>> +
>> +static inline void check_object_size(const void *ptr, unsigned long n,
>> +bool to_user)
>> +{
>> +   __check_object_size(ptr, n, to_user);
>> +}
>> +#else
>> +static inline void check_object_size(const void *ptr, unsigned long n,
>> +bool to_user)
>> +{ }
>> +#endif /* CONFIG_HARDENED_USERCOPY */
>> +
>>  #endif /* __KERNEL__ */
>>
>>  #endif /* _LINUX_THREAD_INFO_H */
>> diff --git a/mm/Makefile b/mm/Makefile
>> index 78c6f7dedb83..32d37247c7e5 100644
>> --- a/mm/Makefile
>> +++ b/mm/Makefile
>> @@ -21,6 +21,9 @@ KCOV_INSTRUMENT_memcontrol.o := n
>>  KCOV_INSTRUMENT_mmzone.o := n
>>  KCOV_INSTRUMENT_vmstat.o := n
>>
>> +# Since __builtin_frame_address does work as used, disable the warning.
>> +CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
>> +
>>  mmu-y  := nommu.o
>>  mmu-$(CONFIG_MMU)  := gup.o highmem.o memory.o mincore.o \
>>mlock.o mmap.o mprotect.o mremap.o msync.o
>> rmap.o \
>> @@ -99,3 +102,4 @@ obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
>>  obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
>>

Re: [PATCH v2 1/2] powerpc/pseries: Implemented indexed-count hotplug memory add

2016-07-19 Thread Tyrel Datwyler

On 07/18/2016 08:07 AM, Sahil Mehta wrote:
> Indexed-count add for memory hotplug guarantees that a contiguous block
> of  lmbs beginning at a specified  will be assigned (NOT
> that  lmbs will be added). Because of Qemu's per-DIMM memory
> management, the addition of a contiguous block of memory currently
> requires a series of individual calls. Indexed-count add reduces
> this series into a single call.
> 
> Signed-off-by: Sahil Mehta 
> ---
> v2:   -remove potential memory leak when parsing command
>   -use u32s drc_index and count instead of u32 ic[]
>in dlpar_memory
> 
>  arch/powerpc/include/asm/rtas.h |2
>  arch/powerpc/platforms/pseries/dlpar.c  |   34 +++-
>  arch/powerpc/platforms/pseries/hotplug-memory.c |  100 
> +--
>  3 files changed, 124 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
> index 51400ba..f46b271 100644
> --- a/arch/powerpc/include/asm/rtas.h
> +++ b/arch/powerpc/include/asm/rtas.h
> @@ -307,6 +307,7 @@ struct pseries_hp_errorlog {
>   union {
>   __be32  drc_index;
>   __be32  drc_count;
> + __be32  indexed_count[2];
>   chardrc_name[1];
>   } _drc_u;
>  };
> @@ -322,6 +323,7 @@ struct pseries_hp_errorlog {
>  #define PSERIES_HP_ELOG_ID_DRC_NAME  1
>  #define PSERIES_HP_ELOG_ID_DRC_INDEX 2
>  #define PSERIES_HP_ELOG_ID_DRC_COUNT 3
> +#define PSERIES_HP_ELOG_ID_IC4

For consistency it would be nice if this had the same prefix, namely
PSERIES_HP_ELOG_ID_DRC_XXX, as the previous types. Otherwise, we need to
remember that indexed count is named slightly different.

-Tyrel

> 
>  struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
> uint16_t section_id);
> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
> b/arch/powerpc/platforms/pseries/dlpar.c
> index 2b93ae8..2a6dc9e 100644
> --- a/arch/powerpc/platforms/pseries/dlpar.c
> +++ b/arch/powerpc/platforms/pseries/dlpar.c
> @@ -345,11 +345,17 @@ static int handle_dlpar_errorlog(struct 
> pseries_hp_errorlog *hp_elog)
>   switch (hp_elog->id_type) {
>   case PSERIES_HP_ELOG_ID_DRC_COUNT:
>   hp_elog->_drc_u.drc_count =
> - be32_to_cpu(hp_elog->_drc_u.drc_count);
> + be32_to_cpu(hp_elog->_drc_u.drc_count);
>   break;
>   case PSERIES_HP_ELOG_ID_DRC_INDEX:
>   hp_elog->_drc_u.drc_index =
> - be32_to_cpu(hp_elog->_drc_u.drc_index);
> + be32_to_cpu(hp_elog->_drc_u.drc_index);
> + break;
> + case PSERIES_HP_ELOG_ID_IC:
> + hp_elog->_drc_u.indexed_count[0] =
> + be32_to_cpu(hp_elog->_drc_u.indexed_count[0]);
> + hp_elog->_drc_u.indexed_count[1] =
> + be32_to_cpu(hp_elog->_drc_u.indexed_count[1]);
>   }
> 
>   switch (hp_elog->resource) {
> @@ -409,7 +415,29 @@ static ssize_t dlpar_store(struct class *class, struct 
> class_attribute *attr,
>   goto dlpar_store_out;
>   }
> 
> - if (!strncmp(arg, "index", 5)) {
> + if (!strncmp(arg, "indexed-count", 13)) {
> + u32 index, count;
> + char *cstr, *istr;
> +
> + hp_elog->id_type = PSERIES_HP_ELOG_ID_IC;
> + arg += strlen("indexed-count ");
> +
> + cstr = kstrdup(arg, GFP_KERNEL);
> + istr = strchr(cstr, ' ');
> + *istr++ = '\0';
> +
> + if (kstrtou32(cstr, 0, ) || kstrtou32(istr, 0, )) {
> + rc = -EINVAL;
> + pr_err("Invalid index or count : \"%s\"\n", buf);
> + kfree(cstr);
> + goto dlpar_store_out;
> + }
> +
> + kfree(cstr);
> +
> + hp_elog->_drc_u.indexed_count[0] = cpu_to_be32(count);
> + hp_elog->_drc_u.indexed_count[1] = cpu_to_be32(index);
> + } else if (!strncmp(arg, "index", 5)) {
>   u32 index;
> 
>   hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 2ce1385..d7942ca 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -701,6 +701,83 @@ static int dlpar_memory_add_by_index(u32 drc_index, 
> struct property *prop)
>   return rc;
>  }
> 
> +static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index,
> +   struct property *prop)
> +{
> + struct of_drconf_cell *lmbs;
> + u32 num_lmbs, *p;
> + int i, rc;
> + int lmbs_available = 0, start_index = 0, end_index;
> +
> + pr_info("Attempting to hot-add %u

Re: [PATCH v2 2/2] powerpc/pseries: Implemented indexed-count hotplug memory remove

2016-07-19 Thread Nathan Fontenot

On 07/18/2016 10:08 AM, Sahil Mehta wrote:
> Indexed-count remove for memory hotplug guarantees that a contiguous block
> of  lmbs beginning at a specified  will be unassigned (NOT
> that  lmbs will be removed). Because of Qemu's per-DIMM memory
> management, the removal of a contiguous block of memory currently
> requires a series of individual calls. Indexed-count remove reduces
> this series into a single call.
> 
> Signed-off-by: Sahil Mehta 

Reviewed-by: Nathan Fontenot 

> ---
> v2:   -use u32s drc_index and count instead of u32 ic[]
>in dlpar_memory
> 
>  arch/powerpc/platforms/pseries/hotplug-memory.c |   84 
> +++
>  1 file changed, 84 insertions(+)
> 
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index d7942ca..244e1a8 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -503,6 +503,86 @@ static int dlpar_memory_remove_by_index(u32 drc_index, 
> struct property *prop)
>   return rc;
>  }
> 
> +static int dlpar_memory_remove_by_ic(u32 lmbs_to_remove, u32 drc_index,
> +  struct property *prop)
> +{
> + struct of_drconf_cell *lmbs;
> + u32 num_lmbs, *p;
> + int i, rc;
> + int lmbs_available = 0, start_index = 0, end_index;
> +
> + pr_info("Attempting to hot-remove %u LMB(s) at %x\n",
> + lmbs_to_remove, drc_index);
> +
> + if (lmbs_to_remove == 0)
> + return -EINVAL;
> +
> + p = prop->value;
> + num_lmbs = *p++;
> + lmbs = (struct of_drconf_cell *)p;
> +
> + /* Navigate to drc_index */
> + while (start_index < num_lmbs) {
> + if (lmbs[start_index].drc_index == drc_index)
> + break;
> +
> + start_index++;
> + }
> +
> + end_index = start_index + lmbs_to_remove;
> +
> + /* Validate that there are enough LMBs to satisfy the request */
> + for (i = start_index; i < end_index; i++) {
> + if (lmbs[i].flags & DRCONF_MEM_RESERVED)
> + break;
> +
> + lmbs_available++;
> + }
> +
> + if (lmbs_available < lmbs_to_remove)
> + return -EINVAL;
> +
> + for (i = 0; i < end_index; i++) {
> + if (!(lmbs[i].flags & DRCONF_MEM_ASSIGNED))
> + continue;
> +
> + rc = dlpar_remove_lmb([i]);
> + if (rc)
> + break;
> +
> + lmbs[i].reserved = 1;
> + }
> +
> + if (rc) {
> + pr_err("Memory indexed-count-remove failed, adding any removed 
> LMBs\n");
> +
> + for (i = start_index; i < end_index; i++) {
> + if (!lmbs[i].reserved)
> + continue;
> +
> + rc = dlpar_add_lmb([i]);
> + if (rc)
> + pr_err("Failed to add LMB, drc index %x\n",
> +be32_to_cpu(lmbs[i].drc_index));
> +
> + lmbs[i].reserved = 0;
> + }
> + rc = -EINVAL;
> + } else {
> + for (i = start_index; i < end_index; i++) {
> + if (!lmbs[i].reserved)
> + continue;
> +
> + pr_info("Memory at %llx (drc index %x) was 
> hot-removed\n",
> + lmbs[i].base_addr, lmbs[i].drc_index);
> +
> + lmbs[i].reserved = 0;
> + }
> + }
> +
> + return rc;
> +}
> +
>  #else
>  static inline int pseries_remove_memblock(unsigned long base,
> unsigned int memblock_size)
> @@ -821,6 +901,10 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog)
>   } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_DRC_INDEX) {
>   drc_index = hp_elog->_drc_u.drc_index;
>   rc = dlpar_memory_remove_by_index(drc_index, prop);
> + } else if (hp_elog->id_type == PSERIES_HP_ELOG_ID_IC) {
> + count = hp_elog->_drc_u.indexed_count[0];
> + drc_index = hp_elog->_drc_u.indexed_count[1];
> + rc = dlpar_memory_remove_by_ic(count, drc_index, prop);
>   } else
>   rc = -EINVAL;
>   break;
> 
> On 07/18/2016 10:04 AM, Sahil Mehta wrote:
>> Indexed-count memory management allows addition and removal of contiguous
>> lmb blocks with a single command. When compared to the series of calls
>> previously required to manage contiguous blocks, indexed-count decreases
>> command frequency and reduces risk of buffer overflow.
>>
>> Changes in v2:
>> --
>> -[PATCH 1/2]:-remove potential memory leak when parsing command
>>  -use u32s drc_index and count instead of u32 ic[]
>>   in

Re: [PATCH v2 1/2] powerpc/pseries: Implemented indexed-count hotplug memory add

2016-07-19 Thread Nathan Fontenot

On 07/18/2016 10:07 AM, Sahil Mehta wrote:
> Indexed-count add for memory hotplug guarantees that a contiguous block
> of  lmbs beginning at a specified  will be assigned (NOT
> that  lmbs will be added). Because of Qemu's per-DIMM memory
> management, the addition of a contiguous block of memory currently
> requires a series of individual calls. Indexed-count add reduces
> this series into a single call.
> 
> Signed-off-by: Sahil Mehta 

Reviewed-by: Nathan Fontenot 

> ---
> v2:   -remove potential memory leak when parsing command
>   -use u32s drc_index and count instead of u32 ic[]
>in dlpar_memory
> 
>  arch/powerpc/include/asm/rtas.h |2
>  arch/powerpc/platforms/pseries/dlpar.c  |   34 +++-
>  arch/powerpc/platforms/pseries/hotplug-memory.c |  100 
> +--
>  3 files changed, 124 insertions(+), 12 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h
> index 51400ba..f46b271 100644
> --- a/arch/powerpc/include/asm/rtas.h
> +++ b/arch/powerpc/include/asm/rtas.h
> @@ -307,6 +307,7 @@ struct pseries_hp_errorlog {
>   union {
>   __be32  drc_index;
>   __be32  drc_count;
> + __be32  indexed_count[2];
>   chardrc_name[1];
>   } _drc_u;
>  };
> @@ -322,6 +323,7 @@ struct pseries_hp_errorlog {
>  #define PSERIES_HP_ELOG_ID_DRC_NAME  1
>  #define PSERIES_HP_ELOG_ID_DRC_INDEX 2
>  #define PSERIES_HP_ELOG_ID_DRC_COUNT 3
> +#define PSERIES_HP_ELOG_ID_IC4
> 
>  struct pseries_errorlog *get_pseries_errorlog(struct rtas_error_log *log,
> uint16_t section_id);
> diff --git a/arch/powerpc/platforms/pseries/dlpar.c 
> b/arch/powerpc/platforms/pseries/dlpar.c
> index 2b93ae8..2a6dc9e 100644
> --- a/arch/powerpc/platforms/pseries/dlpar.c
> +++ b/arch/powerpc/platforms/pseries/dlpar.c
> @@ -345,11 +345,17 @@ static int handle_dlpar_errorlog(struct 
> pseries_hp_errorlog *hp_elog)
>   switch (hp_elog->id_type) {
>   case PSERIES_HP_ELOG_ID_DRC_COUNT:
>   hp_elog->_drc_u.drc_count =
> - be32_to_cpu(hp_elog->_drc_u.drc_count);
> + be32_to_cpu(hp_elog->_drc_u.drc_count);
>   break;
>   case PSERIES_HP_ELOG_ID_DRC_INDEX:
>   hp_elog->_drc_u.drc_index =
> - be32_to_cpu(hp_elog->_drc_u.drc_index);
> + be32_to_cpu(hp_elog->_drc_u.drc_index);
> + break;
> + case PSERIES_HP_ELOG_ID_IC:
> + hp_elog->_drc_u.indexed_count[0] =
> + be32_to_cpu(hp_elog->_drc_u.indexed_count[0]);
> + hp_elog->_drc_u.indexed_count[1] =
> + be32_to_cpu(hp_elog->_drc_u.indexed_count[1]);
>   }
> 
>   switch (hp_elog->resource) {
> @@ -409,7 +415,29 @@ static ssize_t dlpar_store(struct class *class, struct 
> class_attribute *attr,
>   goto dlpar_store_out;
>   }
> 
> - if (!strncmp(arg, "index", 5)) {
> + if (!strncmp(arg, "indexed-count", 13)) {
> + u32 index, count;
> + char *cstr, *istr;
> +
> + hp_elog->id_type = PSERIES_HP_ELOG_ID_IC;
> + arg += strlen("indexed-count ");
> +
> + cstr = kstrdup(arg, GFP_KERNEL);
> + istr = strchr(cstr, ' ');
> + *istr++ = '\0';
> +
> + if (kstrtou32(cstr, 0, ) || kstrtou32(istr, 0, )) {
> + rc = -EINVAL;
> + pr_err("Invalid index or count : \"%s\"\n", buf);
> + kfree(cstr);
> + goto dlpar_store_out;
> + }
> +
> + kfree(cstr);
> +
> + hp_elog->_drc_u.indexed_count[0] = cpu_to_be32(count);
> + hp_elog->_drc_u.indexed_count[1] = cpu_to_be32(index);
> + } else if (!strncmp(arg, "index", 5)) {
>   u32 index;
> 
>   hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX;
> diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
> b/arch/powerpc/platforms/pseries/hotplug-memory.c
> index 2ce1385..d7942ca 100644
> --- a/arch/powerpc/platforms/pseries/hotplug-memory.c
> +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
> @@ -701,6 +701,83 @@ static int dlpar_memory_add_by_index(u32 drc_index, 
> struct property *prop)
>   return rc;
>  }
> 
> +static int dlpar_memory_add_by_ic(u32 lmbs_to_add, u32 drc_index,
> +   struct property *prop)
> +{
> + struct of_drconf_cell *lmbs;
> + u32 num_lmbs, *p;
> + int i, rc;
> + int lmbs_available = 0, start_index = 0, end_index;
> +
> + pr_info("Attempting to hot-add %u LMB(s) at index %x\n",
> + lmbs_to_add, drc_index);
> +
> + if (lmbs_to_add == 0)
> + return -EINVAL;
> +
> + p =

RE: [PATCH V2 7/7] thermal: qoriq: Add thermal management support

2016-07-19 Thread Hongtao Jia

Hi Eduardo,

Any comments on this patch?

Thanks.
-Hongtao.

> -Original Message-
> From: Jia Hongtao [mailto:hongtao@nxp.com]
> Sent: Thursday, June 30, 2016 11:09 AM
> To: edubez...@gmail.com; rui.zh...@intel.com; robh...@kernel.org;
> ga...@codeaurora.org; Scott Wood ;
> shawn...@kernel.org
> Cc: linux...@vger.kernel.org; devicet...@vger.kernel.org; linux-
> ker...@vger.kernel.org; linuxppc-dev@lists.ozlabs.org; linux-arm-
> ker...@lists.infradead.org; Hongtao Jia 
> Subject: [PATCH V2 7/7] thermal: qoriq: Add thermal management support
> 
> This driver add thermal management support by enabling TMU (Thermal
> Monitoring Unit) on QorIQ platform.
> 
> It's based on thermal of framework:
> - Trip points defined in device tree.
> - Cpufreq as cooling device registered in qoriq cpufreq driver.
> 
> Signed-off-by: Jia Hongtao 
> ---
> Changes of V2:
> * Add HAS_IOMEM dependency to fix build error on UM
> 
>  drivers/thermal/Kconfig |  10 ++
>  drivers/thermal/Makefile|   1 +
>  drivers/thermal/qoriq_thermal.c | 328
> 
>  3 files changed, 339 insertions(+)
>  create mode 100644 drivers/thermal/qoriq_thermal.c
> 
> diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index
> 2d702ca..56ef30d 100644
> --- a/drivers/thermal/Kconfig
> +++ b/drivers/thermal/Kconfig
> @@ -195,6 +195,16 @@ config IMX_THERMAL
> cpufreq is used as the cooling device to throttle CPUs when the
> passive trip is crossed.
> 
> +config QORIQ_THERMAL
> + tristate "QorIQ Thermal Monitoring Unit"
> + depends on THERMAL_OF
> + depends on HAS_IOMEM
> + help
> +   Support for Thermal Monitoring Unit (TMU) found on QorIQ platforms.
> +   It supports one critical trip point and one passive trip point. The
> +   cpufreq is used as the cooling device to throttle CPUs when the
> +   passive trip is crossed.
> +
>  config SPEAR_THERMAL
>   tristate "SPEAr thermal sensor driver"
>   depends on PLAT_SPEAR || COMPILE_TEST
> diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index
> 10b07c1..6662232 100644
> --- a/drivers/thermal/Makefile
> +++ b/drivers/thermal/Makefile
> @@ -37,6 +37,7 @@ obj-$(CONFIG_DB8500_THERMAL)+= db8500_thermal.o
>  obj-$(CONFIG_ARMADA_THERMAL) += armada_thermal.o
>  obj-$(CONFIG_TANGO_THERMAL)  += tango_thermal.o
>  obj-$(CONFIG_IMX_THERMAL)+= imx_thermal.o
> +obj-$(CONFIG_QORIQ_THERMAL)  += qoriq_thermal.o
>  obj-$(CONFIG_DB8500_CPUFREQ_COOLING) += db8500_cpufreq_cooling.o
>  obj-$(CONFIG_INTEL_POWERCLAMP)   += intel_powerclamp.o
>  obj-$(CONFIG_X86_PKG_TEMP_THERMAL)   += x86_pkg_temp_thermal.o
> diff --git a/drivers/thermal/qoriq_thermal.c 
> b/drivers/thermal/qoriq_thermal.c new
> file mode 100644 index 000..644ba52
> --- /dev/null
> +++ b/drivers/thermal/qoriq_thermal.c
> @@ -0,0 +1,328 @@
> +/*
> + * Copyright 2016 Freescale Semiconductor, Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify
> +it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but
> +WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
> +or
> + * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
> +License for
> + * more details.
> + *
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "thermal_core.h"
> +
> +#define SITES_MAX16
> +
> +/*
> + * QorIQ TMU Registers
> + */
> +struct qoriq_tmu_site_regs {
> + u32 tritsr; /* Immediate Temperature Site Register */
> + u32 tratsr; /* Average Temperature Site Register */
> + u8 res0[0x8];
> +};
> +
> +struct qoriq_tmu_regs {
> + u32 tmr;/* Mode Register */
> +#define TMR_DISABLE  0x0
> +#define TMR_ME   0x8000
> +#define TMR_ALPF 0x0c00
> + u32 tsr;/* Status Register */
> + u32 tmtmir; /* Temperature measurement interval Register */
> +#define TMTMIR_DEFAULT   0x000f
> + u8 res0[0x14];
> + u32 tier;   /* Interrupt Enable Register */
> +#define TIER_DISABLE 0x0
> + u32 tidr;   /* Interrupt Detect Register */
> + u32 tiscr;  /* Interrupt Site Capture Register */
> + u32 ticscr; /* Interrupt Critical Site Capture Register */
> + u8 res1[0x10];
> + u32 tmhtcrh;/* High Temperature Capture Register */
> + u32 tmhtcrl;/* Low Temperature Capture Register */
> + u8 res2[0x8];
> + u32 tmhtitr;/* High Temperature Immediate Threshold */
> + u32 tmhtatr;/* High Temperature Average Threshold */
> + u32 tmhtactr;   /* High Temperature Average

Re: [Patch v2] rpaphp: fix slot registration for multiple slots under a PHB

2016-07-19 Thread Nathan Fontenot

On 07/11/2016 05:16 PM, Tyrel Datwyler wrote:
> PowerVM seems to only ever provide a single hotplug slot per PHB.
> The under lying slot hotplug registration code assumed multiple slots,
> but the actual implementation is broken for multiple slots. This went
> unnoticed for years due to the nature of PowerVM as mentioned
> previously. Under qemu/kvm the hotplug slot model aligns more with
> x86 where multiple slots are presented under a single PHB. As seen
> in the following each additional slot after the first fails to
> register due to each slot always being compared against the first
> child node of the PHB in the device tree.
> 
> [6.492291] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
> [6.492569] rpaphp: Slot [Slot 0] registered
> [6.492577] rpaphp: pci_hp_register failed with error -16
> [6.493082] rpaphp: pci_hp_register failed with error -16
> [6.493138] rpaphp: pci_hp_register failed with error -16
> [6.493161] rpaphp: pci_hp_register failed with error -16
> 
> The registration logic is fixed so that each slot is compared
> against the existing child devices of the PHB in the device tree to
> determine present slots vs empty slots.
> 
> [   38.481750] rpaphp: RPA HOT Plug PCI Controller Driver version: 0.1
> [   38.482004] rpaphp: Slot [C0] registered
> [   38.482127] rpaphp: Slot [C1] registered
> [   38.482241] rpaphp: Slot [C2] registered
> [   38.482356] rpaphp: Slot [C3] registered
> [   38.482495] rpaphp: Slot [C4] registered
> 
> Signed-off-by: Tyrel Datwyler 

Reviewed-by: Nathan Fontenot 

> ---
> 
> Changes in v2: corrected ibm,my-drc-index property name
> 
> ---
>  drivers/pci/hotplug/rpaphp_slot.c | 17 -
>  1 file changed, 12 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/pci/hotplug/rpaphp_slot.c 
> b/drivers/pci/hotplug/rpaphp_slot.c
> index 6937c72..388c4d8 100644
> --- a/drivers/pci/hotplug/rpaphp_slot.c
> +++ b/drivers/pci/hotplug/rpaphp_slot.c
> @@ -117,8 +117,10 @@ EXPORT_SYMBOL_GPL(rpaphp_deregister_slot);
>  int rpaphp_register_slot(struct slot *slot)
>  {
>   struct hotplug_slot *php_slot = slot->hotplug_slot;
> + struct device_node *child;
> + u32 my_index;
>   int retval;
> - int slotno;
> + int slotno = -1;
> 
>   dbg("%s registering slot:path[%s] index[%x], name[%s] pdomain[%x] 
> type[%d]\n",
>   __func__, slot->dn->full_name, slot->index, slot->name,
> @@ -130,10 +132,15 @@ int rpaphp_register_slot(struct slot *slot)
>   return -EAGAIN;
>   }
> 
> - if (slot->dn->child)
> - slotno = PCI_SLOT(PCI_DN(slot->dn->child)->devfn);
> - else
> - slotno = -1;
> + for_each_child_of_node(slot->dn, child) {
> + retval = of_property_read_u32(child, "ibm,my-drc-index", 
> _index);
> + if (my_index == slot->index) {
> + slotno = PCI_SLOT(PCI_DN(child)->devfn);
> + of_node_put(child);
> + break;
> + }
> + }
> +
>   retval = pci_hp_register(php_slot, slot->bus, slotno, slot->name);
>   if (retval) {
>   err("pci_hp_register failed with error %d\n", retval);
> 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH] crypto: vmx - Ignore generated files

2016-07-19 Thread Paulo Flabiano Smorigo

Ignore assembly files generated by the perl script.

Signed-off-by: Paulo Flabiano Smorigo 
---
 drivers/crypto/vmx/.gitignore | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 drivers/crypto/vmx/.gitignore

diff --git a/drivers/crypto/vmx/.gitignore b/drivers/crypto/vmx/.gitignore
new file mode 100644
index 000..af4a7ce
--- /dev/null
+++ b/drivers/crypto/vmx/.gitignore
@@ -0,0 +1,2 @@
+aesp8-ppc.S
+ghashp8-ppc.S
-- 
2.5.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC 3/3] kexec: extend kexec_file_load system call

2016-07-19 Thread Vivek Goyal

On Tue, Jul 19, 2016 at 01:47:28PM +0100, Mark Rutland wrote:
> On Tue, Jul 19, 2016 at 08:24:06AM -0400, Vivek Goyal wrote:
> > On Tue, Jul 19, 2016 at 11:52:00AM +0100, Mark Rutland wrote:
> > > Regardless, this extended syscall changes some underlying assumptions
> > > made with the development of kexec_file_load, and I think treating this
> > > as an extension is not a great idea. From a user's perspective there is
> > > little difference between passing an additional flag or using a
> > > different syscall number, so I don't think that we gain much by altering
> > > the existing prototype relative to allocating a new syscall number.
> > 
> > If we are providing/opening up additional flags, I can't think what will
> > it break. Same flag was invalid in old kernel but new kernel supports 
> > it and will accept it. So it sounds reasonable to me to add new flags.
> > 
> > If existing users are not broken, then I think it might be a good idea
> > to extend existing syscall. Otherwise userspace will have to be modified
> > to understand a 3rd syscall also and an additional option will show up
> > which asks users to specify which syscall to use. So extending existing
> > syscall might keep it little simple for users.
> 
> I don't follow.
> 
> To use the new feature, you have to modify userspace anyway, as you
> require userspace to pass information which it did not previously pass
> (in the new arguments added to the syscall).
> 
> The presence of a new syscall does not imply the absence of the old
> syscall, so you can always use that be default unless the user asks for
> asomething only the new syscall provides. Regardless of the
> syscall/flags difference, you still have to detect whether the new
> functionality is present somehow.
> 

Hmm., so current idea is that we have two syscalls() which are *ideally*
supposed to work for all arches. Difference between two is that first
one does not support kernel signature verification while second one does.

By default old syscall is used and user can force using new syscall using
option --kexec-file-load.

If a user DTB is present, I was hoping that it will continue to work the
same way. Both the sycalls can be used and can handle DTB. If we introduce
a 3rd syscall, that means only first and 3rd syscall can handle DTB and
we need to introduce one more option which tells whether to use
kexec_load() or use the 3rd new syscall. And that's what I am trying
to avoid.

Vivek

> > BTW, does kexec_load() needs to be modified too to handle DT?
> 
> No, at least for arm64. In the kexec_load case userspace provides the
> DTB as a raw segment, and the user-provided purgatory sets up registers
> to pass that to the new kernel.
> 
> Thanks,
> Mark.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC 3/3] kexec: extend kexec_file_load system call

2016-07-19 Thread Mark Rutland

On Tue, Jul 19, 2016 at 08:24:06AM -0400, Vivek Goyal wrote:
> On Tue, Jul 19, 2016 at 11:52:00AM +0100, Mark Rutland wrote:
> > Regardless, this extended syscall changes some underlying assumptions
> > made with the development of kexec_file_load, and I think treating this
> > as an extension is not a great idea. From a user's perspective there is
> > little difference between passing an additional flag or using a
> > different syscall number, so I don't think that we gain much by altering
> > the existing prototype relative to allocating a new syscall number.
> 
> If we are providing/opening up additional flags, I can't think what will
> it break. Same flag was invalid in old kernel but new kernel supports 
> it and will accept it. So it sounds reasonable to me to add new flags.
> 
> If existing users are not broken, then I think it might be a good idea
> to extend existing syscall. Otherwise userspace will have to be modified
> to understand a 3rd syscall also and an additional option will show up
> which asks users to specify which syscall to use. So extending existing
> syscall might keep it little simple for users.

I don't follow.

To use the new feature, you have to modify userspace anyway, as you
require userspace to pass information which it did not previously pass
(in the new arguments added to the syscall).

The presence of a new syscall does not imply the absence of the old
syscall, so you can always use that be default unless the user asks for
asomething only the new syscall provides. Regardless of the
syscall/flags difference, you still have to detect whether the new
functionality is present somehow.

> BTW, does kexec_load() needs to be modified too to handle DT?

No, at least for arm64. In the kexec_load case userspace provides the
DTB as a raw segment, and the user-provided purgatory sets up registers
to pass that to the new kernel.

Thanks,
Mark.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC 3/3] kexec: extend kexec_file_load system call

2016-07-19 Thread Vivek Goyal

On Tue, Jul 19, 2016 at 11:52:00AM +0100, Mark Rutland wrote:
> On Tue, Jul 19, 2016 at 08:55:56AM +0800, Dave Young wrote:
> > On 07/18/16 at 11:07am, Mark Rutland wrote:
> > > On Mon, Jul 18, 2016 at 10:30:24AM +0800, Dave Young wrote:
> > > > I do not think it is worth to add another syscall for extra fds.
> > > > We have open(2) as an example for different numbers of arguments
> > > > already.
> > > 
> > > Did we change the syscall interface for that?
> > > 
> > > I was under the impression that there was always one underlying syscall,
> > > and the C library did the right thing to pass the expected information
> > > to the underlying syscall.
> > 
> > I'm not sure kexec_load and kexec_file_load were included in glibc, we use
> > syscall directly in kexec-tools.
> > 
> > kexec_load man pages says there are no wrappers for both kexec_load and
> > kexec_file_load in glibc.
> 
> For the above, I was talking about how open() was handled.
> 
> If there are no userspace wrappers, then the two cases aren't comparable
> in the first place...
> 
> > > That's rather different to changing the underlying syscall.
> > > 
> > > Regardless of how this is wrapped in userspace, I do not think modifying
> > > the existing prototype is a good idea, and I think this kind of
> > > extension needs to be a new syscall.
> > 
> > Hmm, as I replied to Vivek, there is one case about the flags, previously
> > the new flag will be regarded as invalid, but not we extend it it will be
> > valid, this maybe the only potential bad case.
> 
> It's true that adding suport for new flags will change the behaviour of
> what used to be error cases. We generally expect real users to not be
> making pointless calls for which they rely on an error being returned in
> all cases.
> 
> Regardless, this extended syscall changes some underlying assumptions
> made with the development of kexec_file_load, and I think treating this
> as an extension is not a great idea. From a user's perspective there is
> little difference between passing an additional flag or using a
> different syscall number, so I don't think that we gain much by altering
> the existing prototype relative to allocating a new syscall number.

If we are providing/opening up additional flags, I can't think what will
it break. Same flag was invalid in old kernel but new kernel supports 
it and will accept it. So it sounds reasonable to me to add new flags.

If existing users are not broken, then I think it might be a good idea
to extend existing syscall. Otherwise userspace will have to be modified
to understand a 3rd syscall also and an additional option will show up
which asks users to specify which syscall to use. So extending existing
syscall might keep it little simple for users.

This is only if conclusion in the end is that DT needs to be passed in
from user space.

BTW, does kexec_load() needs to be modified too to handle DT?

Vivek
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH -next] wan/fsl_ucc_hdlc: remove .owner field for driver

2016-07-19 Thread Wei Yongjun

From: Wei Yongjun 

Remove .owner field if calls are used which set it automatically.

Generated by: scripts/coccinelle/api/platform_no_drv_owner.cocci

Signed-off-by: Wei Yongjun 
---
 drivers/net/wan/fsl_ucc_hdlc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index b3861bf..10ca497 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -1168,7 +1168,6 @@ static struct platform_driver ucc_hdlc_driver = {
.probe  = ucc_hdlc_probe,
.remove = ucc_hdlc_remove,
.driver = {
-   .owner  = THIS_MODULE,
.name   = DRV_NAME,
.pm = HDLC_PM_OPS,
.of_match_table = fsl_ucc_hdlc_of_match,




___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH -next] wan/fsl_ucc_hdlc: use module_platform_driver to simplify the code

2016-07-19 Thread Wei Yongjun

From: Wei Yongjun 

module_platform_driver() makes the code simpler by eliminating
boilerplate code.

Signed-off-by: Wei Yongjun 
---
 drivers/net/wan/fsl_ucc_hdlc.c | 13 +
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index b3861bf..3f6b218 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -1175,15 +1175,4 @@ static struct platform_driver ucc_hdlc_driver = {
},
 };
 
-static int __init ucc_hdlc_init(void)
-{
-   return platform_driver_register(_hdlc_driver);
-}
-
-static void __exit ucc_hdlc_exit(void)
-{
-   platform_driver_unregister(_hdlc_driver);
-}
-
-module_init(ucc_hdlc_init);
-module_exit(ucc_hdlc_exit);
+module_platform_driver(ucc_hdlc_driver);




___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC 3/3] kexec: extend kexec_file_load system call

2016-07-19 Thread Mark Rutland

On Tue, Jul 19, 2016 at 08:55:56AM +0800, Dave Young wrote:
> On 07/18/16 at 11:07am, Mark Rutland wrote:
> > On Mon, Jul 18, 2016 at 10:30:24AM +0800, Dave Young wrote:
> > > I do not think it is worth to add another syscall for extra fds.
> > > We have open(2) as an example for different numbers of arguments
> > > already.
> > 
> > Did we change the syscall interface for that?
> > 
> > I was under the impression that there was always one underlying syscall,
> > and the C library did the right thing to pass the expected information
> > to the underlying syscall.
> 
> I'm not sure kexec_load and kexec_file_load were included in glibc, we use
> syscall directly in kexec-tools.
> 
> kexec_load man pages says there are no wrappers for both kexec_load and
> kexec_file_load in glibc.

For the above, I was talking about how open() was handled.

If there are no userspace wrappers, then the two cases aren't comparable
in the first place...

> > That's rather different to changing the underlying syscall.
> > 
> > Regardless of how this is wrapped in userspace, I do not think modifying
> > the existing prototype is a good idea, and I think this kind of
> > extension needs to be a new syscall.
> 
> Hmm, as I replied to Vivek, there is one case about the flags, previously
> the new flag will be regarded as invalid, but not we extend it it will be
> valid, this maybe the only potential bad case.

It's true that adding suport for new flags will change the behaviour of
what used to be error cases. We generally expect real users to not be
making pointless calls for which they rely on an error being returned in
all cases.

Regardless, this extended syscall changes some underlying assumptions
made with the development of kexec_file_load, and I think treating this
as an extension is not a great idea. From a user's perspective there is
little difference between passing an additional flag or using a
different syscall number, so I don't think that we gain much by altering
the existing prototype relative to allocating a new syscall number.

Thus, I think that if this is necessary it should be treated as a new
syscall.

Thanks,
Mark.
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Suspected regression?

2016-07-19 Thread Alessio Igor Bogani

Hi all,

I have got two boards MVME5100 (MPC7410 cpu) and MVME7100 (MPC8641D
cpu) for which I use the same cross-compiler (ppc7400).

I tested these against kernel HEAD to found that these don't boot
anymore (PID 1 crash).

Bisecting results in first offending commit:
7aef4136566b0539a1a98391181e188905e33401

Removing it from HEAD make boards boot properly again.

A third system based on P2010 isn't affected at all.

Is it a regression or I have made something wrong?

Thanks!

Ciao,
Alessio
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 0/2] Automatically load the vmx_crypto module if supported

2016-07-19 Thread Herbert Xu

On Tue, Jul 19, 2016 at 07:13:24PM +1000, Michael Ellerman wrote:
>
> I'll assume patch 2 has your ack :)

Sure,

Acked-by: Herbert Xu 

Thanks,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [1/1] KVM: PPC: Introduce KVM_CAP_PPC_HTM

2016-07-19 Thread Michael Ellerman

Sam Bobroff  writes:

> On Fri, Jul 08, 2016 at 08:49:49PM +1000, Michael Ellerman wrote:
>> On Wed, 2016-06-07 at 06:05:54 UTC, Sam bobroff wrote:
>> > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
>> > index 02416fe..06d79bc 100644
>> > --- a/arch/powerpc/kvm/powerpc.c
>> > +++ b/arch/powerpc/kvm/powerpc.c
>> > @@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, 
>> > long ext)
>> >r = 1;
>> >break;
>> >  #endif
>> > +  case KVM_CAP_PPC_HTM:
>> > +  r = cpu_has_feature(CPU_FTR_TM)
>> > +  && is_kvmppc_hv_enabled(kvm);
>> 
>> I think it should be using CPU_FTR_TM_COMP.
>
> Oh, why is that? I'm happy to respin the patch I'm just curious.
>
> (I did it that way becuase that seems to be the way the other flags are used,
> e.g. CPU_FTR_ALTIVEC).
>
> If I read the code correctly, using CPU_FTR_TM_COMP will work fine: it should
> cause the cpu_has_feature() test to always return false if CPU_FTR_TM_COMP is
> 0.

CPU_FTR_TM says the CPU supports TM.

CPU_FTR_TM_COMP says the CPU supports TM *and* the kernel is built with
TM support.

The distinction exists because currently the assembly patching macros
don't deal correctly with a feature bit that is defined to 0. (And
possibly other reasons I don't remember)

We should fix that, but until we have, anything that is advertising
support to userspace should be using the COMP bits, when they exist.

cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v3 02/11] mm: Hardened usercopy

2016-07-19 Thread Christian Borntraeger

On 07/15/2016 11:44 PM, Kees Cook wrote:
> +config HAVE_ARCH_LINEAR_KERNEL_MAPPING
> + bool
> + help
> +   An architecture should select this if it has a secondary linear
> +   mapping of the kernel text. This is used to verify that kernel
> +   text exposures are not visible under CONFIG_HARDENED_USERCOPY.

I have trouble parsing this. (What does secondary linear mapping mean?)
So let me give an example below

> +
[...]
> +/* Is this address range in the kernel text area? */
> +static inline const char *check_kernel_text_object(const void *ptr,
> +unsigned long n)
> +{
> + unsigned long textlow = (unsigned long)_stext;
> + unsigned long texthigh = (unsigned long)_etext;
> +
> + if (overlaps(ptr, n, textlow, texthigh))
> + return "";
> +
> +#ifdef HAVE_ARCH_LINEAR_KERNEL_MAPPING
> + /* Check against linear mapping as well. */
> + if (overlaps(ptr, n, (unsigned long)__va(__pa(textlow)),
> +  (unsigned long)__va(__pa(texthigh
> + return "";
> +#endif
> +
> + return NULL;
> +}

s390 has an address space for user (primary address space from 0..4TB/8PB) and 
a separate 
address space (home space from 0..4TB/8PB) for the kernel. In this home space 
the kernel
mapping is virtual containing the physical memory as well as vmalloc memory 
(creating aliases
into the physical one). The kernel text is mapped from _stext to _etext in this 
mapping.
So I assume this would qualify for HAVE_ARCH_LINEAR_KERNEL_MAPPING ?

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 0/2] Automatically load the vmx_crypto module if supported

2016-07-19 Thread Michael Ellerman

Herbert Xu  writes:

> On Tue, Jul 19, 2016 at 11:01:55AM +1000, Michael Ellerman wrote:
>>
>> Can you please ask for an ack before merging arch patches?
>> 
>> That's a 70 line powerpc patch and a 6 line crypto patch. It has no
>> reviews and no acks. I would have preferred it if we could take it via
>> the powerpc tree.
>
> Sorry, I'll delete them from the crypto tree.

Thanks.

I'll assume patch 2 has your ack :)

cheers
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V4 4/5] kvm/stats: Add provisioning for ulong vm stats and u64 vcpu stats

2016-07-19 Thread Christian Borntraeger

On 07/19/2016 10:12 AM, Suraj Jitindar Singh wrote:
> vms and vcpus have statistics associated with them which can be viewed
> within the debugfs. Currently it is assumed within the vcpu_stat_get() and
> vm_stat_get() functions that all of these statistics are represented as
> u32s, however the next patch adds some u64 vcpu statistics.
> 
> Change all vcpu statistics to u64 and modify vcpu_stat_get() accordingly.
> Since vcpu statistics are per vcpu, they will only be updated by a single
> vcpu at a time so this shouldn't present a problem on 32-bit machines
> which can't atomically increment 64-bit numbers. However vm statistics
> could potentially be updated by multiple vcpus from that vm at a time.
> To avoid the overhead of atomics make all vm statistics ulong such that
> they are 64-bit on 64-bit systems where they can be atomically incremented
> and are 32-bit on 32-bit systems which may not be able to atomically
> increment 64-bit numbers. Modify vm_stat_get() to expect ulongs.
> 
> Signed-off-by: Suraj Jitindar Singh 

Acked-by: Christian Borntraeger 

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V4 5/5] powerpc/kvm/stats: Implement existing and add new halt polling vcpu stats

2016-07-19 Thread Christian Borntraeger

On 07/19/2016 10:12 AM, Suraj Jitindar Singh wrote:

> Also add new vcpu stats: halt_poll_success_ns, halt_poll_fail_ns and
> halt_wait_ns to be used to accumulate the total time spend polling
> successfully, polling unsuccessfully and waiting respectively, and
> halt_successful_wait to accumulate the number of times the vcpu waits.
> Given that halt_poll_success_ns, halt_poll_fail_ns and halt_wait_ns are
> expressed in nanoseconds it is necessary to represent these as 64-bit
> quantities, otherwise they would overflow after only about 4 seconds.


Paolo, would these new kvm_stats also be useful for the base implementation?

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH V4 3/5] kvm/ppc/book3s_hv: Implement halt polling in the kvm_hv kernel module

2016-07-19 Thread Christian Borntraeger

On 07/19/2016 10:12 AM, Suraj Jitindar Singh wrote:
> This patch introduces new halt polling functionality into the kvm_hv kernel
> module. When a vcore is idle it will poll for some period of time before
> scheduling itself out.

Some wording on why you cannot use the common code might be useful. 
> 
> When all of the runnable vcpus on a vcore have ceded (and thus the vcore is
> idle) we schedule ourselves out to allow something else to run. In the
> event that we need to wake up very quickly (for example an interrupt
> arrives), we are required to wait until we get scheduled again.
> 
> Implement halt polling so that when a vcore is idle, and before scheduling
> ourselves, we poll for vcpus in the runnable_threads list which have
> pending exceptions or which leave the ceded state. If we poll successfully
> then we can get back into the guest very quickly without ever scheduling
> ourselves, otherwise we schedule ourselves out as before.
> 
> Testing of this patch with a TCP round robin test between two guests with
> virtio network interfaces has found a decrease in round trip time of ~15us
> on average. A performance gain is only seen when going out of and
> back into the guest often and quickly, otherwise there is no net benefit
> from the polling. The polling interval is adjusted such that when we are
> often scheduled out for long periods of time it is reduced, and when we
> often poll successfully it is increased. The rate at which the polling
> interval increases or decreases, and the maximum polling interval, can
> be set through module parameters.
> 
> Based on the implementation in the generic kvm module by Wanpeng Li and
> Paolo Bonzini, and on direction from Paul Mackerras.
> 
> Signed-off-by: Suraj Jitindar Singh 
> ---
>  arch/powerpc/include/asm/kvm_book3s.h |   1 +
>  arch/powerpc/include/asm/kvm_host.h   |   1 +
>  arch/powerpc/kvm/book3s_hv.c  | 116 
> ++
>  arch/powerpc/kvm/trace_hv.h   |  22 +++
>  4 files changed, 126 insertions(+), 14 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
> b/arch/powerpc/include/asm/kvm_book3s.h
> index 151f817..c261f52 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -102,6 +102,7 @@ struct kvmppc_vcore {
>   ulong pcr;
>   ulong dpdes;/* doorbell state (POWER8) */
>   ulong conferring_threads;
> + unsigned int halt_poll_ns;
>  };
> 
>  struct kvmppc_vcpu_book3s {
> diff --git a/arch/powerpc/include/asm/kvm_host.h 
> b/arch/powerpc/include/asm/kvm_host.h
> index 02d06e9..610f393 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -294,6 +294,7 @@ struct kvm_arch {
>  #define VCORE_SLEEPING   3
>  #define VCORE_RUNNING4
>  #define VCORE_EXITING5
> +#define VCORE_POLLING6
> 
>  /*
>   * Struct used to manage memory for a virtual processor area
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 3bcf9e6..a9de1d4 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -94,6 +94,23 @@ module_param_cb(h_ipi_redirect, _param_ops, 
> _ipi_redirect,
>  MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host 
> core");
>  #endif
> 
> +/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
> +static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
> +module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
> +MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
> +
> +/* Factor by which the vcore halt poll interval is grown, default is to 
> double
> + */
> +static unsigned int halt_poll_ns_grow = 2;
> +module_param(halt_poll_ns_grow, int, S_IRUGO);
> +MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
> +
> +/* Factor by which the vcore halt poll interval is shrunk, default is to 
> reset
> + */
> +static unsigned int halt_poll_ns_shrink;
> +module_param(halt_poll_ns_shrink, int, S_IRUGO);
> +MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
> +
>  static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
>  static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
> 
> @@ -2620,32 +2637,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore 
> *vc,
>   finish_wait(>arch.cpu_run, );
>  }
> 
> +static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
> +{
> + /* 10us base */
> + if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
> + vc->halt_poll_ns = 1;
> + else
> + vc->halt_poll_ns *= halt_poll_ns_grow;
> +
> + if (vc->halt_poll_ns > halt_poll_max_ns)
> + vc->halt_poll_ns = halt_poll_max_ns;
> +}
> +
> +static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
> +{
> + if (halt_poll_ns_shrink == 0)
> + vc->halt_poll_ns = 0;
> + else
> + vc->halt_poll_ns /=

[PATCH V4 5/5] powerpc/kvm/stats: Implement existing and add new halt polling vcpu stats

2016-07-19 Thread Suraj Jitindar Singh

vcpu stats are used to collect information about a vcpu which can be viewed
in the debugfs. For example halt_attempted_poll and halt_successful_poll
are used to keep track of the number of times the vcpu attempts to and
successfully polls. These stats are currently not used on powerpc.

Implement incrementation of the halt_attempted_poll and
halt_successful_poll vcpu stats for powerpc. Since these stats are summed
over all the vcpus for all running guests it doesn't matter which vcpu
they are attributed to, thus we choose the current runner vcpu of the
vcore.

Also add new vcpu stats: halt_poll_success_ns, halt_poll_fail_ns and
halt_wait_ns to be used to accumulate the total time spend polling
successfully, polling unsuccessfully and waiting respectively, and
halt_successful_wait to accumulate the number of times the vcpu waits.
Given that halt_poll_success_ns, halt_poll_fail_ns and halt_wait_ns are
expressed in nanoseconds it is necessary to represent these as 64-bit
quantities, otherwise they would overflow after only about 4 seconds.

Given that the total time spend either polling or waiting will be known and
the number of times that each was done, it will be possible to determine
the average poll and wait times. This will give the ability to tune the kvm
module parameters based on the calculated average wait and poll times.

Signed-off-by: Suraj Jitindar Singh 

---
Change Log:

V3 -> V4:
- Instead of accounting just wait and poll time, separate these
  into successful_poll_time, failed_poll_time and wait_time.
---
 arch/powerpc/include/asm/kvm_host.h |  4 
 arch/powerpc/kvm/book3s.c   |  4 
 arch/powerpc/kvm/book3s_hv.c| 36 +++-
 3 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index f6304c5..f15ffc0 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -114,8 +114,12 @@ struct kvm_vcpu_stat {
u64 emulated_inst_exits;
u64 dec_exits;
u64 ext_intr_exits;
+   u64 halt_poll_success_ns;
+   u64 halt_poll_fail_ns;
+   u64 halt_wait_ns;
u64 halt_successful_poll;
u64 halt_attempted_poll;
+   u64 halt_successful_wait;
u64 halt_poll_invalid;
u64 halt_wakeup;
u64 dbell_exits;
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 47018fc..71eb8f3 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -52,8 +52,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "dec", VCPU_STAT(dec_exits) },
{ "ext_intr",VCPU_STAT(ext_intr_exits) },
{ "queue_intr",  VCPU_STAT(queue_intr) },
+   { "halt_poll_success_ns",   VCPU_STAT(halt_poll_success_ns) },
+   { "halt_poll_fail_ns",  VCPU_STAT(halt_poll_fail_ns) },
+   { "halt_wait_ns",   VCPU_STAT(halt_wait_ns) },
{ "halt_successful_poll", VCPU_STAT(halt_successful_poll), },
{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), },
+   { "halt_successful_wait",   VCPU_STAT(halt_successful_wait) },
{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
{ "pf_storage",  VCPU_STAT(pf_storage) },
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a9de1d4..81072f2 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2679,15 +2679,16 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore 
*vc)
  */
 static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 {
+   ktime_t cur, start_poll, start_wait;
int do_sleep = 1;
-   ktime_t cur, start;
u64 block_ns;
DECLARE_SWAITQUEUE(wait);
 
/* Poll for pending exceptions and ceded state */
-   cur = start = ktime_get();
+   cur = start_poll = ktime_get();
if (vc->halt_poll_ns) {
-   ktime_t stop = ktime_add_ns(start, vc->halt_poll_ns);
+   ktime_t stop = ktime_add_ns(start_poll, vc->halt_poll_ns);
+   ++vc->runner->stat.halt_attempted_poll;
 
vc->vcore_state = VCORE_POLLING;
spin_unlock(>lock);
@@ -2703,8 +2704,10 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
spin_lock(>lock);
vc->vcore_state = VCORE_INACTIVE;
 
-   if (!do_sleep)
+   if (!do_sleep) {
+   ++vc->runner->stat.halt_successful_poll;
goto out;
+   }
}
 
prepare_to_swait(>wq, , TASK_INTERRUPTIBLE);
@@ -2712,9 +2715,14 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
if (kvmppc_vcore_check_block(vc)) {
finish_swait(>wq, );
do_sleep = 0;
+   /* If we polled, count this as a successful poll

[PATCH V4 4/5] kvm/stats: Add provisioning for ulong vm stats and u64 vcpu stats

2016-07-19 Thread Suraj Jitindar Singh

vms and vcpus have statistics associated with them which can be viewed
within the debugfs. Currently it is assumed within the vcpu_stat_get() and
vm_stat_get() functions that all of these statistics are represented as
u32s, however the next patch adds some u64 vcpu statistics.

Change all vcpu statistics to u64 and modify vcpu_stat_get() accordingly.
Since vcpu statistics are per vcpu, they will only be updated by a single
vcpu at a time so this shouldn't present a problem on 32-bit machines
which can't atomically increment 64-bit numbers. However vm statistics
could potentially be updated by multiple vcpus from that vm at a time.
To avoid the overhead of atomics make all vm statistics ulong such that
they are 64-bit on 64-bit systems where they can be atomically incremented
and are 32-bit on 32-bit systems which may not be able to atomically
increment 64-bit numbers. Modify vm_stat_get() to expect ulongs.

Signed-off-by: Suraj Jitindar Singh 

---
Change Log:

V2 -> V3:
- Instead of implementing separate u32 and u64 functions keep the
  generic functions and modify them to expect u64s. Thus update all
  vm and vcpu statistics to u64s accordingly.
V3 -> V4:
- Change vm_stats from u64 to ulong
---
 arch/arm/include/asm/kvm_host.h |  12 ++--
 arch/arm64/include/asm/kvm_host.h   |  12 ++--
 arch/mips/include/asm/kvm_host.h|  46 ++---
 arch/powerpc/include/asm/kvm_host.h |  60 -
 arch/s390/include/asm/kvm_host.h| 128 ++--
 arch/x86/include/asm/kvm_host.h |  72 ++--
 virt/kvm/kvm_main.c |   4 +-
 7 files changed, 167 insertions(+), 167 deletions(-)

diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 96387d4..c8e55b3b 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -183,15 +183,15 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_vm_stat {
-   u32 remote_tlb_flush;
+   ulong remote_tlb_flush;
 };
 
 struct kvm_vcpu_stat {
-   u32 halt_successful_poll;
-   u32 halt_attempted_poll;
-   u32 halt_poll_invalid;
-   u32 halt_wakeup;
-   u32 hvc_exit_stat;
+   u64 halt_successful_poll;
+   u64 halt_attempted_poll;
+   u64 halt_poll_invalid;
+   u64 halt_wakeup;
+   u64 hvc_exit_stat;
u64 wfe_exit_stat;
u64 wfi_exit_stat;
u64 mmio_exit_user;
diff --git a/arch/arm64/include/asm/kvm_host.h 
b/arch/arm64/include/asm/kvm_host.h
index 49095fc..b14c8bc 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -291,15 +291,15 @@ struct kvm_vcpu_arch {
 #endif
 
 struct kvm_vm_stat {
-   u32 remote_tlb_flush;
+   ulong remote_tlb_flush;
 };
 
 struct kvm_vcpu_stat {
-   u32 halt_successful_poll;
-   u32 halt_attempted_poll;
-   u32 halt_poll_invalid;
-   u32 halt_wakeup;
-   u32 hvc_exit_stat;
+   u64 halt_successful_poll;
+   u64 halt_attempted_poll;
+   u64 halt_poll_invalid;
+   u64 halt_wakeup;
+   u64 hvc_exit_stat;
u64 wfe_exit_stat;
u64 wfi_exit_stat;
u64 mmio_exit_user;
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 36a391d..9704888 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -98,32 +98,32 @@ extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
 extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
 
 struct kvm_vm_stat {
-   u32 remote_tlb_flush;
+   ulong remote_tlb_flush;
 };
 
 struct kvm_vcpu_stat {
-   u32 wait_exits;
-   u32 cache_exits;
-   u32 signal_exits;
-   u32 int_exits;
-   u32 cop_unusable_exits;
-   u32 tlbmod_exits;
-   u32 tlbmiss_ld_exits;
-   u32 tlbmiss_st_exits;
-   u32 addrerr_st_exits;
-   u32 addrerr_ld_exits;
-   u32 syscall_exits;
-   u32 resvd_inst_exits;
-   u32 break_inst_exits;
-   u32 trap_inst_exits;
-   u32 msa_fpe_exits;
-   u32 fpe_exits;
-   u32 msa_disabled_exits;
-   u32 flush_dcache_exits;
-   u32 halt_successful_poll;
-   u32 halt_attempted_poll;
-   u32 halt_poll_invalid;
-   u32 halt_wakeup;
+   u64 wait_exits;
+   u64 cache_exits;
+   u64 signal_exits;
+   u64 int_exits;
+   u64 cop_unusable_exits;
+   u64 tlbmod_exits;
+   u64 tlbmiss_ld_exits;
+   u64 tlbmiss_st_exits;
+   u64 addrerr_st_exits;
+   u64 addrerr_ld_exits;
+   u64 syscall_exits;
+   u64 resvd_inst_exits;
+   u64 break_inst_exits;
+   u64 trap_inst_exits;
+   u64 msa_fpe_exits;
+   u64 fpe_exits;
+   u64 msa_disabled_exits;
+   u64 flush_dcache_exits;
+   u64 halt_successful_poll;
+   u64 halt_attempted_poll;
+   u64 halt_poll_invalid;
+   u64 halt_wakeup;
 };
 
 enum kvm_mips_exit_types {
diff --git a/arch/powerpc/include/asm/kvm_host.h

[PATCH V4 3/5] kvm/ppc/book3s_hv: Implement halt polling in the kvm_hv kernel module

2016-07-19 Thread Suraj Jitindar Singh

This patch introduces new halt polling functionality into the kvm_hv kernel
module. When a vcore is idle it will poll for some period of time before
scheduling itself out.

When all of the runnable vcpus on a vcore have ceded (and thus the vcore is
idle) we schedule ourselves out to allow something else to run. In the
event that we need to wake up very quickly (for example an interrupt
arrives), we are required to wait until we get scheduled again.

Implement halt polling so that when a vcore is idle, and before scheduling
ourselves, we poll for vcpus in the runnable_threads list which have
pending exceptions or which leave the ceded state. If we poll successfully
then we can get back into the guest very quickly without ever scheduling
ourselves, otherwise we schedule ourselves out as before.

Testing of this patch with a TCP round robin test between two guests with
virtio network interfaces has found a decrease in round trip time of ~15us
on average. A performance gain is only seen when going out of and
back into the guest often and quickly, otherwise there is no net benefit
from the polling. The polling interval is adjusted such that when we are
often scheduled out for long periods of time it is reduced, and when we
often poll successfully it is increased. The rate at which the polling
interval increases or decreases, and the maximum polling interval, can
be set through module parameters.

Based on the implementation in the generic kvm module by Wanpeng Li and
Paolo Bonzini, and on direction from Paul Mackerras.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/kvm_book3s.h |   1 +
 arch/powerpc/include/asm/kvm_host.h   |   1 +
 arch/powerpc/kvm/book3s_hv.c  | 116 ++
 arch/powerpc/kvm/trace_hv.h   |  22 +++
 4 files changed, 126 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 151f817..c261f52 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -102,6 +102,7 @@ struct kvmppc_vcore {
ulong pcr;
ulong dpdes;/* doorbell state (POWER8) */
ulong conferring_threads;
+   unsigned int halt_poll_ns;
 };
 
 struct kvmppc_vcpu_book3s {
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 02d06e9..610f393 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -294,6 +294,7 @@ struct kvm_arch {
 #define VCORE_SLEEPING 3
 #define VCORE_RUNNING  4
 #define VCORE_EXITING  5
+#define VCORE_POLLING  6
 
 /*
  * Struct used to manage memory for a virtual processor area
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 3bcf9e6..a9de1d4 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -94,6 +94,23 @@ module_param_cb(h_ipi_redirect, _param_ops, 
_ipi_redirect,
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
 #endif
 
+/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
+static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
+module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
+
+/* Factor by which the vcore halt poll interval is grown, default is to double
+ */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
+
+/* Factor by which the vcore halt poll interval is shrunk, default is to reset
+ */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
@@ -2620,32 +2637,82 @@ static void kvmppc_wait_for_exec(struct kvmppc_vcore 
*vc,
finish_wait(>arch.cpu_run, );
 }
 
+static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+   /* 10us base */
+   if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
+   vc->halt_poll_ns = 1;
+   else
+   vc->halt_poll_ns *= halt_poll_ns_grow;
+
+   if (vc->halt_poll_ns > halt_poll_max_ns)
+   vc->halt_poll_ns = halt_poll_max_ns;
+}
+
+static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
+{
+   if (halt_poll_ns_shrink == 0)
+   vc->halt_poll_ns = 0;
+   else
+   vc->halt_poll_ns /= halt_poll_ns_shrink;
+}
+
+/* Check to see if any of the runnable vcpus on the vcore have pending
+ * exceptions or are no longer ceded
+ */
+static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
+{
+   struct kvm_vcpu *vcpu;
+   int i;
+
+   for_each_runnable_thread(i, vcpu, vc) {
+   if (vcpu->arch.pending_exceptions ||

[PATCH V4 2/5] kvm/ppc/book3s_hv: Change vcore element runnable_threads from linked-list to array

2016-07-19 Thread Suraj Jitindar Singh

The struct kvmppc_vcore is a structure used to store various information
about a virtual core for a kvm guest. The runnable_threads element of the
struct provides a list of all of the currently runnable vcpus on the core
(those in the KVMPPC_VCPU_RUNNABLE state). The previous implementation of
this list was a linked_list. The next patch requires that the list be able
to be iterated over without holding the vcore lock.

Reimplement the runnable_threads list in the kvmppc_vcore struct as an
array. Implement function to iterate over valid entries in the array and
update access sites accordingly.

Signed-off-by: Suraj Jitindar Singh 
---
 arch/powerpc/include/asm/kvm_book3s.h |  2 +-
 arch/powerpc/include/asm/kvm_host.h   |  1 -
 arch/powerpc/kvm/book3s_hv.c  | 68 +--
 3 files changed, 43 insertions(+), 28 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index a50c5fe..151f817 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -87,7 +87,7 @@ struct kvmppc_vcore {
u8 vcore_state;
u8 in_guest;
struct kvmppc_vcore *master_vcore;
-   struct list_head runnable_threads;
+   struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
struct list_head preempt_list;
spinlock_t lock;
struct swait_queue_head wq;
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 19c6731..02d06e9 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -633,7 +633,6 @@ struct kvm_vcpu_arch {
long pgfault_index;
unsigned long pgfault_hpte[2];
 
-   struct list_head run_list;
struct task_struct *run_task;
struct kvm_run *kvm_run;
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e20beae..3bcf9e6 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -57,6 +57,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "book3s.h"
 
@@ -96,6 +97,26 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a 
free host core");
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
+static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
+   int *ip)
+{
+   int i = *ip;
+   struct kvm_vcpu *vcpu;
+
+   while (++i < MAX_SMT_THREADS) {
+   vcpu = READ_ONCE(vc->runnable_threads[i]);
+   if (vcpu) {
+   *ip = i;
+   return vcpu;
+   }
+   }
+   return NULL;
+}
+
+/* Used to traverse the list of runnable threads for a given vcore */
+#define for_each_runnable_thread(i, vcpu, vc) \
+   for (i = -1; (vcpu = next_runnable_thread(vc, )); )
+
 static bool kvmppc_ipi_thread(int cpu)
 {
/* On POWER8 for IPIs to threads in the same core, use msgsnd */
@@ -1492,7 +1513,6 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct 
kvm *kvm, int core)
if (vcore == NULL)
return NULL;
 
-   INIT_LIST_HEAD(>runnable_threads);
spin_lock_init(>lock);
spin_lock_init(>stoltb_lock);
init_swait_queue_head(>wq);
@@ -1801,7 +1821,7 @@ static void kvmppc_remove_runnable(struct kvmppc_vcore 
*vc,
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
spin_unlock_irq(>arch.tbacct_lock);
--vc->n_runnable;
-   list_del(>arch.run_list);
+   WRITE_ONCE(vc->runnable_threads[vcpu->arch.ptid], NULL);
 }
 
 static int kvmppc_grab_hwthread(int cpu)
@@ -2208,10 +2228,10 @@ static bool can_piggyback(struct kvmppc_vcore *pvc, 
struct core_info *cip,
 
 static void prepare_threads(struct kvmppc_vcore *vc)
 {
-   struct kvm_vcpu *vcpu, *vnext;
+   int i;
+   struct kvm_vcpu *vcpu;
 
-   list_for_each_entry_safe(vcpu, vnext, >runnable_threads,
-arch.run_list) {
+   for_each_runnable_thread(i, vcpu, vc) {
if (signal_pending(vcpu->arch.run_task))
vcpu->arch.ret = -EINTR;
else if (vcpu->arch.vpa.update_pending ||
@@ -2258,15 +2278,14 @@ static void collect_piggybacks(struct core_info *cip, 
int target_threads)
 
 static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 {
-   int still_running = 0;
+   int still_running = 0, i;
u64 now;
long ret;
-   struct kvm_vcpu *vcpu, *vnext;
+   struct kvm_vcpu *vcpu;
 
spin_lock(>lock);
now = get_tb();
-   list_for_each_entry_safe(vcpu, vnext, >runnable_threads,
-arch.run_list) {
+   for_each_runnable_thread(i, vcpu, vc) {
/* cancel pending dec exception if dec is positive */
if (now < vcpu->arch.dec_expires &&

[PATCH V4 1/5] kvm/ppc/book3s: Move struct kvmppc_vcore from kvm_host.h to kvm_book3s.h

2016-07-19 Thread Suraj Jitindar Singh

The next commit will introduce a member to the kvmppc_vcore struct which
references MAX_SMT_THREADS which is defined in kvm_book3s_asm.h, however
this file isn't included in kvm_host.h directly. Thus compiling for
certain platforms such as pmac32_defconfig and ppc64e_defconfig with KVM
fails due to MAX_SMT_THREADS not being defined.

Move the struct kvmppc_vcore definition to kvm_book3s.h which explicitly
includes kvm_book3s_asm.h.

Signed-off-by: Suraj Jitindar Singh 

---
Change Log:

V1 -> V2:
- Added patch to series
---
 arch/powerpc/include/asm/kvm_book3s.h | 35 +++
 arch/powerpc/include/asm/kvm_host.h   | 35 ---
 2 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 8f39796..a50c5fe 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -69,6 +69,41 @@ struct hpte_cache {
int pagesize;
 };
 
+/*
+ * Struct for a virtual core.
+ * Note: entry_exit_map combines a bitmap of threads that have entered
+ * in the bottom 8 bits and a bitmap of threads that have exited in the
+ * next 8 bits.  This is so that we can atomically set the entry bit
+ * iff the exit map is 0 without taking a lock.
+ */
+struct kvmppc_vcore {
+   int n_runnable;
+   int num_threads;
+   int entry_exit_map;
+   int napping_threads;
+   int first_vcpuid;
+   u16 pcpu;
+   u16 last_cpu;
+   u8 vcore_state;
+   u8 in_guest;
+   struct kvmppc_vcore *master_vcore;
+   struct list_head runnable_threads;
+   struct list_head preempt_list;
+   spinlock_t lock;
+   struct swait_queue_head wq;
+   spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
+   u64 stolen_tb;
+   u64 preempt_tb;
+   struct kvm_vcpu *runner;
+   struct kvm *kvm;
+   u64 tb_offset;  /* guest timebase - host timebase */
+   ulong lpcr;
+   u32 arch_compat;
+   ulong pcr;
+   ulong dpdes;/* doorbell state (POWER8) */
+   ulong conferring_threads;
+};
+
 struct kvmppc_vcpu_book3s {
struct kvmppc_sid_map sid_map[SID_MAP_NUM];
struct {
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index ec35af3..19c6731 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -275,41 +275,6 @@ struct kvm_arch {
 #endif
 };
 
-/*
- * Struct for a virtual core.
- * Note: entry_exit_map combines a bitmap of threads that have entered
- * in the bottom 8 bits and a bitmap of threads that have exited in the
- * next 8 bits.  This is so that we can atomically set the entry bit
- * iff the exit map is 0 without taking a lock.
- */
-struct kvmppc_vcore {
-   int n_runnable;
-   int num_threads;
-   int entry_exit_map;
-   int napping_threads;
-   int first_vcpuid;
-   u16 pcpu;
-   u16 last_cpu;
-   u8 vcore_state;
-   u8 in_guest;
-   struct kvmppc_vcore *master_vcore;
-   struct list_head runnable_threads;
-   struct list_head preempt_list;
-   spinlock_t lock;
-   struct swait_queue_head wq;
-   spinlock_t stoltb_lock; /* protects stolen_tb and preempt_tb */
-   u64 stolen_tb;
-   u64 preempt_tb;
-   struct kvm_vcpu *runner;
-   struct kvm *kvm;
-   u64 tb_offset;  /* guest timebase - host timebase */
-   ulong lpcr;
-   u32 arch_compat;
-   ulong pcr;
-   ulong dpdes;/* doorbell state (POWER8) */
-   ulong conferring_threads;
-};
-
 #define VCORE_ENTRY_MAP(vc)((vc)->entry_exit_map & 0xff)
 #define VCORE_EXIT_MAP(vc) ((vc)->entry_exit_map >> 8)
 #define VCORE_IS_EXITING(vc)   (VCORE_EXIT_MAP(vc) != 0)
-- 
2.5.5

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v2 1/2] crypto: vmx - Adding asm subroutines for XTS

2016-07-19 Thread Herbert Xu

On Mon, Jul 18, 2016 at 12:26:25PM -0300, Paulo Flabiano Smorigo wrote:
> This patch add XTS subroutines using VMX-crypto driver.
> 
> It gives a boost of 20 times using XTS.
> 
> These code has been adopted from OpenSSL project in collaboration
> with the original author (Andy Polyakov ).
> 
> Signed-off-by: Leonidas S. Barbosa 
> Signed-off-by: Paulo Flabiano Smorigo 

Both patches applied.  Thanks.
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

51 matches

Mail list logo