[PATCH 25/31] mm, multi-arch: pass a protection key in to calc_vm_flag_bits()

2016-01-06 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

This plumbs a protection key through calc_vm_flag_bits().  We
could have done this in calc_vm_prot_bits(), but I did not feel
super strongly which way to go.  It was pretty arbitrary which
one to use.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
Cc: linux-a...@vger.kernel.org
---

 b/arch/powerpc/include/asm/mman.h  |5 +++--
 b/drivers/char/agp/frontend.c  |2 +-
 b/drivers/staging/android/ashmem.c |4 ++--
 b/include/linux/mman.h |6 +++---
 b/mm/mmap.c|2 +-
 b/mm/mprotect.c|2 +-
 b/mm/nommu.c   |2 +-
 7 files changed, 12 insertions(+), 11 deletions(-)

diff -puN arch/powerpc/include/asm/mman.h~pkeys-70-calc_vm_prot_bits 
arch/powerpc/include/asm/mman.h
--- a/arch/powerpc/include/asm/mman.h~pkeys-70-calc_vm_prot_bits
2016-01-06 15:50:13.971532951 -0800
+++ b/arch/powerpc/include/asm/mman.h   2016-01-06 15:50:13.984533537 -0800
@@ -18,11 +18,12 @@
  * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
  * here.  How important is the optimization?
  */
-static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot)
+static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+   unsigned long pkey)
 {
return (prot & PROT_SAO) ? VM_SAO : 0;
 }
-#define arch_calc_vm_prot_bits(prot) arch_calc_vm_prot_bits(prot)
+#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
 
 static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
 {
diff -puN drivers/char/agp/frontend.c~pkeys-70-calc_vm_prot_bits 
drivers/char/agp/frontend.c
--- a/drivers/char/agp/frontend.c~pkeys-70-calc_vm_prot_bits2016-01-06 
15:50:13.972532995 -0800
+++ b/drivers/char/agp/frontend.c   2016-01-06 15:50:13.984533537 -0800
@@ -156,7 +156,7 @@ static pgprot_t agp_convert_mmap_flags(i
 {
unsigned long prot_bits;
 
-   prot_bits = calc_vm_prot_bits(prot) | VM_SHARED;
+   prot_bits = calc_vm_prot_bits(prot, 0) | VM_SHARED;
return vm_get_page_prot(prot_bits);
 }
 
diff -puN drivers/staging/android/ashmem.c~pkeys-70-calc_vm_prot_bits 
drivers/staging/android/ashmem.c
--- a/drivers/staging/android/ashmem.c~pkeys-70-calc_vm_prot_bits   
2016-01-06 15:50:13.974533086 -0800
+++ b/drivers/staging/android/ashmem.c  2016-01-06 15:50:13.985533582 -0800
@@ -372,8 +372,8 @@ static int ashmem_mmap(struct file *file
}
 
/* requested protection bits must match our allowed protection mask */
-   if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
-calc_vm_prot_bits(PROT_MASK))) {
+   if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask, 0)) &
+calc_vm_prot_bits(PROT_MASK, 0))) {
ret = -EPERM;
goto out;
}
diff -puN include/linux/mman.h~pkeys-70-calc_vm_prot_bits include/linux/mman.h
--- a/include/linux/mman.h~pkeys-70-calc_vm_prot_bits   2016-01-06 
15:50:13.976533176 -0800
+++ b/include/linux/mman.h  2016-01-06 15:50:13.985533582 -0800
@@ -35,7 +35,7 @@ static inline void vm_unacct_memory(long
  */
 
 #ifndef arch_calc_vm_prot_bits
-#define arch_calc_vm_prot_bits(prot) 0
+#define arch_calc_vm_prot_bits(prot, pkey) 0
 #endif
 
 #ifndef arch_vm_get_page_prot
@@ -70,12 +70,12 @@ static inline int arch_validate_prot(uns
  * Combine the mmap "prot" argument into "vm_flags" used internally.
  */
 static inline unsigned long
-calc_vm_prot_bits(unsigned long prot)
+calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
 {
return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
   _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
   _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
-  arch_calc_vm_prot_bits(prot);
+  arch_calc_vm_prot_bits(prot, pkey);
 }
 
 /*
diff -puN mm/mmap.c~pkeys-70-calc_vm_prot_bits mm/mmap.c
--- a/mm/mmap.c~pkeys-70-calc_vm_prot_bits  2016-01-06 15:50:13.977533221 
-0800
+++ b/mm/mmap.c 2016-01-06 15:50:13.986533627 -0800
@@ -1309,7 +1309,7 @@ unsigned long do_mmap(struct file *file,
 * to. we assume access permissions have been handled by the open
 * of the memory object, so we don't do any here.
 */
-   vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+   vm_flags |= calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
if (flags & MAP_LOCKED)
diff -puN mm/mprotect.c~pkeys-70-calc_vm_prot_bits mm/mprotect.c
--- a/mm/mprotect.c~pkeys-70-calc_vm_prot_bits  2016-01-06 15:50:13.979533311 
-0800
+++ b/mm/mprotect.c 2016-01-06 15:50:13.986533627 -0800
@@ -373,7 +373,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long,

[PATCH 25/32] mm, multi-arch: pass a protection key in to calc_vm_flag_bits()

2015-12-14 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

This plumbs a protection key through calc_vm_flag_bits().  We
could have done this in calc_vm_prot_bits(), but I did not feel
super strongly which way to go.  It was pretty arbitrary which
one to use.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
Cc: linux-a...@vger.kernel.org
---

 b/arch/powerpc/include/asm/mman.h  |5 +++--
 b/drivers/char/agp/frontend.c  |2 +-
 b/drivers/staging/android/ashmem.c |4 ++--
 b/include/linux/mman.h |6 +++---
 b/mm/mmap.c|2 +-
 b/mm/mprotect.c|2 +-
 b/mm/nommu.c   |2 +-
 7 files changed, 12 insertions(+), 11 deletions(-)

diff -puN arch/powerpc/include/asm/mman.h~pkeys-70-calc_vm_prot_bits 
arch/powerpc/include/asm/mman.h
--- a/arch/powerpc/include/asm/mman.h~pkeys-70-calc_vm_prot_bits
2015-12-14 10:42:50.063128373 -0800
+++ b/arch/powerpc/include/asm/mman.h   2015-12-14 10:42:50.076128955 -0800
@@ -18,11 +18,12 @@
  * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
  * here.  How important is the optimization?
  */
-static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot)
+static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+   unsigned long pkey)
 {
return (prot & PROT_SAO) ? VM_SAO : 0;
 }
-#define arch_calc_vm_prot_bits(prot) arch_calc_vm_prot_bits(prot)
+#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
 
 static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
 {
diff -puN drivers/char/agp/frontend.c~pkeys-70-calc_vm_prot_bits 
drivers/char/agp/frontend.c
--- a/drivers/char/agp/frontend.c~pkeys-70-calc_vm_prot_bits2015-12-14 
10:42:50.064128418 -0800
+++ b/drivers/char/agp/frontend.c   2015-12-14 10:42:50.076128955 -0800
@@ -156,7 +156,7 @@ static pgprot_t agp_convert_mmap_flags(i
 {
unsigned long prot_bits;
 
-   prot_bits = calc_vm_prot_bits(prot) | VM_SHARED;
+   prot_bits = calc_vm_prot_bits(prot, 0) | VM_SHARED;
return vm_get_page_prot(prot_bits);
 }
 
diff -puN drivers/staging/android/ashmem.c~pkeys-70-calc_vm_prot_bits 
drivers/staging/android/ashmem.c
--- a/drivers/staging/android/ashmem.c~pkeys-70-calc_vm_prot_bits   
2015-12-14 10:42:50.066128507 -0800
+++ b/drivers/staging/android/ashmem.c  2015-12-14 10:42:50.077129000 -0800
@@ -372,8 +372,8 @@ static int ashmem_mmap(struct file *file
}
 
/* requested protection bits must match our allowed protection mask */
-   if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
-calc_vm_prot_bits(PROT_MASK))) {
+   if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask, 0)) &
+calc_vm_prot_bits(PROT_MASK, 0))) {
ret = -EPERM;
goto out;
}
diff -puN include/linux/mman.h~pkeys-70-calc_vm_prot_bits include/linux/mman.h
--- a/include/linux/mman.h~pkeys-70-calc_vm_prot_bits   2015-12-14 
10:42:50.068128597 -0800
+++ b/include/linux/mman.h  2015-12-14 10:42:50.077129000 -0800
@@ -35,7 +35,7 @@ static inline void vm_unacct_memory(long
  */
 
 #ifndef arch_calc_vm_prot_bits
-#define arch_calc_vm_prot_bits(prot) 0
+#define arch_calc_vm_prot_bits(prot, pkey) 0
 #endif
 
 #ifndef arch_vm_get_page_prot
@@ -70,12 +70,12 @@ static inline int arch_validate_prot(uns
  * Combine the mmap "prot" argument into "vm_flags" used internally.
  */
 static inline unsigned long
-calc_vm_prot_bits(unsigned long prot)
+calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
 {
return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
   _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
   _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
-  arch_calc_vm_prot_bits(prot);
+  arch_calc_vm_prot_bits(prot, pkey);
 }
 
 /*
diff -puN mm/mmap.c~pkeys-70-calc_vm_prot_bits mm/mmap.c
--- a/mm/mmap.c~pkeys-70-calc_vm_prot_bits  2015-12-14 10:42:50.069128642 
-0800
+++ b/mm/mmap.c 2015-12-14 10:42:50.078129045 -0800
@@ -1309,7 +1309,7 @@ unsigned long do_mmap(struct file *file,
 * to. we assume access permissions have been handled by the open
 * of the memory object, so we don't do any here.
 */
-   vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+   vm_flags |= calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
if (flags & MAP_LOCKED)
diff -puN mm/mprotect.c~pkeys-70-calc_vm_prot_bits mm/mprotect.c
--- a/mm/mprotect.c~pkeys-70-calc_vm_prot_bits  2015-12-14 10:42:50.071128731 
-0800
+++ b/mm/mprotect.c 2015-12-14 10:42:50.078129045 -0800
@@ -373,7 +373,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long,

[PATCH 00/32] x86: Memory Protection Keys (v7)

2015-12-14 Thread Dave Hansen
Memory Protection Keys for User pages is a CPU feature which will
first appear on Skylake Servers, but will also be supported on
future non-server parts (there is also a QEMU implementation).  It
provides a mechanism for enforcing page-based protections, but
without requiring modification of the page tables when an
application changes protection domains.  See the Documentation/
patch for more details.

This set enables for two things in the end:
1. Allows "execute-only" memory
2. Enables KVM to run Protection-Key-enabled guests

Changes from v6:
 * fix up ??'s showing up in in smaps' VmFlags field
 * added execute-only support
 * removed all the new syscalls from this set.  We can discuss
   them in detail after this is merged.

Changes from v5:

 * make types in read_pkru() u32's, not ints
 * rework VM_* bits to avoid using __ffsl() and clean up
   vma_pkey()
 * rework pte_allows_gup() to use p??_val() instead of passing
   around p{te,md,ud}_t types.
 * Fix up some inconsistent bool vs. int usage
 * corrected name of ARCH_VM_PKEY_FLAGS in patch description
 * remove NR_PKEYS... config option.  Just define it directly

Changes from v4:

 * Made "allow setting of XSAVE state" safe if we got preempted
   between when we saved our FPU state and when we restore it.
   (I would appreciate a look from Ingo on this patch).
 * Fixed up a few things from Thomas's latest comments: splt up
   siginfo in to x86 and generic, removed extra 'eax' variable
   in rdpkru function, reworked vm_flags assignment, reworded
   a comment in pte_allows_gup()
 * Add missing DISABLED/REQUIRED_MASK14 in cpufeature.h
 * Added comment about compile optimization in fault path
 * Left get_user_pages_locked() alone.  Andrea thinks we need it.

Changes from RFCv3:

 * Added 'current' and 'foreign' variants of get_user_pages() to
   help indicate whether protection keys should be enforced.
   Thanks to Jerome Glisse for pointing out this issue.
 * Added "allocation" and set/get system calls so that we can do
   management of proection keys in the kernel.  This opens the
   door to use of specific protection keys for kernel use in the
   future, such as for execute-only memory.
 * Removed the kselftest code for the moment.  It will be
   submitted separately.

Thanks Ingo and Thomas for most of these):
Changes from RFCv2 (Thanks Ingo and Thomas for most of these):

 * few minor compile warnings
 * changed 'nopku' interaction with cpuid bits.  Now, we do not
   clear the PKU cpuid bit, we just skip enabling it.
 * changed __pkru_allows_write() to also check access disable bit
 * removed the unused write_pkru()
 * made si_pkey a u64 and added some patch description details.
   Also made it share space in siginfo with MPX and clarified
   comments.
 * give some real text for the Processor Trace xsave state
 * made vma_pkey() less ugly (and much more optimized actually)
 * added SEGV_PKUERR to copy_siginfo_to_user()
 * remove page table walk when filling in si_pkey, added some
   big fat comments about it being inherently racy.
 * added self test code

This code is not runnable to anyone outside of Intel unless they
have some special hardware or a fancy simulator.  There is a qemu
model to emulate the feature, but it is not currently implemented
fully enough to be usable.  If you are interested in running this
for real, please get in touch with me.  Hardware is available to a
very small but nonzero number of people.

This set is also available here:

git://git.kernel.org/pub/scm/linux/kernel/git/daveh/x86-pkeys.git 
pkeys-v018

=== diffstat ===

Dave Hansen (32):
  mm, gup: introduce concept of "foreign" get_user_pages()
  x86, fpu: add placeholder for Processor Trace XSAVE state
  x86, pkeys: Add Kconfig option
  x86, pkeys: cpuid bit definition
  x86, pkeys: define new CR4 bit
  x86, pkeys: add PKRU xsave fields and data structure(s)
  x86, pkeys: PTE bits for storing protection key
  x86, pkeys: new page fault error code bit: PF_PK
  x86, pkeys: store protection in high VMA flags
  x86, pkeys: arch-specific protection bits
  x86, pkeys: pass VMA down in to fault signal generation code
  signals, pkeys: notify userspace about protection key faults
  x86, pkeys: fill in pkey field in siginfo
  x86, pkeys: add functions to fetch PKRU
  mm: factor out VMA fault permission checking
  x86, mm: simplify get_user_pages() PTE bit handling
  x86, pkeys: check VMAs and PTEs for protection keys
  mm: add gup flag to indicate "foreign" mm access
  x86, pkeys: optimize fault handling in access_error()
  x86, pkeys: differentiate instruction fetches
  x86, pkeys: dump PKRU with other kernel registers
  x86, pkeys: dump PTE pkey in /proc/pid/smaps
  x86, pkeys: add Kconfig prompt to existing config option
  x86, pkeys: actually enable Memory Protection Keys in CPU
  mm, multi-arch: pass a protection k

Re: [PATCH 26/34] mm: implement new mprotect_key() system call

2015-12-09 Thread Dave Hansen
On 12/09/2015 08:45 AM, Michael Kerrisk (man-pages) wrote:
>>> >> * Explanation of what a protection domain is.
>> >
>> > A protection domain is a unique view of memory and is represented by the
>> > value in the PKRU register.
> Out something about this in pkey(7), but explain what you mean by a
> "unique view of memory".

Let's say there are only two protection keys: 0 and 1.  There are two
disable bits per protection key (Access and Write Disable), so a two-key
PKRU looks like:

|   PKEY0   |   PKEY1   |
| AD0 | WD0 | AD1 | WD1 |

In this example, there are 16 possible protection domains, one for each
possible combination of the 4 rights-disable bits.

"Changing a protection domain" would mean changing (setting or clearing)
the value of any of those 4 bits.  Each unique value of PKRU represents
a view of memory, or unique protection domain.
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 28/34] x86: wire up mprotect_key() system call

2015-12-08 Thread Dave Hansen
On 12/08/2015 10:44 AM, Thomas Gleixner wrote:
> On Thu, 3 Dec 2015, Dave Hansen wrote:
>>  #include 
>> diff -puN mm/Kconfig~pkeys-16-x86-mprotect_key mm/Kconfig
>> --- a/mm/Kconfig~pkeys-16-x86-mprotect_key   2015-12-03 16:21:31.114920208 
>> -0800
>> +++ b/mm/Kconfig 2015-12-03 16:21:31.119920435 -0800
>> @@ -679,4 +679,5 @@ config NR_PROTECTION_KEYS
>>  # Everything supports a _single_ key, so allow folks to
>>  # at least call APIs that take keys, but require that the
>>  # key be 0.
>> +default 16 if X86_INTEL_MEMORY_PROTECTION_KEYS
>>  default 1
> 
> What happens if I set that to 42?
> 
> I think we want to make this a runtime evaluated thingy. If pkeys are
> compiled in, but the machine does not support it then we don't support
> 16 keys, or do we?

We do have runtime evaluation:

#define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ?  \
 CONFIG_NR_PROTECTION_KEYS : 1)

The config option really just sets the architectural limit for how many
are supported.  So it probably needs a better name at least.  Let me
take a look at getting rid of this config option entirely.
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 26/34] mm: implement new mprotect_key() system call

2015-12-07 Thread Dave Hansen
On 12/04/2015 10:50 PM, Michael Kerrisk (man-pages) wrote:
> On 12/04/2015 02:15 AM, Dave Hansen wrote:
>> From: Dave Hansen <dave.han...@linux.intel.com>
>>
>> mprotect_key() is just like mprotect, except it also takes a
>> protection key as an argument.  On systems that do not support
>> protection keys, it still works, but requires that key=0.
>> Otherwise it does exactly what mprotect does.
> 
> Is there a man page for this API?

Yep.  Patch to man-pages source is attached.  I actually broke it up in
to a few separate pages.  I was planning on submitting these after the
patches themselves go upstream.
commit ebb12643876810931ed23992f92b7c77c2c36883
Author: Dave Hansen <dave.han...@intel.com>
Date:   Mon Dec 7 08:42:57 2015 -0800

pkeys

diff --git a/man2/mprotect.2 b/man2/mprotect.2
index ae305f6..a3c1e62 100644
--- a/man2/mprotect.2
+++ b/man2/mprotect.2
@@ -38,16 +38,19 @@
 .\"
 .TH MPROTECT 2 2015-07-23 "Linux" "Linux Programmer's Manual"
 .SH NAME
-mprotect \- set protection on a region of memory
+mprotect, mprotect_key \- set protection on a region of memory
 .SH SYNOPSIS
 .nf
 .B #include 
 .sp
 .BI "int mprotect(void *" addr ", size_t " len ", int " prot );
+.BI "int mprotect_key(void *" addr ", size_t " len ", int " prot , " int " key);
 .fi
 .SH DESCRIPTION
 .BR mprotect ()
-changes protection for the calling process's memory page(s)
+and
+.BR mprotect_key ()
+change protection for the calling process's memory page(s)
 containing any part of the address range in the
 interval [\fIaddr\fP,\ \fIaddr\fP+\fIlen\fP\-1].
 .I addr
@@ -74,10 +77,17 @@ The memory can be modified.
 .TP
 .B PROT_EXEC
 The memory can be executed.
+.PP
+.I key
+is the protection or storage key to assign to the memory.
+A key must be allocated with pkey_alloc () before it is
+passed to pkey_mprotect ().
 .SH RETURN VALUE
 On success,
 .BR mprotect ()
-returns zero.
+and
+.BR mprotect_key ()
+return zero.
 On error, \-1 is returned, and
 .I errno
 is set appropriately.
diff --git a/man2/pkey_alloc.2 b/man2/pkey_alloc.2
new file mode 100644
index 000..980ce3e
--- /dev/null
+++ b/man2/pkey_alloc.2
@@ -0,0 +1,72 @@
+.\" Copyright (C) 2007 Michael Kerrisk <mtk.manpa...@gmail.com>
+.\" and Copyright (C) 1995 Michael Shields <shie...@tembel.org>.
+.\"
+.\" %%%LICENSE_START(VERBATIM)
+.\" Permission is granted to make and distribute verbatim copies of this
+.\" manual provided the copyright notice and this permission notice are
+.\" preserved on all copies.
+.\"
+.\" Permission is granted to copy and distribute modified versions of this
+.\" manual under the conditions for verbatim copying, provided that the
+.\" entire resulting derived work is distributed under the terms of a
+.\" permission notice identical to this one.
+.\"
+.\" Since the Linux kernel and libraries are constantly changing, this
+.\" manual page may be incorrect or out-of-date.  The author(s) assume no
+.\" responsibility for errors or omissions, or for damages resulting from
+.\" the use of the information contained herein.  The author(s) may not
+.\" have taken the same level of care in the production of this manual,
+.\" which is licensed free of charge, as they might when working
+.\" professionally.
+.\"
+.\" Formatted or processed versions of this manual, if unaccompanied by
+.\" the source, must acknowledge the copyright and author of this work.
+.\" %%%LICENSE_END
+.\"
+.\" Modified 2015-12-04 by Dave Hansen <d...@sr71.net>
+.\"
+.\"
+.TH PKEY_ALLOC 2 2015-12-04 "Linux" "Linux Programmer's Manual"
+.SH NAME
+pkey_alloc, pkey_free \- allocate or free a protection key
+.SH SYNOPSIS
+.nf
+.B #include 
+.sp
+.BI "int pkey_alloc(unsigned long" flags ", unsigned long " init_val);
+.BI "int pkey_free(int " pkey);
+.fi
+.SH DESCRIPTION
+.BR pkey_alloc ()
+and
+.BR pkey_free ()
+allow or disallow the calling process's to use the given
+protection key for all protection-key-related operations.
+
+.PP
+.I flags
+is may contain zero or more disable operation:
+.B PKEY_DISABLE_ACCESS
+and/or
+.B PKEY_DISABLE_WRITE
+.SH RETURN VALUE
+On success,
+.BR pkey_alloc ()
+and
+.BR pkey_free ()
+return zero.
+On error, \-1 is returned, and
+.I errno
+is set appropriately.
+.SH ERRORS
+.TP
+.B EINVAL
+An invalid protection key, flag, or init_val was specified.
+.TP
+.B ENOSPC
+All protection keys available for the current process have
+been allocated.
+.SH SEE ALSO
+.BR mprotect_pkey (2),
+.BR pkey_get (2),
+.BR pkey_set (2),
diff --git a/man2/pkey_get.2 b/man2/pkey_get.2
new file mode 100644
index 000..4cfdea9
--- /dev/null
+++ b/man2/pkey_get.2
@@ -0,0 +1,76 @@
+.\" Copyright (C) 2007 Michael Kerrisk <

Re: [PATCH 00/34] x86: Memory Protection Keys (v5)

2015-12-04 Thread Dave Hansen
On 12/04/2015 03:31 PM, Andy Lutomirski wrote:
> On Thu, Dec 3, 2015 at 5:14 PM, Dave Hansen <d...@sr71.net> wrote:
>> Memory Protection Keys for User pages is a CPU feature which will
>> first appear on Skylake Servers, but will also be supported on
>> future non-server parts.  It provides a mechanism for enforcing
>> page-based protections, but without requiring modification of the
>> page tables when an application changes protection domains.  See
>> the Documentation/ patch for more details.
> 
> What, if anything, happened to the signal handling parts?

Patches 12 and 13 contain most of it:

x86, pkeys: fill in pkey field in siginfo
signals, pkeys: notify userspace about protection key faults

I decided to just not try to preserve the pkey_get/set() semantics
across entering and returning from signals, fwiw.

> Also, do you have a git tree for this somewhere?  I can't actually
> enable it (my laptop, while very shiny, is not a Skylake server), but
> I can poke around a bit.

http://git.kernel.org/cgit/linux/kernel/git/daveh/x86-pkeys.git/

Thanks for taking a look!
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 26/34] mm: implement new mprotect_key() system call

2015-12-03 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

mprotect_key() is just like mprotect, except it also takes a
protection key as an argument.  On systems that do not support
protection keys, it still works, but requires that key=0.
Otherwise it does exactly what mprotect does.

I expect it to get used like this, if you want to guarantee that
any mapping you create can *never* be accessed without the right
protection keys set up.

pkey_deny_access(11); // random pkey
int real_prot = PROT_READ|PROT_WRITE;
ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 
0);
ret = mprotect_key(ptr, PAGE_SIZE, real_prot, 11);

This way, there is *no* window where the mapping is accessible
since it was always either PROT_NONE or had a protection key set.

We settled on 'unsigned long' for the type of the key here.  We
only need 4 bits on x86 today, but I figured that other
architectures might need some more space.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/include/asm/mmu_context.h |   10 +++--
 b/include/linux/pkeys.h  |7 +-
 b/mm/Kconfig |7 ++
 b/mm/mprotect.c  |   36 +--
 4 files changed, 51 insertions(+), 9 deletions(-)

diff -puN arch/x86/include/asm/mmu_context.h~pkeys-85-mprotect_pkey 
arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~pkeys-85-mprotect_pkey 2015-12-03 
16:21:30.181877894 -0800
+++ b/arch/x86/include/asm/mmu_context.h2015-12-03 16:21:30.190878302 
-0800
@@ -4,6 +4,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -243,10 +244,14 @@ static inline void arch_unmap(struct mm_
mpx_notify_unmap(mm, vma, start, end);
 }
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+/*
+ * If the config option is off, we get the generic version from
+ * include/linux/pkeys.h.
+ */
 static inline int vma_pkey(struct vm_area_struct *vma)
 {
u16 pkey = 0;
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
  VM_PKEY_BIT2 | VM_PKEY_BIT3;
/*
@@ -259,9 +264,10 @@ static inline int vma_pkey(struct vm_are
 */
pkey = (vma->vm_flags >> vm_pkey_shift) &
   (vma_pkey_mask >> vm_pkey_shift);
-#endif
+
return pkey;
 }
+#endif
 
 static inline bool __pkru_allows_pkey(u16 pkey, bool write)
 {
diff -puN include/linux/pkeys.h~pkeys-85-mprotect_pkey include/linux/pkeys.h
--- a/include/linux/pkeys.h~pkeys-85-mprotect_pkey  2015-12-03 
16:21:30.183877985 -0800
+++ b/include/linux/pkeys.h 2015-12-03 16:21:30.190878302 -0800
@@ -2,10 +2,10 @@
 #define _LINUX_PKEYS_H
 
 #include 
-#include 
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
 #include 
+#include 
 #else /* ! CONFIG_ARCH_HAS_PKEYS */
 
 /*
@@ -17,6 +17,11 @@ static inline bool arch_validate_pkey(in
 {
return true;
 }
+
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+   return 0;
+}
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 #endif /* _LINUX_PKEYS_H */
diff -puN mm/Kconfig~pkeys-85-mprotect_pkey mm/Kconfig
--- a/mm/Kconfig~pkeys-85-mprotect_pkey 2015-12-03 16:21:30.185878075 -0800
+++ b/mm/Kconfig2015-12-03 16:21:30.190878302 -0800
@@ -673,3 +673,10 @@ config ARCH_USES_HIGH_VMA_FLAGS
bool
 config ARCH_HAS_PKEYS
bool
+
+config NR_PROTECTION_KEYS
+   int
+   # Everything supports a _single_ key, so allow folks to
+   # at least call APIs that take keys, but require that the
+   # key be 0.
+   default 1
diff -puN mm/mprotect.c~pkeys-85-mprotect_pkey mm/mprotect.c
--- a/mm/mprotect.c~pkeys-85-mprotect_pkey  2015-12-03 16:21:30.186878121 
-0800
+++ b/mm/mprotect.c 2015-12-03 16:21:30.191878347 -0800
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -344,10 +345,13 @@ fail:
return error;
 }
 
-SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
-   unsigned long, prot)
+/*
+ * pkey=-1 when doing a legacy mprotect()
+ */
+static int do_mprotect_pkey(unsigned long start, size_t len,
+   unsigned long prot, int pkey)
 {
-   unsigned long vm_flags, nstart, end, tmp, reqprot;
+   unsigned long nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
int error = -EINVAL;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
@@ -373,8 +377,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long,
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
prot |= PROT_EXEC;
 
-   vm_flags = calc_vm_prot_bits(prot, 0);
-
down_write(>mm->mmap_sem);
 
vma = find_vma(current->mm, start);
@@ -407,7 +409,14 @@ SYSCALL_DEFINE3(mprotect, unsigned long,
 
   

[PATCH 24/34] mm, multi-arch: pass a protection key in to calc_vm_flag_bits()

2015-12-03 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

This plumbs a protection key through calc_vm_flag_bits().  We
could have done this in calc_vm_prot_bits(), but I did not feel
super strongly which way to go.  It was pretty arbitrary which
one to use.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
Cc: linux-a...@vger.kernel.org
---

 b/arch/powerpc/include/asm/mman.h  |5 +++--
 b/drivers/char/agp/frontend.c  |2 +-
 b/drivers/staging/android/ashmem.c |4 ++--
 b/include/linux/mman.h |6 +++---
 b/mm/mmap.c|2 +-
 b/mm/mprotect.c|2 +-
 b/mm/nommu.c   |2 +-
 7 files changed, 12 insertions(+), 11 deletions(-)

diff -puN arch/powerpc/include/asm/mman.h~pkeys-84-calc_vm_prot_bits 
arch/powerpc/include/asm/mman.h
--- a/arch/powerpc/include/asm/mman.h~pkeys-84-calc_vm_prot_bits
2015-12-03 16:21:29.142830772 -0800
+++ b/arch/powerpc/include/asm/mman.h   2015-12-03 16:21:29.155831361 -0800
@@ -18,11 +18,12 @@
  * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
  * here.  How important is the optimization?
  */
-static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot)
+static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+   unsigned long pkey)
 {
return (prot & PROT_SAO) ? VM_SAO : 0;
 }
-#define arch_calc_vm_prot_bits(prot) arch_calc_vm_prot_bits(prot)
+#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
 
 static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
 {
diff -puN drivers/char/agp/frontend.c~pkeys-84-calc_vm_prot_bits 
drivers/char/agp/frontend.c
--- a/drivers/char/agp/frontend.c~pkeys-84-calc_vm_prot_bits2015-12-03 
16:21:29.143830817 -0800
+++ b/drivers/char/agp/frontend.c   2015-12-03 16:21:29.155831361 -0800
@@ -156,7 +156,7 @@ static pgprot_t agp_convert_mmap_flags(i
 {
unsigned long prot_bits;
 
-   prot_bits = calc_vm_prot_bits(prot) | VM_SHARED;
+   prot_bits = calc_vm_prot_bits(prot, 0) | VM_SHARED;
return vm_get_page_prot(prot_bits);
 }
 
diff -puN drivers/staging/android/ashmem.c~pkeys-84-calc_vm_prot_bits 
drivers/staging/android/ashmem.c
--- a/drivers/staging/android/ashmem.c~pkeys-84-calc_vm_prot_bits   
2015-12-03 16:21:29.145830908 -0800
+++ b/drivers/staging/android/ashmem.c  2015-12-03 16:21:29.156831407 -0800
@@ -372,8 +372,8 @@ static int ashmem_mmap(struct file *file
}
 
/* requested protection bits must match our allowed protection mask */
-   if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
-calc_vm_prot_bits(PROT_MASK))) {
+   if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask, 0)) &
+calc_vm_prot_bits(PROT_MASK, 0))) {
ret = -EPERM;
goto out;
}
diff -puN include/linux/mman.h~pkeys-84-calc_vm_prot_bits include/linux/mman.h
--- a/include/linux/mman.h~pkeys-84-calc_vm_prot_bits   2015-12-03 
16:21:29.147830999 -0800
+++ b/include/linux/mman.h  2015-12-03 16:21:29.156831407 -0800
@@ -35,7 +35,7 @@ static inline void vm_unacct_memory(long
  */
 
 #ifndef arch_calc_vm_prot_bits
-#define arch_calc_vm_prot_bits(prot) 0
+#define arch_calc_vm_prot_bits(prot, pkey) 0
 #endif
 
 #ifndef arch_vm_get_page_prot
@@ -70,12 +70,12 @@ static inline int arch_validate_prot(uns
  * Combine the mmap "prot" argument into "vm_flags" used internally.
  */
 static inline unsigned long
-calc_vm_prot_bits(unsigned long prot)
+calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
 {
return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
   _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
   _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
-  arch_calc_vm_prot_bits(prot);
+  arch_calc_vm_prot_bits(prot, pkey);
 }
 
 /*
diff -puN mm/mmap.c~pkeys-84-calc_vm_prot_bits mm/mmap.c
--- a/mm/mmap.c~pkeys-84-calc_vm_prot_bits  2015-12-03 16:21:29.148831044 
-0800
+++ b/mm/mmap.c 2015-12-03 16:21:29.157831452 -0800
@@ -1309,7 +1309,7 @@ unsigned long do_mmap(struct file *file,
 * to. we assume access permissions have been handled by the open
 * of the memory object, so we don't do any here.
 */
-   vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+   vm_flags |= calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
if (flags & MAP_LOCKED)
diff -puN mm/mprotect.c~pkeys-84-calc_vm_prot_bits mm/mprotect.c
--- a/mm/mprotect.c~pkeys-84-calc_vm_prot_bits  2015-12-03 16:21:29.150831135 
-0800
+++ b/mm/mprotect.c 2015-12-03 16:21:29.158831497 -0800
@@ -373,7 +373,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long,

[PATCH 28/34] x86: wire up mprotect_key() system call

2015-12-03 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

This is all that we need to get the new system call itself
working on x86.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/entry/syscalls/syscall_32.tbl |1 +
 b/arch/x86/entry/syscalls/syscall_64.tbl |1 +
 b/arch/x86/include/uapi/asm/mman.h   |6 ++
 b/mm/Kconfig |1 +
 4 files changed, 9 insertions(+)

diff -puN arch/x86/entry/syscalls/syscall_32.tbl~pkeys-16-x86-mprotect_key 
arch/x86/entry/syscalls/syscall_32.tbl
--- a/arch/x86/entry/syscalls/syscall_32.tbl~pkeys-16-x86-mprotect_key  
2015-12-03 16:21:31.109919982 -0800
+++ b/arch/x86/entry/syscalls/syscall_32.tbl2015-12-03 16:21:31.118920390 
-0800
@@ -383,3 +383,4 @@
 374i386userfaultfd sys_userfaultfd
 375i386membarrier  sys_membarrier
 376i386mlock2  sys_mlock2
+377i386pkey_mprotect   sys_pkey_mprotect
diff -puN arch/x86/entry/syscalls/syscall_64.tbl~pkeys-16-x86-mprotect_key 
arch/x86/entry/syscalls/syscall_64.tbl
--- a/arch/x86/entry/syscalls/syscall_64.tbl~pkeys-16-x86-mprotect_key  
2015-12-03 16:21:31.111920072 -0800
+++ b/arch/x86/entry/syscalls/syscall_64.tbl2015-12-03 16:21:31.118920390 
-0800
@@ -332,6 +332,7 @@
 323common  userfaultfd sys_userfaultfd
 324common  membarrier  sys_membarrier
 325common  mlock2  sys_mlock2
+326common  pkey_mprotect   sys_pkey_mprotect
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff -puN arch/x86/include/uapi/asm/mman.h~pkeys-16-x86-mprotect_key 
arch/x86/include/uapi/asm/mman.h
--- a/arch/x86/include/uapi/asm/mman.h~pkeys-16-x86-mprotect_key
2015-12-03 16:21:31.113920163 -0800
+++ b/arch/x86/include/uapi/asm/mman.h  2015-12-03 16:21:31.118920390 -0800
@@ -20,6 +20,12 @@
((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
+
+#define arch_calc_vm_prot_bits(prot, key) (\
+   ((key) & 0x1 ? VM_PKEY_BIT0 : 0) |  \
+   ((key) & 0x2 ? VM_PKEY_BIT1 : 0) |  \
+   ((key) & 0x4 ? VM_PKEY_BIT2 : 0) |  \
+   ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
 #endif
 
 #include 
diff -puN mm/Kconfig~pkeys-16-x86-mprotect_key mm/Kconfig
--- a/mm/Kconfig~pkeys-16-x86-mprotect_key  2015-12-03 16:21:31.114920208 
-0800
+++ b/mm/Kconfig2015-12-03 16:21:31.119920435 -0800
@@ -679,4 +679,5 @@ config NR_PROTECTION_KEYS
# Everything supports a _single_ key, so allow folks to
# at least call APIs that take keys, but require that the
# key be 0.
+   default 16 if X86_INTEL_MEMORY_PROTECTION_KEYS
default 1
_
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 00/34] x86: Memory Protection Keys (v5)

2015-12-03 Thread Dave Hansen
Memory Protection Keys for User pages is a CPU feature which will
first appear on Skylake Servers, but will also be supported on
future non-server parts.  It provides a mechanism for enforcing
page-based protections, but without requiring modification of the
page tables when an application changes protection domains.  See
the Documentation/ patch for more details.

Changes from v4:

 * Made "allow setting of XSAVE state" safe if we got preempted
   between when we saved our FPU state and when we restore it.
   (I would appreciate a look from Ingo on this patch).
 * Fixed up a few things from Thomas's latest comments: splt up
   siginfo in to x86 and generic, removed extra 'eax' variable
   in rdpkru function, reworked vm_flags assignment, reworded
   a comment in pte_allows_gup()
 * Add missing DISABLED/REQUIRED_MASK14 in cpufeature.h
 * Added comment about compile optimization in fault path
 * Left get_user_pages_locked() alone.  Andrea thinks we need it.

Changes from RFCv3:

 * Added 'current' and 'foreign' variants of get_user_pages() to
   help indicate whether protection keys should be enforced.
   Thanks to Jerome Glisse for pointing out this issue.
 * Added "allocation" and set/get system calls so that we can do
   management of proection keys in the kernel.  This opens the
   door to use of specific protection keys for kernel use in the
   future, such as for execute-only memory.
 * Removed the kselftest code for the moment.  It will be
   submitted separately.

Thanks Ingo and Thomas for most of these):
Changes from RFCv2 (Thanks Ingo and Thomas for most of these):

 * few minor compile warnings
 * changed 'nopku' interaction with cpuid bits.  Now, we do not
   clear the PKU cpuid bit, we just skip enabling it.
 * changed __pkru_allows_write() to also check access disable bit
 * removed the unused write_pkru()
 * made si_pkey a u64 and added some patch description details.
   Also made it share space in siginfo with MPX and clarified
   comments.
 * give some real text for the Processor Trace xsave state
 * made vma_pkey() less ugly (and much more optimized actually)
 * added SEGV_PKUERR to copy_siginfo_to_user()
 * remove page table walk when filling in si_pkey, added some
   big fat comments about it being inherently racy.
 * added self test code

This code is not runnable to anyone outside of Intel unless they
have some special hardware or a fancy simulator.  If you are
interested in running this for real, please get in touch with me.
Hardware is available to a very small but nonzero number of
people.

This set is also available here (with the new syscall):

git://git.kernel.org/pub/scm/linux/kernel/git/daveh/x86-pkeys.git 
pkeys-v014

=== diffstat ===

Dave Hansen (34):
  mm, gup: introduce concept of "foreign" get_user_pages()
  x86, fpu: add placeholder for Processor Trace XSAVE state
  x86, pkeys: Add Kconfig option
  x86, pkeys: cpuid bit definition
  x86, pkeys: define new CR4 bit
  x86, pkeys: add PKRU xsave fields and data structure(s)
  x86, pkeys: PTE bits for storing protection key
  x86, pkeys: new page fault error code bit: PF_PK
  x86, pkeys: store protection in high VMA flags
  x86, pkeys: arch-specific protection bits
  x86, pkeys: pass VMA down in to fault signal generation code
  signals, pkeys: notify userspace about protection key faults
  x86, pkeys: fill in pkey field in siginfo
  x86, pkeys: add functions to fetch PKRU
  mm: factor out VMA fault permission checking
  x86, mm: simplify get_user_pages() PTE bit handling
  x86, pkeys: check VMAs and PTEs for protection keys
  mm: add gup flag to indicate "foreign" mm access
  x86, pkeys: optimize fault handling in access_error()
  x86, pkeys: differentiate instruction fetches
  x86, pkeys: dump PKRU with other kernel registers
  x86, pkeys: dump PTE pkey in /proc/pid/smaps
  x86, pkeys: add Kconfig prompt to existing config option
  mm, multi-arch: pass a protection key in to calc_vm_flag_bits()
  x86, pkeys: add arch_validate_pkey()
  mm: implement new mprotect_key() system call
  x86, pkeys: make mprotect_key() mask off additional vm_flags
  x86: wire up mprotect_key() system call
  x86: separate out LDT init from context init
  x86, fpu: allow setting of XSAVE state
  x86, pkeys: allocation/free syscalls
  x86, pkeys: add pkey set/get syscalls
  x86, pkeys: actually enable Memory Protection Keys in CPU
  x86, pkeys: Documentation

 Documentation/kernel-parameters.txt |   3 +
 Documentation/x86/protection-keys.txt   |  53 +
 arch/mips/mm/gup.c  |   3 +-
 arch/powerpc/include/asm/mman.h |   5 +-
 arch/powerpc/include/asm/mmu_context.h  |  12 +
 arch/s390/include/asm/mmu_context.h |  12 +
 arch/s390/mm/gup.c  |   3 +-
 arch/sh/mm/gup.c 

[PATCH 31/34] x86, pkeys: allocation/free syscalls

2015-12-03 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

This patch adds two new system calls:

int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
int pkey_free(int pkey);

These establish which protection keys are valid for use by
userspace.  A key which was not obtained by pkey_alloc() may not
be passed to pkey_mprotect().

In addition, the 'init_access_rights' argument to pkey_alloc() specifies
the rights that will be established for the returned pkey.  For instance

pkey = pkey_alloc(flags, PKEY_DENY_WRITE);

will return with the bits set in PKRU such that writing to 'pkey' is
already denied.  This keeps userspace from needing to have knowledge
about manipulating PKRU.  It is still free to do so if it wishes, but
it is no longer required.

The kernel does _not_ enforce that this interface must be used for
changes to PKRU, even for keys it does not control.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/entry/syscalls/syscall_32.tbl |2 
 b/arch/x86/entry/syscalls/syscall_64.tbl |2 
 b/arch/x86/include/asm/mmu.h |7 ++
 b/arch/x86/include/asm/mmu_context.h |8 +++
 b/arch/x86/include/asm/pgtable.h |5 +-
 b/arch/x86/include/asm/pkeys.h   |   55 ++
 b/arch/x86/kernel/fpu/xstate.c   |   75 +++
 b/include/linux/pkeys.h  |   23 +
 b/include/uapi/asm-generic/mman-common.h |5 ++
 b/mm/mprotect.c  |   59 +++-
 10 files changed, 238 insertions(+), 3 deletions(-)

diff -puN arch/x86/entry/syscalls/syscall_32.tbl~pkey-allocation-syscalls 
arch/x86/entry/syscalls/syscall_32.tbl
--- a/arch/x86/entry/syscalls/syscall_32.tbl~pkey-allocation-syscalls   
2015-12-03 16:21:32.484982342 -0800
+++ b/arch/x86/entry/syscalls/syscall_32.tbl2015-12-03 16:21:32.502983159 
-0800
@@ -384,3 +384,5 @@
 375i386membarrier  sys_membarrier
 376i386mlock2  sys_mlock2
 377i386pkey_mprotect   sys_pkey_mprotect
+378i386pkey_alloc  sys_pkey_alloc
+379i386pkey_free   sys_pkey_free
diff -puN arch/x86/entry/syscalls/syscall_64.tbl~pkey-allocation-syscalls 
arch/x86/entry/syscalls/syscall_64.tbl
--- a/arch/x86/entry/syscalls/syscall_64.tbl~pkey-allocation-syscalls   
2015-12-03 16:21:32.485982388 -0800
+++ b/arch/x86/entry/syscalls/syscall_64.tbl2015-12-03 16:21:32.502983159 
-0800
@@ -333,6 +333,8 @@
 324common  membarrier  sys_membarrier
 325common  mlock2  sys_mlock2
 326common  pkey_mprotect   sys_pkey_mprotect
+327common  pkey_alloc  sys_pkey_alloc
+328common  pkey_free   sys_pkey_free
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff -puN arch/x86/include/asm/mmu_context.h~pkey-allocation-syscalls 
arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~pkey-allocation-syscalls   
2015-12-03 16:21:32.487982478 -0800
+++ b/arch/x86/include/asm/mmu_context.h2015-12-03 16:21:32.503983204 
-0800
@@ -108,7 +108,12 @@ static inline void enter_lazy_tlb(struct
 static inline int init_new_context(struct task_struct *tsk,
   struct mm_struct *mm)
 {
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+   /* pkey 0 is the default and always allocated */
+   mm->context.pkey_allocation_map = 0x1;
+#endif
init_new_context_ldt(tsk, mm);
+
return 0;
 }
 static inline void destroy_context(struct mm_struct *mm)
@@ -333,4 +338,7 @@ static inline bool arch_pte_access_permi
return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
 }
 
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+   unsigned long init_val);
+
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff -puN arch/x86/include/asm/mmu.h~pkey-allocation-syscalls 
arch/x86/include/asm/mmu.h
--- a/arch/x86/include/asm/mmu.h~pkey-allocation-syscalls   2015-12-03 
16:21:32.489982569 -0800
+++ b/arch/x86/include/asm/mmu.h2015-12-03 16:21:32.503983204 -0800
@@ -22,6 +22,13 @@ typedef struct {
void __user *vdso;
 
atomic_t perf_rdpmc_allowed;/* nonzero if rdpmc is allowed */
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+   /*
+* One bit per protection key says whether userspace can
+* use it or not.  protected by mmap_sem.
+*/
+   u16 pkey_allocation_map;
+#endif
 } mm_context_t;
 
 #ifdef CONFIG_SMP
diff -puN arch/x86/include/asm/pgtable.h~pkey-allocation-syscalls 
arch/x86/include/asm/pgtable.h
--- a/arch/x86/include/asm/pgtable.h~pkey-allocation-syscalls   2015-12-03 
16:21:32.490982614 -0800
+++ b/arch/x86/include/asm/pgtable.h2015-12-03 16:21:32.503983204 -0800
@@ -912,16 +912,17 @@ static inline pte_t pte_s

[PATCH 32/34] x86, pkeys: add pkey set/get syscalls

2015-12-03 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

This establishes two more system calls for protection key management:

unsigned long pkey_get(int pkey);
int pkey_set(int pkey, unsigned long access_rights);

The return value from pkey_get() and the 'access_rights' passed
to pkey_set() are the same format: a bitmask containing
PKEY_DENY_WRITE and/or PKEY_DENY_ACCESS, or nothing set at all.

These replace userspace's direct use of rdpkru/wrpkru.

With current hardware, the kernel can not enforce that it has
control over a given key.  But, this at least allows the kernel
to indicate to userspace that userspace does not control a given
protection key.

The kernel does _not_ enforce that this interface must be used for
changes to PKRU, even for keys it has not "allocated".

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/entry/syscalls/syscall_32.tbl |2 +
 b/arch/x86/entry/syscalls/syscall_64.tbl |2 +
 b/arch/x86/include/asm/mmu_context.h |2 +
 b/arch/x86/include/asm/pkeys.h   |2 -
 b/arch/x86/kernel/fpu/xstate.c   |   55 +--
 b/include/linux/pkeys.h  |8 
 b/mm/mprotect.c  |   34 +++
 7 files changed, 102 insertions(+), 3 deletions(-)

diff -puN arch/x86/entry/syscalls/syscall_32.tbl~pkey-syscalls-set-get 
arch/x86/entry/syscalls/syscall_32.tbl
--- a/arch/x86/entry/syscalls/syscall_32.tbl~pkey-syscalls-set-get  
2015-12-03 16:21:33.139012003 -0800
+++ b/arch/x86/entry/syscalls/syscall_32.tbl2015-12-03 16:21:33.151012548 
-0800
@@ -386,3 +386,5 @@
 377i386pkey_mprotect   sys_pkey_mprotect
 378i386pkey_alloc  sys_pkey_alloc
 379i386pkey_free   sys_pkey_free
+380i386pkey_getsys_pkey_get
+381i386pkey_setsys_pkey_set
diff -puN arch/x86/entry/syscalls/syscall_64.tbl~pkey-syscalls-set-get 
arch/x86/entry/syscalls/syscall_64.tbl
--- a/arch/x86/entry/syscalls/syscall_64.tbl~pkey-syscalls-set-get  
2015-12-03 16:21:33.141012094 -0800
+++ b/arch/x86/entry/syscalls/syscall_64.tbl2015-12-03 16:21:33.152012593 
-0800
@@ -335,6 +335,8 @@
 326common  pkey_mprotect   sys_pkey_mprotect
 327common  pkey_alloc  sys_pkey_alloc
 328common  pkey_free   sys_pkey_free
+329common  pkey_getsys_pkey_get
+330common  pkey_setsys_pkey_set
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff -puN arch/x86/include/asm/mmu_context.h~pkey-syscalls-set-get 
arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~pkey-syscalls-set-get  2015-12-03 
16:21:33.142012139 -0800
+++ b/arch/x86/include/asm/mmu_context.h2015-12-03 16:21:33.152012593 
-0800
@@ -340,5 +340,7 @@ static inline bool arch_pte_access_permi
 
 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
+extern unsigned long arch_get_user_pkey_access(struct task_struct *tsk,
+   int pkey);
 
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff -puN arch/x86/include/asm/pkeys.h~pkey-syscalls-set-get 
arch/x86/include/asm/pkeys.h
--- a/arch/x86/include/asm/pkeys.h~pkey-syscalls-set-get2015-12-03 
16:21:33.144012230 -0800
+++ b/arch/x86/include/asm/pkeys.h  2015-12-03 16:21:33.152012593 -0800
@@ -16,7 +16,7 @@
 } while (0)
 
 static inline
-bool mm_pkey_is_allocated(struct mm_struct *mm, unsigned long pkey)
+bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
 {
if (!arch_validate_pkey(pkey))
return true;
diff -puN arch/x86/kernel/fpu/xstate.c~pkey-syscalls-set-get 
arch/x86/kernel/fpu/xstate.c
--- a/arch/x86/kernel/fpu/xstate.c~pkey-syscalls-set-get2015-12-03 
16:21:33.145012275 -0800
+++ b/arch/x86/kernel/fpu/xstate.c  2015-12-03 16:21:33.153012638 -0800
@@ -687,7 +687,7 @@ void fpu__resume_cpu(void)
  *
  * Note: does not work for compacted buffers.
  */
-void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
+static void *__raw_xsave_addr(struct xregs_state *xsave, int 
xstate_feature_mask)
 {
int feature_nr = fls64(xstate_feature_mask) - 1;
 
@@ -862,6 +862,7 @@ out:
 
 #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
 #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
+#define PKRU_INIT_STATE0
 
 /*
  * This will go out and modify the XSAVE buffer so that PKRU is
@@ -880,6 +881,9 @@ int arch_set_user_pkey_access(struct tas
int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
u32 new_pkru_bits = 0;
 
+   /* Only support manipulating current task for now */
+   if (tsk != current)
+   return -EINVAL;
if (!arch_validate_pkey(pkey))
return -EINVAL;
/*
@@ -907,7 +911,7 @@ int arch_set_user_

[PATCH 34/37] x86, pkeys: allocation/free syscalls

2015-11-16 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

This patch adds two new system calls:

int pkey_alloc(unsigned long flags, unsigned long init_access_rights)
int pkey_free(int pkey);

These establish which protection keys are valid for use by
userspace.  A key which was not obtained by pkey_alloc() may not
be passed to pkey_mprotect().

In addition, the 'init_access_rights' argument to pkey_alloc() specifies
the rights that will be established for the returned pkey.  For instance

pkey = pkey_alloc(flags, PKEY_DENY_WRITE);

will return with the bits set in PKRU such that writing to 'pkey' is
already denied.  This keeps userspace from needing to have knowledge
about manipulating PKRU.  It is still free to do so if it wishes, but
it is no longer required.

The kernel does _not_ enforce that this interface must be used for
changes to PKRU, even for keys it does not control.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/entry/syscalls/syscall_32.tbl |2 
 b/arch/x86/entry/syscalls/syscall_64.tbl |2 
 b/arch/x86/include/asm/mmu.h |7 ++
 b/arch/x86/include/asm/mmu_context.h |8 +++
 b/arch/x86/include/asm/pgtable.h |5 +-
 b/arch/x86/include/asm/pkeys.h   |   55 ++
 b/arch/x86/kernel/fpu/xstate.c   |   75 +++
 b/include/asm-generic/mm_hooks.h |1 
 b/include/linux/pkeys.h  |   23 +
 b/include/uapi/asm-generic/mman-common.h |5 ++
 b/mm/mprotect.c  |   55 ++
 11 files changed, 236 insertions(+), 2 deletions(-)

diff -puN arch/x86/entry/syscalls/syscall_32.tbl~pkey-allocation-syscalls 
arch/x86/entry/syscalls/syscall_32.tbl
--- a/arch/x86/entry/syscalls/syscall_32.tbl~pkey-allocation-syscalls   
2015-11-16 12:36:32.972787782 -0800
+++ b/arch/x86/entry/syscalls/syscall_32.tbl2015-11-16 19:21:52.519973697 
-0800
@@ -384,3 +384,5 @@
 375i386membarrier  sys_membarrier
 376i386mlock2  sys_mlock2
 377i386pkey_mprotect   sys_pkey_mprotect
+378i386pkey_alloc  sys_pkey_alloc
+379i386pkey_free   sys_pkey_free
diff -puN arch/x86/entry/syscalls/syscall_64.tbl~pkey-allocation-syscalls 
arch/x86/entry/syscalls/syscall_64.tbl
--- a/arch/x86/entry/syscalls/syscall_64.tbl~pkey-allocation-syscalls   
2015-11-16 12:36:32.973787828 -0800
+++ b/arch/x86/entry/syscalls/syscall_64.tbl2015-11-16 19:21:52.520973742 
-0800
@@ -333,6 +333,8 @@
 324common  membarrier  sys_membarrier
 325common  mlock2  sys_mlock2
 326common  pkey_mprotect   sys_pkey_mprotect
+327common  pkey_alloc  sys_pkey_alloc
+328common  pkey_free   sys_pkey_free
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff -puN arch/x86/include/asm/mmu_context.h~pkey-allocation-syscalls 
arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~pkey-allocation-syscalls   
2015-11-16 12:36:32.975787919 -0800
+++ b/arch/x86/include/asm/mmu_context.h2015-11-16 19:21:52.520973742 
-0800
@@ -108,7 +108,12 @@ static inline void enter_lazy_tlb(struct
 static inline int init_new_context(struct task_struct *tsk,
   struct mm_struct *mm)
 {
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+   /* pkey 0 is the default and always allocated */
+   mm->context.pkey_allocation_map = 0x1;
+#endif
init_new_context_ldt(tsk, mm);
+
return 0;
 }
 static inline void destroy_context(struct mm_struct *mm)
@@ -333,4 +338,7 @@ static inline bool arch_pte_access_permi
return __pkru_allows_pkey(pte_flags_pkey(pte_flags(pte)), write);
 }
 
+extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
+   unsigned long init_val);
+
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff -puN arch/x86/include/asm/mmu.h~pkey-allocation-syscalls 
arch/x86/include/asm/mmu.h
--- a/arch/x86/include/asm/mmu.h~pkey-allocation-syscalls   2015-11-16 
12:36:32.976787964 -0800
+++ b/arch/x86/include/asm/mmu.h2015-11-16 12:36:32.992788690 -0800
@@ -22,6 +22,13 @@ typedef struct {
void __user *vdso;
 
atomic_t perf_rdpmc_allowed;/* nonzero if rdpmc is allowed */
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+   /*
+* One bit per protection key says whether userspace can
+* use it or not.  protected by mmap_sem.
+*/
+   u16 pkey_allocation_map;
+#endif
 } mm_context_t;
 
 #ifdef CONFIG_SMP
diff -puN arch/x86/include/asm/pgtable.h~pkey-allocation-syscalls 
arch/x86/include/asm/pgtable.h
--- a/arch/x86/include/asm/pgtable.h~pkey-allocation-syscalls   2015-11-16 
12:36:32.978788055 -0800
+++ b/arch/x86/include/asm/pgtable.h2015-11-16 12:36:32.992788690 -0800
@@ 

[PATCH 29/37] mm: implement new mprotect_key() system call

2015-11-16 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

mprotect_key() is just like mprotect, except it also takes a
protection key as an argument.  On systems that do not support
protection keys, it still works, but requires that key=0.
Otherwise it does exactly what mprotect does.

I expect it to get used like this, if you want to guarantee that
any mapping you create can *never* be accessed without the right
protection keys set up.

pkey_deny_access(11); // random pkey
int real_prot = PROT_READ|PROT_WRITE;
ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 
0);
ret = mprotect_key(ptr, PAGE_SIZE, real_prot, 11);

This way, there is *no* window where the mapping is accessible
since it was always either PROT_NONE or had a protection key set.

We settled on 'unsigned long' for the type of the key here.  We
only need 4 bits on x86 today, but I figured that other
architectures might need some more space.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/include/asm/mmu_context.h |   10 +++--
 b/include/linux/pkeys.h  |7 +-
 b/mm/Kconfig |7 ++
 b/mm/mprotect.c  |   36 +--
 4 files changed, 51 insertions(+), 9 deletions(-)

diff -puN arch/x86/include/asm/mmu_context.h~pkeys-85-mprotect_pkey 
arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~pkeys-85-mprotect_pkey 2015-11-16 
12:36:28.811598913 -0800
+++ b/arch/x86/include/asm/mmu_context.h2015-11-16 19:22:01.920396860 
-0800
@@ -4,6 +4,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -243,10 +244,14 @@ static inline void arch_unmap(struct mm_
mpx_notify_unmap(mm, vma, start, end);
 }
 
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+/*
+ * If the config option is off, we get the generic version from
+ * include/linux/pkeys.h.
+ */
 static inline int vma_pkey(struct vm_area_struct *vma)
 {
u16 pkey = 0;
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 |
  VM_PKEY_BIT2 | VM_PKEY_BIT3;
/*
@@ -259,9 +264,10 @@ static inline int vma_pkey(struct vm_are
 */
pkey = (vma->vm_flags >> vm_pkey_shift) &
   (vma_pkey_mask >> vm_pkey_shift);
-#endif
+
return pkey;
 }
+#endif
 
 static inline bool __pkru_allows_pkey(u16 pkey, bool write)
 {
diff -puN include/linux/pkeys.h~pkeys-85-mprotect_pkey include/linux/pkeys.h
--- a/include/linux/pkeys.h~pkeys-85-mprotect_pkey  2015-11-16 
12:36:28.812598958 -0800
+++ b/include/linux/pkeys.h 2015-11-16 19:22:05.259547173 -0800
@@ -2,10 +2,10 @@
 #define _LINUX_PKEYS_H
 
 #include 
-#include 
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
 #include 
+#include 
 #else /* ! CONFIG_ARCH_HAS_PKEYS */
 
 /*
@@ -17,6 +17,11 @@ static inline bool arch_validate_pkey(in
 {
return true;
 }
+
+static inline int vma_pkey(struct vm_area_struct *vma)
+{
+   return 0;
+}
 #endif /* ! CONFIG_ARCH_HAS_PKEYS */
 
 #endif /* _LINUX_PKEYS_H */
diff -puN mm/Kconfig~pkeys-85-mprotect_pkey mm/Kconfig
--- a/mm/Kconfig~pkeys-85-mprotect_pkey 2015-11-16 12:36:28.814599049 -0800
+++ b/mm/Kconfig2015-11-16 19:22:03.862484284 -0800
@@ -673,3 +673,10 @@ config ARCH_USES_HIGH_VMA_FLAGS
bool
 config ARCH_HAS_PKEYS
bool
+
+config NR_PROTECTION_KEYS
+   int
+   # Everything supports a _single_ key, so allow folks to
+   # at least call APIs that take keys, but require that the
+   # key be 0.
+   default 1
diff -puN mm/mprotect.c~pkeys-85-mprotect_pkey mm/mprotect.c
--- a/mm/mprotect.c~pkeys-85-mprotect_pkey  2015-11-16 12:36:28.816599140 
-0800
+++ b/mm/mprotect.c 2015-11-16 19:22:05.259547173 -0800
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -344,10 +345,13 @@ fail:
return error;
 }
 
-SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
-   unsigned long, prot)
+/*
+ * pkey=-1 when doing a legacy mprotect()
+ */
+static int do_mprotect_pkey(unsigned long start, size_t len,
+   unsigned long prot, int pkey)
 {
-   unsigned long vm_flags, nstart, end, tmp, reqprot;
+   unsigned long nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
int error = -EINVAL;
const int grows = prot & (PROT_GROWSDOWN|PROT_GROWSUP);
@@ -373,8 +377,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long,
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
prot |= PROT_EXEC;
 
-   vm_flags = calc_vm_prot_bits(prot, 0);
-
down_write(>mm->mmap_sem);
 
vma = find_vma(current->mm, start);
@@ -407,7 +409,14 @@ SYSCALL_DEFINE3(mprotect, unsigned long,
 
   

[PATCH 35/37] x86, pkeys: add pkey set/get syscalls

2015-11-16 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

This establishes two more system calls for protection key management:

unsigned long pkey_get(int pkey);
int pkey_set(int pkey, unsigned long access_rights);

The return value from pkey_get() and the 'access_rights' passed
to pkey_set() are the same format: a bitmask containing
PKEY_DENY_WRITE and/or PKEY_DENY_ACCESS, or nothing set at all.

These replace userspace's direct use of rdpkru/wrpkru.

With current hardware, the kernel can not enforce that it has
control over a given key.  But, this at least allows the kernel
to indicate to userspace that userspace does not control a given
protection key.

The kernel does _not_ enforce that this interface must be used for
changes to PKRU, even for keys it has not "allocated".

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/entry/syscalls/syscall_32.tbl |2 +
 b/arch/x86/entry/syscalls/syscall_64.tbl |2 +
 b/arch/x86/include/asm/mmu_context.h |2 +
 b/arch/x86/include/asm/pkeys.h   |2 -
 b/arch/x86/kernel/fpu/xstate.c   |   55 +--
 b/include/linux/pkeys.h  |8 
 b/mm/mprotect.c  |   34 +++
 7 files changed, 102 insertions(+), 3 deletions(-)

diff -puN arch/x86/entry/syscalls/syscall_32.tbl~pkey-syscalls-set-get 
arch/x86/entry/syscalls/syscall_32.tbl
--- a/arch/x86/entry/syscalls/syscall_32.tbl~pkey-syscalls-set-get  
2015-11-16 12:36:34.851873071 -0800
+++ b/arch/x86/entry/syscalls/syscall_32.tbl2015-11-16 12:36:34.863873616 
-0800
@@ -386,3 +386,5 @@
 377i386pkey_mprotect   sys_pkey_mprotect
 378i386pkey_alloc  sys_pkey_alloc
 379i386pkey_free   sys_pkey_free
+380i386pkey_getsys_pkey_get
+381i386pkey_setsys_pkey_set
diff -puN arch/x86/entry/syscalls/syscall_64.tbl~pkey-syscalls-set-get 
arch/x86/entry/syscalls/syscall_64.tbl
--- a/arch/x86/entry/syscalls/syscall_64.tbl~pkey-syscalls-set-get  
2015-11-16 12:36:34.852873116 -0800
+++ b/arch/x86/entry/syscalls/syscall_64.tbl2015-11-16 12:36:34.864873661 
-0800
@@ -335,6 +335,8 @@
 326common  pkey_mprotect   sys_pkey_mprotect
 327common  pkey_alloc  sys_pkey_alloc
 328common  pkey_free   sys_pkey_free
+329common  pkey_getsys_pkey_get
+330common  pkey_setsys_pkey_set
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff -puN arch/x86/include/asm/mmu_context.h~pkey-syscalls-set-get 
arch/x86/include/asm/mmu_context.h
--- a/arch/x86/include/asm/mmu_context.h~pkey-syscalls-set-get  2015-11-16 
12:36:34.854873207 -0800
+++ b/arch/x86/include/asm/mmu_context.h2015-11-16 12:36:34.864873661 
-0800
@@ -340,5 +340,7 @@ static inline bool arch_pte_access_permi
 
 extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
unsigned long init_val);
+extern unsigned long arch_get_user_pkey_access(struct task_struct *tsk,
+   int pkey);
 
 #endif /* _ASM_X86_MMU_CONTEXT_H */
diff -puN arch/x86/include/asm/pkeys.h~pkey-syscalls-set-get 
arch/x86/include/asm/pkeys.h
--- a/arch/x86/include/asm/pkeys.h~pkey-syscalls-set-get2015-11-16 
12:36:34.855873253 -0800
+++ b/arch/x86/include/asm/pkeys.h  2015-11-16 19:14:09.231117816 -0800
@@ -16,7 +16,7 @@
 } while (0)
 
 static inline
-bool mm_pkey_is_allocated(struct mm_struct *mm, unsigned long pkey)
+bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey)
 {
if (!arch_validate_pkey(pkey))
return true;
diff -puN arch/x86/kernel/fpu/xstate.c~pkey-syscalls-set-get 
arch/x86/kernel/fpu/xstate.c
--- a/arch/x86/kernel/fpu/xstate.c~pkey-syscalls-set-get2015-11-16 
12:36:34.857873343 -0800
+++ b/arch/x86/kernel/fpu/xstate.c  2015-11-16 12:36:34.865873706 -0800
@@ -687,7 +687,7 @@ void fpu__resume_cpu(void)
  *
  * Note: does not work for compacted buffers.
  */
-void *__raw_xsave_addr(struct xregs_state *xsave, int xstate_feature_mask)
+static void *__raw_xsave_addr(struct xregs_state *xsave, int 
xstate_feature_mask)
 {
int feature_nr = fls64(xstate_feature_mask) - 1;
 
@@ -871,6 +871,7 @@ static void fpu__xfeature_set_state(int
 
 #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
 #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
+#define PKRU_INIT_STATE0
 
 /*
  * This will go out and modify the XSAVE buffer so that PKRU is
@@ -889,6 +890,9 @@ int arch_set_user_pkey_access(struct tas
int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
u32 new_pkru_bits = 0;
 
+   /* Only support manipulating current task for now */
+   if (tsk != current)
+   return -EINVAL;
if (!arch_validate_pkey(pkey))
return -EINVAL;
/*
@

[PATCH 31/37] x86: wire up mprotect_key() system call

2015-11-16 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

This is all that we need to get the new system call itself
working on x86.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/arch/x86/entry/syscalls/syscall_32.tbl |1 +
 b/arch/x86/entry/syscalls/syscall_64.tbl |1 +
 b/arch/x86/include/uapi/asm/mman.h   |6 ++
 b/mm/Kconfig |1 +
 4 files changed, 9 insertions(+)

diff -puN arch/x86/entry/syscalls/syscall_32.tbl~pkeys-16-x86-mprotect_key 
arch/x86/entry/syscalls/syscall_32.tbl
--- a/arch/x86/entry/syscalls/syscall_32.tbl~pkeys-16-x86-mprotect_key  
2015-11-16 12:36:30.650682386 -0800
+++ b/arch/x86/entry/syscalls/syscall_32.tbl2015-11-16 19:21:56.328145123 
-0800
@@ -383,3 +383,4 @@
 374i386userfaultfd sys_userfaultfd
 375i386membarrier  sys_membarrier
 376i386mlock2  sys_mlock2
+377i386pkey_mprotect   sys_pkey_mprotect
diff -puN arch/x86/entry/syscalls/syscall_64.tbl~pkeys-16-x86-mprotect_key 
arch/x86/entry/syscalls/syscall_64.tbl
--- a/arch/x86/entry/syscalls/syscall_64.tbl~pkeys-16-x86-mprotect_key  
2015-11-16 12:36:30.652682477 -0800
+++ b/arch/x86/entry/syscalls/syscall_64.tbl2015-11-16 19:21:56.328145123 
-0800
@@ -332,6 +332,7 @@
 323common  userfaultfd sys_userfaultfd
 324common  membarrier  sys_membarrier
 325common  mlock2  sys_mlock2
+326common  pkey_mprotect   sys_pkey_mprotect
 
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
diff -puN arch/x86/include/uapi/asm/mman.h~pkeys-16-x86-mprotect_key 
arch/x86/include/uapi/asm/mman.h
--- a/arch/x86/include/uapi/asm/mman.h~pkeys-16-x86-mprotect_key
2015-11-16 12:36:30.653682522 -0800
+++ b/arch/x86/include/uapi/asm/mman.h  2015-11-16 12:36:30.659682794 -0800
@@ -20,6 +20,12 @@
((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
+
+#define arch_calc_vm_prot_bits(prot, key) (\
+   ((key) & 0x1 ? VM_PKEY_BIT0 : 0) |  \
+   ((key) & 0x2 ? VM_PKEY_BIT1 : 0) |  \
+   ((key) & 0x4 ? VM_PKEY_BIT2 : 0) |  \
+   ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
 #endif
 
 #include 
diff -puN mm/Kconfig~pkeys-16-x86-mprotect_key mm/Kconfig
--- a/mm/Kconfig~pkeys-16-x86-mprotect_key  2015-11-16 12:36:30.655682613 
-0800
+++ b/mm/Kconfig2015-11-16 12:36:30.659682794 -0800
@@ -679,4 +679,5 @@ config NR_PROTECTION_KEYS
# Everything supports a _single_ key, so allow folks to
# at least call APIs that take keys, but require that the
# key be 0.
+   default 16 if X86_INTEL_MEMORY_PROTECTION_KEYS
default 1
_
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 27/37] mm, multi-arch: pass a protection key in to calc_vm_flag_bits()

2015-11-16 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

This plumbs a protection key through calc_vm_flag_bits().  We
could have done this in calc_vm_prot_bits(), but I did not feel
super strongly which way to go.  It was pretty arbitrary which
one to use.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
Cc: linux-a...@vger.kernel.org
---

 b/arch/powerpc/include/asm/mman.h  |5 +++--
 b/drivers/char/agp/frontend.c  |2 +-
 b/drivers/staging/android/ashmem.c |4 ++--
 b/include/linux/mman.h |6 +++---
 b/mm/mmap.c|2 +-
 b/mm/mprotect.c|2 +-
 b/mm/nommu.c   |2 +-
 7 files changed, 12 insertions(+), 11 deletions(-)

diff -puN arch/powerpc/include/asm/mman.h~pkeys-84-calc_vm_prot_bits 
arch/powerpc/include/asm/mman.h
--- a/arch/powerpc/include/asm/mman.h~pkeys-84-calc_vm_prot_bits
2015-11-16 12:36:26.421490430 -0800
+++ b/arch/powerpc/include/asm/mman.h   2015-11-16 12:36:26.434491019 -0800
@@ -18,11 +18,12 @@
  * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
  * here.  How important is the optimization?
  */
-static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot)
+static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+   unsigned long pkey)
 {
return (prot & PROT_SAO) ? VM_SAO : 0;
 }
-#define arch_calc_vm_prot_bits(prot) arch_calc_vm_prot_bits(prot)
+#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
 
 static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
 {
diff -puN drivers/char/agp/frontend.c~pkeys-84-calc_vm_prot_bits 
drivers/char/agp/frontend.c
--- a/drivers/char/agp/frontend.c~pkeys-84-calc_vm_prot_bits2015-11-16 
12:36:26.423490520 -0800
+++ b/drivers/char/agp/frontend.c   2015-11-16 12:36:26.435491065 -0800
@@ -156,7 +156,7 @@ static pgprot_t agp_convert_mmap_flags(i
 {
unsigned long prot_bits;
 
-   prot_bits = calc_vm_prot_bits(prot) | VM_SHARED;
+   prot_bits = calc_vm_prot_bits(prot, 0) | VM_SHARED;
return vm_get_page_prot(prot_bits);
 }
 
diff -puN drivers/staging/android/ashmem.c~pkeys-84-calc_vm_prot_bits 
drivers/staging/android/ashmem.c
--- a/drivers/staging/android/ashmem.c~pkeys-84-calc_vm_prot_bits   
2015-11-16 12:36:26.424490566 -0800
+++ b/drivers/staging/android/ashmem.c  2015-11-16 12:36:26.435491065 -0800
@@ -372,8 +372,8 @@ static int ashmem_mmap(struct file *file
}
 
/* requested protection bits must match our allowed protection mask */
-   if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
-calc_vm_prot_bits(PROT_MASK))) {
+   if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask, 0)) &
+calc_vm_prot_bits(PROT_MASK, 0))) {
ret = -EPERM;
goto out;
}
diff -puN include/linux/mman.h~pkeys-84-calc_vm_prot_bits include/linux/mman.h
--- a/include/linux/mman.h~pkeys-84-calc_vm_prot_bits   2015-11-16 
12:36:26.426490656 -0800
+++ b/include/linux/mman.h  2015-11-16 12:36:26.436491110 -0800
@@ -35,7 +35,7 @@ static inline void vm_unacct_memory(long
  */
 
 #ifndef arch_calc_vm_prot_bits
-#define arch_calc_vm_prot_bits(prot) 0
+#define arch_calc_vm_prot_bits(prot, pkey) 0
 #endif
 
 #ifndef arch_vm_get_page_prot
@@ -70,12 +70,12 @@ static inline int arch_validate_prot(uns
  * Combine the mmap "prot" argument into "vm_flags" used internally.
  */
 static inline unsigned long
-calc_vm_prot_bits(unsigned long prot)
+calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
 {
return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
   _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
   _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
-  arch_calc_vm_prot_bits(prot);
+  arch_calc_vm_prot_bits(prot, pkey);
 }
 
 /*
diff -puN mm/mmap.c~pkeys-84-calc_vm_prot_bits mm/mmap.c
--- a/mm/mmap.c~pkeys-84-calc_vm_prot_bits  2015-11-16 12:36:26.428490747 
-0800
+++ b/mm/mmap.c 2015-11-16 12:36:26.437491156 -0800
@@ -1309,7 +1309,7 @@ unsigned long do_mmap(struct file *file,
 * to. we assume access permissions have been handled by the open
 * of the memory object, so we don't do any here.
 */
-   vm_flags |= calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) |
+   vm_flags |= calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
if (flags & MAP_LOCKED)
diff -puN mm/mprotect.c~pkeys-84-calc_vm_prot_bits mm/mprotect.c
--- a/mm/mprotect.c~pkeys-84-calc_vm_prot_bits  2015-11-16 12:36:26.429490793 
-0800
+++ b/mm/mprotect.c 2015-11-16 19:22:07.781660707 -0800
@@ -373,7 +373,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long,

[PATCH 00/37] x86: Memory Protection Keys

2015-11-16 Thread Dave Hansen
Memory Protection Keys for User pages is a CPU feature which will
first appear on Skylake Servers, but will also be supported on
future non-server parts.  It provides a mechanism for enforcing
page-based protections, but without requiring modification of the
page tables when an application changes protection domains.  See
the Documentation/ patch for more details.

Changes from RFCv3:

 * Added 'current' and 'foreign' variants of get_user_pages() to
   help indicate whether protection keys should be enforced.
   Thanks to Jerome Glisse for pointing out this issue.
 * Added "allocation" and set/get system calls so that we can do
   management of proection keys in the kernel.  This opens the
   door to use of specific protection keys for kernel use in the
   future, such as for execute-only memory.
 * Removed the kselftest code for the moment.  It will be
   submitted separately.

Thanks Ingo and Thomas for most of these):
Changes from RFCv2 (Thanks Ingo and Thomas for most of these):

 * few minor compile warnings
 * changed 'nopku' interaction with cpuid bits.  Now, we do not
   clear the PKU cpuid bit, we just skip enabling it.
 * changed __pkru_allows_write() to also check access disable bit
 * removed the unused write_pkru()
 * made si_pkey a u64 and added some patch description details.
   Also made it share space in siginfo with MPX and clarified
   comments.
 * give some real text for the Processor Trace xsave state
 * made vma_pkey() less ugly (and much more optimized actually)
 * added SEGV_PKUERR to copy_siginfo_to_user()
 * remove page table walk when filling in si_pkey, added some
   big fat comments about it being inherently racy.
 * added self test code

This code is not runnable to anyone outside of Intel unless they
have some special hardware or a fancy simulator.  If you are
interested in running this for real, please get in touch with me.
Hardware is available to a very small but nonzero number of
people.

This set is also available here (with the new syscall):

git://git.kernel.org/pub/scm/linux/kernel/git/daveh/x86-pkeys.git 
pkeys-v010

=== diffstat ===

 Documentation/kernel-parameters.txt |   3 +
 Documentation/x86/protection-keys.txt   |  53 +
 arch/mips/mm/gup.c  |   3 +-
 arch/powerpc/include/asm/mman.h |   5 +-
 arch/powerpc/include/asm/mmu_context.h  |  12 +
 arch/s390/include/asm/mmu_context.h |  12 +
 arch/s390/mm/gup.c  |   3 +-
 arch/sh/mm/gup.c|   2 +-
 arch/sparc/mm/gup.c |   2 +-
 arch/unicore32/include/asm/mmu_context.h|  12 +
 arch/x86/Kconfig|  16 ++
 arch/x86/entry/syscalls/syscall_32.tbl  |   5 +
 arch/x86/entry/syscalls/syscall_64.tbl  |   5 +
 arch/x86/include/asm/cpufeature.h   |  54 +++--
 arch/x86/include/asm/disabled-features.h|  12 +
 arch/x86/include/asm/fpu/types.h|  12 +
 arch/x86/include/asm/fpu/xstate.h   |   4 +-
 arch/x86/include/asm/mmu.h  |   7 +
 arch/x86/include/asm/mmu_context.h  | 110 -
 arch/x86/include/asm/pgtable.h  |  38 +++
 arch/x86/include/asm/pgtable_types.h|  34 ++-
 arch/x86/include/asm/pkeys.h|  68 ++
 arch/x86/include/asm/required-features.h|   4 +
 arch/x86/include/asm/special_insns.h|  20 ++
 arch/x86/include/uapi/asm/mman.h|  22 ++
 arch/x86/include/uapi/asm/processor-flags.h |   2 +
 arch/x86/kernel/cpu/common.c|  42 
 arch/x86/kernel/fpu/xstate.c| 250 +++-
 arch/x86/kernel/ldt.c   |   4 +-
 arch/x86/kernel/process_64.c|   2 +
 arch/x86/kernel/setup.c |   9 +
 arch/x86/mm/fault.c | 153 ++--
 arch/x86/mm/gup.c   |  52 ++--
 arch/x86/mm/mpx.c   |   4 +-
 drivers/char/agp/frontend.c |   2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |   4 +-
 drivers/gpu/drm/i915/i915_gem_userptr.c |   2 +-
 drivers/gpu/drm/radeon/radeon_ttm.c |   4 +-
 drivers/gpu/drm/via/via_dmablit.c   |   3 +-
 drivers/infiniband/core/umem.c  |   2 +-
 drivers/infiniband/core/umem_odp.c  |   8 +-
 drivers/infiniband/hw/mthca/mthca_memfree.c |   3 +-
 drivers/infiniband/hw/qib/qib_user_pages.c  |   3 +-
 drivers/infiniband/hw/usnic/usnic_uiom.c|   2 +-
 drivers/iommu/amd_iommu_v2.c|   8 +-
 drivers/media/pci/ivtv/ivtv-udma.c  |   4 +-
 drivers/media/pci/ivtv/ivtv-yuv.c   |  10 +-
 drivers/media/v4l2-core/videobuf-dma-sg.c   |   3 +-
 drivers/misc/sgi-gru/grufault.c |   3 +-
 drivers/scsi/st.c   |   2 -
 drivers/staging/android/ashmem.c|   4 +-
 drivers/video/fbdev/pvr2fb.c|   4 +-
 drivers/virt/fsl_hypervisor.c 

Re: [PATCH 21/25] mm: implement new mprotect_key() system call

2015-09-29 Thread Dave Hansen
On 09/28/2015 11:39 PM, Michael Ellerman wrote:
> On Mon, 2015-09-28 at 12:18 -0700, Dave Hansen wrote:
>> From: Dave Hansen <dave.han...@linux.intel.com>
>>
>> mprotect_key() is just like mprotect, except it also takes a
>> protection key as an argument.  On systems that do not support
>> protection keys, it still works, but requires that key=0.
> 
> I'm not sure how userspace is going to use the key=0 feature? ie. userspace
> will still have to detect that keys are not supported and use key 0 
> everywhere.
> At that point it could just as well skip the mprotect_key() syscalls entirely
> couldn't it?

Yep.

Or, a new architecture could just skip mprotect() itself entirely and
only wire up mprotect_pkey().  I don't see this pkey=0 thing as an
important feature or anything.  I just wanted to call out the behavior.

>> I expect it to get used like this, if you want to guarantee that
>> any mapping you create can *never* be accessed without the right
>> protection keys set up.
>>
>>  pkey_deny_access(11); // random pkey
>>  int real_prot = PROT_READ|PROT_WRITE;
>>  ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 
>> 0);
>>  ret = mprotect_key(ptr, PAGE_SIZE, real_prot, 11);
>>
>> This way, there is *no* window where the mapping is accessible
>> since it was always either PROT_NONE or had a protection key set.
>>
>> We settled on 'unsigned long' for the type of the key here.  We
>> only need 4 bits on x86 today, but I figured that other
>> architectures might need some more space.
> 
> If the existing mprotect() syscall had a flags argument you could have just
> used that. So is it worth just adding mprotect2() now and using it for this? 
> ie:
> 
> int mprotect2(unsigned long start, size_t len, unsigned long prot, unsigned 
> long flags) ..
> 
> And then you define bit zero of flags to say you're passing a pkey, and it's 
> in
> bits 1-63?
> 
> That way if other arches need to do something different you at least have the
> flags available?

But what problem does that solve?

mprotect() itself has plenty of space in prot.  Do any of the other
architectures need to pass in more than just an integer key to implement
storage/protection keys?

I'd much rather have a set of (relatively) arch-specific system calls
implementing protection keys rather than a single one with one
arch-specific argument.
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 21/25] mm: implement new mprotect_key() system call

2015-09-28 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

mprotect_key() is just like mprotect, except it also takes a
protection key as an argument.  On systems that do not support
protection keys, it still works, but requires that key=0.
Otherwise it does exactly what mprotect does.

I expect it to get used like this, if you want to guarantee that
any mapping you create can *never* be accessed without the right
protection keys set up.

pkey_deny_access(11); // random pkey
int real_prot = PROT_READ|PROT_WRITE;
ptr = mmap(NULL, PAGE_SIZE, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 
0);
ret = mprotect_key(ptr, PAGE_SIZE, real_prot, 11);

This way, there is *no* window where the mapping is accessible
since it was always either PROT_NONE or had a protection key set.

We settled on 'unsigned long' for the type of the key here.  We
only need 4 bits on x86 today, but I figured that other
architectures might need some more space.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
---

 b/mm/Kconfig|7 +++
 b/mm/mprotect.c |   20 +---
 2 files changed, 24 insertions(+), 3 deletions(-)

diff -puN mm/Kconfig~pkeys-85-mprotect_pkey mm/Kconfig
--- a/mm/Kconfig~pkeys-85-mprotect_pkey 2015-09-28 11:39:50.527391162 -0700
+++ b/mm/Kconfig2015-09-28 11:39:50.532391390 -0700
@@ -683,3 +683,10 @@ config FRAME_VECTOR
 
 config ARCH_USES_HIGH_VMA_FLAGS
bool
+
+config NR_PROTECTION_KEYS
+   int
+   # Everything supports a _single_ key, so allow folks to
+   # at least call APIs that take keys, but require that the
+   # key be 0.
+   default 1
diff -puN mm/mprotect.c~pkeys-85-mprotect_pkey mm/mprotect.c
--- a/mm/mprotect.c~pkeys-85-mprotect_pkey  2015-09-28 11:39:50.529391253 
-0700
+++ b/mm/mprotect.c 2015-09-28 11:39:50.532391390 -0700
@@ -344,8 +344,8 @@ fail:
return error;
 }
 
-SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
-   unsigned long, prot)
+static int do_mprotect_key(unsigned long start, size_t len,
+   unsigned long prot, unsigned long key)
 {
unsigned long vm_flags, nstart, end, tmp, reqprot;
struct vm_area_struct *vma, *prev;
@@ -365,6 +365,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long,
return -ENOMEM;
if (!arch_validate_prot(prot))
return -EINVAL;
+   if (key >= CONFIG_NR_PROTECTION_KEYS)
+   return -EINVAL;
 
reqprot = prot;
/*
@@ -373,7 +375,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long,
if ((prot & PROT_READ) && (current->personality & READ_IMPLIES_EXEC))
prot |= PROT_EXEC;
 
-   vm_flags = calc_vm_prot_bits(prot, 0);
+   vm_flags = calc_vm_prot_bits(prot, key);
 
down_write(>mm->mmap_sem);
 
@@ -443,3 +445,15 @@ out:
up_write(>mm->mmap_sem);
return error;
 }
+
+SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
+   unsigned long, prot)
+{
+   return do_mprotect_key(start, len, prot, 0);
+}
+
+SYSCALL_DEFINE4(mprotect_key, unsigned long, start, size_t, len,
+   unsigned long, prot, unsigned long, key)
+{
+   return do_mprotect_key(start, len, prot, key);
+}
_
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 20/25] mm, multi-arch: pass a protection key in to calc_vm_flag_bits()

2015-09-28 Thread Dave Hansen

From: Dave Hansen <dave.han...@linux.intel.com>

This plumbs a protection key through calc_vm_flag_bits().
We could of done this in calc_vm_prot_bits(), but I did not
feel super strongly which way to go.  It was pretty arbitrary
which one to use.

Signed-off-by: Dave Hansen <dave.han...@linux.intel.com>
Cc: linux-api@vger.kernel.org
Cc: linux-a...@vger.kernel.org
---

 b/arch/powerpc/include/asm/mman.h  |5 +++--
 b/drivers/char/agp/frontend.c  |2 +-
 b/drivers/staging/android/ashmem.c |9 +
 b/include/linux/mman.h |6 +++---
 b/mm/mmap.c|2 +-
 b/mm/mprotect.c|2 +-
 b/mm/nommu.c   |2 +-
 7 files changed, 15 insertions(+), 13 deletions(-)

diff -puN arch/powerpc/include/asm/mman.h~pkeys-84-calc_vm_prot_bits 
arch/powerpc/include/asm/mman.h
--- a/arch/powerpc/include/asm/mman.h~pkeys-84-calc_vm_prot_bits
2015-09-28 11:39:49.962365460 -0700
+++ b/arch/powerpc/include/asm/mman.h   2015-09-28 11:39:49.976366097 -0700
@@ -18,11 +18,12 @@
  * This file is included by linux/mman.h, so we can't use cacl_vm_prot_bits()
  * here.  How important is the optimization?
  */
-static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot)
+static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+   unsigned long pkey)
 {
return (prot & PROT_SAO) ? VM_SAO : 0;
 }
-#define arch_calc_vm_prot_bits(prot) arch_calc_vm_prot_bits(prot)
+#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
 
 static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
 {
diff -puN drivers/char/agp/frontend.c~pkeys-84-calc_vm_prot_bits 
drivers/char/agp/frontend.c
--- a/drivers/char/agp/frontend.c~pkeys-84-calc_vm_prot_bits2015-09-28 
11:39:49.964365551 -0700
+++ b/drivers/char/agp/frontend.c   2015-09-28 11:39:49.977366142 -0700
@@ -156,7 +156,7 @@ static pgprot_t agp_convert_mmap_flags(i
 {
unsigned long prot_bits;
 
-   prot_bits = calc_vm_prot_bits(prot) | VM_SHARED;
+   prot_bits = calc_vm_prot_bits(prot, 0) | VM_SHARED;
return vm_get_page_prot(prot_bits);
 }
 
diff -puN drivers/staging/android/ashmem.c~pkeys-84-calc_vm_prot_bits 
drivers/staging/android/ashmem.c
--- a/drivers/staging/android/ashmem.c~pkeys-84-calc_vm_prot_bits   
2015-09-28 11:39:49.966365642 -0700
+++ b/drivers/staging/android/ashmem.c  2015-09-28 11:39:49.977366142 -0700
@@ -351,7 +351,8 @@ out:
return ret;
 }
 
-static inline vm_flags_t calc_vm_may_flags(unsigned long prot)
+static inline vm_flags_t calc_vm_may_flags(unsigned long prot,
+   unsigned long pkey)
 {
return _calc_vm_trans(prot, PROT_READ,  VM_MAYREAD) |
   _calc_vm_trans(prot, PROT_WRITE, VM_MAYWRITE) |
@@ -372,12 +373,12 @@ static int ashmem_mmap(struct file *file
}
 
/* requested protection bits must match our allowed protection mask */
-   if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask)) &
-calc_vm_prot_bits(PROT_MASK))) {
+   if (unlikely((vma->vm_flags & ~calc_vm_prot_bits(asma->prot_mask, 0)) &
+calc_vm_prot_bits(PROT_MASK, 0))) {
ret = -EPERM;
goto out;
}
-   vma->vm_flags &= ~calc_vm_may_flags(~asma->prot_mask);
+   vma->vm_flags &= ~calc_vm_may_flags(~asma->prot_mask, 0);
 
if (!asma->file) {
char *name = ASHMEM_NAME_DEF;
diff -puN include/linux/mman.h~pkeys-84-calc_vm_prot_bits include/linux/mman.h
--- a/include/linux/mman.h~pkeys-84-calc_vm_prot_bits   2015-09-28 
11:39:49.967365688 -0700
+++ b/include/linux/mman.h  2015-09-28 11:39:49.977366142 -0700
@@ -35,7 +35,7 @@ static inline void vm_unacct_memory(long
  */
 
 #ifndef arch_calc_vm_prot_bits
-#define arch_calc_vm_prot_bits(prot) 0
+#define arch_calc_vm_prot_bits(prot, pkey) 0
 #endif
 
 #ifndef arch_vm_get_page_prot
@@ -70,12 +70,12 @@ static inline int arch_validate_prot(uns
  * Combine the mmap "prot" argument into "vm_flags" used internally.
  */
 static inline unsigned long
-calc_vm_prot_bits(unsigned long prot)
+calc_vm_prot_bits(unsigned long prot, unsigned long pkey)
 {
return _calc_vm_trans(prot, PROT_READ,  VM_READ ) |
   _calc_vm_trans(prot, PROT_WRITE, VM_WRITE) |
   _calc_vm_trans(prot, PROT_EXEC,  VM_EXEC) |
-  arch_calc_vm_prot_bits(prot);
+  arch_calc_vm_prot_bits(prot, pkey);
 }
 
 /*
diff -puN mm/mmap.c~pkeys-84-calc_vm_prot_bits mm/mmap.c
--- a/mm/mmap.c~pkeys-84-calc_vm_prot_bits  2015-09-28 11:39:49.969365779 
-0700
+++ b/mm/mmap.c 2015-09-28 11:39:49.978366188 -0700
@@ -1311,7 +1311,7 @@ unsigned long do_mmap(struct file *file,
 * to. we assume access permissions have been handled by the open
 * of the memory object, so we don't do any 

[PATCH 00/25] x86: Memory Protection Keys

2015-09-28 Thread Dave Hansen
I have addressed all known issues and review comments.  I believe
they are ready to be pulled in to the x86 tree.  Note that this
is also the first time anyone has seen the new 'selftests' code.
If there are issues limited to it, I'd prefer to fix those up
separately post-merge.

Changes from RFCv2 (Thanks Ingo and Thomas for most of these):

 * few minor compile warnings
 * changed 'nopku' interaction with cpuid bits.  Now, we do not
   clear the PKU cpuid bit, we just skip enabling it.
 * changed __pkru_allows_write() to also check access disable bit
 * removed the unused write_pkru()
 * made si_pkey a u64 and added some patch description details.
   Also made it share space in siginfo with MPX and clarified
   comments.
 * give some real text for the Processor Trace xsave state
 * made vma_pkey() less ugly (and much more optimized actually)
 * added SEGV_PKUERR to copy_siginfo_to_user()
 * remove page table walk when filling in si_pkey, added some
   big fat comments about it being inherently racy.
 * added self test code

MM reviewers, if you are going to look at one thing, please look
at patch 14 which adds a bunch of additional vma/pte permission
checks.

This code contains a new system call: mprotect_key(),  This needs
the usual amount of rigor around new interfaces.  Review there
would be much appreciated.

This code is not runnable to anyone outside of Intel unless they
have some special hardware or a fancy simulator.  If you are
interested in running this for real, please get in touch with me.
Hardware is available to a very small but nonzero number of
people.

This set is also available here (with the new syscall):

git://git.kernel.org/pub/scm/linux/kernel/git/daveh/x86-pkeys.git 
pkeys-v006

=== diffstat ===

(note that over half of this is kselftests)

 Documentation/kernel-parameters.txt   |3 
 Documentation/x86/protection-keys.txt |   54 +
 arch/powerpc/include/asm/mman.h   |5 
 arch/powerpc/include/asm/mmu_context.h|   11 
 arch/s390/include/asm/mmu_context.h   |   11 
 arch/unicore32/include/asm/mmu_context.h  |   11 
 arch/x86/Kconfig  |   15 
 arch/x86/entry/syscalls/syscall_32.tbl|1 
 arch/x86/entry/syscalls/syscall_64.tbl|1 
 arch/x86/include/asm/cpufeature.h |   54 +
 arch/x86/include/asm/disabled-features.h  |   12 
 arch/x86/include/asm/fpu/types.h  |   16 
 arch/x86/include/asm/fpu/xstate.h |4 
 arch/x86/include/asm/mmu_context.h|   71 ++
 arch/x86/include/asm/pgtable.h|   45 +
 arch/x86/include/asm/pgtable_types.h  |   34 -
 arch/x86/include/asm/required-features.h  |4 
 arch/x86/include/asm/special_insns.h  |   32 +
 arch/x86/include/uapi/asm/mman.h  |   23 
 arch/x86/include/uapi/asm/processor-flags.h   |2 
 arch/x86/kernel/cpu/common.c  |   42 +
 arch/x86/kernel/fpu/xstate.c  |7 
 arch/x86/kernel/process_64.c  |2 
 arch/x86/kernel/setup.c   |9 
 arch/x86/mm/fault.c   |  143 +++-
 arch/x86/mm/gup.c |   37 -
 drivers/char/agp/frontend.c   |2 
 drivers/staging/android/ashmem.c  |9 
 fs/proc/task_mmu.c|5 
 include/asm-generic/mm_hooks.h|   11 
 include/linux/mm.h|   13 
 include/linux/mman.h  |6 
 include/uapi/asm-generic/siginfo.h|   17 
 kernel/signal.c   |4 
 mm/Kconfig|   11 
 mm/gup.c  |   28 
 mm/memory.c   |4 
 mm/mmap.c |2 
 mm/mprotect.c |   20 
 mm/nommu.c|2 
 tools/testing/selftests/x86/Makefile  |3 
 tools/testing/selftests/x86/pkey-helpers.h|  182 +
 tools/testing/selftests/x86/protection_keys.c |  828 ++
 43 files changed, 1705 insertions(+), 91 deletions(-)

Cc: linux-api@vger.kernel.org
Cc: linux-a...@vger.kernel.org
--
To unsubscribe from this list: send the line "unsubscribe linux-api" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 10/23] userfaultfd: add new syscall to provide memory externalization

2015-06-23 Thread Dave Hansen
On 05/14/2015 10:31 AM, Andrea Arcangeli wrote:
 +static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
 +  int wake_flags, void *key)
 +{
 + struct userfaultfd_wake_range *range = key;
 + int ret;
 + struct userfaultfd_wait_queue *uwq;
 + unsigned long start, len;
 +
 + uwq = container_of(wq, struct userfaultfd_wait_queue, wq);
 + ret = 0;
 + /* don't wake the pending ones to avoid reads to block */
 + if (uwq-pending  !ACCESS_ONCE(uwq-ctx-released))
 + goto out;
 + /* len == 0 means wake all */
 + start = range-start;
 + len = range-len;
 + if (len  (start  uwq-address || start + len = uwq-address))
 + goto out;
 + ret = wake_up_state(wq-private, mode);
 + if (ret)
 + /* wake only once, autoremove behavior */
 + list_del_init(wq-task_list);
 +out:
 + return ret;
 +}
...
 +static __always_inline int validate_range(struct mm_struct *mm,
 +   __u64 start, __u64 len)
 +{
 + __u64 task_size = mm-task_size;
 +
 + if (start  ~PAGE_MASK)
 + return -EINVAL;
 + if (len  ~PAGE_MASK)
 + return -EINVAL;
 + if (!len)
 + return -EINVAL;
 + if (start  mmap_min_addr)
 + return -EINVAL;
 + if (start = task_size)
 + return -EINVAL;
 + if (len  task_size - start)
 + return -EINVAL;
 + return 0;
 +}

Hey Andrea,

Down in userfaultfd_wake_function(), it looks like you intended for a
len=0 to mean wake all.  But the validate_range() that we do from
userspace has a !len check in it, which keeps us from passing a len=0 in
from userspace.

Was that wake all for some internal use, or is the check too strict?

I was trying to use the wake ioctl after an madvise() (as opposed to
filling things in using a userfd copy).
--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 1/3] mm: introduce fincore()

2014-07-07 Thread Dave Hansen
 +/*
 + * You can control how the buffer in userspace is filled with this mode
 + * parameters:

I agree that we don't have any good mechanisms for looking at the page
cache from userspace.  I've hacked some things up using mincore() and
they weren't pretty, so I welcome _something_ like this.

But, is this trying to do too many things at once?  Do we have solid use
cases spelled out for each of these modes?  Have we thought out how they
will be used in practice?

The biggest question for me, though, is whether we want to start
designing these per-page interfaces to consider different page sizes, or
whether we're going to just continue to pretend that the entire world is
4k pages.  Using FINCORE_BMAP on 1GB hugetlbfs files would be a bit
silly, for instance.

 + * - FINCORE_BMAP:
 + * the page status is returned in a vector of bytes.
 + * The least significant bit of each byte is 1 if the referenced page
 + * is in memory, otherwise it is zero.

I know this is consistent with mincore(), but it did always bother me
that mincore() was so sparse.  Seems like it is wasting 7/8 of its bits.

 + * - FINCORE_PGOFF:
 + * if this flag is set, fincore() doesn't store any information about
 + * holes. Instead each records per page has the entry of page offset,
 + * using 8 bytes. This mode is useful if we handle a large file and
 + * only few pages are on memory.

This bothers me a bit.  How would someone know how sparse file was
before calling this?  If it's not sparse, and they use this, they'll end
up using 8x the memory they would have using FINCORE_BMAP.  If it *is*
sparse, and they use FINCORE_BMAP, they will either waste tons of memory
on buffers, or have to make a ton of calls.

I guess this could also be used to do *searches*, which would let you
search out holes.  Let's say you have a 2TB file.  You could call this
with a buffer size of 1 entry and do searches, say 0-1TB.  If you get
your one entry back, you know it's not completely sparse.

But, that wouldn't work with it as-designed.  The length of the buffer
and the range of the file being checked are coupled together, so you
can't say:

vec = malloc(sizeof(long));
fincore(fd, 0, 1TB, FINCORE_PGOFF, vec, extra);

without overflowing vec.

Is it really right to say this is going to be 8 bytes?  Would we want it
to share types with something else, like be an loff_t?

 + * - FINCORE_PFN:
 + * stores pfn, using 8 bytes.

These are all an unprivileged operations from what I can tell.  I know
we're going to a lot of trouble to hide kernel addresses from being seen
in userspace.  This seems like it would be undesirable for the folks
that care about not leaking kernel addresses, especially for
unprivileged users.

This would essentially tell userspace where in the kernel's address
space some user-controlled data will be.

 + * We can use multiple flags among the flags in FINCORE_LONGENTRY_MASK.
 + * For example, when the mode is FINCORE_PFN|FINCORE_PAGEFLAGS, the per-page
 + * information is stored like this:

Instead of specifying the ordering in the manpages alone, would it be
smarter to just say that the ordering of the items is dependent on the
ordering of the flags?  In other words if FINCORE_PFN 
FINCORE_PAGEFLAGS, then its field comes first?


--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 3/3] man2/fincore.2: document general description about fincore(2)

2014-07-07 Thread Dave Hansen
On 07/07/2014 11:00 AM, Naoya Horiguchi wrote:
 +.SH RETURN VALUE
 +On success,
 +.BR fincore ()
 +returns 0.
 +On error, \-1 is returned, and
 +.I errno
 +is set appropriately.

Is this accurate?  From reading the syscall itself, it looked like it
did this:

 + * Return value is the number of pages whose data is stored in fc-buffer.
 + */
 +static long do_fincore(struct fincore_control *fc, int nr_pages)

and:

 +SYSCALL_DEFINE6(fincore, int, fd, loff_t, start, long, nr_pages,
...
 + while (fc.nr_pages  0) {
 + memset(fc.buffer, 0, fc.buffer_size);
 + ret = do_fincore(fc, min(step, fc.nr_pages));
 + /* Reached the end of the file */
 + if (ret == 0)
 + break;
 + if (ret  0)
 + break;
...
 + }
...
 + return ret;
 +}

Which seems that for a given loop of do_fincore(), you might end up
returning the result of that *single* iteration of do_fincore() instead
of the aggregate of the entire syscall.

So, it can return 0 on failure, 0 on success, or also an essentially
random 0 number on success too.

Why not just use the return value for something useful instead of
hacking in the extras-nr_entries stuff?  Oh, and what if that

 + if (extra)
 + __put_user(nr, extra-nr_entries);

fails?  It seems like we might silently forget to tell userspace how
many entries we filled.
--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 1/3] mm: introduce fincore()

2014-07-07 Thread Dave Hansen
On 07/07/2014 01:21 PM, Naoya Horiguchi wrote:
 On Mon, Jul 07, 2014 at 12:01:41PM -0700, Dave Hansen wrote:
 But, is this trying to do too many things at once?  Do we have solid use
 cases spelled out for each of these modes?  Have we thought out how they
 will be used in practice?
 
 tools/vm/page-types.c will be an in-kernel user after this base code is
 accepted. The idea of doing fincore() thing comes up during the discussion
 with Konstantin over file cache mode of this tool.
 pfn and page flag are needed there, so I think it's one clear usecase.

I'm going to take that as a no. :)

The whole FINCORE_PGOFF vs. FINCORE_BMAP issue is something that will
come up in practice.  We just don't have the interfaces for an end user
to pick which one they want to use.
 Is it really right to say this is going to be 8 bytes?  Would we want it
 to share types with something else, like be an loff_t?
 
 Could you elaborate it more?

We specify file offsets in other system calls, like the lseek family.  I
was just thinking that this type should match up with those calls since
they are expressing the same data type with the same ranges and limitations.

 + * - FINCORE_PFN:
 + * stores pfn, using 8 bytes.

 These are all an unprivileged operations from what I can tell.  I know
 we're going to a lot of trouble to hide kernel addresses from being seen
 in userspace.  This seems like it would be undesirable for the folks
 that care about not leaking kernel addresses, especially for
 unprivileged users.

 This would essentially tell userspace where in the kernel's address
 space some user-controlled data will be.
 
 OK, so this and FINCORE_PAGEFLAGS will be limited for privileged users.

Then I'd just question their usefulness outside of a debugging
environment, especially when you can get at them in other (more
roundabout) ways in a debugging environment.

This is really looking to me like two system calls.  The bitmap-based
one, and another more extensible one.  I don't think there's any harm in
having two system calls, especially when they're trying to glue together
two disparate interfaces.

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 3/3] man2/fincore.2: document general description about fincore(2)

2014-07-07 Thread Dave Hansen
On 07/07/2014 01:59 PM, Naoya Horiguchi wrote:
 On Mon, Jul 07, 2014 at 12:08:12PM -0700, Dave Hansen wrote:
 On 07/07/2014 11:00 AM, Naoya Horiguchi wrote:
 +.SH RETURN VALUE
 +On success,
 +.BR fincore ()
 +returns 0.
 +On error, \-1 is returned, and
 +.I errno
 +is set appropriately.

 Is this accurate?  From reading the syscall itself, it looked like it
 did this:

 + * Return value is the number of pages whose data is stored in fc-buffer.
 + */
 +static long do_fincore(struct fincore_control *fc, int nr_pages)

 and:

 +SYSCALL_DEFINE6(fincore, int, fd, loff_t, start, long, nr_pages,
 ...
 +   while (fc.nr_pages  0) {
 +   memset(fc.buffer, 0, fc.buffer_size);
 +   ret = do_fincore(fc, min(step, fc.nr_pages));
 +   /* Reached the end of the file */
 +   if (ret == 0)
 +   break;
 +   if (ret  0)
 +   break;
 ...
 +   }
 ...
 +   return ret;
 +}

 Which seems that for a given loop of do_fincore(), you might end up
 returning the result of that *single* iteration of do_fincore() instead
 of the aggregate of the entire syscall.

 So, it can return 0 on failure, 0 on success, or also an essentially
 random 0 number on success too.
 
 We don't break this while loop if do_fincore() returned a positive value
 unless copy_to_user() fails. And in that case ret is set to -EFAULT.
 So I think sys_fincore() never returns a positive value.

OK, that makes sense as I'm reading it again.

 Why not just use the return value for something useful instead of
 hacking in the extras-nr_entries stuff?
 
 Hmm, I got the opposite complaint previously, where we shouldn't
 interpret the return value differently depending on the flag.
 And I'd like to keep the extra argument for future extensibility.
 For example, if we want to collect pages only with a specific
 set of page flags, this extra argument will be necessary.

Couldn't it simply be the number of elements that it wrote in to the
buffer, or even the number of bytes?

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v21 001/100] eclone (1/11): Factor out code to allocate pidmap page

2010-05-03 Thread Dave Hansen
On Sat, 2010-05-01 at 15:10 -0700, David Miller wrote:
 NO WAY, there is no way in the world you should post 100 patches
 at a time to any mailing list, especially those at vger.kernel.org
 that have thousands upon thousands of subscribers.
 
 Post only small, well contained, sets of patches at a time.  At most
 10 or so in one go.

Hi Dave,

I really do apologize if these caused undue traffic on vger.  It
certainly wasn't our intention to cause any problems.

We've had a fear all along that we'll just go back into our little
contain...@lists.linux-foundation.org, and go astray of what the
community wants done with these patches.  It's also important to
remember that these really do affect the entire kernel.  Unfortunately,
it's not really a single feature or something that fits well on any
other mailing list.  It has implications _everywhere_.  I think Andrew
Morton also asked that these continue coming to LKML, although his
request probably came at a time when the set was a wee bit smaller.

 Do you realize how much mail traffic you generate by posting so many
 patches at one time, and how unlikely it is for anyone to actually
 sift through and review your patches after you've spammed them by
 posting so many at one time?

I honestly don't expect people to be reading the whole thing at once.
But, I do hope that people can take a look at their individual bits that
are touched.

 A second infraction and I will have no choice but to block you at the
 SMTP level at vger.kernel.org so please do not do it again.

I know these patches are certainly not at the level of importance as,
say the pata or -stable updates, but they're certainly not of
unprecedented scale.  I've seen a number of patchbombs of this size
recently.

I hope Andrew pulls this set into -mm so this doesn't even come up
again.  But, if it does, can you make some suggestions on how to be more
kind to vger in the process?

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: How much of a mess does OpenVZ make? ;) Was: What can OpenVZ do?

2009-03-13 Thread Dave Hansen
On Fri, 2009-03-13 at 14:01 -0700, Linus Torvalds wrote:
 On Fri, 13 Mar 2009, Alexey Dobriyan wrote:
   Let's face it, we're not going to _ever_ checkpoint any kind of general 
   case process. Just TCP makes that fundamentally impossible in the general 
   case, and there are lots and lots of other cases too (just something as 
   totally _trivial_ as all the files in the filesystem that don't get 
   rolled 
   back).
  
  What do you mean here? Unlinked files?
 
 Or modified files, or anything else. External state is a pretty damn 
 wide net. It's not just TCP sequence numbers and another machine.

This is precisely the reason that we've focused so hard on containers,
and *didn't* just jump right into checkpoint/restart; we're trying
really hard to constrain the _truly_ external things that a process can
interact with.  

The approach so far has largely been to make things are external to a
process at least *internal* to a container.  Network, pid, ipc, and uts
namespaces, for example.  An ipc/sem.c semaphore may be external to a
process, so we'll just pick the whole namespace up and checkpoint it
along with the process.

In the OpenVZ case, they've at least demonstrated that the filesystem
can be moved largely with rsync.  Unlinked files need some in-kernel TLC
(or /proc mangling) but it isn't *that* bad.

We can also make the fs problem much easier by using things like dm or
btrfs snapshotting of the block device, or restricting to where on a fs
a container is allowed to write with stuff like r/o bind mounts.

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: How much of a mess does OpenVZ make? ;) Was: What can OpenVZ do?

2009-02-27 Thread Dave Hansen
On Fri, 2009-02-27 at 01:31 +0300, Alexey Dobriyan wrote:
  I think the main question is: will we ever find ourselves in the 
  future saying that C/R sucks, nobody but a small minority uses 
  it, wish we had never merged it? I think the likelyhood of that 
  is very low. I think the current OpenVZ stuff already looks very 
  useful, and i dont think we've realized (let alone explored) all 
  the possibilities yet.
 
 This is collecting and start of dumping part of cleaned up OpenVZ C/R
 implementation, FYI.

Are you just posting this to show how you expect c/r to look eventually?
Or are you proposing this as an alternative to what Oren has bee
posting?

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v13][PATCH 05/14] x86 support for checkpoint/restart

2009-02-24 Thread Dave Hansen
On Tue, 2009-02-24 at 01:47 -0600, Nathan Lynch wrote:
 But I think this has been pointed out before.  If I understand the
 justification for cr_hbuf_get correctly, the allocations it services
 are somehow known to be bounded in size and nesting.  But even if that
 is the case, it's not much of a reason to avoid using kmalloc, is it?

Oren wants this particular facility to be used for live migration.  To
support good live migration, we need to be able to return from the
syscall as fast as possible.  To do that, Oren proposed that we buffer
all the data needed for the checkpoint inside the kernel.

The current cr_hbuf_put/get() could easily be modified to support this
usage by basically making put() do nothing, then handing off a handle to
the cr_ctx structure elsewhere in the kernel.  When the time comes to
free up the in-memory image, you only have one simple structure to go
free (the hbuf) as opposed to a bunch of little kmalloc()'d objects.

I'm sure I'm missing something.  I'm also sure that this *will* work
eventually.  But, I don't think the code as it stands supports keeping
the abstraction in there.  It is virtually impossible to debate the
design or its alternatives in this state.

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Banning checkpoint (was: Re: What can OpenVZ do?)

2009-02-19 Thread Dave Hansen
On Thu, 2009-02-19 at 22:06 +0300, Alexey Dobriyan wrote:
 Inotify isn't supported yet? You do
 
 if (!list_empty(inode-inotify_watches))
 return -E;
 
 without hooking into inotify syscalls.
 
 ptrace(2) isn't supported -- look at struct task_struct::ptraced and
 friends.
 
 And so on.
 
 System call (or whatever) does something with some piece of kernel
 internals. We look at this something when walking data structures
 and
 abort if it's scary enough.
 
 Please, show at least one counter-example.

Alexey, I agree with you here.  I've been fighting myself internally
about these two somewhat opposing approaches.  Of *course* we can
determine the checkpointability at sys_checkpoint() time by checking
all the various bits of state.

The problem that I think Ingo is trying to address here is that doing it
then makes it hard to figure out _when_ you went wrong.  That's the
single most critical piece of finding out how to go address it.

I see where you are coming from.  Ingo's suggestion has the *huge*
downside that we've got to go muck with a lot of generic code and hook
into all the things we don't support.

I think what I posted is a decent compromise.  It gets you those
warnings at runtime and is a one-way trip for any given process.  But,
it does detect in certain cases (fork() and unshare(FILES)) when it is
safe to make the trip back to the I'm checkpointable state again.

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: What can OpenVZ do?

2009-02-18 Thread Dave Hansen
On Wed, 2009-02-18 at 19:16 +0100, Ingo Molnar wrote:
 Nothing motivates more than app designers complaining about the 
 one-way flag.
 
 Furthermore, it's _far_ easier to make a one-way flag SMP-safe. 
 We just set it and that's it. When we unset it, what do we about 
 SMP races with other threads in the same MM installing another 
 non-linear vma, etc.

After looking at this for file descriptors, I have to really agree with
Ingo on this one, at least as far as the flag is concerned.  I want to
propose one teeny change, though:  I think the flag should be
per-resource.

We should have one flag in mm_struct, one in files_struct, etc...  The
task_is_checkpointable() function can just query task-mm, task-files,
etc...  This gives us nice behavior at clone() *and* fork that just
works.

I'll do this for files_struct and see how it comes out so you can take a
peek.

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: What can OpenVZ do?

2009-02-17 Thread Dave Hansen
On Tue, 2009-02-17 at 23:23 +0100, Ingo Molnar wrote:
 * Dave Hansen d...@linux.vnet.ibm.com wrote:
  On Fri, 2009-02-13 at 11:53 +0100, Ingo Molnar wrote:
   In any case, by designing checkpointing to reuse the existing LSM
   callbacks, we'd hit multiple birds with the same stone. (One of
   which is the constant complaints about the runtime costs of the LSM
   callbacks - with checkpointing we get an independent, non-security
   user of the facility which is a nice touch.)
  
  There's a fundamental problem with using LSM that I'm seeing 
  now that I look at using it for file descriptors.  The LSM 
  hooks are there to say, No, you can't do this and abort 
  whatever kernel operation was going on.  That's good for 
  detecting when we do something that's bad for checkpointing.
  
  *But* it completely falls on its face when we want to find out 
  when we are doing things that are *good*.  For instance, let's 
  say that we open a network socket.  The LSM hook sees it and 
  marks us as uncheckpointable.  What about when we close it?  
  We've become checkpointable again.  But, there's no LSM hook 
  for the close side because we don't currently have a need for 
  it.
 
 Uncheckpointable should be a one-way flag anyway. We want this 
 to become usable, so uncheckpointable functionality should be as 
 painful as possible, to make sure it's getting fixed ...

Again, as these patches stand, we don't support checkpointing when
non-simple files are opened.  Basically, if a open()/lseek() pair won't
get you back where you were, we don't deal with them.

init does non-checkpointable things.  If the flag is a one-way trip,
we'll never be able to checkpoint because we'll always inherit init's !
checkpointable flag.  

To fix this, we could start working on making sure we can checkpoint
init, but that's practically worthless.

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v13][PATCH 00/14] Kernel based checkpoint/restart

2009-02-16 Thread Dave Hansen
On Fri, 2009-02-13 at 15:28 -0800, Andrew Morton wrote:
   For extra marks:
   
   - Will any of this involve non-trivial serialisation of kernel
 objects?  If so, that's getting into the
 unacceptably-expensive-to-maintain space, I suspect.
  
  We have some structures that are certainly tied to the kernel-internal
  ones.  However, we are certainly *not* simply writing kernel structures
  to userspace.  We could do that with /dev/mem.  We are carefully pulling
  out the minimal bits of information from the kernel structures that we
  *need* to recreate the function of the structure at restart.  There is a
  maintenance burden here but, so far, that burden is almost entirely in
  checkpoint/*.c.  We intend to test this functionality thoroughly to
  ensure that we don't regress once we have integrated it.
 
 I guess my question can be approximately simplified to: will it end up
 looking like openvz?  (I don't believe that we know of any other way
 of implementing this?)
 
 Because if it does then that's a concern, because my assessment when I
 looked at that code (a number of years ago) was that having code of
 that nature in mainline would be pretty costly to us, and rather
 unwelcome.

With the current path, my guess is that we will end up looking
*something* like OpenVZ.  But, with all the input from the OpenVZ folks
and at least three other projects, I bet we can come up with something
better.  I do wish the OpenVZ folks were being more vocal and
constructive about Oren's current code but I guess silence is the
greatest complement...

 The broadest form of the question is will we end up regretting having
 done this.
 If we can arrange for the implementation to sit quietly over in a
 corner with a team of people maintaining it and not screwing up other
 people's work then I guess we'd be OK - if it breaks then the breakage
 is localised.
 
 And it's not just a matter of does the diffstat only affect a single
 subdirectory.  We also should watch out for the imposition of new
 rules which kernel code must follow.  you can't do that, because we
 can't serialise it, or something.
 
 Similar to the way in which perfectly correct and normal kernel
 sometimes has to be changed because it unexpectedly upsets the -rt
 patch.
 
 Do you expect that any restrictions of this type will be imposed?

Basically, yes.  But, practically, we haven't been thinking about
serializing stuff in the kernel, ever.  That's produced a few
difficult-to-serialize things like AF_UNIX sockets but absolutely
nothing that simply can't be done.  

Having this code in mainline and getting some of people's mindshare
should at least enable us to speak up if we see another thing like
AF_UNIX coming down the pipe.  We could hopefully catch it and at least
tweak it a bit to enhance how easily we can serialize it. 

Again, it isn't likely to be an all-or-nothing situation.  It is a
matter of how many hoops the checkpoint code itself has to jump
through. 

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: What can OpenVZ do?

2009-02-16 Thread Dave Hansen
On Fri, 2009-02-13 at 11:53 +0100, Ingo Molnar wrote:
 In any case, by designing checkpointing to reuse the existing LSM
 callbacks, we'd hit multiple birds with the same stone. (One of
 which is the constant complaints about the runtime costs of the LSM
 callbacks - with checkpointing we get an independent, non-security
 user of the facility which is a nice touch.)

There's a fundamental problem with using LSM that I'm seeing now that I
look at using it for file descriptors.  The LSM hooks are there to say,
No, you can't do this and abort whatever kernel operation was going
on.  That's good for detecting when we do something that's bad for
checkpointing.

*But* it completely falls on its face when we want to find out when we
are doing things that are *good*.  For instance, let's say that we open
a network socket.  The LSM hook sees it and marks us as
uncheckpointable.  What about when we close it?  We've become
checkpointable again.  But, there's no LSM hook for the close side
because we don't currently have a need for it.

We have a couple of options:

We can let uncheckpointable actions behave like security violations and
just abort the kernel calls.  The problem with this is that it makes it
difficult to do *anything* unless your application is 100% supported.
Pretty inconvenient, especially at first.  Might be useful later on
though.

We could just log the actions and let them proceed.  But the problem
with this is that we don't get the temporal idea when an app transitions
between the good and bad states.  We would need to work on culling
the output in the logs since we'd be potentially getting a lot of
redundant data.

We could add to the set of security hooks.  Make sure that we cover all
the transitional states like close().

What I'm thinking about doing for now is what I have attached here.  We
allow the apps who we want to be checkpointed to query some interface
that will use the same checks that sys_checkpoint() does internally.
Say:

# cat /proc/1072/checkpointable
mm: 1
files: 0
...

Then, when it realizes that its files can't be checkpointed, it can look
elsewhere:

/proc/1072/fdinfo/2:pos:0
/proc/1072/fdinfo/2:flags:  02
/proc/1072/fdinfo/2:checkpointable: 0 (special file)
/proc/1072/fdinfo/3:pos:0
/proc/1072/fdinfo/3:flags:  04000
/proc/1072/fdinfo/3:checkpointable: 0 (pipefs does not support checkpoint)
/proc/1072/fdinfo/4:pos:0
/proc/1072/fdinfo/4:flags:  04002
/proc/1072/fdinfo/4:checkpointable: 0 (sockfs does not support checkpoint)
/proc/1074/fdinfo/0:pos:0
/proc/1074/fdinfo/0:flags:  012
/proc/1074/fdinfo/0:checkpointable: 0 (devpts does not support checkpoint)

That requires zero overhead during runtime of the app.  It is also less
error-prone because we don't have any of the transitions to catch.

-- Dave

diff --git a/checkpoint/ckpt_file.c b/checkpoint/ckpt_file.c
index e3097ac..ebe776a 100644
--- a/checkpoint/ckpt_file.c
+++ b/checkpoint/ckpt_file.c
@@ -72,6 +72,32 @@ int cr_scan_fds(struct files_struct *files, int **fdtable)
return n;
 }
 
+int cr_can_checkpoint_file(struct file *file, char *explain, int left)
+{
+   char p[] = checkpointable;
+   struct inode *inode = file-f_dentry-d_inode;
+   struct file_system_type *fs_type = inode-i_sb-s_type;
+
+   printk(%s() left: %d\n, __func__, left);
+
+   if (!(fs_type-fs_flags  FS_CHECKPOINTABLE)) {
+   if (explain)
+   snprintf(explain, left,
+   %s: 0 (%s does not support checkpoint)\n,
+   p, fs_type-name);
+   return 0;
+   }
+
+   if (special_file(inode-i_mode)) {
+   if (explain)
+   snprintf(explain, left, %s: 0 (special file)\n, p);
+   return 0;
+   }
+
+   snprintf(explain, left, %s: 1\n, p);
+   return 1;
+}
+
 /* cr_write_fd_data - dump the state of a given file pointer */
 static int cr_write_fd_data(struct cr_ctx *ctx, struct file *file, int parent)
 {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d467760..2300353 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1597,7 +1597,19 @@ out:
return ~0U;
 }
 
-#define PROC_FDINFO_MAX 64
+#define PROC_FDINFO_MAX PAGE_SIZE
+
+static void proc_fd_write_info(struct file *file, char *info)
+{
+   int max = PROC_FDINFO_MAX;
+   int p = 0;
+   if (!info)
+   return;
+
+   p += snprintf(info+p, max-p, pos:\t%lli\n, (long long) file-f_pos);
+   p += snprintf(info+p, max-p, flags:\t0%o\n, file-f_flags);
+   cr_can_checkpoint_file(file, info, max-p);
+}
 
 static int proc_fd_info(struct inode *inode, struct path *path, char *info)
 {
@@ -1622,12 +1634,7 @@ static int proc_fd_info(struct inode *inode, struct path 
*path, char *info)
*path = file-f_path;
path_get(file-f_path);
}
-   if 

What can OpenVZ do?

2009-02-12 Thread Dave Hansen
On Thu, 2009-02-12 at 11:42 -0800, Andrew Morton wrote:
 On Thu, 12 Feb 2009 13:30:35 -0600
 Matt Mackall m...@selenic.com wrote:
 
  On Thu, 2009-02-12 at 10:11 -0800, Dave Hansen wrote:
  
- In bullet-point form, what features are missing, and should be added?
   
* support for more architectures than i386
* file descriptors:
 * sockets (network, AF_UNIX, etc...)
 * devices files
 * shmfs, hugetlbfs
 * epoll
 * unlinked files
  
* Filesystem state
 * contents of files
 * mount tree for individual processes
* flock
* threads and sessions
* CPU and NUMA affinity
* sys_remap_file_pages()
  
  I think the real questions is: where are the dragons hiding? Some of
  these are known to be hard. And some of them are critical checkpointing
  typical applications. If you have plans or theories for implementing all
  of the above, then great. But this list doesn't really give any sense of
  whether we should be scared of what lurks behind those doors.
 
 How close has OpenVZ come to implementing all of this?  I think the
 implementatation is fairly complete?

I also believe it is fairly complete.  At least able to be used
practically.

 If so, perhaps that can be used as a guide.  Will the planned feature
 have a similar design?  If not, how will it differ?  To what extent can
 we use that implementation as a tool for understanding what this new
 implementation will look like?

Yes, we can certainly use it as a guide.  However, there are some
barriers to being able to do that:

d...@nimitz:~/kernels/linux-2.6-openvz$ git diff v2.6.27.10... | diffstat | 
tail -1
 628 files changed, 59597 insertions(+), 2927 deletions(-)
d...@nimitz:~/kernels/linux-2.6-openvz$ git diff v2.6.27.10... | wc 
  84887  290855 2308745

Unfortunately, the git tree doesn't have that great of a history.  It
appears that the forward-ports are just applications of huge single
patches which then get committed into git.  This tree has also
historically contained a bunch of stuff not directly related to
checkpoint/restart like resource management.

We'd be idiots not to take a hard look at what has been done in OpenVZ.
But, for the time being, we have absolutely no shortage of things that
we know are important and know have to be done.  Our largest problem is
not finding things to do, but is our large out-of-tree patch that is
growing by the day. :(

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


How much of a mess does OpenVZ make? ;) Was: What can OpenVZ do?

2009-02-12 Thread Dave Hansen
On Thu, 2009-02-12 at 14:10 -0800, Andrew Morton wrote:
 On Thu, 12 Feb 2009 13:51:23 -0800
 Dave Hansen d...@linux.vnet.ibm.com wrote:
 
  On Thu, 2009-02-12 at 11:42 -0800, Andrew Morton wrote:
   On Thu, 12 Feb 2009 13:30:35 -0600
   Matt Mackall m...@selenic.com wrote:
   
On Thu, 2009-02-12 at 10:11 -0800, Dave Hansen wrote:

  - In bullet-point form, what features are missing, and should be 
  added?
 
  * support for more architectures than i386
  * file descriptors:
   * sockets (network, AF_UNIX, etc...)
   * devices files
   * shmfs, hugetlbfs
   * epoll
   * unlinked files

  * Filesystem state
   * contents of files
   * mount tree for individual processes
  * flock
  * threads and sessions
  * CPU and NUMA affinity
  * sys_remap_file_pages()

I think the real questions is: where are the dragons hiding? Some of
these are known to be hard. And some of them are critical checkpointing
typical applications. If you have plans or theories for implementing all
of the above, then great. But this list doesn't really give any sense of
whether we should be scared of what lurks behind those doors.
   
   How close has OpenVZ come to implementing all of this?  I think the
   implementatation is fairly complete?
  
  I also believe it is fairly complete.  At least able to be used
  practically.
  
   If so, perhaps that can be used as a guide.  Will the planned feature
   have a similar design?  If not, how will it differ?  To what extent can
   we use that implementation as a tool for understanding what this new
   implementation will look like?
  
  Yes, we can certainly use it as a guide.  However, there are some
  barriers to being able to do that:
  
  d...@nimitz:~/kernels/linux-2.6-openvz$ git diff v2.6.27.10... | diffstat | 
  tail -1
   628 files changed, 59597 insertions(+), 2927 deletions(-)
  d...@nimitz:~/kernels/linux-2.6-openvz$ git diff v2.6.27.10... | wc 
84887  290855 2308745
  
  Unfortunately, the git tree doesn't have that great of a history.  It
  appears that the forward-ports are just applications of huge single
  patches which then get committed into git.  This tree has also
  historically contained a bunch of stuff not directly related to
  checkpoint/restart like resource management.
  
  We'd be idiots not to take a hard look at what has been done in OpenVZ.
  But, for the time being, we have absolutely no shortage of things that
  we know are important and know have to be done.  Our largest problem is
  not finding things to do, but is our large out-of-tree patch that is
  growing by the day. :(
  
 
 Well we have a chicken-and-eggish thing.  The patchset will keep
 growing until we understand how much of this:
 
  d...@nimitz:~/kernels/linux-2.6-openvz$ git diff v2.6.27.10... | diffstat | 
  tail -1
   628 files changed, 59597 insertions(+), 2927 deletions(-)
 
 we will be committed to if we were to merge the current patchset.

Here's the measurement that Alexey suggested:

d...@nimitz:~/kernels/linux-2.6-openvz$ git diff v2.6.27.10... kernel/cpt/ | 
diffstat 
 Makefile|   53 +
 cpt_conntrack.c |  365 
 cpt_context.c   |  257 
 cpt_context.h   |  215 +++
 cpt_dump.c  | 1250 ++
 cpt_dump.h  |   16 
 cpt_epoll.c |  113 +++
 cpt_exports.c   |   13 
 cpt_files.c | 1626 +++
 cpt_files.h |   71 ++
 cpt_fsmagic.h   |   16 
 cpt_inotify.c   |  144 
 cpt_kernel.c|  177 ++
 cpt_kernel.h|   99 +++
 cpt_mm.c|  923 +++
 cpt_mm.h|   35 +
 cpt_net.c   |  614 
 cpt_net.h   |7 
 cpt_obj.c   |  162 +
 cpt_obj.h   |   62 ++
 cpt_proc.c  |  595 
 cpt_process.c   | 1369 ++
 cpt_process.h   |   13 
 cpt_socket.c|  790 ++
 cpt_socket.h|   33 +
 cpt_socket_in.c |  450 +++
 cpt_syscalls.h  |  101 +++
 cpt_sysvipc.c   |  403 +
 cpt_tty.c   |  215 +++
 cpt_ubc.c   |  132 
 cpt_ubc.h   |   23 
 cpt_x8664.S |   67 ++
 rst_conntrack.c |  283 +
 rst_context.c   |  323 ++
 rst_epoll.c |  169 +
 rst_files.c | 1648 
 rst_inotify.c   |  196 ++
 rst_mm.c| 1151 +++
 rst_net.c   |  741 +
 rst_proc.c  |  580 +++
 rst_process.c   | 1640 +++
 rst_socket.c|  918 +++
 rst_socket_in.c |  489 
 rst_sysvipc.c   |  633 +
 rst_tty.c   |  384 +
 rst_ubc.c   |  131 
 rst_undump.c| 1007

Re: [RFC v13][PATCH 00/14] Kernel based checkpoint/restart

2009-02-10 Thread Dave Hansen
On Tue, 2009-01-27 at 12:07 -0500, Oren Laadan wrote:
 Checkpoint-restart (c/r): a couple of fixes in preparation for 64bit
 architectures, and a couple of fixes for bugss (comments from Serge
 Hallyn, Sudakvev Bhattiprolu and Nathan Lynch). Updated and tested
 against v2.6.28.
 
 Aiming for -mm.

Is there anything that we're waiting on before these can go into -mm?  I
think the discussion on the first few patches has died down to almost
nothing.  They're pretty reviewed-out.  Do they need a run in -mm?  I
don't think linux-next is quite appropriate since they're not _quite_
aimed at mainline yet.

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v11][PATCH 05/13] Dump memory address space

2008-12-18 Thread Dave Hansen
On Thu, 2008-12-18 at 06:10 -0500, Oren Laadan wrote:
  +mutex_lock(mm-context.lock);
  +
  +hh-ldt_entry_size = LDT_ENTRY_SIZE;
  +hh-nldt = mm-context.size;
  +
  +cr_debug(nldt %d\n, hh-nldt);
  +
  +ret = cr_write_obj(ctx, h, hh);
  +cr_hbuf_put(ctx, sizeof(*hh));
  +if (ret  0)
  +goto out;
  +
  +ret = cr_kwrite(ctx, mm-context.ldt,
  +mm-context.size * LDT_ENTRY_SIZE);
  
  Do we really want to emit anything under lock?  I realize that this
  patch goes and does a ton of writes with mmap_sem held for read -- is
  this ok?
 
 Because all tasks in the container must be frozen during the checkpoint,
 there is no performance penalty for keeping the locks. Although the object
 should not change in the interim anyways, the locks protects us from, e.g.
 the task unfreezing somehow, or being killed by the OOM killer, or any
 other change incurred from the outside world (even future code).
 
 Put in other words - in the long run it is safer to assume that the
 underlying object may otherwise change.
 
 (If we want to drop the lock here before cr_kwrite(), we need to copy the
 data to a temporary buffer first. If we also want to drop mmap_sem(), we
 need to be more careful with following the vma's.)
 
 Do you see a reason to not keeping the locks ?

Mike, although we're doing writes of the checkpoint file here, the *mm*
access is read-only.  We only need really mmap_sem for write if we're
creating new VMAs, which we only do on restore.  Was there an action
taken on the mm that would require a write that we missed?

Oren, I never considered the locking overhead, either.  The fact that
the processes are frozen is very, very important here.  The code is fine
as it stands because this *is* a very simple way to do it.  But, this
probably deserves a comment. 

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v11][PATCH 05/13] Dump memory address space

2008-12-18 Thread Dave Hansen
On Thu, 2008-12-18 at 10:15 -0800, Mike Waychison wrote:
 
  +pgarr = kzalloc(sizeof(*pgarr), GFP_KERNEL);
  +if (!pgarr)
  +return NULL;
  +
  +pgarr-vaddrs = kmalloc(CR_PGARR_TOTAL * sizeof(unsigned
 long),
  You used PAGE_SIZE / sizeof(void *) above.   Why not
 __get_free_page()?
  
  Hahaha .. well, it's a guaranteed method to keep Dave Hansen from
  barking about not using kmalloc ...
  
  Personally I prefer __get_free_page() here, but not enough to keep
  arguing with him. Let me know when the two of you settle it :)
 
 Alright, I just wasn't sure if it had been considered.

__get_free_page() sucks.  It doesn't do cool stuff like redzoning when
you have slab debugging turned on.  :)

I would personally suggest never using __get_free_page() unless you
truly need a *PAGE*.  That's an aligned, and PAGE_SIZE chunk.  If you
don't need alignment, or don't literally need a 'struct page', don't use
it.

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v11][PATCH 00/13] Kernel based checkpoint/restart

2008-12-16 Thread Dave Hansen
Andrew,

I just realized that you weren't cc'd on these when they were posted.

Can we give them a run in -mm?  As far as I know, all review comments
have been addressed and there's nothing outstanding.

On Fri, 2008-12-05 at 12:31 -0500, Oren Laadan wrote:
 Checkpoint-restart (c/r): fixed races in file handling (comments from
 from Al Viro). Updated and tested against v2.6.28-rc7 (feaf384...)
 
 We'd like these to make it into -mm. This version addresses the
 last of the known bugs. Please pull at least the first 11 patches,
 as they are similar to before.
 
 Patches 1-11 are stable, providing self- and external- c/r of a
 single process.
 Patches 12 and 13 are newer, adding support for c/r of multiple
 processes.
 
 The git tree tracking v11, branch 'ckpt-v11' (and older versions):
   git://git.ncl.cs.columbia.edu/pub/git/linux-cr.git
 
 Restarting multiple processes requires 'mktree' userspace tool:
   git://git.ncl.cs.columbia.edu/pub/git/user-cr.git
 
 Oren.
 
 
 --
 Why do we want it?  It allows containers to be moved between physical
 machines' kernels in the same way that VMWare can move VMs between
 physical machines' hypervisors.  There are currently at least two
 out-of-tree implementations of this in the commercial world (IBM's
 Metacluster and Parallels' OpenVZ/Virtuozzo) and several in the academic
 world like Zap.
 
 Why do we need it in mainline now?  Because we already have plenty of
 out-of-tree ones, and  want to know what an in-tree one will be like.   :)  
 What *I* want right now is the extra review and scrutiny that comes with
 a mainline submission to make sure we're not going in a direction
 contrary to the community.
 
 This only supports pretty simple apps.  But, I trust Ingo when he says:
Generally, if something works for simple apps already (in a robust, 
compatible and supportable way) and users find it very cool, then 
support for more complex apps is not far in the future.  but if you
want to support more complex apps straight away, it takes forever and
gets ugly.
 
 We're *certainly* going to be changing the ABI (which is the format of
 the checkpoint).  I'd like to follow the model that we used for
 ext4-dev, which is to make it very clear that this is a development-only
 feature for now.  Perhaps we do that by making the interface only
 available through debugfs or something similar for now.  Or, reserving
 the syscall numbers but require some runtime switch to be thrown before
 they can be used.  I'm open to suggestions here.
 --
 
 --
 Todo:
 - Add support for x86-64 and improve ABI
 - Refine or change syscall interface
 - Handle multiple namespaces in a container (e.g. save the filesystem
   namespaces state with the file descriptors)
 - Security (without CAPS_SYS_ADMIN files restore may fail)
 
 Changelog:
 
 [2008-Dec-05] v11:
   - Use contents of 'init-fs-root' instead of pointing to it
   - Ignore symlinks (there is no such thing as an open symlink)
   - cr_scan_fds() retries from scratch if it hits size limits
   - Add missing test for VM_MAYSHARE when dumping memory
   - Improve documentation about: behavior when tasks aren't fronen,
 life span of the object hash, references to objects in the hash
 
 [2008-Nov-26] v10:
   - Grab vfs root of container init, rather than current process
   - Acquire dcache_lock around call to __d_path() in cr_fill_name()
   - Force end-of-string in cr_read_string() (fix possible DoS)
   - Introduce cr_write_buffer(), cr_read_buffer() and cr_read_buf_type()
 
 [2008-Nov-10] v9:
   - Support multiple processes c/r
   - Extend checkpoint header with archtiecture dependent header 
   - Misc bug fixes (see individual changelogs)
   - Rebase to v2.6.28-rc3.
 
 [2008-Oct-29] v8:
   - Support external checkpoint
   - Include Dave Hansen's 'deny-checkpoint' patch
   - Split docs in Documentation/checkpoint/..., and improve contents
 
 [2008-Oct-17] v7:
   - Fix save/restore state of FPU
   - Fix argument given to kunmap_atomic() in memory dump/restore
 
 [2008-Oct-07] v6:
   - Balance all calls to cr_hbuf_get() with matching cr_hbuf_put()
 (even though it's not really needed)
   - Add assumptions and what's-missing to documentation
   - Misc fixes and cleanups
 
 [2008-Sep-11] v5:
   - Config is now 'def_bool n' by default
   - Improve memory dump/restore code (following Dave Hansen's comments)
   - Change dump format (and code) to allow chunks of vaddrs, pages
 instead of one long list of each
   - Fix use of follow_page() to avoid faulting in non-present pages
   - Memory restore now maps user pages explicitly to copy data into them,
 instead of reading directly to user space; got rid of mprotect_fixup()
   - Remove preempt_disable() when restoring debug registers
   - Rename headers files s/ckpt/checkpoint/
   - Fix misc bugs in files dump/restore
   - Fixes and cleanups on some error paths
   - Fix misc coding style
 
 [2008-Sep-09] v4:
   - Various fixes and clean-ups
   - Fix calculation of 

Re: [RFC v11][PATCH 03/13] General infrastructure for checkpoint restart

2008-12-16 Thread Dave Hansen
On Tue, 2008-12-16 at 14:43 -0800, Mike Waychison wrote:
 Hmm, if I'm understanding you correctly, adding ref counts explicitly 
 (like you suggest below)  would be used to let a lower layer defer 
 writes.  Seems like this could be just as easily done with explicits 
 kmallocs and transferring ownership of the allocated memory to the 
 in-kernel representation handling layer below (which in turn queues the 
 data structures for writes).

Yup, that's true.  We'd effectively shift the burden of freeing those
buffers into the cr_write() (or whatever we call it) function.  

But, I'm just thinking about the sys_checkpoint() side.  I need to go
look at the restart code too.

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v10][PATCH 05/13] Dump memory address space

2008-12-01 Thread Dave Hansen
On Fri, 2008-11-28 at 10:53 +, Al Viro wrote:
 
  +static int cr_ctx_checkpoint(struct cr_ctx *ctx, pid_t pid)
  +{
  + ctx-root_pid = pid;
  +
  + /*
  +  * assume checkpointer is in container's root vfs
  +  * FIXME: this works for now, but will change with real containers
  +  */
  + ctx-vfsroot = current-fs-root;
  + path_get(ctx-vfsroot);
 
 This is going to break as soon as you get another thread doing e.g. chroot(2)
 while you are in there.

Yeah, we do need at least a read_lock(current-fs-lock) to keep people
from chroot()'ing underneath us.

 And it's a really, _really_ bad idea to take a
 pointer to shared object, increment refcount on the current *contents* of
 said object and assume that dropping refcount on the later contents of the
 same will balance out.

Absolutely.  I assume you mean get_fs_struct(current) instead of
path_get().

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v10][PATCH 02/13] Checkpoint/restart: initial documentation

2008-12-01 Thread Dave Hansen
On Fri, 2008-11-28 at 10:45 +, Al Viro wrote:
 On Wed, Nov 26, 2008 at 08:04:33PM -0500, Oren Laadan wrote:
  +Currently, namespaces are not saved or restored. They will be treated
  +as a class of a shared object. In particular, it is assumed that the
  +task's file system namespace is the root for the entire container.
  +It is also assumed that the same file system view is available for the
  +restart task(s). Otherwise, a file system snapshot is required.
 
 That is to say, bindings are not handled at all.

Sadly, no.  I'm trying to convince Oren that this is important, but I've
been unable to do so thus far.  I'd appreciate any particularly
pathological cases you can think of.

There are two cases here that worry me.  One is the case where we're
checkpointing a container that has been off in its own mount namespace
doing bind mounts to its little heart's content.  We want to checkpoint
and restore the sucker on the same machine.  In this case, we almost
certainly want the kernel to be doing the restoration of the binds.

The other case is when we're checkpointing, and moving to a completely
different machine.  The new machine may have a completely different disk
layout and *need* the admin doing the sys_restore() to set up those
binds differently because of space constraints or whatever.

For now, we're completely assuming the second case, where the admin
(userspace) is responsible for it, and punting.  Do you think this is
something we should take care of now, early in the process?

  +* What additional work needs to be done to it?
 
  +We know this design can work.  We have two commercial products and a
  +horde of academic projects doing it today using this basic design.
 
 May I use that for a t-shirt, please?  With that quote in foreground, and
 pus-yellow-greenish MACH serving as background.  With the names of products
 and projects dripping from it...

*Functionally* we know this design can work.  Practically, in Linux, I
have no freaking idea.  The hard part isn't getting it working, of
course.  The hard part is getting something that doesn't step on top of
everyone else's toes in the kernel and, at the same time, is actually
*maintainable*.

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v10][PATCH 09/13] Restore open file descriprtors

2008-12-01 Thread Dave Hansen
On Mon, 2008-12-01 at 15:41 -0500, Oren Laadan wrote:
  +   fd = cr_attach_file(file);  /* no need to cleanup 'file' below */
  +   if (fd  0) {
  +   filp_close(file, NULL);
  +   ret = fd;
  +   goto out;
  +   }
  +
  +   /* register new objref, file tuple in hash table */
  +   ret = cr_obj_add_ref(ctx, file, parent, CR_OBJ_FILE, 0);
  +   if (ret  0)
  +   goto out;
  Who said that file still exists at that point?
 
 Correct. This call should move higher up befor ethe call to cr_attach_file()

Is that sufficient?  It seems like we're depending on the fd's reference
to the 'struct file' to keep it valid in the hash.  If something happens
to the fd (like the other thread messing with it) the 'struct file' can
still go away.

Shouldn't we do another get_file() for the hash's reference?

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v10][PATCH 09/13] Restore open file descriprtors

2008-12-01 Thread Dave Hansen
On Mon, 2008-12-01 at 13:07 -0800, Dave Hansen wrote:
  When a shared object is inserted to the hash we automatically take another
  reference to it (according to its type) for as long as it remains in the
  hash. See:  'cr_obj_ref_grab()' and 'cr_obj_ref_drop()'.  So by moving that
  call higher up, we protect the struct file.
 
 That's kinda (and by kinda I mean really) disgusting.  Hiding that two
 levels deep in what is effectively the hash table code where no one will
 ever see it is really bad.  It also makes you lazy thinking that the
 hash code will just know how to take references on whatever you give to
 it.
 
 I think cr_obj_ref_grab() is hideous obfuscation and needs to die.
 Let's just do the get_file() directly, please.

Well, I at least see why you need it now.  The objhash thing is trying
to be a pretty generic hash implementation and it does need to free the
references up when it is destroyed.  Instead of keeping a hash of
files and a hash of pipes or other shared objects, there's just a
single hash for everything.

One alternative here would be to have an ops-style release function that
gets called instead of what we have now:

static void cr_obj_ref_drop(struct cr_objref *obj)
{
switch (obj-type) {
case CR_OBJ_FILE:
fput((struct file *) obj-ptr);
break;
default:
BUG();
}
}

static void cr_obj_ref_grab(struct cr_objref *obj)
{
switch (obj-type) {
case CR_OBJ_FILE:
get_file((struct file *) obj-ptr);
break;
default:
BUG();
}
}

That would make it something like:

struct cr_obj_ops {
int type;
void (*release)(struct cr_objref *obj);
};

void cr_release_file(struct cr_objref *obj)
{
struct file *file = obj-ptr;
put_file(file);
}

struct cr_obj_ops cr_file_ops = {
.type = CR_OBJ_FILE,
.release = cr_release_file,
};


And the add operation becomes:

get_file(file);
new = cr_obj_add_ptr(ctx, file, objref, cr_file_ops, 0);

with 'cr_file_ops' basically replacing the CR_OBJ_FILE that got passed
before.

I like that because it only obfuscates what truly needs to be abstracted
out: the release side.  Hiding that get_file() is really tricky.

But, I guess we could also just kill cr_obj_ref_grab(), do the
get_file() explicitly and still keep cr_obj_ref_drop() as it is now.

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v7][PATCH 2/9] General infrastructure for checkpoint restart

2008-10-27 Thread Dave Hansen
On Mon, 2008-10-27 at 07:03 -0400, Oren Laadan wrote:
  In our implementation, we simply refused to checkpoint setid
 programs.
 
 True. And this works very well for HPC applications.
 
 However, it doesn't work so well for server applications, for
 instance.
 
 Also, you could use file system snapshotting to ensure that the file
 system view does not change, and still face the same issue.
 
 So I'm perfectly ok with deferring this discussion to a later time :)

Oren, is this a good place to stick a process_deny_checkpoint()?  Both
so we refuse to checkpoint, and document this as something that has to
be addressed later?

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC v7][PATCH 2/9] General infrastructure for checkpoint restart

2008-10-21 Thread Dave Hansen
On Tue, 2008-10-21 at 22:55 -0400, Daniel Jacobowitz wrote:
 I haven't been following - but why this whole container restriction?
 Checkpoint/restart of individual processes is very useful too.
 There are issues with e.g. IPC, but I'm not convinced they're
 substantially different than the issues already present for a
 container.

Containers provide isolation.  Once you have isolation, you have a
discrete set of resources which you can checkpoint/restart.

Let's say you have a process you want to checkpoint.  If it uses a
completely discrete IPC namespace, you *know* that nothing else depends
on those IPC ids.  We don't even have to worry about who might have been
using them and when.

Also think about pids.  Without containers, how can you guarantee a
restarted process that it can regain the same pid?

-- Dave

--
To unsubscribe from this list: send the line unsubscribe linux-api in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html