[PATCH 2/2] s390/mm: enable fixup_user_fault retrying

2016-01-04 Thread Dominik Dingel
By passing a non-null flag we allow fixup_user_fault to retry, which
enables userfaultfd.  As during these retries we might drop the mmap_sem we
need to check if that happened and redo the complete chain of actions.

Signed-off-by: Dominik Dingel 
---
 arch/s390/mm/pgtable.c | 29 ++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b15759c..3c5456d 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -578,17 +578,29 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
 {
unsigned long vmaddr;
int rc;
+   bool unlocked;
 
down_read(>mm->mmap_sem);
+
+retry:
+   unlocked = false;
vmaddr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(vmaddr)) {
rc = vmaddr;
goto out_up;
}
-   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, NULL)) {
+   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+)) {
rc = -EFAULT;
goto out_up;
}
+   /*
+* In the case that fixup_user_fault unlocked the mmap_sem during
+* faultin redo __gmap_translate to not race with a map/unmap_segment.
+*/
+   if (unlocked)
+   goto retry;
+
rc = __gmap_link(gmap, gaddr, vmaddr);
 out_up:
up_read(>mm->mmap_sem);
@@ -717,12 +729,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
spinlock_t *ptl;
pte_t *ptep, entry;
pgste_t pgste;
+   bool unlocked;
int rc = 0;
 
if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
return -EINVAL;
down_read(>mm->mmap_sem);
while (len) {
+   unlocked = false;
/* Convert gmap address and connect the page tables */
addr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(addr)) {
@@ -731,10 +745,13 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
}
/* Get the page mapped */
if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-NULL)) {
+)) {
rc = -EFAULT;
break;
}
+   /* While trying to map mmap_sem got unlocked. Let us retry */
+   if (unlocked)
+   continue;
rc = __gmap_link(gmap, gaddr, addr);
if (rc)
break;
@@ -795,9 +812,11 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned 
long addr,
spinlock_t *ptl;
pgste_t old, new;
pte_t *ptep;
+   bool unlocked;
 
down_read(>mmap_sem);
 retry:
+   unlocked = false;
ptep = get_locked_pte(mm, addr, );
if (unlikely(!ptep)) {
up_read(>mmap_sem);
@@ -806,8 +825,12 @@ retry:
if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 (pte_val(*ptep) & _PAGE_PROTECT)) {
pte_unmap_unlock(ptep, ptl);
+   /*
+* We do not really care about unlocked. We will retry either
+* way. But this allows fixup_user_fault to enable userfaultfd.
+*/
if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
-NULL)) {
+)) {
up_read(>mmap_sem);
return -EFAULT;
}
-- 
2.3.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] mm: bring in additional flag for fixup_user_fault to signal unlock

2016-01-04 Thread Dominik Dingel
With the introduction of userfaultfd, kvm on s390 needs fixup_user_fault to
pass in FAULT_FLAG_ALLOW_RETRY and give feedback if during the faulting we
ever unlocked mmap_sem.

This patch brings in the logic to handle retries as well as it cleans up
the current documentation.  fixup_user_fault was not having the same
semantics as filemap_fault.  It never indicated if a retry happened and so
a caller wasn't able to handle that case.  So we now changed the behaviour
to always retry a locked mmap_sem.

Signed-off-by: Dominik Dingel 
---
 arch/s390/mm/pgtable.c |  8 +---
 include/linux/mm.h |  5 +++--
 kernel/futex.c |  2 +-
 mm/gup.c   | 30 +-
 4 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 54ef3bc..b15759c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -585,7 +585,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
rc = vmaddr;
goto out_up;
}
-   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
+   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, NULL)) {
rc = -EFAULT;
goto out_up;
}
@@ -730,7 +730,8 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
break;
}
/* Get the page mapped */
-   if (fixup_user_fault(current, gmap->mm, addr, 
FAULT_FLAG_WRITE)) {
+   if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
+NULL)) {
rc = -EFAULT;
break;
}
@@ -805,7 +806,8 @@ retry:
if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 (pte_val(*ptep) & _PAGE_PROTECT)) {
pte_unmap_unlock(ptep, ptl);
-   if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
+   if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
+NULL)) {
up_read(>mmap_sem);
return -EFAULT;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00bad77..7783073 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1163,7 +1163,8 @@ int invalidate_inode_page(struct page *page);
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-   unsigned long address, unsigned int fault_flags);
+   unsigned long address, unsigned int fault_flags,
+   bool *unlocked);
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
@@ -1175,7 +1176,7 @@ static inline int handle_mm_fault(struct mm_struct *mm,
 }
 static inline int fixup_user_fault(struct task_struct *tsk,
struct mm_struct *mm, unsigned long address,
-   unsigned int fault_flags)
+   unsigned int fault_flags, bool *unlocked)
 {
/* should never happen if there's no MMU */
BUG();
diff --git a/kernel/futex.c b/kernel/futex.c
index 684d754..fb640c5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -639,7 +639,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 
down_read(>mmap_sem);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
-  FAULT_FLAG_WRITE);
+  FAULT_FLAG_WRITE, NULL);
up_read(>mmap_sem);
 
return ret < 0 ? ret : 0;
diff --git a/mm/gup.c b/mm/gup.c
index deafa2c..493d543 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -564,6 +564,8 @@ EXPORT_SYMBOL(__get_user_pages);
  * @mm:mm_struct of target mm
  * @address:   user address
  * @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked:  did we unlock the mmap_sem while retrying, maybe NULL if caller
+ * does not allow retry
  *
  * This is meant to be called in the specific scenario where for locking 
reasons
  * we try to access user memory in atomic context (within a pagefault_disable()
@@ -575,22 +577,28 @@ EXPORT_SYMBOL(__get_user_pages);
  * The main difference with get_user_pages() is that this function will
  * unconditionally call handle_mm_fault() which will in turn perform all the
  * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
+ * get_user_pages() only guarantees to update these in the struct page.
  *
  * This is important for some architectures where those bits also gate the
  * access permission to the page because they are maintained in s

[PATCH v3 0/2] Allow gmap fault to retry

2016-01-04 Thread Dominik Dingel
Hello,

sorry for the delay since the last version.

During Jasons work with postcopy migration support for s390 a problem regarding
gmap faults was discovered.

The gmap code will call fixup_user_fault which will end up always in
handle_mm_fault. Till now we never cared about retries, but as the userfaultfd
code kind of relies on it. this needs some fix.

This patchset does not take care of the futex code. I will now look closer at
this.

Thanks,
Dominik

v2 -> v3:
- In case of retrying check vma again
- Do the accounting of major/minor faults once

v1 -> v2:
- Instread of passing the VM_FAULT_RETRY from fixup_user_fault we do retries
  within fixup_user_fault, like get_user_pages_locked do.
- gmap code will now take retry if fixup_user_fault drops the lock.

Dominik Dingel (2):
  mm: bring in additional flag for fixup_user_fault to signal unlock
  s390/mm: enable fixup_user_fault retrying

 arch/s390/mm/pgtable.c | 31 ---
 include/linux/mm.h |  5 +++--
 kernel/futex.c |  2 +-
 mm/gup.c   | 30 +-
 4 files changed, 57 insertions(+), 11 deletions(-)

-- 
2.3.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 0/2] Allow gmap fault to retry

2016-01-04 Thread Dominik Dingel
Hello,

sorry for the delay since the last version.

During Jasons work with postcopy migration support for s390 a problem regarding
gmap faults was discovered.

The gmap code will call fixup_user_fault which will end up always in
handle_mm_fault. Till now we never cared about retries, but as the userfaultfd
code kind of relies on it. this needs some fix.

This patchset does not take care of the futex code. I will now look closer at
this.

Thanks,
Dominik

v2 -> v3:
- In case of retrying check vma again
- Do the accounting of major/minor faults once

v1 -> v2:
- Instread of passing the VM_FAULT_RETRY from fixup_user_fault we do retries
  within fixup_user_fault, like get_user_pages_locked do.
- gmap code will now take retry if fixup_user_fault drops the lock.

Dominik Dingel (2):
  mm: bring in additional flag for fixup_user_fault to signal unlock
  s390/mm: enable fixup_user_fault retrying

 arch/s390/mm/pgtable.c | 31 ---
 include/linux/mm.h |  5 +++--
 kernel/futex.c |  2 +-
 mm/gup.c   | 30 +-
 4 files changed, 57 insertions(+), 11 deletions(-)

-- 
2.3.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] s390/mm: enable fixup_user_fault retrying

2016-01-04 Thread Dominik Dingel
By passing a non-null flag we allow fixup_user_fault to retry, which
enables userfaultfd.  As during these retries we might drop the mmap_sem we
need to check if that happened and redo the complete chain of actions.

Signed-off-by: Dominik Dingel <din...@linux.vnet.ibm.com>
---
 arch/s390/mm/pgtable.c | 29 ++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b15759c..3c5456d 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -578,17 +578,29 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
 {
unsigned long vmaddr;
int rc;
+   bool unlocked;
 
down_read(>mm->mmap_sem);
+
+retry:
+   unlocked = false;
vmaddr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(vmaddr)) {
rc = vmaddr;
goto out_up;
}
-   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, NULL)) {
+   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+)) {
rc = -EFAULT;
goto out_up;
}
+   /*
+* In the case that fixup_user_fault unlocked the mmap_sem during
+* faultin redo __gmap_translate to not race with a map/unmap_segment.
+*/
+   if (unlocked)
+   goto retry;
+
rc = __gmap_link(gmap, gaddr, vmaddr);
 out_up:
up_read(>mm->mmap_sem);
@@ -717,12 +729,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
spinlock_t *ptl;
pte_t *ptep, entry;
pgste_t pgste;
+   bool unlocked;
int rc = 0;
 
if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
return -EINVAL;
down_read(>mm->mmap_sem);
while (len) {
+   unlocked = false;
/* Convert gmap address and connect the page tables */
addr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(addr)) {
@@ -731,10 +745,13 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
}
/* Get the page mapped */
if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-NULL)) {
+)) {
rc = -EFAULT;
break;
}
+   /* While trying to map mmap_sem got unlocked. Let us retry */
+   if (unlocked)
+   continue;
rc = __gmap_link(gmap, gaddr, addr);
if (rc)
break;
@@ -795,9 +812,11 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned 
long addr,
spinlock_t *ptl;
pgste_t old, new;
pte_t *ptep;
+   bool unlocked;
 
down_read(>mmap_sem);
 retry:
+   unlocked = false;
ptep = get_locked_pte(mm, addr, );
if (unlikely(!ptep)) {
up_read(>mmap_sem);
@@ -806,8 +825,12 @@ retry:
if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 (pte_val(*ptep) & _PAGE_PROTECT)) {
pte_unmap_unlock(ptep, ptl);
+   /*
+* We do not really care about unlocked. We will retry either
+* way. But this allows fixup_user_fault to enable userfaultfd.
+*/
if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
-NULL)) {
+)) {
up_read(>mmap_sem);
return -EFAULT;
}
-- 
2.3.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] mm: bring in additional flag for fixup_user_fault to signal unlock

2016-01-04 Thread Dominik Dingel
With the introduction of userfaultfd, kvm on s390 needs fixup_user_fault to
pass in FAULT_FLAG_ALLOW_RETRY and give feedback if during the faulting we
ever unlocked mmap_sem.

This patch brings in the logic to handle retries as well as it cleans up
the current documentation.  fixup_user_fault was not having the same
semantics as filemap_fault.  It never indicated if a retry happened and so
a caller wasn't able to handle that case.  So we now changed the behaviour
to always retry a locked mmap_sem.

Signed-off-by: Dominik Dingel <din...@linux.vnet.ibm.com>
---
 arch/s390/mm/pgtable.c |  8 +---
 include/linux/mm.h |  5 +++--
 kernel/futex.c |  2 +-
 mm/gup.c   | 30 +-
 4 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 54ef3bc..b15759c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -585,7 +585,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
rc = vmaddr;
goto out_up;
}
-   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
+   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, NULL)) {
rc = -EFAULT;
goto out_up;
}
@@ -730,7 +730,8 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
break;
}
/* Get the page mapped */
-   if (fixup_user_fault(current, gmap->mm, addr, 
FAULT_FLAG_WRITE)) {
+   if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
+NULL)) {
rc = -EFAULT;
break;
}
@@ -805,7 +806,8 @@ retry:
if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 (pte_val(*ptep) & _PAGE_PROTECT)) {
pte_unmap_unlock(ptep, ptl);
-   if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
+   if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
+NULL)) {
up_read(>mmap_sem);
return -EFAULT;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00bad77..7783073 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1163,7 +1163,8 @@ int invalidate_inode_page(struct page *page);
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-   unsigned long address, unsigned int fault_flags);
+   unsigned long address, unsigned int fault_flags,
+   bool *unlocked);
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
@@ -1175,7 +1176,7 @@ static inline int handle_mm_fault(struct mm_struct *mm,
 }
 static inline int fixup_user_fault(struct task_struct *tsk,
struct mm_struct *mm, unsigned long address,
-   unsigned int fault_flags)
+   unsigned int fault_flags, bool *unlocked)
 {
/* should never happen if there's no MMU */
BUG();
diff --git a/kernel/futex.c b/kernel/futex.c
index 684d754..fb640c5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -639,7 +639,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 
down_read(>mmap_sem);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
-  FAULT_FLAG_WRITE);
+  FAULT_FLAG_WRITE, NULL);
up_read(>mmap_sem);
 
return ret < 0 ? ret : 0;
diff --git a/mm/gup.c b/mm/gup.c
index deafa2c..493d543 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -564,6 +564,8 @@ EXPORT_SYMBOL(__get_user_pages);
  * @mm:mm_struct of target mm
  * @address:   user address
  * @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked:  did we unlock the mmap_sem while retrying, maybe NULL if caller
+ * does not allow retry
  *
  * This is meant to be called in the specific scenario where for locking 
reasons
  * we try to access user memory in atomic context (within a pagefault_disable()
@@ -575,22 +577,28 @@ EXPORT_SYMBOL(__get_user_pages);
  * The main difference with get_user_pages() is that this function will
  * unconditionally call handle_mm_fault() which will in turn perform all the
  * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
+ * get_user_pages() only guarantees to update these in the struct page.
  *
  * This is important for some architectures where those bits also gate the
  * access permission to th

[PATCH v2 0/2] Allow gmap fault to retry

2015-11-26 Thread Dominik Dingel
Hello,

during Jasons work with postcopy migration support for s390 a problem regarding
gmap faults was discovered.

The gmap code will call fixup_userfault which will end up always in
handle_mm_fault. Till now we never cared about retries, but as the userfaultfd
code kind of relies on it, this needs some fix.

Thanks,
Dominik

v1 -> v2:
- Instead of passing the VM_FAULT_RETRY from fixup_user_fault we do retries
  within fixup_user_fault, like get_user_pages_locked do.
- gmap code will now take retry if fixup_user_fault drops the lock

Dominik Dingel (2):
  mm: bring in additional flag for fixup_user_fault to signal unlock
  s390/mm: enable fixup_user_fault retrying

 arch/s390/mm/pgtable.c | 31 ---
 include/linux/mm.h |  5 +++--
 kernel/futex.c |  2 +-
 mm/gup.c   | 25 +
 4 files changed, 53 insertions(+), 10 deletions(-)

-- 
2.3.9

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] mm: bring in additional flag for fixup_user_fault to signal unlock

2015-11-26 Thread Dominik Dingel
With the introduction of userfaultfd, kvm on s390 needs fixup_user_fault to
pass in FAULT_FLAG_ALLOW_RETRY and give feedback if during the faulting we
ever unlocked mmap_sem.

This patch brings in the logic to handle retries as well as it cleans up
the current documentation.  fixup_user_fault was not having the same
semantics as filemap_fault.  It never indicated if a retry happened and so
a caller wasn't able to handle that case.  So we now changed the behaviour
to always retry a locked mmap_sem.

Signed-off-by: Dominik Dingel 
---
 arch/s390/mm/pgtable.c |  8 +---
 include/linux/mm.h |  5 +++--
 kernel/futex.c |  2 +-
 mm/gup.c   | 25 +
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 54ef3bc..b15759c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -585,7 +585,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
rc = vmaddr;
goto out_up;
}
-   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
+   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, NULL)) {
rc = -EFAULT;
goto out_up;
}
@@ -730,7 +730,8 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
break;
}
/* Get the page mapped */
-   if (fixup_user_fault(current, gmap->mm, addr, 
FAULT_FLAG_WRITE)) {
+   if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
+NULL)) {
rc = -EFAULT;
break;
}
@@ -805,7 +806,8 @@ retry:
if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 (pte_val(*ptep) & _PAGE_PROTECT)) {
pte_unmap_unlock(ptep, ptl);
-   if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
+   if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
+NULL)) {
up_read(>mmap_sem);
return -EFAULT;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00bad77..7783073 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1163,7 +1163,8 @@ int invalidate_inode_page(struct page *page);
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-   unsigned long address, unsigned int fault_flags);
+   unsigned long address, unsigned int fault_flags,
+   bool *unlocked);
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
@@ -1175,7 +1176,7 @@ static inline int handle_mm_fault(struct mm_struct *mm,
 }
 static inline int fixup_user_fault(struct task_struct *tsk,
struct mm_struct *mm, unsigned long address,
-   unsigned int fault_flags)
+   unsigned int fault_flags, bool *unlocked)
 {
/* should never happen if there's no MMU */
BUG();
diff --git a/kernel/futex.c b/kernel/futex.c
index 684d754..fb640c5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -639,7 +639,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 
down_read(>mmap_sem);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
-  FAULT_FLAG_WRITE);
+  FAULT_FLAG_WRITE, NULL);
up_read(>mmap_sem);
 
return ret < 0 ? ret : 0;
diff --git a/mm/gup.c b/mm/gup.c
index deafa2c..4ed35a3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -564,6 +564,8 @@ EXPORT_SYMBOL(__get_user_pages);
  * @mm:mm_struct of target mm
  * @address:   user address
  * @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked:  did we unlock the mmap_sem while retrying, maybe NULL if caller
+ * does not allow retry
  *
  * This is meant to be called in the specific scenario where for locking 
reasons
  * we try to access user memory in atomic context (within a pagefault_disable()
@@ -575,17 +577,19 @@ EXPORT_SYMBOL(__get_user_pages);
  * The main difference with get_user_pages() is that this function will
  * unconditionally call handle_mm_fault() which will in turn perform all the
  * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
+ * get_user_pages() only guarantees to update these in the struct page.
  *
  * This is important for some architectures where those bits also gate the
  * access permission to the page because they are maintained in s

[PATCH 2/2] s390/mm: enable fixup_user_fault retrying

2015-11-26 Thread Dominik Dingel
By passing a non-null flag we allow fixup_user_fault to retry, which
enables userfaultfd.  As during these retries we might drop the mmap_sem we
need to check if that happened and redo the complete chain of actions.

Signed-off-by: Dominik Dingel 
---
 arch/s390/mm/pgtable.c | 29 ++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b15759c..3c5456d 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -578,17 +578,29 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
 {
unsigned long vmaddr;
int rc;
+   bool unlocked;
 
down_read(>mm->mmap_sem);
+
+retry:
+   unlocked = false;
vmaddr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(vmaddr)) {
rc = vmaddr;
goto out_up;
}
-   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, NULL)) {
+   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+)) {
rc = -EFAULT;
goto out_up;
}
+   /*
+* In the case that fixup_user_fault unlocked the mmap_sem during
+* faultin redo __gmap_translate to not race with a map/unmap_segment.
+*/
+   if (unlocked)
+   goto retry;
+
rc = __gmap_link(gmap, gaddr, vmaddr);
 out_up:
up_read(>mm->mmap_sem);
@@ -717,12 +729,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
spinlock_t *ptl;
pte_t *ptep, entry;
pgste_t pgste;
+   bool unlocked;
int rc = 0;
 
if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
return -EINVAL;
down_read(>mm->mmap_sem);
while (len) {
+   unlocked = false;
/* Convert gmap address and connect the page tables */
addr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(addr)) {
@@ -731,10 +745,13 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
}
/* Get the page mapped */
if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-NULL)) {
+)) {
rc = -EFAULT;
break;
}
+   /* While trying to map mmap_sem got unlocked. Let us retry */
+   if (unlocked)
+   continue;
rc = __gmap_link(gmap, gaddr, addr);
if (rc)
break;
@@ -795,9 +812,11 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned 
long addr,
spinlock_t *ptl;
pgste_t old, new;
pte_t *ptep;
+   bool unlocked;
 
down_read(>mmap_sem);
 retry:
+   unlocked = false;
ptep = get_locked_pte(mm, addr, );
if (unlikely(!ptep)) {
up_read(>mmap_sem);
@@ -806,8 +825,12 @@ retry:
if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 (pte_val(*ptep) & _PAGE_PROTECT)) {
pte_unmap_unlock(ptep, ptl);
+   /*
+* We do not really care about unlocked. We will retry either
+* way. But this allows fixup_user_fault to enable userfaultfd.
+*/
if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
-NULL)) {
+)) {
up_read(>mmap_sem);
return -EFAULT;
}
-- 
2.3.9

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] mm: bring in additional flag for fixup_user_fault to signal unlock

2015-11-26 Thread Dominik Dingel
With the introduction of userfaultfd, kvm on s390 needs fixup_user_fault to
pass in FAULT_FLAG_ALLOW_RETRY and give feedback if during the faulting we
ever unlocked mmap_sem.

This patch brings in the logic to handle retries as well as it cleans up
the current documentation.  fixup_user_fault was not having the same
semantics as filemap_fault.  It never indicated if a retry happened and so
a caller wasn't able to handle that case.  So we now changed the behaviour
to always retry a locked mmap_sem.

Signed-off-by: Dominik Dingel <din...@linux.vnet.ibm.com>
---
 arch/s390/mm/pgtable.c |  8 +---
 include/linux/mm.h |  5 +++--
 kernel/futex.c |  2 +-
 mm/gup.c   | 25 +
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 54ef3bc..b15759c 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -585,7 +585,7 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
rc = vmaddr;
goto out_up;
}
-   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
+   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, NULL)) {
rc = -EFAULT;
goto out_up;
}
@@ -730,7 +730,8 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
break;
}
/* Get the page mapped */
-   if (fixup_user_fault(current, gmap->mm, addr, 
FAULT_FLAG_WRITE)) {
+   if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
+NULL)) {
rc = -EFAULT;
break;
}
@@ -805,7 +806,8 @@ retry:
if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 (pte_val(*ptep) & _PAGE_PROTECT)) {
pte_unmap_unlock(ptep, ptl);
-   if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
+   if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
+NULL)) {
up_read(>mmap_sem);
return -EFAULT;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00bad77..7783073 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1163,7 +1163,8 @@ int invalidate_inode_page(struct page *page);
 extern int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, unsigned int flags);
 extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
-   unsigned long address, unsigned int fault_flags);
+   unsigned long address, unsigned int fault_flags,
+   bool *unlocked);
 #else
 static inline int handle_mm_fault(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
@@ -1175,7 +1176,7 @@ static inline int handle_mm_fault(struct mm_struct *mm,
 }
 static inline int fixup_user_fault(struct task_struct *tsk,
struct mm_struct *mm, unsigned long address,
-   unsigned int fault_flags)
+   unsigned int fault_flags, bool *unlocked)
 {
/* should never happen if there's no MMU */
BUG();
diff --git a/kernel/futex.c b/kernel/futex.c
index 684d754..fb640c5 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -639,7 +639,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
 
down_read(>mmap_sem);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
-  FAULT_FLAG_WRITE);
+  FAULT_FLAG_WRITE, NULL);
up_read(>mmap_sem);
 
return ret < 0 ? ret : 0;
diff --git a/mm/gup.c b/mm/gup.c
index deafa2c..4ed35a3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -564,6 +564,8 @@ EXPORT_SYMBOL(__get_user_pages);
  * @mm:mm_struct of target mm
  * @address:   user address
  * @fault_flags:flags to pass down to handle_mm_fault()
+ * @unlocked:  did we unlock the mmap_sem while retrying, maybe NULL if caller
+ * does not allow retry
  *
  * This is meant to be called in the specific scenario where for locking 
reasons
  * we try to access user memory in atomic context (within a pagefault_disable()
@@ -575,17 +577,19 @@ EXPORT_SYMBOL(__get_user_pages);
  * The main difference with get_user_pages() is that this function will
  * unconditionally call handle_mm_fault() which will in turn perform all the
  * necessary SW fixup of the dirty and young bits in the PTE, while
- * handle_mm_fault() only guarantees to update these in the struct page.
+ * get_user_pages() only guarantees to update these in the struct page.
  *
  * This is important for some architectures where those bits also gate the
  * access permission to th

[PATCH 2/2] s390/mm: enable fixup_user_fault retrying

2015-11-26 Thread Dominik Dingel
By passing a non-null flag we allow fixup_user_fault to retry, which
enables userfaultfd.  As during these retries we might drop the mmap_sem we
need to check if that happened and redo the complete chain of actions.

Signed-off-by: Dominik Dingel <din...@linux.vnet.ibm.com>
---
 arch/s390/mm/pgtable.c | 29 ++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b15759c..3c5456d 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -578,17 +578,29 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
 {
unsigned long vmaddr;
int rc;
+   bool unlocked;
 
down_read(>mm->mmap_sem);
+
+retry:
+   unlocked = false;
vmaddr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(vmaddr)) {
rc = vmaddr;
goto out_up;
}
-   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags, NULL)) {
+   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags,
+)) {
rc = -EFAULT;
goto out_up;
}
+   /*
+* In the case that fixup_user_fault unlocked the mmap_sem during
+* faultin redo __gmap_translate to not race with a map/unmap_segment.
+*/
+   if (unlocked)
+   goto retry;
+
rc = __gmap_link(gmap, gaddr, vmaddr);
 out_up:
up_read(>mm->mmap_sem);
@@ -717,12 +729,14 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
spinlock_t *ptl;
pte_t *ptep, entry;
pgste_t pgste;
+   bool unlocked;
int rc = 0;
 
if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
return -EINVAL;
down_read(>mm->mmap_sem);
while (len) {
+   unlocked = false;
/* Convert gmap address and connect the page tables */
addr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(addr)) {
@@ -731,10 +745,13 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
}
/* Get the page mapped */
if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
-NULL)) {
+)) {
rc = -EFAULT;
break;
}
+   /* While trying to map mmap_sem got unlocked. Let us retry */
+   if (unlocked)
+   continue;
rc = __gmap_link(gmap, gaddr, addr);
if (rc)
break;
@@ -795,9 +812,11 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned 
long addr,
spinlock_t *ptl;
pgste_t old, new;
pte_t *ptep;
+   bool unlocked;
 
down_read(>mmap_sem);
 retry:
+   unlocked = false;
ptep = get_locked_pte(mm, addr, );
if (unlikely(!ptep)) {
up_read(>mmap_sem);
@@ -806,8 +825,12 @@ retry:
if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 (pte_val(*ptep) & _PAGE_PROTECT)) {
pte_unmap_unlock(ptep, ptl);
+   /*
+* We do not really care about unlocked. We will retry either
+* way. But this allows fixup_user_fault to enable userfaultfd.
+*/
if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE,
-NULL)) {
+)) {
up_read(>mmap_sem);
return -EFAULT;
}
-- 
2.3.9

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 0/2] Allow gmap fault to retry

2015-11-26 Thread Dominik Dingel
Hello,

during Jasons work with postcopy migration support for s390 a problem regarding
gmap faults was discovered.

The gmap code will call fixup_userfault which will end up always in
handle_mm_fault. Till now we never cared about retries, but as the userfaultfd
code kind of relies on it, this needs some fix.

Thanks,
Dominik

v1 -> v2:
- Instead of passing the VM_FAULT_RETRY from fixup_user_fault we do retries
  within fixup_user_fault, like get_user_pages_locked do.
- gmap code will now take retry if fixup_user_fault drops the lock

Dominik Dingel (2):
  mm: bring in additional flag for fixup_user_fault to signal unlock
  s390/mm: enable fixup_user_fault retrying

 arch/s390/mm/pgtable.c | 31 ---
 include/linux/mm.h |  5 +++--
 kernel/futex.c |  2 +-
 mm/gup.c   | 25 +
 4 files changed, 53 insertions(+), 10 deletions(-)

-- 
2.3.9

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] s390/mm: allow gmap code to retry on faulting in guest memory

2015-11-19 Thread Dominik Dingel
On Thu, 19 Nov 2015 09:25:24 +0100
Christian Borntraeger  wrote:

> On 11/19/2015 09:18 AM, Martin Schwidefsky wrote:
> > On Thu, 19 Nov 2015 00:49:58 +0100
> > Dominik Dingel  wrote:
> > 
> >> The userfaultfd does need FAULT_FLAG_ALLOW_RETRY to not return
> >> VM_FAULT_SIGBUS.  So we improve the gmap code to handle one
> >> VM_FAULT_RETRY.
> >>
> >> Signed-off-by: Dominik Dingel 
> >> ---
> >>  arch/s390/mm/pgtable.c | 28 
> >>  1 file changed, 24 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
> >> index 54ef3bc..8a0025d 100644
> >> --- a/arch/s390/mm/pgtable.c
> >> +++ b/arch/s390/mm/pgtable.c
> >> @@ -577,15 +577,22 @@ int gmap_fault(struct gmap *gmap, unsigned long 
> >> gaddr,
> >>   unsigned int fault_flags)
> >>  {
> >>unsigned long vmaddr;
> >> -  int rc;
> >> +  int rc, fault;
> >>
> >> +  fault_flags |= FAULT_FLAG_ALLOW_RETRY;
> >> +retry:
> >>down_read(>mm->mmap_sem);
> >>vmaddr = __gmap_translate(gmap, gaddr);
> >>if (IS_ERR_VALUE(vmaddr)) {
> >>rc = vmaddr;
> >>goto out_up;
> >>}
> >> -  if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
> >> +  fault = fixup_user_fault(current, gmap->mm, vmaddr, fault_flags);
> >> +  if (fault & VM_FAULT_RETRY) {
> >> +  fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
> >> +  fault_flags |= FAULT_FLAG_TRIED;
> >> +  goto retry;
> >> +  } else if (fault) {
> >>rc = -EFAULT;
> >>goto out_up;
> >>}
> > 
> > Me thinks that you want to add the retry code into fixup_user_fault itself.
> > You basically have the same code around the three calls to fixup_user_fault.
> > Yes, it will be a common code patch but I guess that it will be acceptable
> > given userfaultfd as a reason.
> 
> That makes a lot of sense. In an earlier discussion (a followup of Jasons
> mm: Loosen MADV_NOHUGEPAGE to enable Qemu postcopy on s390) patch.
> 
> Andrea suggested the following:
> 
> It's probably better to add a fixup_user_fault_unlocked that will work
> like get_user_pages_unlocked. I.e. leaves the details of the mmap_sem
> locking internally to the function, and will handle VM_FAULT_RETRY
> automatically by re-taking the mmap_sem and repeating the
> fixup_user_fault after updating the FAULT_FLAG_ALLOW_RETRY to
> FAULT_FLAG_TRIED.

I know, I saw his mail. But within the gmap code we need to take the mmap_sem 
before calling fixup_user_fault as well as holding it for later on like 
__gmap_link.

We could introduce a new wrapper arround fixup_user_fault, like:
fixup_user_fault_retry, which would take care of the retry logic, but does not 
encapsulate the complete mmap_sem logic.

@Kirill would that be acceptable for you as well?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/2] s390/mm: allow gmap code to retry on faulting in guest memory

2015-11-19 Thread Dominik Dingel
On Thu, 19 Nov 2015 09:25:24 +0100
Christian Borntraeger <borntrae...@de.ibm.com> wrote:

> On 11/19/2015 09:18 AM, Martin Schwidefsky wrote:
> > On Thu, 19 Nov 2015 00:49:58 +0100
> > Dominik Dingel <din...@linux.vnet.ibm.com> wrote:
> > 
> >> The userfaultfd does need FAULT_FLAG_ALLOW_RETRY to not return
> >> VM_FAULT_SIGBUS.  So we improve the gmap code to handle one
> >> VM_FAULT_RETRY.
> >>
> >> Signed-off-by: Dominik Dingel <din...@linux.vnet.ibm.com>
> >> ---
> >>  arch/s390/mm/pgtable.c | 28 
> >>  1 file changed, 24 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
> >> index 54ef3bc..8a0025d 100644
> >> --- a/arch/s390/mm/pgtable.c
> >> +++ b/arch/s390/mm/pgtable.c
> >> @@ -577,15 +577,22 @@ int gmap_fault(struct gmap *gmap, unsigned long 
> >> gaddr,
> >>   unsigned int fault_flags)
> >>  {
> >>unsigned long vmaddr;
> >> -  int rc;
> >> +  int rc, fault;
> >>
> >> +  fault_flags |= FAULT_FLAG_ALLOW_RETRY;
> >> +retry:
> >>down_read(>mm->mmap_sem);
> >>vmaddr = __gmap_translate(gmap, gaddr);
> >>if (IS_ERR_VALUE(vmaddr)) {
> >>rc = vmaddr;
> >>goto out_up;
> >>}
> >> -  if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
> >> +  fault = fixup_user_fault(current, gmap->mm, vmaddr, fault_flags);
> >> +  if (fault & VM_FAULT_RETRY) {
> >> +  fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
> >> +  fault_flags |= FAULT_FLAG_TRIED;
> >> +  goto retry;
> >> +  } else if (fault) {
> >>rc = -EFAULT;
> >>goto out_up;
> >>}
> > 
> > Me thinks that you want to add the retry code into fixup_user_fault itself.
> > You basically have the same code around the three calls to fixup_user_fault.
> > Yes, it will be a common code patch but I guess that it will be acceptable
> > given userfaultfd as a reason.
> 
> That makes a lot of sense. In an earlier discussion (a followup of Jasons
> mm: Loosen MADV_NOHUGEPAGE to enable Qemu postcopy on s390) patch.
> 
> Andrea suggested the following:
> 
> It's probably better to add a fixup_user_fault_unlocked that will work
> like get_user_pages_unlocked. I.e. leaves the details of the mmap_sem
> locking internally to the function, and will handle VM_FAULT_RETRY
> automatically by re-taking the mmap_sem and repeating the
> fixup_user_fault after updating the FAULT_FLAG_ALLOW_RETRY to
> FAULT_FLAG_TRIED.

I know, I saw his mail. But within the gmap code we need to take the mmap_sem 
before calling fixup_user_fault as well as holding it for later on like 
__gmap_link.

We could introduce a new wrapper arround fixup_user_fault, like:
fixup_user_fault_retry, which would take care of the retry logic, but does not 
encapsulate the complete mmap_sem logic.

@Kirill would that be acceptable for you as well?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] mm: fixup_userfault returns VM_FAULT_RETRY if asked

2015-11-18 Thread Dominik Dingel
When calling fixup_userfault with FAULT_FLAG_ALLOW_RETRY, fixup_userfault
didn't care about VM_FAULT_RETRY and returned 0. If the VM_FAULT_RETRY flag is
set we will return the complete result of handle_mm_fault.

Signed-off-by: Dominik Dingel 
---
 mm/gup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/gup.c b/mm/gup.c
index deafa2c..2af3b31 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -609,6 +609,8 @@ int fixup_user_fault(struct task_struct *tsk, struct 
mm_struct *mm,
return -EFAULT;
BUG();
}
+   if (ret & VM_FAULT_RETRY)
+   return ret;
if (tsk) {
if (ret & VM_FAULT_MAJOR)
tsk->maj_flt++;
-- 
2.3.9

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/2] Allow gmap fault to retry

2015-11-18 Thread Dominik Dingel
Hello,

during Jasons work with postcopy migration support for s390 a problem regarding
gmap faults was discovered.

The gmap code will call fixup_userfault which will end up always in
handle_mm_fault. Till now we never cared about retries, but as the userfaultfd
code kind of relies on it, this needed some fix. This patchset includes the
retry logic fory gmap fault scenarios, as well as passing back VM_FAULT_RETRY
from fixup_userfault.

Thanks,
Dominik

Dominik Dingel (2):
  mm: fixup_userfault returns VM_FAULT_RETRY if asked
  s390/mm: allow gmap code to retry on faulting in guest memory

 arch/s390/mm/pgtable.c | 28 
 mm/gup.c   |  2 ++
 2 files changed, 26 insertions(+), 4 deletions(-)

-- 
2.3.9

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] s390/mm: allow gmap code to retry on faulting in guest memory

2015-11-18 Thread Dominik Dingel
The userfaultfd does need FAULT_FLAG_ALLOW_RETRY to not return
VM_FAULT_SIGBUS.  So we improve the gmap code to handle one
VM_FAULT_RETRY.

Signed-off-by: Dominik Dingel 
---
 arch/s390/mm/pgtable.c | 28 
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 54ef3bc..8a0025d 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -577,15 +577,22 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
   unsigned int fault_flags)
 {
unsigned long vmaddr;
-   int rc;
+   int rc, fault;
 
+   fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+retry:
down_read(>mm->mmap_sem);
vmaddr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(vmaddr)) {
rc = vmaddr;
goto out_up;
}
-   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
+   fault = fixup_user_fault(current, gmap->mm, vmaddr, fault_flags);
+   if (fault & VM_FAULT_RETRY) {
+   fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+   fault_flags |= FAULT_FLAG_TRIED;
+   goto retry;
+   } else if (fault) {
rc = -EFAULT;
goto out_up;
}
@@ -717,10 +724,13 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
spinlock_t *ptl;
pte_t *ptep, entry;
pgste_t pgste;
+   int fault, fault_flags;
int rc = 0;
 
+   fault_flags = FAULT_FLAG_WRITE | FAULT_FLAG_ALLOW_RETRY;
if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
return -EINVAL;
+retry:
down_read(>mm->mmap_sem);
while (len) {
/* Convert gmap address and connect the page tables */
@@ -730,7 +740,12 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
break;
}
/* Get the page mapped */
-   if (fixup_user_fault(current, gmap->mm, addr, 
FAULT_FLAG_WRITE)) {
+   fault = fixup_user_fault(current, gmap->mm, addr, fault_flags);
+   if (fault & VM_FAULT_RETRY) {
+   fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+   fault_flags |= FAULT_FLAG_TRIED;
+   goto retry;
+   } else if (fault) {
rc = -EFAULT;
break;
}
@@ -794,7 +809,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned 
long addr,
spinlock_t *ptl;
pgste_t old, new;
pte_t *ptep;
+   int fault, fault_flags;
 
+   fault_flags = FAULT_FLAG_WRITE | FAULT_FLAG_ALLOW_RETRY;
down_read(>mmap_sem);
 retry:
ptep = get_locked_pte(mm, addr, );
@@ -805,10 +822,13 @@ retry:
if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 (pte_val(*ptep) & _PAGE_PROTECT)) {
pte_unmap_unlock(ptep, ptl);
-   if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
+   fault = fixup_user_fault(current, mm, addr, fault_flags);
+   if (fault && !(fault & VM_FAULT_RETRY)) {
up_read(>mmap_sem);
return -EFAULT;
}
+   fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+   fault_flags |= FAULT_FLAG_TRIED;
goto retry;
}
 
-- 
2.3.9

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/2] s390/mm: allow gmap code to retry on faulting in guest memory

2015-11-18 Thread Dominik Dingel
The userfaultfd does need FAULT_FLAG_ALLOW_RETRY to not return
VM_FAULT_SIGBUS.  So we improve the gmap code to handle one
VM_FAULT_RETRY.

Signed-off-by: Dominik Dingel <din...@linux.vnet.ibm.com>
---
 arch/s390/mm/pgtable.c | 28 
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 54ef3bc..8a0025d 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -577,15 +577,22 @@ int gmap_fault(struct gmap *gmap, unsigned long gaddr,
   unsigned int fault_flags)
 {
unsigned long vmaddr;
-   int rc;
+   int rc, fault;
 
+   fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+retry:
down_read(>mm->mmap_sem);
vmaddr = __gmap_translate(gmap, gaddr);
if (IS_ERR_VALUE(vmaddr)) {
rc = vmaddr;
goto out_up;
}
-   if (fixup_user_fault(current, gmap->mm, vmaddr, fault_flags)) {
+   fault = fixup_user_fault(current, gmap->mm, vmaddr, fault_flags);
+   if (fault & VM_FAULT_RETRY) {
+   fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+   fault_flags |= FAULT_FLAG_TRIED;
+   goto retry;
+   } else if (fault) {
rc = -EFAULT;
goto out_up;
}
@@ -717,10 +724,13 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
spinlock_t *ptl;
pte_t *ptep, entry;
pgste_t pgste;
+   int fault, fault_flags;
int rc = 0;
 
+   fault_flags = FAULT_FLAG_WRITE | FAULT_FLAG_ALLOW_RETRY;
if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
return -EINVAL;
+retry:
down_read(>mm->mmap_sem);
while (len) {
/* Convert gmap address and connect the page tables */
@@ -730,7 +740,12 @@ int gmap_ipte_notify(struct gmap *gmap, unsigned long 
gaddr, unsigned long len)
break;
}
/* Get the page mapped */
-   if (fixup_user_fault(current, gmap->mm, addr, 
FAULT_FLAG_WRITE)) {
+   fault = fixup_user_fault(current, gmap->mm, addr, fault_flags);
+   if (fault & VM_FAULT_RETRY) {
+   fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+   fault_flags |= FAULT_FLAG_TRIED;
+   goto retry;
+   } else if (fault) {
rc = -EFAULT;
break;
}
@@ -794,7 +809,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned 
long addr,
spinlock_t *ptl;
pgste_t old, new;
pte_t *ptep;
+   int fault, fault_flags;
 
+   fault_flags = FAULT_FLAG_WRITE | FAULT_FLAG_ALLOW_RETRY;
down_read(>mmap_sem);
 retry:
ptep = get_locked_pte(mm, addr, );
@@ -805,10 +822,13 @@ retry:
if (!(pte_val(*ptep) & _PAGE_INVALID) &&
 (pte_val(*ptep) & _PAGE_PROTECT)) {
pte_unmap_unlock(ptep, ptl);
-   if (fixup_user_fault(current, mm, addr, FAULT_FLAG_WRITE)) {
+   fault = fixup_user_fault(current, mm, addr, fault_flags);
+   if (fault && !(fault & VM_FAULT_RETRY)) {
up_read(>mmap_sem);
return -EFAULT;
}
+   fault_flags &= ~FAULT_FLAG_ALLOW_RETRY;
+   fault_flags |= FAULT_FLAG_TRIED;
goto retry;
}
 
-- 
2.3.9

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/2] mm: fixup_userfault returns VM_FAULT_RETRY if asked

2015-11-18 Thread Dominik Dingel
When calling fixup_userfault with FAULT_FLAG_ALLOW_RETRY, fixup_userfault
didn't care about VM_FAULT_RETRY and returned 0. If the VM_FAULT_RETRY flag is
set we will return the complete result of handle_mm_fault.

Signed-off-by: Dominik Dingel <din...@linux.vnet.ibm.com>
---
 mm/gup.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/gup.c b/mm/gup.c
index deafa2c..2af3b31 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -609,6 +609,8 @@ int fixup_user_fault(struct task_struct *tsk, struct 
mm_struct *mm,
return -EFAULT;
BUG();
}
+   if (ret & VM_FAULT_RETRY)
+   return ret;
if (tsk) {
if (ret & VM_FAULT_MAJOR)
tsk->maj_flt++;
-- 
2.3.9

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/2] Allow gmap fault to retry

2015-11-18 Thread Dominik Dingel
Hello,

during Jasons work with postcopy migration support for s390 a problem regarding
gmap faults was discovered.

The gmap code will call fixup_userfault which will end up always in
handle_mm_fault. Till now we never cared about retries, but as the userfaultfd
code kind of relies on it, this needed some fix. This patchset includes the
retry logic fory gmap fault scenarios, as well as passing back VM_FAULT_RETRY
from fixup_userfault.

Thanks,
Dominik

Dominik Dingel (2):
  mm: fixup_userfault returns VM_FAULT_RETRY if asked
  s390/mm: allow gmap code to retry on faulting in guest memory

 arch/s390/mm/pgtable.c | 28 
 mm/gup.c   |  2 ++
 2 files changed, 26 insertions(+), 4 deletions(-)

-- 
2.3.9

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


perf: Link error on non x86 with sample_reg_masks

2015-09-30 Thread Dominik Dingel
Greetings,

With 4.3-rc3 during the build of perf (on s390) I get following error:

libperf.a(libperf-in.o): In function `parse_regs':
/home/dingel/GIT/linux/tools/perf/util/parse-regs-options.c:28: undefined 
reference to `sample_reg_masks'
/home/dingel/GIT/linux/tools/perf/util/parse-regs-options.c:45: undefined 
reference to `sample_reg_masks'
/home/dingel/GIT/linux/tools/perf/util/parse-regs-options.c:38: undefined 
reference to `sample_reg_masks'
collect2: error: ld returned 1 exit status
Makefile.perf:306: recipe for target 'perf' failed
make[1]: *** [perf] Error 1
Makefile:68: recipe for target 'all' failed
make: *** [all] Error 2

By doing the same thing, as:

commit af4aeadd8c04303c0aa2d112145c3627e2ebd026
Author: Stephane Eranian 
Date:   Tue Sep 1 11:30:14 2015 +0200

perf tools: Fix link time error with sample_reg_masks on non x86

does:

--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
@@ -4,6 +4,10 @@
 #include "util/parse-options.h"
 #include "util/parse-regs-options.h"
 
+const struct sample_reg __weak sample_reg_masks[] = {
+   SMPL_REG_END
+};
+

I was able to fix the problem, but is it the right thing to do?

Thanks,
Dominik

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


perf: Link error on non x86 with sample_reg_masks

2015-09-30 Thread Dominik Dingel
Greetings,

With 4.3-rc3 during the build of perf (on s390) I get following error:

libperf.a(libperf-in.o): In function `parse_regs':
/home/dingel/GIT/linux/tools/perf/util/parse-regs-options.c:28: undefined 
reference to `sample_reg_masks'
/home/dingel/GIT/linux/tools/perf/util/parse-regs-options.c:45: undefined 
reference to `sample_reg_masks'
/home/dingel/GIT/linux/tools/perf/util/parse-regs-options.c:38: undefined 
reference to `sample_reg_masks'
collect2: error: ld returned 1 exit status
Makefile.perf:306: recipe for target 'perf' failed
make[1]: *** [perf] Error 1
Makefile:68: recipe for target 'all' failed
make: *** [all] Error 2

By doing the same thing, as:

commit af4aeadd8c04303c0aa2d112145c3627e2ebd026
Author: Stephane Eranian 
Date:   Tue Sep 1 11:30:14 2015 +0200

perf tools: Fix link time error with sample_reg_masks on non x86

does:

--- a/tools/perf/util/parse-regs-options.c
+++ b/tools/perf/util/parse-regs-options.c
@@ -4,6 +4,10 @@
 #include "util/parse-options.h"
 #include "util/parse-regs-options.h"
 
+const struct sample_reg __weak sample_reg_masks[] = {
+   SMPL_REG_END
+};
+

I was able to fix the problem, but is it the right thing to do?

Thanks,
Dominik

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] sched: access local runqueue directly in single_task_running

2015-09-18 Thread Dominik Dingel
On Fri, 18 Sep 2015 13:26:53 +0200
Paolo Bonzini  wrote:

> 
> 
> On 18/09/2015 11:27, Dominik Dingel wrote:
> > Commit 2ee507c47293 ("sched: Add function single_task_running to let a task
> > check if it is the only task running on a cpu") referenced the current
> > runqueue with the smp_processor_id.  When CONFIG_DEBUG_PREEMPT is enabled,
> > that is only allowed if preemption is disabled or the currrent task is
> > bound to the local cpu (e.g. kernel worker).
> > 
> > With commit f78195129963 ("kvm: add halt_poll_ns module parameter") KVM
> > calls single_task_running. If CONFIG_DEBUG_PREEMPT is enabled that
> > generates a lot of kernel messages.
> > 
> > To avoid adding preemption in that cases, as it would limit the usefulness,
> > we change single_task_running to access directly the cpu local runqueue.
> > 
> > Cc: Tim Chen 
> > Suggested-by: Peter Zijlstra 
> > Cc:  # 4.2.x
> > Signed-off-by: Dominik Dingel 
> > ---
> >  kernel/sched/core.c | 8 
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> > 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 78b4bad10..5bfad0b 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -2614,13 +2614,13 @@ unsigned long nr_running(void)
> >  
> >  /*
> >   * Check if only the current task is running on the cpu.
> > + *
> > + * Caution result is subject to time-of-check-to-time-of-use race,
> > + * every caller is responsible to set up additional fences if necessary.
> 
> Let's expand it a bit more:
> 
>  * Caution: this function does not check that the caller has disabled
>  * preemption, thus the result might have a time-of-check-to-time-of-use
>  * race.  The caller is responsible to use this correctly, for example:
>  *
>  * - use it from a non-preemptable section
>  *
>  * - use it from a thread that is bound to a single CPU
>  *
>  * - use it in a loop where each iteration takes very little time
>  *   (e.g. a polling loop)
>  */
> 
> I'll include it in my pull request.

Sounds really good.
Thank you!

> Paolo
> 
> >   */
> >  bool single_task_running(void)
> >  {
> > -   if (cpu_rq(smp_processor_id())->nr_running == 1)
> > -   return true;
> > -   else
> > -   return false;
> > +   return raw_rq()->nr_running == 1;
> >  }
> >  EXPORT_SYMBOL(single_task_running);
> >  
> > 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] sched: access local runqueue directly in single_task_running

2015-09-18 Thread Dominik Dingel
Commit 2ee507c47293 ("sched: Add function single_task_running to let a task
check if it is the only task running on a cpu") referenced the current
runqueue with the smp_processor_id.  When CONFIG_DEBUG_PREEMPT is enabled,
that is only allowed if preemption is disabled or the currrent task is
bound to the local cpu (e.g. kernel worker).

With commit f78195129963 ("kvm: add halt_poll_ns module parameter") KVM
calls single_task_running. If CONFIG_DEBUG_PREEMPT is enabled that
generates a lot of kernel messages.

To avoid adding preemption in that cases, as it would limit the usefulness,
we change single_task_running to access directly the cpu local runqueue.

Cc: Tim Chen 
Suggested-by: Peter Zijlstra 
Cc:  # 4.2.x
Signed-off-by: Dominik Dingel 
---
 kernel/sched/core.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10..5bfad0b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2614,13 +2614,13 @@ unsigned long nr_running(void)
 
 /*
  * Check if only the current task is running on the cpu.
+ *
+ * Caution result is subject to time-of-check-to-time-of-use race,
+ * every caller is responsible to set up additional fences if necessary.
  */
 bool single_task_running(void)
 {
-   if (cpu_rq(smp_processor_id())->nr_running == 1)
-   return true;
-   else
-   return false;
+   return raw_rq()->nr_running == 1;
 }
 EXPORT_SYMBOL(single_task_running);
 
-- 
2.3.8

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] sched: access local runqueue directly in single_task_running

2015-09-18 Thread Dominik Dingel
Commit 2ee507c47293 ("sched: Add function single_task_running to let a task
check if it is the only task running on a cpu") referenced the current
runqueue with the smp_processor_id.  When CONFIG_DEBUG_PREEMPT is enabled,
that is only allowed if preemption is disabled or the currrent task is
bound to the local cpu (e.g. kernel worker).

With commit f78195129963 ("kvm: add halt_poll_ns module parameter") KVM
calls single_task_running. If CONFIG_DEBUG_PREEMPT is enabled that
generates a lot of kernel messages.

To avoid adding preemption in that cases, as it would limit the usefulness,
we change single_task_running to access directly the cpu local runqueue.

Cc: Tim Chen <tim.c.c...@linux.intel.com>
Suggested-by: Peter Zijlstra <pet...@infradead.org>
Cc: <sta...@vger.kernel.org> # 4.2.x
Signed-off-by: Dominik Dingel <din...@linux.vnet.ibm.com>
---
 kernel/sched/core.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 78b4bad10..5bfad0b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2614,13 +2614,13 @@ unsigned long nr_running(void)
 
 /*
  * Check if only the current task is running on the cpu.
+ *
+ * Caution result is subject to time-of-check-to-time-of-use race,
+ * every caller is responsible to set up additional fences if necessary.
  */
 bool single_task_running(void)
 {
-   if (cpu_rq(smp_processor_id())->nr_running == 1)
-   return true;
-   else
-   return false;
+   return raw_rq()->nr_running == 1;
 }
 EXPORT_SYMBOL(single_task_running);
 
-- 
2.3.8

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] sched: access local runqueue directly in single_task_running

2015-09-18 Thread Dominik Dingel
On Fri, 18 Sep 2015 13:26:53 +0200
Paolo Bonzini <pbonz...@redhat.com> wrote:

> 
> 
> On 18/09/2015 11:27, Dominik Dingel wrote:
> > Commit 2ee507c47293 ("sched: Add function single_task_running to let a task
> > check if it is the only task running on a cpu") referenced the current
> > runqueue with the smp_processor_id.  When CONFIG_DEBUG_PREEMPT is enabled,
> > that is only allowed if preemption is disabled or the currrent task is
> > bound to the local cpu (e.g. kernel worker).
> > 
> > With commit f78195129963 ("kvm: add halt_poll_ns module parameter") KVM
> > calls single_task_running. If CONFIG_DEBUG_PREEMPT is enabled that
> > generates a lot of kernel messages.
> > 
> > To avoid adding preemption in that cases, as it would limit the usefulness,
> > we change single_task_running to access directly the cpu local runqueue.
> > 
> > Cc: Tim Chen <tim.c.c...@linux.intel.com>
> > Suggested-by: Peter Zijlstra <pet...@infradead.org>
> > Cc: <sta...@vger.kernel.org> # 4.2.x
> > Signed-off-by: Dominik Dingel <din...@linux.vnet.ibm.com>
> > ---
> >  kernel/sched/core.c | 8 
> >  1 file changed, 4 insertions(+), 4 deletions(-)
> > 
> > diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> > index 78b4bad10..5bfad0b 100644
> > --- a/kernel/sched/core.c
> > +++ b/kernel/sched/core.c
> > @@ -2614,13 +2614,13 @@ unsigned long nr_running(void)
> >  
> >  /*
> >   * Check if only the current task is running on the cpu.
> > + *
> > + * Caution result is subject to time-of-check-to-time-of-use race,
> > + * every caller is responsible to set up additional fences if necessary.
> 
> Let's expand it a bit more:
> 
>  * Caution: this function does not check that the caller has disabled
>  * preemption, thus the result might have a time-of-check-to-time-of-use
>  * race.  The caller is responsible to use this correctly, for example:
>  *
>  * - use it from a non-preemptable section
>  *
>  * - use it from a thread that is bound to a single CPU
>  *
>  * - use it in a loop where each iteration takes very little time
>  *   (e.g. a polling loop)
>  */
> 
> I'll include it in my pull request.

Sounds really good.
Thank you!

> Paolo
> 
> >   */
> >  bool single_task_running(void)
> >  {
> > -   if (cpu_rq(smp_processor_id())->nr_running == 1)
> > -   return true;
> > -   else
> > -   return false;
> > +   return raw_rq()->nr_running == 1;
> >  }
> >  EXPORT_SYMBOL(single_task_running);
> >  
> > 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: single_task_running() vs. preemption warnings (was Re: [PATCH] kvm: fix preemption warnings in kvm_vcpu_block)

2015-09-17 Thread Dominik Dingel
On Thu, 17 Sep 2015 18:45:00 +0200
Paolo Bonzini  wrote:

> 
> 
> On 17/09/2015 18:27, Dominik Dingel wrote:
> > +   preempt_disable();
> > +   solo = single_task_running();
> > +   preempt_enable();
> > +
> > cur = ktime_get();
> > -   } while (single_task_running() && ktime_before(cur, stop));
> 
> That's the obvious way to fix it, but the TOCTTOU problem (which was in
> the buggy code too) is obvious too. :)  And the only other user of
> single_task_running() in drivers/crypto/mcryptd.c has the same issue.

Right, worst thing we fly another round.

I am not sure about the case for mcryptd.c. I think it might be that the worker
there is bounded to one cpu and will not be migrated.

I really need to look more in the details what is happening with that worker.

> In fact, because of the way the function is used ("maybe I can do a
> little bit of work before going to sleep") it will likely be called many
> times in a loop.  This in turn means that:
> 
> - any wrong result due to a concurrent process migration would be
> rectified very soon
> 
> - preempt_disable()/preempt_enable() can actually be just as expensive
> or more expensive than single_task_running() itself.
> 
> Therefore, I wonder if single_task_running() should just use
> raw_smp_processor_id().  At least the TOCTTOU issue can be clearly
> documented in the function comment, instead of being hidden behind each
> of the callers.

Yes to be useful it should probably call raw_smp_processor_id,
and as a lot of code actually already does just does that I do not really see 
much
down sides.

@Tim, would it be okay if I change single_task_running and add a specific 
comment on top?

> Thanks,
> 
> Paolo
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] kvm: fix preemption warnings in kvm_vcpu_block

2015-09-17 Thread Dominik Dingel
Commit f78195129963 ("kvm: add halt_poll_ns module parameter") calls, with
enabled preemption, single_task_running. When CONFIG_DEBUG_PREEMPT is
enabled that will result in a debug_smp_processor_id() call.

Cc:   # 4.2.x
Signed-off-by: Dominik Dingel 
---
 virt/kvm/kvm_main.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 54534de..ce67dd6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1971,6 +1971,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 
start = cur = ktime_get();
if (vcpu->halt_poll_ns) {
+   bool solo;
ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
 
do {
@@ -1982,8 +1983,13 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
++vcpu->stat.halt_successful_poll;
goto out;
}
+
+   preempt_disable();
+   solo = single_task_running();
+   preempt_enable();
+
cur = ktime_get();
-   } while (single_task_running() && ktime_before(cur, stop));
+   } while (solo && ktime_before(cur, stop));
}
 
for (;;) {
-- 
2.3.8

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] kvm: fix preemption warnings in kvm_vcpu_block

2015-09-17 Thread Dominik Dingel
Commit f78195129963 ("kvm: add halt_poll_ns module parameter") calls, with
enabled preemption, single_task_running. When CONFIG_DEBUG_PREEMPT is
enabled that will result in a debug_smp_processor_id() call.

Cc:  <sta...@vger.kernel.org> # 4.2.x
Signed-off-by: Dominik Dingel <din...@linux.vnet.ibm.com>
---
 virt/kvm/kvm_main.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 54534de..ce67dd6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1971,6 +1971,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 
start = cur = ktime_get();
if (vcpu->halt_poll_ns) {
+   bool solo;
ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
 
do {
@@ -1982,8 +1983,13 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
++vcpu->stat.halt_successful_poll;
goto out;
}
+
+   preempt_disable();
+   solo = single_task_running();
+   preempt_enable();
+
cur = ktime_get();
-   } while (single_task_running() && ktime_before(cur, stop));
+   } while (solo && ktime_before(cur, stop));
}
 
for (;;) {
-- 
2.3.8

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: single_task_running() vs. preemption warnings (was Re: [PATCH] kvm: fix preemption warnings in kvm_vcpu_block)

2015-09-17 Thread Dominik Dingel
On Thu, 17 Sep 2015 18:45:00 +0200
Paolo Bonzini <pbonz...@redhat.com> wrote:

> 
> 
> On 17/09/2015 18:27, Dominik Dingel wrote:
> > +   preempt_disable();
> > +   solo = single_task_running();
> > +   preempt_enable();
> > +
> > cur = ktime_get();
> > -   } while (single_task_running() && ktime_before(cur, stop));
> 
> That's the obvious way to fix it, but the TOCTTOU problem (which was in
> the buggy code too) is obvious too. :)  And the only other user of
> single_task_running() in drivers/crypto/mcryptd.c has the same issue.

Right, worst thing we fly another round.

I am not sure about the case for mcryptd.c. I think it might be that the worker
there is bounded to one cpu and will not be migrated.

I really need to look more in the details what is happening with that worker.

> In fact, because of the way the function is used ("maybe I can do a
> little bit of work before going to sleep") it will likely be called many
> times in a loop.  This in turn means that:
> 
> - any wrong result due to a concurrent process migration would be
> rectified very soon
> 
> - preempt_disable()/preempt_enable() can actually be just as expensive
> or more expensive than single_task_running() itself.
> 
> Therefore, I wonder if single_task_running() should just use
> raw_smp_processor_id().  At least the TOCTTOU issue can be clearly
> documented in the function comment, instead of being hidden behind each
> of the callers.

Yes to be useful it should probably call raw_smp_processor_id,
and as a lot of code actually already does just does that I do not really see 
much
down sides.

@Tim, would it be okay if I change single_task_running and add a specific 
comment on top?

> Thanks,
> 
> Paolo
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] Revert "s390/mm: change HPAGE_SHIFT type to int"

2015-07-03 Thread Dominik Dingel
This reverts commit cf54e2fce51c7ad2479fe8cf213a2ed618a8189b.
---
 arch/s390/include/asm/page.h | 2 +-
 arch/s390/mm/pgtable.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index dd34523..0844b78 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -20,7 +20,7 @@
 #include 
 #ifndef __ASSEMBLY__
 
-extern int HPAGE_SHIFT;
+extern unsigned int HPAGE_SHIFT;
 #define HPAGE_SIZE (1UL << HPAGE_SHIFT)
 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 33082d0..16154720 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -31,7 +31,7 @@
 #define ALLOC_ORDER2
 #define FRAG_MASK  0x03
 
-int HPAGE_SHIFT;
+unsigned int HPAGE_SHIFT;
 
 unsigned long *crst_table_alloc(struct mm_struct *mm)
 {
-- 
2.3.8

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] Revert "s390/mm: make hugepages_supported a boot time decision"

2015-07-03 Thread Dominik Dingel
This reverts commit bea41197ead3e03308bdd10c11db3ce91ae5c8ab.
---
 arch/s390/include/asm/page.h | 8 
 arch/s390/kernel/setup.c | 2 --
 arch/s390/mm/pgtable.c   | 2 --
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 0844b78..53eacbd 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -17,10 +17,7 @@
 #define PAGE_DEFAULT_ACC   0
 #define PAGE_DEFAULT_KEY   (PAGE_DEFAULT_ACC << 4)
 
-#include 
-#ifndef __ASSEMBLY__
-
-extern unsigned int HPAGE_SHIFT;
+#define HPAGE_SHIFT20
 #define HPAGE_SIZE (1UL << HPAGE_SHIFT)
 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
@@ -30,6 +27,9 @@ extern unsigned int HPAGE_SHIFT;
 #define ARCH_HAS_PREPARE_HUGEPAGE
 #define ARCH_HAS_HUGEPAGE_CLEAR_FLUSH
 
+#include 
+#ifndef __ASSEMBLY__
+
 static inline void storage_key_init_range(unsigned long start, unsigned long 
end)
 {
 #if PAGE_DEFAULT_KEY
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index f7f027c..ca070d2 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -885,8 +885,6 @@ void __init setup_arch(char **cmdline_p)
 */
setup_hwcaps();
 
-   HPAGE_SHIFT = MACHINE_HAS_HPAGE ? 20 : 0;
-
/*
 * Create kernel page tables and switch to virtual addressing.
 */
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 16154720..b33f661 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -31,8 +31,6 @@
 #define ALLOC_ORDER2
 #define FRAG_MASK  0x03
 
-unsigned int HPAGE_SHIFT;
-
 unsigned long *crst_table_alloc(struct mm_struct *mm)
 {
struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
-- 
2.3.8

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/4] s390/mm: Fixup hugepage sw-emulated code removal

2015-07-03 Thread Dominik Dingel
Heiko noticed that the current check for hugepage support on s390 is a little 
bit to
harsh as systems which do not support will crash.
The reason is that pageblock_order can now get negative when we set HPAGE_SHIFT 
to 0.
To avoid all this and to avoid opening another can of worms with enabling 
HUGETLB_PAGE_SIZE_VARIABLE I think it would be best to simply allow 
architectures to
define their own hugepages_supported().

Thanks
Dominik

Dominik Dingel (4):
  Revert "s390/mm: change HPAGE_SHIFT type to int"
  Revert "s390/mm: make hugepages_supported a boot time decision"
  mm: hugetlb: allow hugepages_supported to be architecture specific
  s390/hugetlb: add hugepages_supported define

 arch/s390/include/asm/hugetlb.h |  1 +
 arch/s390/include/asm/page.h|  8 
 arch/s390/kernel/setup.c|  2 --
 arch/s390/mm/pgtable.c  |  2 --
 include/linux/hugetlb.h | 17 -
 5 files changed, 13 insertions(+), 17 deletions(-)

-- 
2.3.8

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] s390/hugetlb: add hugepages_supported define

2015-07-03 Thread Dominik Dingel
On s390 we only can enable hugepages if the underlying hardware/hypervisor
also does support this. Common code now would assume this to be signaled
by setting HPAGE_SHIFT to 0. But on s390, where we only support one
hugepage size, there is a link between HPAGE_SHIFT and pageblock_order.

So instead of setting HPAGE_SHIFT to 0, we will implement the check for the
hardware capability.

Signed-off-by: Dominik Dingel 
---
 arch/s390/include/asm/hugetlb.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 0130d03..d9be7c0 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -14,6 +14,7 @@
 
 #define is_hugepage_only_range(mm, addr, len)  0
 #define hugetlb_free_pgd_range free_pgd_range
+#define hugepages_supported()  (MACHINE_HAS_HPAGE)
 
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *ptep, pte_t pte);
-- 
2.3.8

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] mm: hugetlb: allow hugepages_supported to be architecture specific

2015-07-03 Thread Dominik Dingel
s390 has a constant hugepage size, by setting HPAGE_SHIFT we also
change e.g. the pageblock_order, which should be independent in
respect to hugepage support.

With this patch every architecture is free to define how to check
for hugepage support.

Signed-off-by: Dominik Dingel 
---
 include/linux/hugetlb.h | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2050261..d891f94 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -460,15 +460,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate 
*h,
return >page_table_lock;
 }
 
-static inline bool hugepages_supported(void)
-{
-   /*
-* Some platform decide whether they support huge pages at boot
-* time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
-* there is no such support
-*/
-   return HPAGE_SHIFT != 0;
-}
+#ifndef hugepages_supported
+/*
+ * Some platform decide whether they support huge pages at boot
+ * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
+ * when there is no such support
+ */
+#define hugepages_supported() (HPAGE_SHIFT != 0)
+#endif
 
 #else  /* CONFIG_HUGETLB_PAGE */
 struct hstate {};
-- 
2.3.8

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/4] s390/mm: Fixup hugepage sw-emulated code removal

2015-07-03 Thread Dominik Dingel
Heiko noticed that the current check for hugepage support on s390 is a little 
bit to
harsh as systems which do not support will crash.
The reason is that pageblock_order can now get negative when we set HPAGE_SHIFT 
to 0.
To avoid all this and to avoid opening another can of worms with enabling 
HUGETLB_PAGE_SIZE_VARIABLE I think it would be best to simply allow 
architectures to
define their own hugepages_supported().

Thanks
Dominik

Dominik Dingel (4):
  Revert s390/mm: change HPAGE_SHIFT type to int
  Revert s390/mm: make hugepages_supported a boot time decision
  mm: hugetlb: allow hugepages_supported to be architecture specific
  s390/hugetlb: add hugepages_supported define

 arch/s390/include/asm/hugetlb.h |  1 +
 arch/s390/include/asm/page.h|  8 
 arch/s390/kernel/setup.c|  2 --
 arch/s390/mm/pgtable.c  |  2 --
 include/linux/hugetlb.h | 17 -
 5 files changed, 13 insertions(+), 17 deletions(-)

-- 
2.3.8

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] s390/hugetlb: add hugepages_supported define

2015-07-03 Thread Dominik Dingel
On s390 we only can enable hugepages if the underlying hardware/hypervisor
also does support this. Common code now would assume this to be signaled
by setting HPAGE_SHIFT to 0. But on s390, where we only support one
hugepage size, there is a link between HPAGE_SHIFT and pageblock_order.

So instead of setting HPAGE_SHIFT to 0, we will implement the check for the
hardware capability.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 arch/s390/include/asm/hugetlb.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index 0130d03..d9be7c0 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -14,6 +14,7 @@
 
 #define is_hugepage_only_range(mm, addr, len)  0
 #define hugetlb_free_pgd_range free_pgd_range
+#define hugepages_supported()  (MACHINE_HAS_HPAGE)
 
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *ptep, pte_t pte);
-- 
2.3.8

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] Revert s390/mm: make hugepages_supported a boot time decision

2015-07-03 Thread Dominik Dingel
This reverts commit bea41197ead3e03308bdd10c11db3ce91ae5c8ab.
---
 arch/s390/include/asm/page.h | 8 
 arch/s390/kernel/setup.c | 2 --
 arch/s390/mm/pgtable.c   | 2 --
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 0844b78..53eacbd 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -17,10 +17,7 @@
 #define PAGE_DEFAULT_ACC   0
 #define PAGE_DEFAULT_KEY   (PAGE_DEFAULT_ACC  4)
 
-#include asm/setup.h
-#ifndef __ASSEMBLY__
-
-extern unsigned int HPAGE_SHIFT;
+#define HPAGE_SHIFT20
 #define HPAGE_SIZE (1UL  HPAGE_SHIFT)
 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
@@ -30,6 +27,9 @@ extern unsigned int HPAGE_SHIFT;
 #define ARCH_HAS_PREPARE_HUGEPAGE
 #define ARCH_HAS_HUGEPAGE_CLEAR_FLUSH
 
+#include asm/setup.h
+#ifndef __ASSEMBLY__
+
 static inline void storage_key_init_range(unsigned long start, unsigned long 
end)
 {
 #if PAGE_DEFAULT_KEY
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index f7f027c..ca070d2 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -885,8 +885,6 @@ void __init setup_arch(char **cmdline_p)
 */
setup_hwcaps();
 
-   HPAGE_SHIFT = MACHINE_HAS_HPAGE ? 20 : 0;
-
/*
 * Create kernel page tables and switch to virtual addressing.
 */
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 16154720..b33f661 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -31,8 +31,6 @@
 #define ALLOC_ORDER2
 #define FRAG_MASK  0x03
 
-unsigned int HPAGE_SHIFT;
-
 unsigned long *crst_table_alloc(struct mm_struct *mm)
 {
struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
-- 
2.3.8

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] Revert s390/mm: change HPAGE_SHIFT type to int

2015-07-03 Thread Dominik Dingel
This reverts commit cf54e2fce51c7ad2479fe8cf213a2ed618a8189b.
---
 arch/s390/include/asm/page.h | 2 +-
 arch/s390/mm/pgtable.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index dd34523..0844b78 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -20,7 +20,7 @@
 #include asm/setup.h
 #ifndef __ASSEMBLY__
 
-extern int HPAGE_SHIFT;
+extern unsigned int HPAGE_SHIFT;
 #define HPAGE_SIZE (1UL  HPAGE_SHIFT)
 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 33082d0..16154720 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -31,7 +31,7 @@
 #define ALLOC_ORDER2
 #define FRAG_MASK  0x03
 
-int HPAGE_SHIFT;
+unsigned int HPAGE_SHIFT;
 
 unsigned long *crst_table_alloc(struct mm_struct *mm)
 {
-- 
2.3.8

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] mm: hugetlb: allow hugepages_supported to be architecture specific

2015-07-03 Thread Dominik Dingel
s390 has a constant hugepage size, by setting HPAGE_SHIFT we also
change e.g. the pageblock_order, which should be independent in
respect to hugepage support.

With this patch every architecture is free to define how to check
for hugepage support.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 include/linux/hugetlb.h | 17 -
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 2050261..d891f94 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -460,15 +460,14 @@ static inline spinlock_t *huge_pte_lockptr(struct hstate 
*h,
return mm-page_table_lock;
 }
 
-static inline bool hugepages_supported(void)
-{
-   /*
-* Some platform decide whether they support huge pages at boot
-* time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when
-* there is no such support
-*/
-   return HPAGE_SHIFT != 0;
-}
+#ifndef hugepages_supported
+/*
+ * Some platform decide whether they support huge pages at boot
+ * time. Some of them, such as powerpc, set HPAGE_SHIFT to 0
+ * when there is no such support
+ */
+#define hugepages_supported() (HPAGE_SHIFT != 0)
+#endif
 
 #else  /* CONFIG_HUGETLB_PAGE */
 struct hstate {};
-- 
2.3.8

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: Add error check after call to rmap_walk in the function page_referenced

2015-06-26 Thread Dominik Dingel
On Fri, 26 Jun 2015 10:47:39 -0400
nick  wrote:

> 
> 
> On 2015-06-26 09:56 AM, Dominik Dingel wrote:
> > On Thu, 25 Jun 2015 21:36:37 -0400
> > Nicholas Krause  wrote:
> > 
> >> This adds a return check after the call to the function rmap_walk
> >> in the function page_referenced as this function call can fail
> >> and thus should signal callers of page_referenced if this happens
> >> by returning the SWAP macro return value as returned by rmap_walk
> >> here. In addition also check if have locked the page pointer as
> >> passed to this particular and unlock it with unlock_page if this
> >> page is locked before returning our SWAP marco return code from
> >> rmap_walk.
> >>
> >> Signed-off-by: Nicholas Krause 
> >> ---
> >>  mm/rmap.c | 10 +-
> >>  1 file changed, 9 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/mm/rmap.c b/mm/rmap.c
> >> index 171b687..e4df848 100644
> >> --- a/mm/rmap.c
> >> +++ b/mm/rmap.c
> >> @@ -814,7 +814,9 @@ static bool invalid_page_referenced_vma(struct 
> >> vm_area_struct *vma, void *arg)
> >>   * @vm_flags: collect encountered vma->vm_flags who actually referenced 
> >> the page
> >>   *
> >>   * Quick test_and_clear_referenced for all mappings to a page,
> >> - * returns the number of ptes which referenced the page.
> >> + * returns the number of ptes which referenced the page.On
> >> + * error returns either zero or the error code returned from
> >> + * the failed call to rmap_walk.
> >>   */
> >>  int page_referenced(struct page *page,
> >>int is_locked,
> >> @@ -855,7 +857,13 @@ int page_referenced(struct page *page,
> >>rwc.invalid_vma = invalid_page_referenced_vma;
> >>}
> >>
> >> +
> > 
> > unnecessary empty line
> > 
> >>ret = rmap_walk(page, );
> >> +  if (!ret) {
> >> +  if (we_locked)
> >> +  unlock_page(page);
> >> +  return ret;
> >> +  }
> > 
> > I don't see why the function should propagate the rmap_walk return value.
> > rmap_walk will not set pra.referenced, so that both callers just skip.
> > 
> > What is the purpose of the given patch? Do you have any real case 
> > introducing such code,
> > which is imho incomplete as all callers need to take care of the changed 
> > return value!
> > 
> There is only one caller that needs to be moved over if this case is put in. 
> Further more 
> do we care if executing rmap_walk fails as if it does this means we were 
> unable to execute
> the function page_referenced one on the rmap_walk_control structure rwc and 
> this can be
> a issue in my option, if not then we can just remove the ret variable and 
> execute rmap_walk
> without checking it's return value.
> Cheers Nick 

Let me rephrase: what will happen after you return ret?
What will: 
- shrink_active_list 
- page_check_references 
now do?

For your second thinking it would be good to check how and why ret was 
introduced?

git log -L '/int page_referenced(/',/^}/:mm/rmap.c

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: Add error check after call to rmap_walk in the function page_referenced

2015-06-26 Thread Dominik Dingel
On Thu, 25 Jun 2015 21:36:37 -0400
Nicholas Krause  wrote:

> This adds a return check after the call to the function rmap_walk
> in the function page_referenced as this function call can fail
> and thus should signal callers of page_referenced if this happens
> by returning the SWAP macro return value as returned by rmap_walk
> here. In addition also check if have locked the page pointer as
> passed to this particular and unlock it with unlock_page if this
> page is locked before returning our SWAP marco return code from
> rmap_walk.
> 
> Signed-off-by: Nicholas Krause 
> ---
>  mm/rmap.c | 10 +-
>  1 file changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/rmap.c b/mm/rmap.c
> index 171b687..e4df848 100644
> --- a/mm/rmap.c
> +++ b/mm/rmap.c
> @@ -814,7 +814,9 @@ static bool invalid_page_referenced_vma(struct 
> vm_area_struct *vma, void *arg)
>   * @vm_flags: collect encountered vma->vm_flags who actually referenced the 
> page
>   *
>   * Quick test_and_clear_referenced for all mappings to a page,
> - * returns the number of ptes which referenced the page.
> + * returns the number of ptes which referenced the page.On
> + * error returns either zero or the error code returned from
> + * the failed call to rmap_walk.
>   */
>  int page_referenced(struct page *page,
>   int is_locked,
> @@ -855,7 +857,13 @@ int page_referenced(struct page *page,
>   rwc.invalid_vma = invalid_page_referenced_vma;
>   }
> 
> +

unnecessary empty line

>   ret = rmap_walk(page, );
> + if (!ret) {
> + if (we_locked)
> + unlock_page(page);
> + return ret;
> + }

I don't see why the function should propagate the rmap_walk return value.
rmap_walk will not set pra.referenced, so that both callers just skip.

What is the purpose of the given patch? Do you have any real case introducing 
such code,
which is imho incomplete as all callers need to take care of the changed return 
value!

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: Add error check after call to rmap_walk in the function page_referenced

2015-06-26 Thread Dominik Dingel
On Thu, 25 Jun 2015 21:36:37 -0400
Nicholas Krause xerofo...@gmail.com wrote:

 This adds a return check after the call to the function rmap_walk
 in the function page_referenced as this function call can fail
 and thus should signal callers of page_referenced if this happens
 by returning the SWAP macro return value as returned by rmap_walk
 here. In addition also check if have locked the page pointer as
 passed to this particular and unlock it with unlock_page if this
 page is locked before returning our SWAP marco return code from
 rmap_walk.
 
 Signed-off-by: Nicholas Krause xerofo...@gmail.com
 ---
  mm/rmap.c | 10 +-
  1 file changed, 9 insertions(+), 1 deletion(-)
 
 diff --git a/mm/rmap.c b/mm/rmap.c
 index 171b687..e4df848 100644
 --- a/mm/rmap.c
 +++ b/mm/rmap.c
 @@ -814,7 +814,9 @@ static bool invalid_page_referenced_vma(struct 
 vm_area_struct *vma, void *arg)
   * @vm_flags: collect encountered vma-vm_flags who actually referenced the 
 page
   *
   * Quick test_and_clear_referenced for all mappings to a page,
 - * returns the number of ptes which referenced the page.
 + * returns the number of ptes which referenced the page.On
 + * error returns either zero or the error code returned from
 + * the failed call to rmap_walk.
   */
  int page_referenced(struct page *page,
   int is_locked,
 @@ -855,7 +857,13 @@ int page_referenced(struct page *page,
   rwc.invalid_vma = invalid_page_referenced_vma;
   }
 
 +

unnecessary empty line

   ret = rmap_walk(page, rwc);
 + if (!ret) {
 + if (we_locked)
 + unlock_page(page);
 + return ret;
 + }

I don't see why the function should propagate the rmap_walk return value.
rmap_walk will not set pra.referenced, so that both callers just skip.

What is the purpose of the given patch? Do you have any real case introducing 
such code,
which is imho incomplete as all callers need to take care of the changed return 
value!

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH] mm: Add error check after call to rmap_walk in the function page_referenced

2015-06-26 Thread Dominik Dingel
On Fri, 26 Jun 2015 10:47:39 -0400
nick xerofo...@gmail.com wrote:

 
 
 On 2015-06-26 09:56 AM, Dominik Dingel wrote:
  On Thu, 25 Jun 2015 21:36:37 -0400
  Nicholas Krause xerofo...@gmail.com wrote:
  
  This adds a return check after the call to the function rmap_walk
  in the function page_referenced as this function call can fail
  and thus should signal callers of page_referenced if this happens
  by returning the SWAP macro return value as returned by rmap_walk
  here. In addition also check if have locked the page pointer as
  passed to this particular and unlock it with unlock_page if this
  page is locked before returning our SWAP marco return code from
  rmap_walk.
 
  Signed-off-by: Nicholas Krause xerofo...@gmail.com
  ---
   mm/rmap.c | 10 +-
   1 file changed, 9 insertions(+), 1 deletion(-)
 
  diff --git a/mm/rmap.c b/mm/rmap.c
  index 171b687..e4df848 100644
  --- a/mm/rmap.c
  +++ b/mm/rmap.c
  @@ -814,7 +814,9 @@ static bool invalid_page_referenced_vma(struct 
  vm_area_struct *vma, void *arg)
* @vm_flags: collect encountered vma-vm_flags who actually referenced 
  the page
*
* Quick test_and_clear_referenced for all mappings to a page,
  - * returns the number of ptes which referenced the page.
  + * returns the number of ptes which referenced the page.On
  + * error returns either zero or the error code returned from
  + * the failed call to rmap_walk.
*/
   int page_referenced(struct page *page,
 int is_locked,
  @@ -855,7 +857,13 @@ int page_referenced(struct page *page,
 rwc.invalid_vma = invalid_page_referenced_vma;
 }
 
  +
  
  unnecessary empty line
  
 ret = rmap_walk(page, rwc);
  +  if (!ret) {
  +  if (we_locked)
  +  unlock_page(page);
  +  return ret;
  +  }
  
  I don't see why the function should propagate the rmap_walk return value.
  rmap_walk will not set pra.referenced, so that both callers just skip.
  
  What is the purpose of the given patch? Do you have any real case 
  introducing such code,
  which is imho incomplete as all callers need to take care of the changed 
  return value!
  
 There is only one caller that needs to be moved over if this case is put in. 
 Further more 
 do we care if executing rmap_walk fails as if it does this means we were 
 unable to execute
 the function page_referenced one on the rmap_walk_control structure rwc and 
 this can be
 a issue in my option, if not then we can just remove the ret variable and 
 execute rmap_walk
 without checking it's return value.
 Cheers Nick 

Let me rephrase: what will happen after you return ret?
What will: 
- shrink_active_list 
- page_check_references 
now do?

For your second thinking it would be good to check how and why ret was 
introduced?

git log -L '/int page_referenced(/',/^}/:mm/rmap.c

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] s390/mm: change HPAGE_SHIFT type to int

2015-06-25 Thread Dominik Dingel
With making HPAGE_SHIFT an unsigned integer we also accidentally changed 
pageblock_order.
In order to avoid compiler warnings we make HPAGE_SHFIT an int again.

Suggested-by: Andrew Morton 
Signed-off-by: Dominik Dingel 
---
 arch/s390/include/asm/page.h | 2 +-
 arch/s390/mm/pgtable.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 0844b78..dd34523 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -20,7 +20,7 @@
 #include 
 #ifndef __ASSEMBLY__
 
-extern unsigned int HPAGE_SHIFT;
+extern int HPAGE_SHIFT;
 #define HPAGE_SIZE (1UL << HPAGE_SHIFT)
 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index f76791e..1bae5dd 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -36,7 +36,7 @@
 #endif
 
 
-unsigned int HPAGE_SHIFT;
+int HPAGE_SHIFT;
 
 unsigned long *crst_table_alloc(struct mm_struct *mm)
 {
-- 
2.3.8

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] s390/mm: change HPAGE_SHIFT type to int

2015-06-25 Thread Dominik Dingel
With making HPAGE_SHIFT an unsigned integer we also accidentally changed 
pageblock_order.
In order to avoid compiler warnings we make HPAGE_SHFIT an int again.

Suggested-by: Andrew Morton a...@linux-foundation.org
Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 arch/s390/include/asm/page.h | 2 +-
 arch/s390/mm/pgtable.c   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 0844b78..dd34523 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -20,7 +20,7 @@
 #include asm/setup.h
 #ifndef __ASSEMBLY__
 
-extern unsigned int HPAGE_SHIFT;
+extern int HPAGE_SHIFT;
 #define HPAGE_SIZE (1UL  HPAGE_SHIFT)
 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index f76791e..1bae5dd 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -36,7 +36,7 @@
 #endif
 
 
-unsigned int HPAGE_SHIFT;
+int HPAGE_SHIFT;
 
 unsigned long *crst_table_alloc(struct mm_struct *mm)
 {
-- 
2.3.8

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/5] Remove s390 sw-emulated hugepages and cleanup

2015-06-02 Thread Dominik Dingel
On Mon, 01 Jun 2015 09:35:57 +0200
Christian Borntraeger  wrote:

> Am 28.05.2015 um 13:52 schrieb Dominik Dingel:
> > Hi everyone,
> > 
> > there is a potential bug with KVM and hugetlbfs if the hardware does not
> > support hugepages (EDAT1).
> > We fix this by making EDAT1 a hard requirement for hugepages and 
> > therefore removing and simplifying code.
> 
> The cleanup itself is nice and probably the right thing to do. 
> Emulating large pages makes the code more complex and asks for
> trouble (as outlined above)
> 
> The only downside that I see is that z/VM as of today does not
> announce EDAT1 for its guests so the "emulated" large pages for
> hugetlbfs would be useful in that case. The current code allocates
> the page table only once and shares it for all mappers - which is
> useful for some big databases that spawn hundreds of processes with
> shared mappings of several hundred GBs. In these cases we do save
> a decent amount of page table memory. 

To limit the damage done, we could always allocate page tables with pgstes for 
that case.
That would allow one guest manipulating another guests storage keys,
but at least would prevent random memory overwrites in the host.

Another thing we could do, make software emulated large pages a kernel config 
option
and only if KVM is not selected allow it, visa versa.

@Martin what do you think?

Thanks,
Dominik

> Not sure if that case is actually important, though.
> 
> Christian
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/5] Remove s390 sw-emulated hugepages and cleanup

2015-06-02 Thread Dominik Dingel
On Mon, 01 Jun 2015 09:35:57 +0200
Christian Borntraeger borntrae...@de.ibm.com wrote:

 Am 28.05.2015 um 13:52 schrieb Dominik Dingel:
  Hi everyone,
  
  there is a potential bug with KVM and hugetlbfs if the hardware does not
  support hugepages (EDAT1).
  We fix this by making EDAT1 a hard requirement for hugepages and 
  therefore removing and simplifying code.
 
 The cleanup itself is nice and probably the right thing to do. 
 Emulating large pages makes the code more complex and asks for
 trouble (as outlined above)
 
 The only downside that I see is that z/VM as of today does not
 announce EDAT1 for its guests so the emulated large pages for
 hugetlbfs would be useful in that case. The current code allocates
 the page table only once and shares it for all mappers - which is
 useful for some big databases that spawn hundreds of processes with
 shared mappings of several hundred GBs. In these cases we do save
 a decent amount of page table memory. 

To limit the damage done, we could always allocate page tables with pgstes for 
that case.
That would allow one guest manipulating another guests storage keys,
but at least would prevent random memory overwrites in the host.

Another thing we could do, make software emulated large pages a kernel config 
option
and only if KVM is not selected allow it, visa versa.

@Martin what do you think?

Thanks,
Dominik

 Not sure if that case is actually important, though.
 
 Christian
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/5] mm/hugetlb: remove unused arch hook prepare/release_hugepage

2015-05-28 Thread Dominik Dingel
With s390 dropping support for emulated hugepages, the last user of
arch_prepare_hugepage and arch_release_hugepage is gone.

Acked-by: Martin Schwidefsky 
Signed-off-by: Dominik Dingel 
---
 mm/hugetlb.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 290984b..a97958e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -917,7 +917,6 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
} else {
-   arch_release_hugepage(page);
__free_pages(page, huge_page_order(h));
}
 }
@@ -1102,10 +1101,6 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
if (page) {
-   if (arch_prepare_hugepage(page)) {
-   __free_pages(page, huge_page_order(h));
-   return NULL;
-   }
prep_new_huge_page(h, page, nid);
}
 
@@ -1257,11 +1252,6 @@ static struct page *alloc_buddy_huge_page(struct hstate 
*h, int nid)
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
 
-   if (page && arch_prepare_hugepage(page)) {
-   __free_pages(page, huge_page_order(h));
-   page = NULL;
-   }
-
spin_lock(_lock);
if (page) {
INIT_LIST_HEAD(>lru);
-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/5] Remove s390 sw-emulated hugepages and cleanup

2015-05-28 Thread Dominik Dingel
Hi everyone,

there is a potential bug with KVM and hugetlbfs if the hardware does not
support hugepages (EDAT1).
We fix this by making EDAT1 a hard requirement for hugepages and 
therefore removing and simplifying code.

As s390, with the sw-emulated hugepages, was the only user of 
arch_prepare/release_hugepage
I also removed theses calls from common and other architecture code.

Thanks,
Dominik

Dominik Dingel (5):
  s390/mm: make hugepages_supported a boot time decision
  mm/hugetlb: remove unused arch hook prepare/release_hugepage
  mm/hugetlb: remove arch_prepare/release_hugepage from arch headers
  s390/hugetlb: remove dead code for sw emulated huge pages
  s390/mm: forward check for huge pmds to pmd_large()

 arch/arm/include/asm/hugetlb.h |  9 --
 arch/arm64/include/asm/hugetlb.h   |  9 --
 arch/ia64/include/asm/hugetlb.h|  9 --
 arch/metag/include/asm/hugetlb.h   |  9 --
 arch/mips/include/asm/hugetlb.h|  9 --
 arch/powerpc/include/asm/hugetlb.h |  9 --
 arch/s390/include/asm/hugetlb.h|  3 --
 arch/s390/include/asm/page.h   |  8 ++---
 arch/s390/kernel/setup.c   |  2 ++
 arch/s390/mm/hugetlbpage.c | 65 +++---
 arch/s390/mm/pgtable.c |  2 ++
 arch/sh/include/asm/hugetlb.h  |  9 --
 arch/sparc/include/asm/hugetlb.h   |  9 --
 arch/tile/include/asm/hugetlb.h|  9 --
 arch/x86/include/asm/hugetlb.h |  9 --
 mm/hugetlb.c   | 10 --
 16 files changed, 12 insertions(+), 168 deletions(-)

-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/5] s390/hugetlb: remove dead code for sw emulated huge pages

2015-05-28 Thread Dominik Dingel
We now support only hugepages on hardware with EDAT1 support.
So we remove the prepare/release_hugepage hooks and
simplify set_huge_pte_at and huge_ptep_get.

Acked-by: Martin Schwidefsky 
Signed-off-by: Dominik Dingel 
---
 arch/s390/include/asm/hugetlb.h |  3 ---
 arch/s390/mm/hugetlbpage.c  | 60 +++--
 2 files changed, 3 insertions(+), 60 deletions(-)

diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index dfb542a..0130d03 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -37,9 +37,6 @@ static inline int prepare_hugepage_range(struct file *file,
 
 #define arch_clear_hugepage_flags(page)do { } while (0)
 
-int arch_prepare_hugepage(struct page *page);
-void arch_release_hugepage(struct page *page);
-
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
  pte_t *ptep)
 {
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index fa6e1bc..999616b 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -80,31 +80,16 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *ptep, pte_t pte)
 {
-   pmd_t pmd;
+   pmd_t pmd = __pte_to_pmd(pte);
 
-   pmd = __pte_to_pmd(pte);
-   if (!MACHINE_HAS_HPAGE) {
-   /* Emulated huge ptes loose the dirty and young bit */
-   pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN;
-   pmd_val(pmd) |= pte_page(pte)[1].index;
-   } else
-   pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
+   pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
*(pmd_t *) ptep = pmd;
 }
 
 pte_t huge_ptep_get(pte_t *ptep)
 {
-   unsigned long origin;
-   pmd_t pmd;
+   pmd_t pmd = *(pmd_t *) ptep;
 
-   pmd = *(pmd_t *) ptep;
-   if (!MACHINE_HAS_HPAGE && pmd_present(pmd)) {
-   origin = pmd_val(pmd) & _SEGMENT_ENTRY_ORIGIN;
-   pmd_val(pmd) &= ~_SEGMENT_ENTRY_ORIGIN;
-   pmd_val(pmd) |= *(unsigned long *) origin;
-   /* Emulated huge ptes are young and dirty by definition */
-   pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG | _SEGMENT_ENTRY_DIRTY;
-   }
return __pmd_to_pte(pmd);
 }
 
@@ -119,45 +104,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
return pte;
 }
 
-int arch_prepare_hugepage(struct page *page)
-{
-   unsigned long addr = page_to_phys(page);
-   pte_t pte;
-   pte_t *ptep;
-   int i;
-
-   if (MACHINE_HAS_HPAGE)
-   return 0;
-
-   ptep = (pte_t *) pte_alloc_one(_mm, addr);
-   if (!ptep)
-   return -ENOMEM;
-
-   pte_val(pte) = addr;
-   for (i = 0; i < PTRS_PER_PTE; i++) {
-   set_pte_at(_mm, addr + i * PAGE_SIZE, ptep + i, pte);
-   pte_val(pte) += PAGE_SIZE;
-   }
-   page[1].index = (unsigned long) ptep;
-   return 0;
-}
-
-void arch_release_hugepage(struct page *page)
-{
-   pte_t *ptep;
-
-   if (MACHINE_HAS_HPAGE)
-   return;
-
-   ptep = (pte_t *) page[1].index;
-   if (!ptep)
-   return;
-   clear_table((unsigned long *) ptep, _PAGE_INVALID,
-   PTRS_PER_PTE * sizeof(pte_t));
-   page_table_free(_mm, (unsigned long *) ptep);
-   page[1].index = 0;
-}
-
 pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
 {
-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/5] mm/hugetlb: remove arch_prepare/release_hugepage from arch headers

2015-05-28 Thread Dominik Dingel
Nobody used these hooks so they were removed from common code,
and can now be removed from the architectures.

Acked-by: Martin Schwidefsky 
Signed-off-by: Dominik Dingel 
---
 arch/arm/include/asm/hugetlb.h | 9 -
 arch/arm64/include/asm/hugetlb.h   | 9 -
 arch/ia64/include/asm/hugetlb.h| 9 -
 arch/metag/include/asm/hugetlb.h   | 9 -
 arch/mips/include/asm/hugetlb.h| 9 -
 arch/powerpc/include/asm/hugetlb.h | 9 -
 arch/sh/include/asm/hugetlb.h  | 9 -
 arch/sparc/include/asm/hugetlb.h   | 9 -
 arch/tile/include/asm/hugetlb.h| 9 -
 arch/x86/include/asm/hugetlb.h | 9 -
 10 files changed, 90 deletions(-)

diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index 31bb7dc..7d26f6c 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -63,15 +63,6 @@ static inline pte_t huge_pte_wrprotect(pte_t pte)
return pte_wrprotect(pte);
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
clear_bit(PG_dcache_clean, >flags);
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 734c17e..2fd9b14 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -96,15 +96,6 @@ static inline pte_t huge_pte_wrprotect(pte_t pte)
return pte_wrprotect(pte);
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
clear_bit(PG_dcache_clean, >flags);
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
index ff1377b..ef65f02 100644
--- a/arch/ia64/include/asm/hugetlb.h
+++ b/arch/ia64/include/asm/hugetlb.h
@@ -65,15 +65,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/arch/metag/include/asm/hugetlb.h b/arch/metag/include/asm/hugetlb.h
index f730b39..905ed42 100644
--- a/arch/metag/include/asm/hugetlb.h
+++ b/arch/metag/include/asm/hugetlb.h
@@ -67,15 +67,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index 4a5bb54..982bc06 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -110,15 +110,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index 4bbd3c8..7eac89b 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -168,15 +168,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index b788a9b..ef489a5 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -79,15 +79,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
clear_bit(PG_dcache_clean, >flags);
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index 3130d76..139e711 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -78,15 +78,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/arch/tile/include/asm/hugetlb.h b/arch/tile/include/asm/

[PATCH 5/5] s390/mm: forward check for huge pmds to pmd_large()

2015-05-28 Thread Dominik Dingel
We already do the check in pmd_large, so we can just forward the call.

Acked-by: Martin Schwidefsky 
Signed-off-by: Dominik Dingel 
---
 arch/s390/mm/hugetlbpage.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 999616b..a4b2f5e 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -135,10 +135,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long 
addr)
 
 int pmd_huge(pmd_t pmd)
 {
-   if (!MACHINE_HAS_HPAGE)
-   return 0;
-
-   return !!(pmd_val(pmd) & _SEGMENT_ENTRY_LARGE);
+   return pmd_large(pmd);
 }
 
 int pud_huge(pud_t pud)
-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/5] s390/mm: make hugepages_supported a boot time decision

2015-05-28 Thread Dominik Dingel
By dropping support for hugepages on machines which do not have
the hardware feature EDAT1, we fix a potential s390 KVM bug.

The bug would happen if a guest is backed by hugetlbfs (not supported 
currently),
but does not get pagetables with PGSTE.
This would lead to random memory overwrites.

Acked-by: Martin Schwidefsky 
Signed-off-by: Dominik Dingel 
---
 arch/s390/include/asm/page.h | 8 
 arch/s390/kernel/setup.c | 2 ++
 arch/s390/mm/pgtable.c   | 2 ++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 53eacbd..0844b78 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -17,7 +17,10 @@
 #define PAGE_DEFAULT_ACC   0
 #define PAGE_DEFAULT_KEY   (PAGE_DEFAULT_ACC << 4)
 
-#define HPAGE_SHIFT20
+#include 
+#ifndef __ASSEMBLY__
+
+extern unsigned int HPAGE_SHIFT;
 #define HPAGE_SIZE (1UL << HPAGE_SHIFT)
 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
@@ -27,9 +30,6 @@
 #define ARCH_HAS_PREPARE_HUGEPAGE
 #define ARCH_HAS_HUGEPAGE_CLEAR_FLUSH
 
-#include 
-#ifndef __ASSEMBLY__
-
 static inline void storage_key_init_range(unsigned long start, unsigned long 
end)
 {
 #if PAGE_DEFAULT_KEY
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index a5ea8bc..9ac282b 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -915,6 +915,8 @@ void __init setup_arch(char **cmdline_p)
 */
setup_hwcaps();
 
+   HPAGE_SHIFT = MACHINE_HAS_HPAGE ? 20 : 0;
+
/*
 * Create kernel page tables and switch to virtual addressing.
 */
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b2c1542..f76791e 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -36,6 +36,8 @@
 #endif
 
 
+unsigned int HPAGE_SHIFT;
+
 unsigned long *crst_table_alloc(struct mm_struct *mm)
 {
struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
-- 
2.3.7

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/5] s390/mm: make hugepages_supported a boot time decision

2015-05-28 Thread Dominik Dingel
By dropping support for hugepages on machines which do not have
the hardware feature EDAT1, we fix a potential s390 KVM bug.

The bug would happen if a guest is backed by hugetlbfs (not supported 
currently),
but does not get pagetables with PGSTE.
This would lead to random memory overwrites.

Acked-by: Martin Schwidefsky schwidef...@de.ibm.com
Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 arch/s390/include/asm/page.h | 8 
 arch/s390/kernel/setup.c | 2 ++
 arch/s390/mm/pgtable.c   | 2 ++
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index 53eacbd..0844b78 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -17,7 +17,10 @@
 #define PAGE_DEFAULT_ACC   0
 #define PAGE_DEFAULT_KEY   (PAGE_DEFAULT_ACC  4)
 
-#define HPAGE_SHIFT20
+#include asm/setup.h
+#ifndef __ASSEMBLY__
+
+extern unsigned int HPAGE_SHIFT;
 #define HPAGE_SIZE (1UL  HPAGE_SHIFT)
 #define HPAGE_MASK (~(HPAGE_SIZE - 1))
 #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
@@ -27,9 +30,6 @@
 #define ARCH_HAS_PREPARE_HUGEPAGE
 #define ARCH_HAS_HUGEPAGE_CLEAR_FLUSH
 
-#include asm/setup.h
-#ifndef __ASSEMBLY__
-
 static inline void storage_key_init_range(unsigned long start, unsigned long 
end)
 {
 #if PAGE_DEFAULT_KEY
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index a5ea8bc..9ac282b 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -915,6 +915,8 @@ void __init setup_arch(char **cmdline_p)
 */
setup_hwcaps();
 
+   HPAGE_SHIFT = MACHINE_HAS_HPAGE ? 20 : 0;
+
/*
 * Create kernel page tables and switch to virtual addressing.
 */
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b2c1542..f76791e 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -36,6 +36,8 @@
 #endif
 
 
+unsigned int HPAGE_SHIFT;
+
 unsigned long *crst_table_alloc(struct mm_struct *mm)
 {
struct page *page = alloc_pages(GFP_KERNEL, ALLOC_ORDER);
-- 
2.3.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 5/5] s390/mm: forward check for huge pmds to pmd_large()

2015-05-28 Thread Dominik Dingel
We already do the check in pmd_large, so we can just forward the call.

Acked-by: Martin Schwidefsky schwidef...@de.ibm.com
Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 arch/s390/mm/hugetlbpage.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index 999616b..a4b2f5e 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -135,10 +135,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long 
addr)
 
 int pmd_huge(pmd_t pmd)
 {
-   if (!MACHINE_HAS_HPAGE)
-   return 0;
-
-   return !!(pmd_val(pmd)  _SEGMENT_ENTRY_LARGE);
+   return pmd_large(pmd);
 }
 
 int pud_huge(pud_t pud)
-- 
2.3.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/5] mm/hugetlb: remove arch_prepare/release_hugepage from arch headers

2015-05-28 Thread Dominik Dingel
Nobody used these hooks so they were removed from common code,
and can now be removed from the architectures.

Acked-by: Martin Schwidefsky schwidef...@de.ibm.com
Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 arch/arm/include/asm/hugetlb.h | 9 -
 arch/arm64/include/asm/hugetlb.h   | 9 -
 arch/ia64/include/asm/hugetlb.h| 9 -
 arch/metag/include/asm/hugetlb.h   | 9 -
 arch/mips/include/asm/hugetlb.h| 9 -
 arch/powerpc/include/asm/hugetlb.h | 9 -
 arch/sh/include/asm/hugetlb.h  | 9 -
 arch/sparc/include/asm/hugetlb.h   | 9 -
 arch/tile/include/asm/hugetlb.h| 9 -
 arch/x86/include/asm/hugetlb.h | 9 -
 10 files changed, 90 deletions(-)

diff --git a/arch/arm/include/asm/hugetlb.h b/arch/arm/include/asm/hugetlb.h
index 31bb7dc..7d26f6c 100644
--- a/arch/arm/include/asm/hugetlb.h
+++ b/arch/arm/include/asm/hugetlb.h
@@ -63,15 +63,6 @@ static inline pte_t huge_pte_wrprotect(pte_t pte)
return pte_wrprotect(pte);
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
clear_bit(PG_dcache_clean, page-flags);
diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h
index 734c17e..2fd9b14 100644
--- a/arch/arm64/include/asm/hugetlb.h
+++ b/arch/arm64/include/asm/hugetlb.h
@@ -96,15 +96,6 @@ static inline pte_t huge_pte_wrprotect(pte_t pte)
return pte_wrprotect(pte);
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
clear_bit(PG_dcache_clean, page-flags);
diff --git a/arch/ia64/include/asm/hugetlb.h b/arch/ia64/include/asm/hugetlb.h
index ff1377b..ef65f02 100644
--- a/arch/ia64/include/asm/hugetlb.h
+++ b/arch/ia64/include/asm/hugetlb.h
@@ -65,15 +65,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/arch/metag/include/asm/hugetlb.h b/arch/metag/include/asm/hugetlb.h
index f730b39..905ed42 100644
--- a/arch/metag/include/asm/hugetlb.h
+++ b/arch/metag/include/asm/hugetlb.h
@@ -67,15 +67,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h
index 4a5bb54..982bc06 100644
--- a/arch/mips/include/asm/hugetlb.h
+++ b/arch/mips/include/asm/hugetlb.h
@@ -110,15 +110,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/arch/powerpc/include/asm/hugetlb.h 
b/arch/powerpc/include/asm/hugetlb.h
index 4bbd3c8..7eac89b 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -168,15 +168,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h
index b788a9b..ef489a5 100644
--- a/arch/sh/include/asm/hugetlb.h
+++ b/arch/sh/include/asm/hugetlb.h
@@ -79,15 +79,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
clear_bit(PG_dcache_clean, page-flags);
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index 3130d76..139e711 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -78,15 +78,6 @@ static inline pte_t huge_ptep_get(pte_t *ptep)
return *ptep;
 }
 
-static inline int arch_prepare_hugepage(struct page *page)
-{
-   return 0;
-}
-
-static inline void arch_release_hugepage(struct page *page)
-{
-}
-
 static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
diff --git a/arch/tile

[PATCH 4/5] s390/hugetlb: remove dead code for sw emulated huge pages

2015-05-28 Thread Dominik Dingel
We now support only hugepages on hardware with EDAT1 support.
So we remove the prepare/release_hugepage hooks and
simplify set_huge_pte_at and huge_ptep_get.

Acked-by: Martin Schwidefsky schwidef...@de.ibm.com
Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 arch/s390/include/asm/hugetlb.h |  3 ---
 arch/s390/mm/hugetlbpage.c  | 60 +++--
 2 files changed, 3 insertions(+), 60 deletions(-)

diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h
index dfb542a..0130d03 100644
--- a/arch/s390/include/asm/hugetlb.h
+++ b/arch/s390/include/asm/hugetlb.h
@@ -37,9 +37,6 @@ static inline int prepare_hugepage_range(struct file *file,
 
 #define arch_clear_hugepage_flags(page)do { } while (0)
 
-int arch_prepare_hugepage(struct page *page);
-void arch_release_hugepage(struct page *page);
-
 static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
  pte_t *ptep)
 {
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index fa6e1bc..999616b 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -80,31 +80,16 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
 void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
 pte_t *ptep, pte_t pte)
 {
-   pmd_t pmd;
+   pmd_t pmd = __pte_to_pmd(pte);
 
-   pmd = __pte_to_pmd(pte);
-   if (!MACHINE_HAS_HPAGE) {
-   /* Emulated huge ptes loose the dirty and young bit */
-   pmd_val(pmd) = ~_SEGMENT_ENTRY_ORIGIN;
-   pmd_val(pmd) |= pte_page(pte)[1].index;
-   } else
-   pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
+   pmd_val(pmd) |= _SEGMENT_ENTRY_LARGE;
*(pmd_t *) ptep = pmd;
 }
 
 pte_t huge_ptep_get(pte_t *ptep)
 {
-   unsigned long origin;
-   pmd_t pmd;
+   pmd_t pmd = *(pmd_t *) ptep;
 
-   pmd = *(pmd_t *) ptep;
-   if (!MACHINE_HAS_HPAGE  pmd_present(pmd)) {
-   origin = pmd_val(pmd)  _SEGMENT_ENTRY_ORIGIN;
-   pmd_val(pmd) = ~_SEGMENT_ENTRY_ORIGIN;
-   pmd_val(pmd) |= *(unsigned long *) origin;
-   /* Emulated huge ptes are young and dirty by definition */
-   pmd_val(pmd) |= _SEGMENT_ENTRY_YOUNG | _SEGMENT_ENTRY_DIRTY;
-   }
return __pmd_to_pte(pmd);
 }
 
@@ -119,45 +104,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
return pte;
 }
 
-int arch_prepare_hugepage(struct page *page)
-{
-   unsigned long addr = page_to_phys(page);
-   pte_t pte;
-   pte_t *ptep;
-   int i;
-
-   if (MACHINE_HAS_HPAGE)
-   return 0;
-
-   ptep = (pte_t *) pte_alloc_one(init_mm, addr);
-   if (!ptep)
-   return -ENOMEM;
-
-   pte_val(pte) = addr;
-   for (i = 0; i  PTRS_PER_PTE; i++) {
-   set_pte_at(init_mm, addr + i * PAGE_SIZE, ptep + i, pte);
-   pte_val(pte) += PAGE_SIZE;
-   }
-   page[1].index = (unsigned long) ptep;
-   return 0;
-}
-
-void arch_release_hugepage(struct page *page)
-{
-   pte_t *ptep;
-
-   if (MACHINE_HAS_HPAGE)
-   return;
-
-   ptep = (pte_t *) page[1].index;
-   if (!ptep)
-   return;
-   clear_table((unsigned long *) ptep, _PAGE_INVALID,
-   PTRS_PER_PTE * sizeof(pte_t));
-   page_table_free(init_mm, (unsigned long *) ptep);
-   page[1].index = 0;
-}
-
 pte_t *huge_pte_alloc(struct mm_struct *mm,
unsigned long addr, unsigned long sz)
 {
-- 
2.3.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/5] Remove s390 sw-emulated hugepages and cleanup

2015-05-28 Thread Dominik Dingel
Hi everyone,

there is a potential bug with KVM and hugetlbfs if the hardware does not
support hugepages (EDAT1).
We fix this by making EDAT1 a hard requirement for hugepages and 
therefore removing and simplifying code.

As s390, with the sw-emulated hugepages, was the only user of 
arch_prepare/release_hugepage
I also removed theses calls from common and other architecture code.

Thanks,
Dominik

Dominik Dingel (5):
  s390/mm: make hugepages_supported a boot time decision
  mm/hugetlb: remove unused arch hook prepare/release_hugepage
  mm/hugetlb: remove arch_prepare/release_hugepage from arch headers
  s390/hugetlb: remove dead code for sw emulated huge pages
  s390/mm: forward check for huge pmds to pmd_large()

 arch/arm/include/asm/hugetlb.h |  9 --
 arch/arm64/include/asm/hugetlb.h   |  9 --
 arch/ia64/include/asm/hugetlb.h|  9 --
 arch/metag/include/asm/hugetlb.h   |  9 --
 arch/mips/include/asm/hugetlb.h|  9 --
 arch/powerpc/include/asm/hugetlb.h |  9 --
 arch/s390/include/asm/hugetlb.h|  3 --
 arch/s390/include/asm/page.h   |  8 ++---
 arch/s390/kernel/setup.c   |  2 ++
 arch/s390/mm/hugetlbpage.c | 65 +++---
 arch/s390/mm/pgtable.c |  2 ++
 arch/sh/include/asm/hugetlb.h  |  9 --
 arch/sparc/include/asm/hugetlb.h   |  9 --
 arch/tile/include/asm/hugetlb.h|  9 --
 arch/x86/include/asm/hugetlb.h |  9 --
 mm/hugetlb.c   | 10 --
 16 files changed, 12 insertions(+), 168 deletions(-)

-- 
2.3.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/5] mm/hugetlb: remove unused arch hook prepare/release_hugepage

2015-05-28 Thread Dominik Dingel
With s390 dropping support for emulated hugepages, the last user of
arch_prepare_hugepage and arch_release_hugepage is gone.

Acked-by: Martin Schwidefsky schwidef...@de.ibm.com
Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 mm/hugetlb.c | 10 --
 1 file changed, 10 deletions(-)

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 290984b..a97958e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -917,7 +917,6 @@ static void update_and_free_page(struct hstate *h, struct 
page *page)
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
} else {
-   arch_release_hugepage(page);
__free_pages(page, huge_page_order(h));
}
 }
@@ -1102,10 +1101,6 @@ static struct page *alloc_fresh_huge_page_node(struct 
hstate *h, int nid)
__GFP_REPEAT|__GFP_NOWARN,
huge_page_order(h));
if (page) {
-   if (arch_prepare_hugepage(page)) {
-   __free_pages(page, huge_page_order(h));
-   return NULL;
-   }
prep_new_huge_page(h, page, nid);
}
 
@@ -1257,11 +1252,6 @@ static struct page *alloc_buddy_huge_page(struct hstate 
*h, int nid)
htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE|
__GFP_REPEAT|__GFP_NOWARN, huge_page_order(h));
 
-   if (page  arch_prepare_hugepage(page)) {
-   __free_pages(page, huge_page_order(h));
-   page = NULL;
-   }
-
spin_lock(hugetlb_lock);
if (page) {
INIT_LIST_HEAD(page-lru);
-- 
2.3.7

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] KVM: trivial fix comment regarding __kvm_set_memory_region

2014-10-27 Thread Dominik Dingel
commit 72dc67a69690 ("KVM: remove the usage of the mmap_sem for the protection 
of the memory slots.")
changed the lock which will be taken. This should be reflected in the function
commentary.

Signed-off-by: Dominik Dingel 
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d82ec25..8b13607 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -738,7 +738,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm 
*kvm,
  *
  * Discontiguous memory is allowed, mostly for framebuffers.
  *
- * Must be called holding mmap_sem for write.
+ * Must be called holding kvm->slots_lock for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem)
-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH] KVM: trivial fix comment regarding __kvm_set_memory_region

2014-10-27 Thread Dominik Dingel
commit 72dc67a69690 (KVM: remove the usage of the mmap_sem for the protection 
of the memory slots.)
changed the lock which will be taken. This should be reflected in the function
commentary.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 virt/kvm/kvm_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index d82ec25..8b13607 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -738,7 +738,7 @@ static struct kvm_memslots *install_new_memslots(struct kvm 
*kvm,
  *
  * Discontiguous memory is allowed, mostly for framebuffers.
  *
- * Must be called holding mmap_sem for write.
+ * Must be called holding kvm-slots_lock for write.
  */
 int __kvm_set_memory_region(struct kvm *kvm,
struct kvm_userspace_memory_region *mem)
-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/4] mm: introduce mm_forbids_zeropage function

2014-10-22 Thread Dominik Dingel
On Wed, 22 Oct 2014 12:22:23 -0700
Andrew Morton  wrote:

> On Wed, 22 Oct 2014 13:09:28 +0200 Dominik Dingel  
> wrote:
> 
> > Add a new function stub to allow architectures to disable for
> > an mm_structthe backing of non-present, anonymous pages with
> > read-only empty zero pages.
> > 
> > ...
> >
> > --- a/include/linux/mm.h
> > +++ b/include/linux/mm.h
> > @@ -56,6 +56,10 @@ extern int sysctl_legacy_va_layout;
> >  #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
> >  #endif
> >  
> > +#ifndef mm_forbids_zeropage
> > +#define mm_forbids_zeropage(X)  (0)
> > +#endif
> 
> Can we document this please?  What it does, why it does it.  We should
> also specify precisely which arch header file is responsible for
> defining mm_forbids_zeropage.
> 

I will add a comment like:

/*
 * To prevent common memory management code establishing
 * a zero page mapping on a read fault.
 * This function should be implemented within .
 * s390 does this to prevent multiplexing of hardware bits
 * related to the physical page in case of virtualization.
 */

Okay?


> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majord...@kvack.org.  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: mailto:"d...@kvack.org;> em...@kvack.org 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 0/4] mm: new function to forbid zeropage mappings for a process

2014-10-22 Thread Dominik Dingel
s390 has the special notion of storage keys which are some sort of page flags
associated with physical pages and live outside of direct addressable memory.
These storage keys can be queried and changed with a special set of 
instructions.
The mentioned instructions behave quite nicely under virtualization, if there 
is: 
- an invalid pte, then the instructions will work on memory in the host page 
table
- a valid pte, then the instructions will work with the real storage key

Thanks to Martin with his software reference and dirty bit tracking,
the kernel does not issue any storage key instructions as now a 
software based approach will be taken, on the other hand distributions 
in the wild are currently using them.

However, for virtualized guests we still have a problem with guest pages 
mapped to zero pages and the kernel same page merging.  
With each one multiple guest pages will point to the same physical page
and share the same storage key.

Let's fix this by introducing a new function which s390 will define to
forbid new zero page mappings.  If the guest issues a storage key related 
instruction we flag the mm_struct, drop existing zero page mappings
and unmerge the guest memory.

v2 -> v3:
 - Clearing up patch description Patch 3/4
 - removing unnecessary flag in mmu_context (Paolo)

v1 -> v2: 
 - Following Dave and Paolo suggestion removing the vma flag

Dominik Dingel (4):
  s390/mm: recfactor global pgste updates
  mm: introduce mm_forbids_zeropage function
  s390/mm: prevent and break zero page mappings in case of storage keys
  s390/mm: disable KSM for storage key enabled pages

 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |   8 +-
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/kvm/priv.c|  17 ++--
 arch/s390/mm/pgtable.c  | 180 ++--
 include/linux/mm.h  |   4 +
 mm/huge_memory.c|   2 +-
 mm/memory.c |   2 +-
 8 files changed, 106 insertions(+), 111 deletions(-)

-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] s390/mm: prevent and break zero page mappings in case of storage keys

2014-10-22 Thread Dominik Dingel
As soon as storage keys are enabled we need to stop working on zero page
mappings to prevent inconsistencies between storage keys and pgste.

Otherwise following data corruption could happen:
1) guest enables storage key
2) guest sets storage key for not mapped page X
   -> change goes to PGSTE
3) guest reads from page X
   -> as X was not dirty before, the page will be zero page backed,
  storage key from PGSTE for X will go to storage key for zero page
4) guest sets storage key for not mapped page Y (same logic as above
5) guest reads from page Y
   -> as Y was not dirty before, the page will be zero page backed,
  storage key from PGSTE for Y will got to storage key for zero page
  overwriting storage key for X

While holding the mmap sem, we are safe against changes on entries we
already fixed, as every fault would need to take the mmap_sem (read).

Other vCPUs executing storage key instructions will get a one time interception
and be serialized also with mmap_sem.

Signed-off-by: Dominik Dingel 
---
 arch/s390/include/asm/pgtable.h |  5 +
 arch/s390/mm/pgtable.c  | 13 -
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 1e991f6a..0da98d6 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -481,6 +481,11 @@ static inline int mm_has_pgste(struct mm_struct *mm)
return 0;
 }
 
+/*
+ * In the case that a guest uses storage keys
+ * faults should no longer be backed by zero pages
+ */
+#define mm_forbids_zeropage mm_use_skey
 static inline int mm_use_skey(struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ab55ba8..58d7eb2 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1309,6 +1309,15 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
pgste_t pgste;
 
pgste = pgste_get_lock(pte);
+   /*
+* Remove all zero page mappings,
+* after establishing a policy to forbid zero page mappings
+* following faults for that page will get fresh anonymous pages
+*/
+   if (is_zero_pfn(pte_pfn(*pte))) {
+   ptep_flush_direct(walk->mm, addr, pte);
+   pte_val(*pte) = _PAGE_INVALID;
+   }
/* Clear storage key */
pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
  PGSTE_GR_BIT | PGSTE_GC_BIT);
@@ -1327,9 +1336,11 @@ void s390_enable_skey(void)
down_write(>mmap_sem);
if (mm_use_skey(mm))
goto out_up;
+
+   mm->context.use_skey = 1;
+
walk.mm = mm;
walk_page_range(0, TASK_SIZE, );
-   mm->context.use_skey = 1;
 
 out_up:
up_write(>mmap_sem);
-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] mm: introduce mm_forbids_zeropage function

2014-10-22 Thread Dominik Dingel
Add a new function stub to allow architectures to disable for
an mm_structthe backing of non-present, anonymous pages with
read-only empty zero pages.

Signed-off-by: Dominik Dingel 
---
 include/linux/mm.h | 4 
 mm/huge_memory.c   | 2 +-
 mm/memory.c| 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cd33ae2..0a2022e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -56,6 +56,10 @@ extern int sysctl_legacy_va_layout;
 #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
 #endif
 
+#ifndef mm_forbids_zeropage
+#define mm_forbids_zeropage(X)  (0)
+#endif
+
 extern unsigned long sysctl_user_reserve_kbytes;
 extern unsigned long sysctl_admin_reserve_kbytes;
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de98415..357a381 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -805,7 +805,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
-   if (!(flags & FAULT_FLAG_WRITE) &&
+   if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) &&
transparent_hugepage_use_zero_page()) {
spinlock_t *ptl;
pgtable_t pgtable;
diff --git a/mm/memory.c b/mm/memory.c
index 64f82aa..f275a9d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2640,7 +2640,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_SIGBUS;
 
/* Use the zero-page for reads */
-   if (!(flags & FAULT_FLAG_WRITE)) {
+   if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, );
-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] s390/mm: disable KSM for storage key enabled pages

2014-10-22 Thread Dominik Dingel
When storage keys are enabled unmerge already merged pages and prevent
new pages from being merged.

Signed-off-by: Dominik Dingel 
Acked-by: Christian Borntraeger 
---
 arch/s390/include/asm/pgtable.h |  2 +-
 arch/s390/kvm/priv.c| 17 -
 arch/s390/mm/pgtable.c  | 16 +++-
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0da98d6..dfb38af 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1754,7 +1754,7 @@ static inline pte_t mk_swap_pte(unsigned long type, 
unsigned long offset)
 extern int vmem_add_mapping(unsigned long start, unsigned long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
-extern void s390_enable_skey(void);
+extern int s390_enable_skey(void);
 extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index f89c1cd..e0967fd 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -156,21 +156,25 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
return 0;
 }
 
-static void __skey_check_enable(struct kvm_vcpu *vcpu)
+static int __skey_check_enable(struct kvm_vcpu *vcpu)
 {
+   int rc = 0;
if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
-   return;
+   return rc;
 
-   s390_enable_skey();
+   rc = s390_enable_skey();
trace_kvm_s390_skey_related_inst(vcpu);
vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+   return rc;
 }
 
 
 static int handle_skey(struct kvm_vcpu *vcpu)
 {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
 
+   if (rc)
+   return rc;
vcpu->stat.instruction_storage_key++;
 
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
@@ -692,7 +696,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
 
if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
+
+   if (rc)
+   return rc;
if (set_guest_storage_key(current->mm, useraddr,
vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 58d7eb2..82aa528 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -18,6 +18,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -1328,22 +1330,34 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
return 0;
 }
 
-void s390_enable_skey(void)
+int s390_enable_skey(void)
 {
struct mm_walk walk = { .pte_entry = __s390_enable_skey };
struct mm_struct *mm = current->mm;
+   struct vm_area_struct *vma;
+   int rc = 0;
 
down_write(>mmap_sem);
if (mm_use_skey(mm))
goto out_up;
 
mm->context.use_skey = 1;
+   for (vma = mm->mmap; vma; vma = vma->vm_next) {
+   if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
+   MADV_UNMERGEABLE, >vm_flags)) {
+   mm->context.use_skey = 0;
+   rc = -ENOMEM;
+   goto out_up;
+   }
+   }
+   mm->def_flags &= ~VM_MERGEABLE;
 
walk.mm = mm;
walk_page_range(0, TASK_SIZE, );
 
 out_up:
up_write(>mmap_sem);
+   return rc;
 }
 EXPORT_SYMBOL_GPL(s390_enable_skey);
 
-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] s390/mm: recfactor global pgste updates

2014-10-22 Thread Dominik Dingel
Replace the s390 specific page table walker for the pgste updates
with a call to the common code walk_page_range function.
There are now two pte modification functions, one for the reset
of the CMMA state and another one for the initialization of the
storage keys.

Signed-off-by: Dominik Dingel 
---
 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |   1 +
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/mm/pgtable.c  | 153 ++--
 4 files changed, 56 insertions(+), 102 deletions(-)

diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 9e18a61..120e126 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -22,8 +22,6 @@ unsigned long *page_table_alloc(struct mm_struct *, unsigned 
long);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *);
 
-void page_table_reset_pgste(struct mm_struct *, unsigned long, unsigned long,
-   bool init_skey);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
  unsigned long key, bool nq);
 
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 5efb2fe..1e991f6a 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1750,6 +1750,7 @@ extern int vmem_add_mapping(unsigned long start, unsigned 
long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
 extern void s390_enable_skey(void);
+extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
  * No page table caches to initialise
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 81b0e11..7a33c11 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -281,7 +281,7 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct 
kvm_device_attr *attr)
case KVM_S390_VM_MEM_CLR_CMMA:
mutex_lock(>lock);
idx = srcu_read_lock(>srcu);
-   page_table_reset_pgste(kvm->arch.gmap->mm, 0, TASK_SIZE, false);
+   s390_reset_cmma(kvm->arch.gmap->mm);
srcu_read_unlock(>srcu, idx);
mutex_unlock(>lock);
ret = 0;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 5404a62..ab55ba8 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -885,99 +885,6 @@ static inline void page_table_free_pgste(unsigned long 
*table)
__free_page(page);
 }
 
-static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t 
*pmd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   pte_t *start_pte, *pte;
-   spinlock_t *ptl;
-   pgste_t pgste;
-
-   start_pte = pte_offset_map_lock(mm, pmd, addr, );
-   pte = start_pte;
-   do {
-   pgste = pgste_get_lock(pte);
-   pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
-   if (init_skey) {
-   unsigned long address;
-
-   pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
- PGSTE_GR_BIT | PGSTE_GC_BIT);
-
-   /* skip invalid and not writable pages */
-   if (pte_val(*pte) & _PAGE_INVALID ||
-   !(pte_val(*pte) & _PAGE_WRITE)) {
-   pgste_set_unlock(pte, pgste);
-   continue;
-   }
-
-   address = pte_val(*pte) & PAGE_MASK;
-   page_set_storage_key(address, PAGE_DEFAULT_KEY, 1);
-   }
-   pgste_set_unlock(pte, pgste);
-   } while (pte++, addr += PAGE_SIZE, addr != end);
-   pte_unmap_unlock(start_pte, ptl);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t 
*pud,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pmd_t *pmd;
-
-   pmd = pmd_offset(pud, addr);
-   do {
-   next = pmd_addr_end(addr, end);
-   if (pmd_none_or_clear_bad(pmd))
-   continue;
-   next = page_table_reset_pte(mm, pmd, addr, next, init_skey);
-   } while (pmd++, addr = next, addr != end);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t 
*pgd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pud_t *pud;
-
-   pud = pud_offset(pgd, addr);
-   do {
-   next = pud_addr_end(addr, end);
-   if (pud_none_or_clear_bad(pud))
-   continue;
-   next = page_table_reset_pmd(mm, pud, 

Re: [PATCH 3/4] s390/mm: prevent and break zero page mappings in case of storage keys

2014-10-22 Thread Dominik Dingel
On Wed, 22 Oct 2014 12:09:31 +0200
Paolo Bonzini  wrote:

> On 10/22/2014 10:30 AM, Dominik Dingel wrote:
> > As use_skey is already the condition on which we call s390_enable_skey
> > we need to introduce a new flag for the mm->context on which we decide
> > if zero page mapping is allowed.
> 
> Can you explain better why "mm->context.use_skey = 1" cannot be done
> before the walk_page_range?  Where does the walk or __s390_enable_skey
> or (after the next patch) ksm_madvise rely on
> "mm->context.forbids_zeropage && !mm->context.use_skey"?

I can't, my reasoning there is wrong.
I remembered incorrectly that we use mm_use_skey in arch/s390/kvm/priv.c to
check if we need to call s390_enable_skey, but that does happen
with the interception bits.

So every vCPU which get the a interception for a storage key instruction
will call s390_enable_skey and wait there for the mmap_sem.

> The only reason I can think of, is that the next patch does not reset
> "mm->context.forbids_zeropage" to 0 if the ksm_madvise fails.  Why
> doesn't it do that---or is it a bug?

You are right, this is a bug, where we will drop to userspace with -ENOMEM.

I will fix this as well. 


> Thanks, and sorry for the flurry of questions! :)

I really appreciate your questions and remarks. Thank you!

> Paolo
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] mm: introduce mm_forbids_zeropage function

2014-10-22 Thread Dominik Dingel
Add a new function stub to allow architectures to disable for
an mm_structthe backing of non-present, anonymous pages with
read-only empty zero pages.

Signed-off-by: Dominik Dingel 
---
 include/linux/mm.h | 4 
 mm/huge_memory.c   | 2 +-
 mm/memory.c| 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cd33ae2..0a2022e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -56,6 +56,10 @@ extern int sysctl_legacy_va_layout;
 #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
 #endif
 
+#ifndef mm_forbids_zeropage
+#define mm_forbids_zeropage(X)  (0)
+#endif
+
 extern unsigned long sysctl_user_reserve_kbytes;
 extern unsigned long sysctl_admin_reserve_kbytes;
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de98415..357a381 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -805,7 +805,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
-   if (!(flags & FAULT_FLAG_WRITE) &&
+   if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm) &&
transparent_hugepage_use_zero_page()) {
spinlock_t *ptl;
pgtable_t pgtable;
diff --git a/mm/memory.c b/mm/memory.c
index 64f82aa..f275a9d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2640,7 +2640,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_SIGBUS;
 
/* Use the zero-page for reads */
-   if (!(flags & FAULT_FLAG_WRITE)) {
+   if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, );
-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] s390/mm: prevent and break zero page mappings in case of storage keys

2014-10-22 Thread Dominik Dingel
As soon as storage keys are enabled we need to stop working on zero page
mappings to prevent inconsistencies between storage keys and pgste.

Otherwise following data corruption could happen:
1) guest enables storage key
2) guest sets storage key for not mapped page X
   -> change goes to PGSTE
3) guest reads from page X
   -> as X was not dirty before, the page will be zero page backed,
  storage key from PGSTE for X will go to storage key for zero page
4) guest sets storage key for not mapped page Y (same logic as above
5) guest reads from page Y
   -> as Y was not dirty before, the page will be zero page backed,
  storage key from PGSTE for Y will got to storage key for zero page
  overwriting storage key for X

While holding the mmap sem, we are safe against changes on entries we
already fixed, as every fault would need to take the mmap_sem (read).
As sske and host large pages are also mutual exclusive we do not even
need to retry the fixup_user_fault.

As use_skey is already the condition on which we call s390_enable_skey
we need to introduce a new flag for the mm->context on which we decide
if zero page mapping is allowed.

Signed-off-by: Dominik Dingel 
---
 arch/s390/include/asm/mmu.h |  2 ++
 arch/s390/include/asm/pgtable.h | 14 ++
 arch/s390/mm/pgtable.c  | 12 
 3 files changed, 28 insertions(+)

diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index a5e6562..0f38469 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -18,6 +18,8 @@ typedef struct {
unsigned int has_pgste:1;
/* The mmu context uses storage keys. */
unsigned int use_skey:1;
+   /* The mmu context forbids zeropage mappings. */
+   unsigned int forbids_zeropage:1;
 } mm_context_t;
 
 #define INIT_MM_CONTEXT(name)\
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 1e991f6a..fe3cfdf 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -481,6 +481,20 @@ static inline int mm_has_pgste(struct mm_struct *mm)
return 0;
 }
 
+/*
+ * In the case that a guest uses storage keys
+ * faults should no longer be backed by zero pages
+ */
+#define mm_forbids_zeropage mm_forbids_zeropage
+static inline int mm_forbids_zeropage(struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+   if (mm->context.forbids_zeropage)
+   return 1;
+#endif
+   return 0;
+}
+
 static inline int mm_use_skey(struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ab55ba8..1e06fbc 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1309,6 +1309,15 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
pgste_t pgste;
 
pgste = pgste_get_lock(pte);
+   /*
+* Remove all zero page mappings,
+* after establishing a policy to forbid zero page mappings
+* following faults for that page will get fresh anonymous pages
+*/
+   if (is_zero_pfn(pte_pfn(*pte))) {
+   ptep_flush_direct(walk->mm, addr, pte);
+   pte_val(*pte) = _PAGE_INVALID;
+   }
/* Clear storage key */
pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
  PGSTE_GR_BIT | PGSTE_GC_BIT);
@@ -1327,6 +1336,9 @@ void s390_enable_skey(void)
down_write(>mmap_sem);
if (mm_use_skey(mm))
goto out_up;
+
+   mm->context.forbids_zeropage = 1;
+
walk.mm = mm;
walk_page_range(0, TASK_SIZE, );
mm->context.use_skey = 1;
-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] s390/mm: disable KSM for storage key enabled pages

2014-10-22 Thread Dominik Dingel
When storage keys are enabled unmerge already merged pages and prevent
new pages from being merged.

Signed-off-by: Dominik Dingel 
Acked-by: Christian Borntraeger 
---
 arch/s390/include/asm/pgtable.h |  2 +-
 arch/s390/kvm/priv.c| 17 -
 arch/s390/mm/pgtable.c  | 15 ++-
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index fe3cfdf..20f3186 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1763,7 +1763,7 @@ static inline pte_t mk_swap_pte(unsigned long type, 
unsigned long offset)
 extern int vmem_add_mapping(unsigned long start, unsigned long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
-extern void s390_enable_skey(void);
+extern int s390_enable_skey(void);
 extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index f89c1cd..e0967fd 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -156,21 +156,25 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
return 0;
 }
 
-static void __skey_check_enable(struct kvm_vcpu *vcpu)
+static int __skey_check_enable(struct kvm_vcpu *vcpu)
 {
+   int rc = 0;
if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
-   return;
+   return rc;
 
-   s390_enable_skey();
+   rc = s390_enable_skey();
trace_kvm_s390_skey_related_inst(vcpu);
vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+   return rc;
 }
 
 
 static int handle_skey(struct kvm_vcpu *vcpu)
 {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
 
+   if (rc)
+   return rc;
vcpu->stat.instruction_storage_key++;
 
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
@@ -692,7 +696,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
 
if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
+
+   if (rc)
+   return rc;
if (set_guest_storage_key(current->mm, useraddr,
vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 1e06fbc..798ab49 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -18,6 +18,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -1328,16 +1330,26 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
return 0;
 }
 
-void s390_enable_skey(void)
+int s390_enable_skey(void)
 {
struct mm_walk walk = { .pte_entry = __s390_enable_skey };
struct mm_struct *mm = current->mm;
+   struct vm_area_struct *vma;
+   int rc = 0;
 
down_write(>mmap_sem);
if (mm_use_skey(mm))
goto out_up;
 
mm->context.forbids_zeropage = 1;
+   for (vma = mm->mmap; vma; vma = vma->vm_next) {
+   if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
+   MADV_UNMERGEABLE, >vm_flags)) {
+   rc = -ENOMEM;
+   goto out_up;
+   }
+   }
+   mm->def_flags &= ~VM_MERGEABLE;
 
walk.mm = mm;
walk_page_range(0, TASK_SIZE, );
@@ -1345,6 +1357,7 @@ void s390_enable_skey(void)
 
 out_up:
up_write(>mmap_sem);
+   return rc;
 }
 EXPORT_SYMBOL_GPL(s390_enable_skey);
 
-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] s390/mm: recfactor global pgste updates

2014-10-22 Thread Dominik Dingel
Replace the s390 specific page table walker for the pgste updates
with a call to the common code walk_page_range function.
There are now two pte modification functions, one for the reset
of the CMMA state and another one for the initialization of the
storage keys.

Signed-off-by: Dominik Dingel 
---
 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |   1 +
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/mm/pgtable.c  | 153 ++--
 4 files changed, 56 insertions(+), 102 deletions(-)

diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 9e18a61..120e126 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -22,8 +22,6 @@ unsigned long *page_table_alloc(struct mm_struct *, unsigned 
long);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *);
 
-void page_table_reset_pgste(struct mm_struct *, unsigned long, unsigned long,
-   bool init_skey);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
  unsigned long key, bool nq);
 
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 5efb2fe..1e991f6a 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1750,6 +1750,7 @@ extern int vmem_add_mapping(unsigned long start, unsigned 
long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
 extern void s390_enable_skey(void);
+extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
  * No page table caches to initialise
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 81b0e11..7a33c11 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -281,7 +281,7 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct 
kvm_device_attr *attr)
case KVM_S390_VM_MEM_CLR_CMMA:
mutex_lock(>lock);
idx = srcu_read_lock(>srcu);
-   page_table_reset_pgste(kvm->arch.gmap->mm, 0, TASK_SIZE, false);
+   s390_reset_cmma(kvm->arch.gmap->mm);
srcu_read_unlock(>srcu, idx);
mutex_unlock(>lock);
ret = 0;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 5404a62..ab55ba8 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -885,99 +885,6 @@ static inline void page_table_free_pgste(unsigned long 
*table)
__free_page(page);
 }
 
-static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t 
*pmd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   pte_t *start_pte, *pte;
-   spinlock_t *ptl;
-   pgste_t pgste;
-
-   start_pte = pte_offset_map_lock(mm, pmd, addr, );
-   pte = start_pte;
-   do {
-   pgste = pgste_get_lock(pte);
-   pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
-   if (init_skey) {
-   unsigned long address;
-
-   pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
- PGSTE_GR_BIT | PGSTE_GC_BIT);
-
-   /* skip invalid and not writable pages */
-   if (pte_val(*pte) & _PAGE_INVALID ||
-   !(pte_val(*pte) & _PAGE_WRITE)) {
-   pgste_set_unlock(pte, pgste);
-   continue;
-   }
-
-   address = pte_val(*pte) & PAGE_MASK;
-   page_set_storage_key(address, PAGE_DEFAULT_KEY, 1);
-   }
-   pgste_set_unlock(pte, pgste);
-   } while (pte++, addr += PAGE_SIZE, addr != end);
-   pte_unmap_unlock(start_pte, ptl);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t 
*pud,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pmd_t *pmd;
-
-   pmd = pmd_offset(pud, addr);
-   do {
-   next = pmd_addr_end(addr, end);
-   if (pmd_none_or_clear_bad(pmd))
-   continue;
-   next = page_table_reset_pte(mm, pmd, addr, next, init_skey);
-   } while (pmd++, addr = next, addr != end);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t 
*pgd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pud_t *pud;
-
-   pud = pud_offset(pgd, addr);
-   do {
-   next = pud_addr_end(addr, end);
-   if (pud_none_or_clear_bad(pud))
-   continue;
-   next = page_table_reset_pmd(mm, pud, 

[PATCH v2 0/4] mm: new function to forbid zeropage mappings for a process

2014-10-22 Thread Dominik Dingel
s390 has the special notion of storage keys which are some sort of page flags
associated with physical pages and live outside of direct addressable memory.
These storage keys can be queried and changed with a special set of 
instructions.
The mentioned instructions behave quite nicely under virtualization, if there 
is: 
- an invalid pte, then the instructions will work on memory in the host page 
table
- a valid pte, then the instructions will work with the real storage key

Thanks to Martin with his software reference and dirty bit tracking,
the kernel does not issue any storage key instructions as now a 
software based approach will be taken, on the other hand distributions 
in the wild are currently using them.

However, for virtualized guests we still have a problem with guest pages 
mapped to zero pages and the kernel same page merging.  
With each one multiple guest pages will point to the same physical page
and share the same storage key.

Let's fix this by introducing a new function which s390 will define to
forbid new zero page mappings.  If the guest issues a storage key related 
instruction we flag the mm_struct, drop existing zero page mappings
and unmerge the guest memory.

v1 -> v2: 
 - Following Dave and Paolo suggestion removing the vma flag

Dominik Dingel (4):
  s390/mm: recfactor global pgste updates
  mm: introduce mm_forbids_zeropage function
  s390/mm: prevent and break zero page mappings in case of storage keys
  s390/mm: disable KSM for storage key enabled pages

 arch/s390/include/asm/mmu.h |   2 +
 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |  17 +++-
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/kvm/priv.c|  17 ++--
 arch/s390/mm/pgtable.c  | 180 ++--
 include/linux/mm.h  |   4 +
 mm/huge_memory.c|   2 +-
 mm/memory.c |   2 +-
 9 files changed, 117 insertions(+), 111 deletions(-)

-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v2 0/4] mm: new function to forbid zeropage mappings for a process

2014-10-22 Thread Dominik Dingel
s390 has the special notion of storage keys which are some sort of page flags
associated with physical pages and live outside of direct addressable memory.
These storage keys can be queried and changed with a special set of 
instructions.
The mentioned instructions behave quite nicely under virtualization, if there 
is: 
- an invalid pte, then the instructions will work on memory in the host page 
table
- a valid pte, then the instructions will work with the real storage key

Thanks to Martin with his software reference and dirty bit tracking,
the kernel does not issue any storage key instructions as now a 
software based approach will be taken, on the other hand distributions 
in the wild are currently using them.

However, for virtualized guests we still have a problem with guest pages 
mapped to zero pages and the kernel same page merging.  
With each one multiple guest pages will point to the same physical page
and share the same storage key.

Let's fix this by introducing a new function which s390 will define to
forbid new zero page mappings.  If the guest issues a storage key related 
instruction we flag the mm_struct, drop existing zero page mappings
and unmerge the guest memory.

v1 - v2: 
 - Following Dave and Paolo suggestion removing the vma flag

Dominik Dingel (4):
  s390/mm: recfactor global pgste updates
  mm: introduce mm_forbids_zeropage function
  s390/mm: prevent and break zero page mappings in case of storage keys
  s390/mm: disable KSM for storage key enabled pages

 arch/s390/include/asm/mmu.h |   2 +
 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |  17 +++-
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/kvm/priv.c|  17 ++--
 arch/s390/mm/pgtable.c  | 180 ++--
 include/linux/mm.h  |   4 +
 mm/huge_memory.c|   2 +-
 mm/memory.c |   2 +-
 9 files changed, 117 insertions(+), 111 deletions(-)

-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] s390/mm: recfactor global pgste updates

2014-10-22 Thread Dominik Dingel
Replace the s390 specific page table walker for the pgste updates
with a call to the common code walk_page_range function.
There are now two pte modification functions, one for the reset
of the CMMA state and another one for the initialization of the
storage keys.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |   1 +
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/mm/pgtable.c  | 153 ++--
 4 files changed, 56 insertions(+), 102 deletions(-)

diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 9e18a61..120e126 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -22,8 +22,6 @@ unsigned long *page_table_alloc(struct mm_struct *, unsigned 
long);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *);
 
-void page_table_reset_pgste(struct mm_struct *, unsigned long, unsigned long,
-   bool init_skey);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
  unsigned long key, bool nq);
 
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 5efb2fe..1e991f6a 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1750,6 +1750,7 @@ extern int vmem_add_mapping(unsigned long start, unsigned 
long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
 extern void s390_enable_skey(void);
+extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
  * No page table caches to initialise
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 81b0e11..7a33c11 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -281,7 +281,7 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct 
kvm_device_attr *attr)
case KVM_S390_VM_MEM_CLR_CMMA:
mutex_lock(kvm-lock);
idx = srcu_read_lock(kvm-srcu);
-   page_table_reset_pgste(kvm-arch.gmap-mm, 0, TASK_SIZE, false);
+   s390_reset_cmma(kvm-arch.gmap-mm);
srcu_read_unlock(kvm-srcu, idx);
mutex_unlock(kvm-lock);
ret = 0;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 5404a62..ab55ba8 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -885,99 +885,6 @@ static inline void page_table_free_pgste(unsigned long 
*table)
__free_page(page);
 }
 
-static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t 
*pmd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   pte_t *start_pte, *pte;
-   spinlock_t *ptl;
-   pgste_t pgste;
-
-   start_pte = pte_offset_map_lock(mm, pmd, addr, ptl);
-   pte = start_pte;
-   do {
-   pgste = pgste_get_lock(pte);
-   pgste_val(pgste) = ~_PGSTE_GPS_USAGE_MASK;
-   if (init_skey) {
-   unsigned long address;
-
-   pgste_val(pgste) = ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
- PGSTE_GR_BIT | PGSTE_GC_BIT);
-
-   /* skip invalid and not writable pages */
-   if (pte_val(*pte)  _PAGE_INVALID ||
-   !(pte_val(*pte)  _PAGE_WRITE)) {
-   pgste_set_unlock(pte, pgste);
-   continue;
-   }
-
-   address = pte_val(*pte)  PAGE_MASK;
-   page_set_storage_key(address, PAGE_DEFAULT_KEY, 1);
-   }
-   pgste_set_unlock(pte, pgste);
-   } while (pte++, addr += PAGE_SIZE, addr != end);
-   pte_unmap_unlock(start_pte, ptl);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t 
*pud,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pmd_t *pmd;
-
-   pmd = pmd_offset(pud, addr);
-   do {
-   next = pmd_addr_end(addr, end);
-   if (pmd_none_or_clear_bad(pmd))
-   continue;
-   next = page_table_reset_pte(mm, pmd, addr, next, init_skey);
-   } while (pmd++, addr = next, addr != end);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t 
*pgd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pud_t *pud;
-
-   pud = pud_offset(pgd, addr);
-   do {
-   next = pud_addr_end(addr, end);
-   if (pud_none_or_clear_bad(pud))
-   continue;
-   next = page_table_reset_pmd(mm, pud, addr, next

[PATCH 4/4] s390/mm: disable KSM for storage key enabled pages

2014-10-22 Thread Dominik Dingel
When storage keys are enabled unmerge already merged pages and prevent
new pages from being merged.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
Acked-by: Christian Borntraeger borntrae...@de.ibm.com
---
 arch/s390/include/asm/pgtable.h |  2 +-
 arch/s390/kvm/priv.c| 17 -
 arch/s390/mm/pgtable.c  | 15 ++-
 3 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index fe3cfdf..20f3186 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1763,7 +1763,7 @@ static inline pte_t mk_swap_pte(unsigned long type, 
unsigned long offset)
 extern int vmem_add_mapping(unsigned long start, unsigned long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
-extern void s390_enable_skey(void);
+extern int s390_enable_skey(void);
 extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index f89c1cd..e0967fd 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -156,21 +156,25 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
return 0;
 }
 
-static void __skey_check_enable(struct kvm_vcpu *vcpu)
+static int __skey_check_enable(struct kvm_vcpu *vcpu)
 {
+   int rc = 0;
if (!(vcpu-arch.sie_block-ictl  (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
-   return;
+   return rc;
 
-   s390_enable_skey();
+   rc = s390_enable_skey();
trace_kvm_s390_skey_related_inst(vcpu);
vcpu-arch.sie_block-ictl = ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+   return rc;
 }
 
 
 static int handle_skey(struct kvm_vcpu *vcpu)
 {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
 
+   if (rc)
+   return rc;
vcpu-stat.instruction_storage_key++;
 
if (vcpu-arch.sie_block-gpsw.mask  PSW_MASK_PSTATE)
@@ -692,7 +696,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
 
if (vcpu-run-s.regs.gprs[reg1]  PFMF_SK) {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
+
+   if (rc)
+   return rc;
if (set_guest_storage_key(current-mm, useraddr,
vcpu-run-s.regs.gprs[reg1]  PFMF_KEY,
vcpu-run-s.regs.gprs[reg1]  PFMF_NQ))
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 1e06fbc..798ab49 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -18,6 +18,8 @@
 #include linux/rcupdate.h
 #include linux/slab.h
 #include linux/swapops.h
+#include linux/ksm.h
+#include linux/mman.h
 
 #include asm/pgtable.h
 #include asm/pgalloc.h
@@ -1328,16 +1330,26 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
return 0;
 }
 
-void s390_enable_skey(void)
+int s390_enable_skey(void)
 {
struct mm_walk walk = { .pte_entry = __s390_enable_skey };
struct mm_struct *mm = current-mm;
+   struct vm_area_struct *vma;
+   int rc = 0;
 
down_write(mm-mmap_sem);
if (mm_use_skey(mm))
goto out_up;
 
mm-context.forbids_zeropage = 1;
+   for (vma = mm-mmap; vma; vma = vma-vm_next) {
+   if (ksm_madvise(vma, vma-vm_start, vma-vm_end,
+   MADV_UNMERGEABLE, vma-vm_flags)) {
+   rc = -ENOMEM;
+   goto out_up;
+   }
+   }
+   mm-def_flags = ~VM_MERGEABLE;
 
walk.mm = mm;
walk_page_range(0, TASK_SIZE, walk);
@@ -1345,6 +1357,7 @@ void s390_enable_skey(void)
 
 out_up:
up_write(mm-mmap_sem);
+   return rc;
 }
 EXPORT_SYMBOL_GPL(s390_enable_skey);
 
-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] s390/mm: prevent and break zero page mappings in case of storage keys

2014-10-22 Thread Dominik Dingel
As soon as storage keys are enabled we need to stop working on zero page
mappings to prevent inconsistencies between storage keys and pgste.

Otherwise following data corruption could happen:
1) guest enables storage key
2) guest sets storage key for not mapped page X
   - change goes to PGSTE
3) guest reads from page X
   - as X was not dirty before, the page will be zero page backed,
  storage key from PGSTE for X will go to storage key for zero page
4) guest sets storage key for not mapped page Y (same logic as above
5) guest reads from page Y
   - as Y was not dirty before, the page will be zero page backed,
  storage key from PGSTE for Y will got to storage key for zero page
  overwriting storage key for X

While holding the mmap sem, we are safe against changes on entries we
already fixed, as every fault would need to take the mmap_sem (read).
As sske and host large pages are also mutual exclusive we do not even
need to retry the fixup_user_fault.

As use_skey is already the condition on which we call s390_enable_skey
we need to introduce a new flag for the mm-context on which we decide
if zero page mapping is allowed.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 arch/s390/include/asm/mmu.h |  2 ++
 arch/s390/include/asm/pgtable.h | 14 ++
 arch/s390/mm/pgtable.c  | 12 
 3 files changed, 28 insertions(+)

diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index a5e6562..0f38469 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -18,6 +18,8 @@ typedef struct {
unsigned int has_pgste:1;
/* The mmu context uses storage keys. */
unsigned int use_skey:1;
+   /* The mmu context forbids zeropage mappings. */
+   unsigned int forbids_zeropage:1;
 } mm_context_t;
 
 #define INIT_MM_CONTEXT(name)\
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 1e991f6a..fe3cfdf 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -481,6 +481,20 @@ static inline int mm_has_pgste(struct mm_struct *mm)
return 0;
 }
 
+/*
+ * In the case that a guest uses storage keys
+ * faults should no longer be backed by zero pages
+ */
+#define mm_forbids_zeropage mm_forbids_zeropage
+static inline int mm_forbids_zeropage(struct mm_struct *mm)
+{
+#ifdef CONFIG_PGSTE
+   if (mm-context.forbids_zeropage)
+   return 1;
+#endif
+   return 0;
+}
+
 static inline int mm_use_skey(struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ab55ba8..1e06fbc 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1309,6 +1309,15 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
pgste_t pgste;
 
pgste = pgste_get_lock(pte);
+   /*
+* Remove all zero page mappings,
+* after establishing a policy to forbid zero page mappings
+* following faults for that page will get fresh anonymous pages
+*/
+   if (is_zero_pfn(pte_pfn(*pte))) {
+   ptep_flush_direct(walk-mm, addr, pte);
+   pte_val(*pte) = _PAGE_INVALID;
+   }
/* Clear storage key */
pgste_val(pgste) = ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
  PGSTE_GR_BIT | PGSTE_GC_BIT);
@@ -1327,6 +1336,9 @@ void s390_enable_skey(void)
down_write(mm-mmap_sem);
if (mm_use_skey(mm))
goto out_up;
+
+   mm-context.forbids_zeropage = 1;
+
walk.mm = mm;
walk_page_range(0, TASK_SIZE, walk);
mm-context.use_skey = 1;
-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] mm: introduce mm_forbids_zeropage function

2014-10-22 Thread Dominik Dingel
Add a new function stub to allow architectures to disable for
an mm_structthe backing of non-present, anonymous pages with
read-only empty zero pages.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 include/linux/mm.h | 4 
 mm/huge_memory.c   | 2 +-
 mm/memory.c| 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cd33ae2..0a2022e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -56,6 +56,10 @@ extern int sysctl_legacy_va_layout;
 #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
 #endif
 
+#ifndef mm_forbids_zeropage
+#define mm_forbids_zeropage(X)  (0)
+#endif
+
 extern unsigned long sysctl_user_reserve_kbytes;
 extern unsigned long sysctl_admin_reserve_kbytes;
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de98415..357a381 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -805,7 +805,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma-vm_flags)))
return VM_FAULT_OOM;
-   if (!(flags  FAULT_FLAG_WRITE) 
+   if (!(flags  FAULT_FLAG_WRITE)  !mm_forbids_zeropage(mm) 
transparent_hugepage_use_zero_page()) {
spinlock_t *ptl;
pgtable_t pgtable;
diff --git a/mm/memory.c b/mm/memory.c
index 64f82aa..f275a9d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2640,7 +2640,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_SIGBUS;
 
/* Use the zero-page for reads */
-   if (!(flags  FAULT_FLAG_WRITE)) {
+   if (!(flags  FAULT_FLAG_WRITE)  !mm_forbids_zeropage(mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma-vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, ptl);
-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 3/4] s390/mm: prevent and break zero page mappings in case of storage keys

2014-10-22 Thread Dominik Dingel
On Wed, 22 Oct 2014 12:09:31 +0200
Paolo Bonzini pbonz...@redhat.com wrote:

 On 10/22/2014 10:30 AM, Dominik Dingel wrote:
  As use_skey is already the condition on which we call s390_enable_skey
  we need to introduce a new flag for the mm-context on which we decide
  if zero page mapping is allowed.
 
 Can you explain better why mm-context.use_skey = 1 cannot be done
 before the walk_page_range?  Where does the walk or __s390_enable_skey
 or (after the next patch) ksm_madvise rely on
 mm-context.forbids_zeropage  !mm-context.use_skey?

I can't, my reasoning there is wrong.
I remembered incorrectly that we use mm_use_skey in arch/s390/kvm/priv.c to
check if we need to call s390_enable_skey, but that does happen
with the interception bits.

So every vCPU which get the a interception for a storage key instruction
will call s390_enable_skey and wait there for the mmap_sem.

 The only reason I can think of, is that the next patch does not reset
 mm-context.forbids_zeropage to 0 if the ksm_madvise fails.  Why
 doesn't it do that---or is it a bug?

You are right, this is a bug, where we will drop to userspace with -ENOMEM.

I will fix this as well. 


 Thanks, and sorry for the flurry of questions! :)

I really appreciate your questions and remarks. Thank you!

 Paolo
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] s390/mm: recfactor global pgste updates

2014-10-22 Thread Dominik Dingel
Replace the s390 specific page table walker for the pgste updates
with a call to the common code walk_page_range function.
There are now two pte modification functions, one for the reset
of the CMMA state and another one for the initialization of the
storage keys.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |   1 +
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/mm/pgtable.c  | 153 ++--
 4 files changed, 56 insertions(+), 102 deletions(-)

diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 9e18a61..120e126 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -22,8 +22,6 @@ unsigned long *page_table_alloc(struct mm_struct *, unsigned 
long);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *);
 
-void page_table_reset_pgste(struct mm_struct *, unsigned long, unsigned long,
-   bool init_skey);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
  unsigned long key, bool nq);
 
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 5efb2fe..1e991f6a 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1750,6 +1750,7 @@ extern int vmem_add_mapping(unsigned long start, unsigned 
long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
 extern void s390_enable_skey(void);
+extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
  * No page table caches to initialise
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 81b0e11..7a33c11 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -281,7 +281,7 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct 
kvm_device_attr *attr)
case KVM_S390_VM_MEM_CLR_CMMA:
mutex_lock(kvm-lock);
idx = srcu_read_lock(kvm-srcu);
-   page_table_reset_pgste(kvm-arch.gmap-mm, 0, TASK_SIZE, false);
+   s390_reset_cmma(kvm-arch.gmap-mm);
srcu_read_unlock(kvm-srcu, idx);
mutex_unlock(kvm-lock);
ret = 0;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 5404a62..ab55ba8 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -885,99 +885,6 @@ static inline void page_table_free_pgste(unsigned long 
*table)
__free_page(page);
 }
 
-static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t 
*pmd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   pte_t *start_pte, *pte;
-   spinlock_t *ptl;
-   pgste_t pgste;
-
-   start_pte = pte_offset_map_lock(mm, pmd, addr, ptl);
-   pte = start_pte;
-   do {
-   pgste = pgste_get_lock(pte);
-   pgste_val(pgste) = ~_PGSTE_GPS_USAGE_MASK;
-   if (init_skey) {
-   unsigned long address;
-
-   pgste_val(pgste) = ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
- PGSTE_GR_BIT | PGSTE_GC_BIT);
-
-   /* skip invalid and not writable pages */
-   if (pte_val(*pte)  _PAGE_INVALID ||
-   !(pte_val(*pte)  _PAGE_WRITE)) {
-   pgste_set_unlock(pte, pgste);
-   continue;
-   }
-
-   address = pte_val(*pte)  PAGE_MASK;
-   page_set_storage_key(address, PAGE_DEFAULT_KEY, 1);
-   }
-   pgste_set_unlock(pte, pgste);
-   } while (pte++, addr += PAGE_SIZE, addr != end);
-   pte_unmap_unlock(start_pte, ptl);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t 
*pud,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pmd_t *pmd;
-
-   pmd = pmd_offset(pud, addr);
-   do {
-   next = pmd_addr_end(addr, end);
-   if (pmd_none_or_clear_bad(pmd))
-   continue;
-   next = page_table_reset_pte(mm, pmd, addr, next, init_skey);
-   } while (pmd++, addr = next, addr != end);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t 
*pgd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pud_t *pud;
-
-   pud = pud_offset(pgd, addr);
-   do {
-   next = pud_addr_end(addr, end);
-   if (pud_none_or_clear_bad(pud))
-   continue;
-   next = page_table_reset_pmd(mm, pud, addr, next

[PATCH 4/4] s390/mm: disable KSM for storage key enabled pages

2014-10-22 Thread Dominik Dingel
When storage keys are enabled unmerge already merged pages and prevent
new pages from being merged.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
Acked-by: Christian Borntraeger borntrae...@de.ibm.com
---
 arch/s390/include/asm/pgtable.h |  2 +-
 arch/s390/kvm/priv.c| 17 -
 arch/s390/mm/pgtable.c  | 16 +++-
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 0da98d6..dfb38af 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1754,7 +1754,7 @@ static inline pte_t mk_swap_pte(unsigned long type, 
unsigned long offset)
 extern int vmem_add_mapping(unsigned long start, unsigned long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
-extern void s390_enable_skey(void);
+extern int s390_enable_skey(void);
 extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index f89c1cd..e0967fd 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -156,21 +156,25 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
return 0;
 }
 
-static void __skey_check_enable(struct kvm_vcpu *vcpu)
+static int __skey_check_enable(struct kvm_vcpu *vcpu)
 {
+   int rc = 0;
if (!(vcpu-arch.sie_block-ictl  (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
-   return;
+   return rc;
 
-   s390_enable_skey();
+   rc = s390_enable_skey();
trace_kvm_s390_skey_related_inst(vcpu);
vcpu-arch.sie_block-ictl = ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+   return rc;
 }
 
 
 static int handle_skey(struct kvm_vcpu *vcpu)
 {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
 
+   if (rc)
+   return rc;
vcpu-stat.instruction_storage_key++;
 
if (vcpu-arch.sie_block-gpsw.mask  PSW_MASK_PSTATE)
@@ -692,7 +696,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
 
if (vcpu-run-s.regs.gprs[reg1]  PFMF_SK) {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
+
+   if (rc)
+   return rc;
if (set_guest_storage_key(current-mm, useraddr,
vcpu-run-s.regs.gprs[reg1]  PFMF_KEY,
vcpu-run-s.regs.gprs[reg1]  PFMF_NQ))
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 58d7eb2..82aa528 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -18,6 +18,8 @@
 #include linux/rcupdate.h
 #include linux/slab.h
 #include linux/swapops.h
+#include linux/ksm.h
+#include linux/mman.h
 
 #include asm/pgtable.h
 #include asm/pgalloc.h
@@ -1328,22 +1330,34 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
return 0;
 }
 
-void s390_enable_skey(void)
+int s390_enable_skey(void)
 {
struct mm_walk walk = { .pte_entry = __s390_enable_skey };
struct mm_struct *mm = current-mm;
+   struct vm_area_struct *vma;
+   int rc = 0;
 
down_write(mm-mmap_sem);
if (mm_use_skey(mm))
goto out_up;
 
mm-context.use_skey = 1;
+   for (vma = mm-mmap; vma; vma = vma-vm_next) {
+   if (ksm_madvise(vma, vma-vm_start, vma-vm_end,
+   MADV_UNMERGEABLE, vma-vm_flags)) {
+   mm-context.use_skey = 0;
+   rc = -ENOMEM;
+   goto out_up;
+   }
+   }
+   mm-def_flags = ~VM_MERGEABLE;
 
walk.mm = mm;
walk_page_range(0, TASK_SIZE, walk);
 
 out_up:
up_write(mm-mmap_sem);
+   return rc;
 }
 EXPORT_SYMBOL_GPL(s390_enable_skey);
 
-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] mm: introduce mm_forbids_zeropage function

2014-10-22 Thread Dominik Dingel
Add a new function stub to allow architectures to disable for
an mm_structthe backing of non-present, anonymous pages with
read-only empty zero pages.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 include/linux/mm.h | 4 
 mm/huge_memory.c   | 2 +-
 mm/memory.c| 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cd33ae2..0a2022e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -56,6 +56,10 @@ extern int sysctl_legacy_va_layout;
 #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
 #endif
 
+#ifndef mm_forbids_zeropage
+#define mm_forbids_zeropage(X)  (0)
+#endif
+
 extern unsigned long sysctl_user_reserve_kbytes;
 extern unsigned long sysctl_admin_reserve_kbytes;
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de98415..357a381 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -805,7 +805,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma-vm_flags)))
return VM_FAULT_OOM;
-   if (!(flags  FAULT_FLAG_WRITE) 
+   if (!(flags  FAULT_FLAG_WRITE)  !mm_forbids_zeropage(mm) 
transparent_hugepage_use_zero_page()) {
spinlock_t *ptl;
pgtable_t pgtable;
diff --git a/mm/memory.c b/mm/memory.c
index 64f82aa..f275a9d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2640,7 +2640,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_SIGBUS;
 
/* Use the zero-page for reads */
-   if (!(flags  FAULT_FLAG_WRITE)) {
+   if (!(flags  FAULT_FLAG_WRITE)  !mm_forbids_zeropage(mm)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma-vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, ptl);
-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] s390/mm: prevent and break zero page mappings in case of storage keys

2014-10-22 Thread Dominik Dingel
As soon as storage keys are enabled we need to stop working on zero page
mappings to prevent inconsistencies between storage keys and pgste.

Otherwise following data corruption could happen:
1) guest enables storage key
2) guest sets storage key for not mapped page X
   - change goes to PGSTE
3) guest reads from page X
   - as X was not dirty before, the page will be zero page backed,
  storage key from PGSTE for X will go to storage key for zero page
4) guest sets storage key for not mapped page Y (same logic as above
5) guest reads from page Y
   - as Y was not dirty before, the page will be zero page backed,
  storage key from PGSTE for Y will got to storage key for zero page
  overwriting storage key for X

While holding the mmap sem, we are safe against changes on entries we
already fixed, as every fault would need to take the mmap_sem (read).

Other vCPUs executing storage key instructions will get a one time interception
and be serialized also with mmap_sem.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
---
 arch/s390/include/asm/pgtable.h |  5 +
 arch/s390/mm/pgtable.c  | 13 -
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 1e991f6a..0da98d6 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -481,6 +481,11 @@ static inline int mm_has_pgste(struct mm_struct *mm)
return 0;
 }
 
+/*
+ * In the case that a guest uses storage keys
+ * faults should no longer be backed by zero pages
+ */
+#define mm_forbids_zeropage mm_use_skey
 static inline int mm_use_skey(struct mm_struct *mm)
 {
 #ifdef CONFIG_PGSTE
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ab55ba8..58d7eb2 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1309,6 +1309,15 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
pgste_t pgste;
 
pgste = pgste_get_lock(pte);
+   /*
+* Remove all zero page mappings,
+* after establishing a policy to forbid zero page mappings
+* following faults for that page will get fresh anonymous pages
+*/
+   if (is_zero_pfn(pte_pfn(*pte))) {
+   ptep_flush_direct(walk-mm, addr, pte);
+   pte_val(*pte) = _PAGE_INVALID;
+   }
/* Clear storage key */
pgste_val(pgste) = ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
  PGSTE_GR_BIT | PGSTE_GC_BIT);
@@ -1327,9 +1336,11 @@ void s390_enable_skey(void)
down_write(mm-mmap_sem);
if (mm_use_skey(mm))
goto out_up;
+
+   mm-context.use_skey = 1;
+
walk.mm = mm;
walk_page_range(0, TASK_SIZE, walk);
-   mm-context.use_skey = 1;
 
 out_up:
up_write(mm-mmap_sem);
-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 0/4] mm: new function to forbid zeropage mappings for a process

2014-10-22 Thread Dominik Dingel
s390 has the special notion of storage keys which are some sort of page flags
associated with physical pages and live outside of direct addressable memory.
These storage keys can be queried and changed with a special set of 
instructions.
The mentioned instructions behave quite nicely under virtualization, if there 
is: 
- an invalid pte, then the instructions will work on memory in the host page 
table
- a valid pte, then the instructions will work with the real storage key

Thanks to Martin with his software reference and dirty bit tracking,
the kernel does not issue any storage key instructions as now a 
software based approach will be taken, on the other hand distributions 
in the wild are currently using them.

However, for virtualized guests we still have a problem with guest pages 
mapped to zero pages and the kernel same page merging.  
With each one multiple guest pages will point to the same physical page
and share the same storage key.

Let's fix this by introducing a new function which s390 will define to
forbid new zero page mappings.  If the guest issues a storage key related 
instruction we flag the mm_struct, drop existing zero page mappings
and unmerge the guest memory.

v2 - v3:
 - Clearing up patch description Patch 3/4
 - removing unnecessary flag in mmu_context (Paolo)

v1 - v2: 
 - Following Dave and Paolo suggestion removing the vma flag

Dominik Dingel (4):
  s390/mm: recfactor global pgste updates
  mm: introduce mm_forbids_zeropage function
  s390/mm: prevent and break zero page mappings in case of storage keys
  s390/mm: disable KSM for storage key enabled pages

 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |   8 +-
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/kvm/priv.c|  17 ++--
 arch/s390/mm/pgtable.c  | 180 ++--
 include/linux/mm.h  |   4 +
 mm/huge_memory.c|   2 +-
 mm/memory.c |   2 +-
 8 files changed, 106 insertions(+), 111 deletions(-)

-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/4] mm: introduce mm_forbids_zeropage function

2014-10-22 Thread Dominik Dingel
On Wed, 22 Oct 2014 12:22:23 -0700
Andrew Morton a...@linux-foundation.org wrote:

 On Wed, 22 Oct 2014 13:09:28 +0200 Dominik Dingel din...@linux.vnet.ibm.com 
 wrote:
 
  Add a new function stub to allow architectures to disable for
  an mm_structthe backing of non-present, anonymous pages with
  read-only empty zero pages.
  
  ...
 
  --- a/include/linux/mm.h
  +++ b/include/linux/mm.h
  @@ -56,6 +56,10 @@ extern int sysctl_legacy_va_layout;
   #define __pa_symbol(x)  __pa(RELOC_HIDE((unsigned long)(x), 0))
   #endif
   
  +#ifndef mm_forbids_zeropage
  +#define mm_forbids_zeropage(X)  (0)
  +#endif
 
 Can we document this please?  What it does, why it does it.  We should
 also specify precisely which arch header file is responsible for
 defining mm_forbids_zeropage.
 

I will add a comment like:

/*
 * To prevent common memory management code establishing
 * a zero page mapping on a read fault.
 * This function should be implemented within asm/pgtable.h.
 * s390 does this to prevent multiplexing of hardware bits
 * related to the physical page in case of virtualization.
 */

Okay?


 --
 To unsubscribe, send a message with 'unsubscribe linux-mm' in
 the body to majord...@kvack.org.  For more info on Linux MM,
 see: http://www.linux-mm.org/ .
 Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/4] mm: introduce new VM_NOZEROPAGE flag

2014-10-21 Thread Dominik Dingel
On Tue, 21 Oct 2014 10:11:43 +0200
Paolo Bonzini  wrote:

> 
> 
> On 10/21/2014 08:11 AM, Martin Schwidefsky wrote:
> >> I agree with Dave (I thought I disagreed, but I changed my mind while
> >> writing down my thoughts).  Just define mm_forbids_zeropage in
> >> arch/s390/include/asm, and make it return mm->context.use_skey---with a
> >> comment explaining how this is only for processes that use KVM, and then
> >> only for guests that use storage keys.
> >
> > The mm_forbids_zeropage() sure will work for now, but I think a vma flag
> > is the better solution. This is analog to VM_MERGEABLE or VM_NOHUGEPAGE,
> > the best solution would be to only mark those vmas that are mapped to
> > the guest. That we have not found a way to do that yet in a sensible way
> > does not change the fact that "no-zero-page" is a per-vma property, no?
> 
> I agree it should be per-VMA.  However, right now the code is 
> complicated unnecessarily by making it a per-VMA flag.  Also, setting 
> the flag per VMA should probably be done in 
> kvm_arch_prepare_memory_region together with some kind of storage key 
> notifier.  This is not very much like Dominik's patch.  All in all, 
> mm_forbids_zeropage() provides a non-intrusive and non-controversial way 
> to fix the bug.  Later on, switching to vma_forbids_zeropage() will be 
> trivial as far as mm/ code is concerned.
> 

Thank you for all the feedback, will cook up a new version.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/4] mm: introduce new VM_NOZEROPAGE flag

2014-10-21 Thread Dominik Dingel
On Tue, 21 Oct 2014 10:11:43 +0200
Paolo Bonzini pbonz...@redhat.com wrote:

 
 
 On 10/21/2014 08:11 AM, Martin Schwidefsky wrote:
  I agree with Dave (I thought I disagreed, but I changed my mind while
  writing down my thoughts).  Just define mm_forbids_zeropage in
  arch/s390/include/asm, and make it return mm-context.use_skey---with a
  comment explaining how this is only for processes that use KVM, and then
  only for guests that use storage keys.
 
  The mm_forbids_zeropage() sure will work for now, but I think a vma flag
  is the better solution. This is analog to VM_MERGEABLE or VM_NOHUGEPAGE,
  the best solution would be to only mark those vmas that are mapped to
  the guest. That we have not found a way to do that yet in a sensible way
  does not change the fact that no-zero-page is a per-vma property, no?
 
 I agree it should be per-VMA.  However, right now the code is 
 complicated unnecessarily by making it a per-VMA flag.  Also, setting 
 the flag per VMA should probably be done in 
 kvm_arch_prepare_memory_region together with some kind of storage key 
 notifier.  This is not very much like Dominik's patch.  All in all, 
 mm_forbids_zeropage() provides a non-intrusive and non-controversial way 
 to fix the bug.  Later on, switching to vma_forbids_zeropage() will be 
 trivial as far as mm/ code is concerned.
 

Thank you for all the feedback, will cook up a new version.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/4] mm: introduce new VM_NOZEROPAGE flag

2014-10-18 Thread Dominik Dingel
On Fri, 17 Oct 2014 15:04:21 -0700
Dave Hansen  wrote:
 
> Is there ever a time where the VMAs under an mm have mixed VM_NOZEROPAGE
> status?  Reading the patches, it _looks_ like it might be an all or
> nothing thing.

Currently it is an all or nothing thing, but for a future change we might want 
to just
tag the guest memory instead of the complete user address space.

> Full disclosure: I've got an x86-specific feature I want to steal a flag
> for.  Maybe we should just define another VM_ARCH bit.
> 

So you think of something like:

#if defined(CONFIG_S390)
# define VM_NOZEROPAGE  VM_ARCH_1
#endif

#ifndef VM_NOZEROPAGE
# define VM_NOZEROPAGE  VM_NONE
#endif

right?


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/4] mm: introduce new VM_NOZEROPAGE flag

2014-10-18 Thread Dominik Dingel
On Fri, 17 Oct 2014 15:04:21 -0700
Dave Hansen dave.han...@intel.com wrote:
 
 Is there ever a time where the VMAs under an mm have mixed VM_NOZEROPAGE
 status?  Reading the patches, it _looks_ like it might be an all or
 nothing thing.

Currently it is an all or nothing thing, but for a future change we might want 
to just
tag the guest memory instead of the complete user address space.

 Full disclosure: I've got an x86-specific feature I want to steal a flag
 for.  Maybe we should just define another VM_ARCH bit.
 

So you think of something like:

#if defined(CONFIG_S390)
# define VM_NOZEROPAGE  VM_ARCH_1
#endif

#ifndef VM_NOZEROPAGE
# define VM_NOZEROPAGE  VM_NONE
#endif

right?


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] s390/mm: prevent and break zero page mappings in case of storage keys

2014-10-17 Thread Dominik Dingel
As soon as storage keys are enabled we need to work around of zero page
mappings to prevent inconsistencies between storage keys and pgste.

Otherwise following data corruption could happen:
1) guest enables storage key
2) guest sets storage key for not mapped page X
   -> change goes to PGSTE
3) guest reads from page X
   -> as X was not dirty before, the page will be zero page backed,
  storage key from PGSTE for X will go to storage key for zero page
4) guest sets storage key for not mapped page Y (same logic as above
5) guest reads from page Y
   -> as Y was not dirty before, the page will be zero page backed,
  storage key from PGSTE for Y will got to storage key for zero page
  overwriting storage key for X

While holding the mmap sem, we are safe before changes on entries we
already fixed. As sske and host large pages are also mutual exclusive
we do not even need to retry the fixup_user_fault.

Signed-off-by: Dominik Dingel 
Acked-by: Christian Borntraeger 
Signed-off-by: Martin Schwidefsky 
---
 arch/s390/Kconfig  |  3 +++
 arch/s390/mm/pgtable.c | 15 +++
 2 files changed, 18 insertions(+)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 05c78bb..4e04e63 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -1,6 +1,9 @@
 config MMU
def_bool y
 
+config NOZEROPAGE
+   def_bool y
+
 config ZONE_DMA
def_bool y
 
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ab55ba8..6321692 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1309,6 +1309,15 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
pgste_t pgste;
 
pgste = pgste_get_lock(pte);
+   /*
+* Remove all zero page mappings,
+* after establishing a policy to forbid zero page mappings
+* following faults for that page will get fresh anonymous pages
+*/
+   if (is_zero_pfn(pte_pfn(*pte))) {
+   ptep_flush_direct(walk->mm, addr, pte);
+   pte_val(*pte) = _PAGE_INVALID;
+   }
/* Clear storage key */
pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
  PGSTE_GR_BIT | PGSTE_GC_BIT);
@@ -1323,10 +1332,16 @@ void s390_enable_skey(void)
 {
struct mm_walk walk = { .pte_entry = __s390_enable_skey };
struct mm_struct *mm = current->mm;
+   struct vm_area_struct *vma;
 
down_write(>mmap_sem);
if (mm_use_skey(mm))
goto out_up;
+
+   for (vma = mm->mmap; vma; vma = vma->vm_next)
+   vma->vm_flags |= VM_NOZEROPAGE;
+   mm->def_flags |= VM_NOZEROPAGE;
+
walk.mm = mm;
walk_page_range(0, TASK_SIZE, );
mm->context.use_skey = 1;
-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/4] mm: new flag to forbid zero page mappings for a vma

2014-10-17 Thread Dominik Dingel
s390 has the special notion of storage keys which are some sort of page flags
associated with physical pages and live outside of direct addressable memory.
These storage keys can be queried and changed with a special set of 
instructions.
The mentioned instructions behave quite nicely under virtualization, if there 
is: 
- an invalid pte, then the instructions will work on some memory reserved in 
the host page table
- a valid pte, then the instructions will work with the real storage key

Thanks to Martin with his software reference and dirty bit tracking, the kernel 
does not issue any 
storage key instructions as now a software based approach will be taken, on the 
other hand 
distributions in the wild are currently using them.

However, for virtualized guests we still have a problem with guest pages mapped 
to zero pages
and the kernel same page merging.  WIth each one multiple guest pages will 
point to the same 
physical page and share the same storage key.

Let's fix this by introducing a new flag which will forbid new zero page 
mappings.
If the guest issues a storage key related instruction we flag all vmas and drop 
existing 
zero page mappings and unmerge the guest memory.

Dominik Dingel (4):
  s390/mm: recfactor global pgste updates
  mm: introduce new VM_NOZEROPAGE flag
  s390/mm: prevent and break zero page mappings in case of storage keys
  s390/mm: disable KSM for storage key enabled pages

 arch/s390/Kconfig   |   3 +
 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |   3 +-
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/kvm/priv.c|  17 ++--
 arch/s390/mm/pgtable.c  | 181 ++--
 include/linux/mm.h  |  13 ++-
 mm/huge_memory.c|   2 +-
 mm/memory.c |   2 +-
 9 files changed, 112 insertions(+), 113 deletions(-)

-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] s390/mm: disable KSM for storage key enabled pages

2014-10-17 Thread Dominik Dingel
When storage keys are enabled unmerge already merged pages and prevent
new pages from being merged.

Signed-off-by: Dominik Dingel 
Acked-by: Christian Borntraeger 
Signed-off-by: Martin Schwidefsky 
---
 arch/s390/include/asm/pgtable.h |  2 +-
 arch/s390/kvm/priv.c| 17 -
 arch/s390/mm/pgtable.c  | 15 +--
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 1e991f6a..a5362e4 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1749,7 +1749,7 @@ static inline pte_t mk_swap_pte(unsigned long type, 
unsigned long offset)
 extern int vmem_add_mapping(unsigned long start, unsigned long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
-extern void s390_enable_skey(void);
+extern int s390_enable_skey(void);
 extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index f89c1cd..e0967fd 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -156,21 +156,25 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
return 0;
 }
 
-static void __skey_check_enable(struct kvm_vcpu *vcpu)
+static int __skey_check_enable(struct kvm_vcpu *vcpu)
 {
+   int rc = 0;
if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
-   return;
+   return rc;
 
-   s390_enable_skey();
+   rc = s390_enable_skey();
trace_kvm_s390_skey_related_inst(vcpu);
vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+   return rc;
 }
 
 
 static int handle_skey(struct kvm_vcpu *vcpu)
 {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
 
+   if (rc)
+   return rc;
vcpu->stat.instruction_storage_key++;
 
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
@@ -692,7 +696,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
 
if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
+
+   if (rc)
+   return rc;
if (set_guest_storage_key(current->mm, useraddr,
vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 6321692..b3311c1 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -18,6 +18,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include 
 #include 
@@ -1328,18 +1330,26 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
return 0;
 }
 
-void s390_enable_skey(void)
+int s390_enable_skey(void)
 {
struct mm_walk walk = { .pte_entry = __s390_enable_skey };
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
+   int rc = 0;
 
down_write(>mmap_sem);
if (mm_use_skey(mm))
goto out_up;
 
-   for (vma = mm->mmap; vma; vma = vma->vm_next)
+   for (vma = mm->mmap; vma; vma = vma->vm_next) {
+   if (ksm_madvise(vma, vma->vm_start, vma->vm_end,
+   MADV_UNMERGEABLE, >vm_flags)) {
+   rc = -ENOMEM;
+   goto out_up;
+   }
vma->vm_flags |= VM_NOZEROPAGE;
+   }
+   mm->def_flags &= ~VM_MERGEABLE;
mm->def_flags |= VM_NOZEROPAGE;
 
walk.mm = mm;
@@ -1348,6 +1358,7 @@ void s390_enable_skey(void)
 
 out_up:
up_write(>mmap_sem);
+   return rc;
 }
 EXPORT_SYMBOL_GPL(s390_enable_skey);
 
-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 2/4] mm: introduce new VM_NOZEROPAGE flag

2014-10-17 Thread Dominik Dingel
Add a new vma flag to allow an architecture to disable the backing
of non-present, anonymous pages with the read-only empty zero page.

Signed-off-by: Dominik Dingel 
Acked-by: Christian Borntraeger 
Signed-off-by: Martin Schwidefsky 
---
 include/linux/mm.h | 13 +++--
 mm/huge_memory.c   |  2 +-
 mm/memory.c|  2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cd33ae2..8f09c91 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -113,7 +113,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_GROWSDOWN   0x0100  /* general info on the segment */
 #define VM_PFNMAP  0x0400  /* Page-ranges managed without "struct 
page", just pure PFN */
 #define VM_DENYWRITE   0x0800  /* ETXTBSY on write attempts.. */
-
+#define VM_NOZEROPAGE  0x1000  /* forbid new zero page mappings */
 #define VM_LOCKED  0x2000
 #define VM_IO   0x4000 /* Memory mapped I/O or similar */
 
@@ -179,7 +179,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
 
 /* This mask defines which mm->def_flags a process can inherit its parent */
-#define VM_INIT_DEF_MASK   VM_NOHUGEPAGE
+#define VM_INIT_DEF_MASK   (VM_NOHUGEPAGE | VM_NOZEROPAGE)
 
 /*
  * mapping from the currently active vm_flags protection bits (the
@@ -1293,6 +1293,15 @@ static inline int stack_guard_page_end(struct 
vm_area_struct *vma,
!vma_growsup(vma->vm_next, addr);
 }
 
+static inline int vma_forbids_zeropage(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_NOZEROPAGE
+   return vma->vm_flags & VM_NOZEROPAGE;
+#else
+   return 0;
+#endif
+}
+
 extern struct task_struct *task_of_stack(struct task_struct *task,
struct vm_area_struct *vma, bool in_group);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de98415..c271265 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -805,7 +805,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma->vm_flags)))
return VM_FAULT_OOM;
-   if (!(flags & FAULT_FLAG_WRITE) &&
+   if (!(flags & FAULT_FLAG_WRITE) && !vma_forbids_zeropage(vma) &&
transparent_hugepage_use_zero_page()) {
spinlock_t *ptl;
pgtable_t pgtable;
diff --git a/mm/memory.c b/mm/memory.c
index 64f82aa..1859b2b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2640,7 +2640,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_SIGBUS;
 
/* Use the zero-page for reads */
-   if (!(flags & FAULT_FLAG_WRITE)) {
+   if (!(flags & FAULT_FLAG_WRITE) && !vma_forbids_zeropage(vma)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, );
-- 
1.8.5.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 1/4] s390/mm: recfactor global pgste updates

2014-10-17 Thread Dominik Dingel
Replace the s390 specific page table walker for the pgste updates
with a call to the common code walk_page_range function.
There are now two pte modification functions, one for the reset
of the CMMA state and another one for the initialization of the
storage keys.

Signed-off-by: Dominik Dingel 
Signed-off-by: Martin Schwidefsky 
---
 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |   1 +
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/mm/pgtable.c  | 153 ++--
 4 files changed, 56 insertions(+), 102 deletions(-)

diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 9e18a61..120e126 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -22,8 +22,6 @@ unsigned long *page_table_alloc(struct mm_struct *, unsigned 
long);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *);
 
-void page_table_reset_pgste(struct mm_struct *, unsigned long, unsigned long,
-   bool init_skey);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
  unsigned long key, bool nq);
 
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 5efb2fe..1e991f6a 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1750,6 +1750,7 @@ extern int vmem_add_mapping(unsigned long start, unsigned 
long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
 extern void s390_enable_skey(void);
+extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
  * No page table caches to initialise
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 81b0e11..7a33c11 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -281,7 +281,7 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct 
kvm_device_attr *attr)
case KVM_S390_VM_MEM_CLR_CMMA:
mutex_lock(>lock);
idx = srcu_read_lock(>srcu);
-   page_table_reset_pgste(kvm->arch.gmap->mm, 0, TASK_SIZE, false);
+   s390_reset_cmma(kvm->arch.gmap->mm);
srcu_read_unlock(>srcu, idx);
mutex_unlock(>lock);
ret = 0;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 5404a62..ab55ba8 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -885,99 +885,6 @@ static inline void page_table_free_pgste(unsigned long 
*table)
__free_page(page);
 }
 
-static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t 
*pmd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   pte_t *start_pte, *pte;
-   spinlock_t *ptl;
-   pgste_t pgste;
-
-   start_pte = pte_offset_map_lock(mm, pmd, addr, );
-   pte = start_pte;
-   do {
-   pgste = pgste_get_lock(pte);
-   pgste_val(pgste) &= ~_PGSTE_GPS_USAGE_MASK;
-   if (init_skey) {
-   unsigned long address;
-
-   pgste_val(pgste) &= ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
- PGSTE_GR_BIT | PGSTE_GC_BIT);
-
-   /* skip invalid and not writable pages */
-   if (pte_val(*pte) & _PAGE_INVALID ||
-   !(pte_val(*pte) & _PAGE_WRITE)) {
-   pgste_set_unlock(pte, pgste);
-   continue;
-   }
-
-   address = pte_val(*pte) & PAGE_MASK;
-   page_set_storage_key(address, PAGE_DEFAULT_KEY, 1);
-   }
-   pgste_set_unlock(pte, pgste);
-   } while (pte++, addr += PAGE_SIZE, addr != end);
-   pte_unmap_unlock(start_pte, ptl);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t 
*pud,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pmd_t *pmd;
-
-   pmd = pmd_offset(pud, addr);
-   do {
-   next = pmd_addr_end(addr, end);
-   if (pmd_none_or_clear_bad(pmd))
-   continue;
-   next = page_table_reset_pte(mm, pmd, addr, next, init_skey);
-   } while (pmd++, addr = next, addr != end);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t 
*pgd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pud_t *pud;
-
-   pud = pud_offset(pgd, addr);
-   do {
-   next = pud_addr_end(addr, end);
-   if (pud_none_or_clear_bad(pud))
-   continue;
-

[PATCH 1/4] s390/mm: recfactor global pgste updates

2014-10-17 Thread Dominik Dingel
Replace the s390 specific page table walker for the pgste updates
with a call to the common code walk_page_range function.
There are now two pte modification functions, one for the reset
of the CMMA state and another one for the initialization of the
storage keys.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
---
 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |   1 +
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/mm/pgtable.c  | 153 ++--
 4 files changed, 56 insertions(+), 102 deletions(-)

diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index 9e18a61..120e126 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -22,8 +22,6 @@ unsigned long *page_table_alloc(struct mm_struct *, unsigned 
long);
 void page_table_free(struct mm_struct *, unsigned long *);
 void page_table_free_rcu(struct mmu_gather *, unsigned long *);
 
-void page_table_reset_pgste(struct mm_struct *, unsigned long, unsigned long,
-   bool init_skey);
 int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
  unsigned long key, bool nq);
 
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 5efb2fe..1e991f6a 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1750,6 +1750,7 @@ extern int vmem_add_mapping(unsigned long start, unsigned 
long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
 extern void s390_enable_skey(void);
+extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
  * No page table caches to initialise
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 81b0e11..7a33c11 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -281,7 +281,7 @@ static int kvm_s390_mem_control(struct kvm *kvm, struct 
kvm_device_attr *attr)
case KVM_S390_VM_MEM_CLR_CMMA:
mutex_lock(kvm-lock);
idx = srcu_read_lock(kvm-srcu);
-   page_table_reset_pgste(kvm-arch.gmap-mm, 0, TASK_SIZE, false);
+   s390_reset_cmma(kvm-arch.gmap-mm);
srcu_read_unlock(kvm-srcu, idx);
mutex_unlock(kvm-lock);
ret = 0;
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 5404a62..ab55ba8 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -885,99 +885,6 @@ static inline void page_table_free_pgste(unsigned long 
*table)
__free_page(page);
 }
 
-static inline unsigned long page_table_reset_pte(struct mm_struct *mm, pmd_t 
*pmd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   pte_t *start_pte, *pte;
-   spinlock_t *ptl;
-   pgste_t pgste;
-
-   start_pte = pte_offset_map_lock(mm, pmd, addr, ptl);
-   pte = start_pte;
-   do {
-   pgste = pgste_get_lock(pte);
-   pgste_val(pgste) = ~_PGSTE_GPS_USAGE_MASK;
-   if (init_skey) {
-   unsigned long address;
-
-   pgste_val(pgste) = ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
- PGSTE_GR_BIT | PGSTE_GC_BIT);
-
-   /* skip invalid and not writable pages */
-   if (pte_val(*pte)  _PAGE_INVALID ||
-   !(pte_val(*pte)  _PAGE_WRITE)) {
-   pgste_set_unlock(pte, pgste);
-   continue;
-   }
-
-   address = pte_val(*pte)  PAGE_MASK;
-   page_set_storage_key(address, PAGE_DEFAULT_KEY, 1);
-   }
-   pgste_set_unlock(pte, pgste);
-   } while (pte++, addr += PAGE_SIZE, addr != end);
-   pte_unmap_unlock(start_pte, ptl);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pmd(struct mm_struct *mm, pud_t 
*pud,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pmd_t *pmd;
-
-   pmd = pmd_offset(pud, addr);
-   do {
-   next = pmd_addr_end(addr, end);
-   if (pmd_none_or_clear_bad(pmd))
-   continue;
-   next = page_table_reset_pte(mm, pmd, addr, next, init_skey);
-   } while (pmd++, addr = next, addr != end);
-
-   return addr;
-}
-
-static inline unsigned long page_table_reset_pud(struct mm_struct *mm, pgd_t 
*pgd,
-   unsigned long addr, unsigned long end, bool init_skey)
-{
-   unsigned long next;
-   pud_t *pud;
-
-   pud = pud_offset(pgd, addr);
-   do {
-   next = pud_addr_end(addr, end);
-   if (pud_none_or_clear_bad(pud))
-   continue

[PATCH 2/4] mm: introduce new VM_NOZEROPAGE flag

2014-10-17 Thread Dominik Dingel
Add a new vma flag to allow an architecture to disable the backing
of non-present, anonymous pages with the read-only empty zero page.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
Acked-by: Christian Borntraeger borntrae...@de.ibm.com
Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
---
 include/linux/mm.h | 13 +++--
 mm/huge_memory.c   |  2 +-
 mm/memory.c|  2 +-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index cd33ae2..8f09c91 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -113,7 +113,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_GROWSDOWN   0x0100  /* general info on the segment */
 #define VM_PFNMAP  0x0400  /* Page-ranges managed without struct 
page, just pure PFN */
 #define VM_DENYWRITE   0x0800  /* ETXTBSY on write attempts.. */
-
+#define VM_NOZEROPAGE  0x1000  /* forbid new zero page mappings */
 #define VM_LOCKED  0x2000
 #define VM_IO   0x4000 /* Memory mapped I/O or similar */
 
@@ -179,7 +179,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP)
 
 /* This mask defines which mm-def_flags a process can inherit its parent */
-#define VM_INIT_DEF_MASK   VM_NOHUGEPAGE
+#define VM_INIT_DEF_MASK   (VM_NOHUGEPAGE | VM_NOZEROPAGE)
 
 /*
  * mapping from the currently active vm_flags protection bits (the
@@ -1293,6 +1293,15 @@ static inline int stack_guard_page_end(struct 
vm_area_struct *vma,
!vma_growsup(vma-vm_next, addr);
 }
 
+static inline int vma_forbids_zeropage(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_NOZEROPAGE
+   return vma-vm_flags  VM_NOZEROPAGE;
+#else
+   return 0;
+#endif
+}
+
 extern struct task_struct *task_of_stack(struct task_struct *task,
struct vm_area_struct *vma, bool in_group);
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index de98415..c271265 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -805,7 +805,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_OOM;
if (unlikely(khugepaged_enter(vma, vma-vm_flags)))
return VM_FAULT_OOM;
-   if (!(flags  FAULT_FLAG_WRITE) 
+   if (!(flags  FAULT_FLAG_WRITE)  !vma_forbids_zeropage(vma) 
transparent_hugepage_use_zero_page()) {
spinlock_t *ptl;
pgtable_t pgtable;
diff --git a/mm/memory.c b/mm/memory.c
index 64f82aa..1859b2b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2640,7 +2640,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
return VM_FAULT_SIGBUS;
 
/* Use the zero-page for reads */
-   if (!(flags  FAULT_FLAG_WRITE)) {
+   if (!(flags  FAULT_FLAG_WRITE)  !vma_forbids_zeropage(vma)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma-vm_page_prot));
page_table = pte_offset_map_lock(mm, pmd, address, ptl);
-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 0/4] mm: new flag to forbid zero page mappings for a vma

2014-10-17 Thread Dominik Dingel
s390 has the special notion of storage keys which are some sort of page flags
associated with physical pages and live outside of direct addressable memory.
These storage keys can be queried and changed with a special set of 
instructions.
The mentioned instructions behave quite nicely under virtualization, if there 
is: 
- an invalid pte, then the instructions will work on some memory reserved in 
the host page table
- a valid pte, then the instructions will work with the real storage key

Thanks to Martin with his software reference and dirty bit tracking, the kernel 
does not issue any 
storage key instructions as now a software based approach will be taken, on the 
other hand 
distributions in the wild are currently using them.

However, for virtualized guests we still have a problem with guest pages mapped 
to zero pages
and the kernel same page merging.  WIth each one multiple guest pages will 
point to the same 
physical page and share the same storage key.

Let's fix this by introducing a new flag which will forbid new zero page 
mappings.
If the guest issues a storage key related instruction we flag all vmas and drop 
existing 
zero page mappings and unmerge the guest memory.

Dominik Dingel (4):
  s390/mm: recfactor global pgste updates
  mm: introduce new VM_NOZEROPAGE flag
  s390/mm: prevent and break zero page mappings in case of storage keys
  s390/mm: disable KSM for storage key enabled pages

 arch/s390/Kconfig   |   3 +
 arch/s390/include/asm/pgalloc.h |   2 -
 arch/s390/include/asm/pgtable.h |   3 +-
 arch/s390/kvm/kvm-s390.c|   2 +-
 arch/s390/kvm/priv.c|  17 ++--
 arch/s390/mm/pgtable.c  | 181 ++--
 include/linux/mm.h  |  13 ++-
 mm/huge_memory.c|   2 +-
 mm/memory.c |   2 +-
 9 files changed, 112 insertions(+), 113 deletions(-)

-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 4/4] s390/mm: disable KSM for storage key enabled pages

2014-10-17 Thread Dominik Dingel
When storage keys are enabled unmerge already merged pages and prevent
new pages from being merged.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
Acked-by: Christian Borntraeger borntrae...@de.ibm.com
Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
---
 arch/s390/include/asm/pgtable.h |  2 +-
 arch/s390/kvm/priv.c| 17 -
 arch/s390/mm/pgtable.c  | 15 +--
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 1e991f6a..a5362e4 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1749,7 +1749,7 @@ static inline pte_t mk_swap_pte(unsigned long type, 
unsigned long offset)
 extern int vmem_add_mapping(unsigned long start, unsigned long size);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
-extern void s390_enable_skey(void);
+extern int s390_enable_skey(void);
 extern void s390_reset_cmma(struct mm_struct *mm);
 
 /*
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index f89c1cd..e0967fd 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -156,21 +156,25 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
return 0;
 }
 
-static void __skey_check_enable(struct kvm_vcpu *vcpu)
+static int __skey_check_enable(struct kvm_vcpu *vcpu)
 {
+   int rc = 0;
if (!(vcpu-arch.sie_block-ictl  (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
-   return;
+   return rc;
 
-   s390_enable_skey();
+   rc = s390_enable_skey();
trace_kvm_s390_skey_related_inst(vcpu);
vcpu-arch.sie_block-ictl = ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+   return rc;
 }
 
 
 static int handle_skey(struct kvm_vcpu *vcpu)
 {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
 
+   if (rc)
+   return rc;
vcpu-stat.instruction_storage_key++;
 
if (vcpu-arch.sie_block-gpsw.mask  PSW_MASK_PSTATE)
@@ -692,7 +696,10 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
}
 
if (vcpu-run-s.regs.gprs[reg1]  PFMF_SK) {
-   __skey_check_enable(vcpu);
+   int rc = __skey_check_enable(vcpu);
+
+   if (rc)
+   return rc;
if (set_guest_storage_key(current-mm, useraddr,
vcpu-run-s.regs.gprs[reg1]  PFMF_KEY,
vcpu-run-s.regs.gprs[reg1]  PFMF_NQ))
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index 6321692..b3311c1 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -18,6 +18,8 @@
 #include linux/rcupdate.h
 #include linux/slab.h
 #include linux/swapops.h
+#include linux/ksm.h
+#include linux/mman.h
 
 #include asm/pgtable.h
 #include asm/pgalloc.h
@@ -1328,18 +1330,26 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
return 0;
 }
 
-void s390_enable_skey(void)
+int s390_enable_skey(void)
 {
struct mm_walk walk = { .pte_entry = __s390_enable_skey };
struct mm_struct *mm = current-mm;
struct vm_area_struct *vma;
+   int rc = 0;
 
down_write(mm-mmap_sem);
if (mm_use_skey(mm))
goto out_up;
 
-   for (vma = mm-mmap; vma; vma = vma-vm_next)
+   for (vma = mm-mmap; vma; vma = vma-vm_next) {
+   if (ksm_madvise(vma, vma-vm_start, vma-vm_end,
+   MADV_UNMERGEABLE, vma-vm_flags)) {
+   rc = -ENOMEM;
+   goto out_up;
+   }
vma-vm_flags |= VM_NOZEROPAGE;
+   }
+   mm-def_flags = ~VM_MERGEABLE;
mm-def_flags |= VM_NOZEROPAGE;
 
walk.mm = mm;
@@ -1348,6 +1358,7 @@ void s390_enable_skey(void)
 
 out_up:
up_write(mm-mmap_sem);
+   return rc;
 }
 EXPORT_SYMBOL_GPL(s390_enable_skey);
 
-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH 3/4] s390/mm: prevent and break zero page mappings in case of storage keys

2014-10-17 Thread Dominik Dingel
As soon as storage keys are enabled we need to work around of zero page
mappings to prevent inconsistencies between storage keys and pgste.

Otherwise following data corruption could happen:
1) guest enables storage key
2) guest sets storage key for not mapped page X
   - change goes to PGSTE
3) guest reads from page X
   - as X was not dirty before, the page will be zero page backed,
  storage key from PGSTE for X will go to storage key for zero page
4) guest sets storage key for not mapped page Y (same logic as above
5) guest reads from page Y
   - as Y was not dirty before, the page will be zero page backed,
  storage key from PGSTE for Y will got to storage key for zero page
  overwriting storage key for X

While holding the mmap sem, we are safe before changes on entries we
already fixed. As sske and host large pages are also mutual exclusive
we do not even need to retry the fixup_user_fault.

Signed-off-by: Dominik Dingel din...@linux.vnet.ibm.com
Acked-by: Christian Borntraeger borntrae...@de.ibm.com
Signed-off-by: Martin Schwidefsky schwidef...@de.ibm.com
---
 arch/s390/Kconfig  |  3 +++
 arch/s390/mm/pgtable.c | 15 +++
 2 files changed, 18 insertions(+)

diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 05c78bb..4e04e63 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -1,6 +1,9 @@
 config MMU
def_bool y
 
+config NOZEROPAGE
+   def_bool y
+
 config ZONE_DMA
def_bool y
 
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index ab55ba8..6321692 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -1309,6 +1309,15 @@ static int __s390_enable_skey(pte_t *pte, unsigned long 
addr,
pgste_t pgste;
 
pgste = pgste_get_lock(pte);
+   /*
+* Remove all zero page mappings,
+* after establishing a policy to forbid zero page mappings
+* following faults for that page will get fresh anonymous pages
+*/
+   if (is_zero_pfn(pte_pfn(*pte))) {
+   ptep_flush_direct(walk-mm, addr, pte);
+   pte_val(*pte) = _PAGE_INVALID;
+   }
/* Clear storage key */
pgste_val(pgste) = ~(PGSTE_ACC_BITS | PGSTE_FP_BIT |
  PGSTE_GR_BIT | PGSTE_GC_BIT);
@@ -1323,10 +1332,16 @@ void s390_enable_skey(void)
 {
struct mm_walk walk = { .pte_entry = __s390_enable_skey };
struct mm_struct *mm = current-mm;
+   struct vm_area_struct *vma;
 
down_write(mm-mmap_sem);
if (mm_use_skey(mm))
goto out_up;
+
+   for (vma = mm-mmap; vma; vma = vma-vm_next)
+   vma-vm_flags |= VM_NOZEROPAGE;
+   mm-def_flags |= VM_NOZEROPAGE;
+
walk.mm = mm;
walk_page_range(0, TASK_SIZE, walk);
mm-context.use_skey = 1;
-- 
1.8.5.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


  1   2   >