[patch 6/6] mm: core remove PageReserved (take 2)

2005-07-26 Thread Nick Piggin

Nick Piggin wrote:

6/6



Actually I think Hugh gave me some feedback about the introduced
`print_invalid_pfn` function, which I ignored.

So here is patch 6 again, with print_invalid_pfn renamed invalid_pfn,
and using a macro to alleviate the requirement of passing in the
function name by hand.

Remove PageReserved() calls from core code by tightening VM_RESERVED
handling in mm/ to cover PageReserved functionality.

PageReserved special casing is removed from get_page and put_page.

All setting and clearning of PageReserved is retained, and it is now
flagged in the page_alloc checks to help ensure we don't introduce
any refcount based freeing of Reserved pages.

MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being
deprecated. We never completely handled it correctly anyway, and is
difficult to handle nicely - difficult but not impossible, it could
be reintroduced in future if required (Hugh has a proof of concept).

Once PageReserved() calls are removed from kernel/power/swsusp.c, and
all arch/ and driver code, the Set and Clear calls, and the PG_reserved
bit can be trivially removed.

Last real user of PageReserved is swsusp, which uses PageReserved to
determine whether a struct page points to valid memory or not. This
still needs to be addressed.

Many thanks to Hugh Dickins for input.

Signed-off-by: Nick Piggin <[EMAIL PROTECTED]>


Index: linux-2.6/include/linux/mm.h
===
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -156,7 +156,8 @@ extern unsigned int kobjsize(const void 
 
 #define VM_DONTCOPY0x0002  /* Do not copy this vma on fork */
 #define VM_DONTEXPAND  0x0004  /* Cannot expand with mremap() */
-#define VM_RESERVED0x0008  /* Don't unmap it from swap_out */
+#define VM_RESERVED0x0008  /* Pages and ptes in region aren't 
managed with regular pagecache or rmap routines */
+
 #define VM_ACCOUNT 0x0010  /* Is a VM accounted object */
 #define VM_HUGETLB 0x0040  /* Huge TLB Page VM */
 #define VM_NONLINEAR   0x0080  /* Is non-linear (remap_file_pages) */
@@ -337,7 +338,7 @@ static inline void get_page(struct page 
 
 static inline void put_page(struct page *page)
 {
-   if (!PageReserved(page) && put_page_testzero(page))
+   if (put_page_testzero(page))
__page_cache_release(page);
 }
 
@@ -711,6 +712,9 @@ void install_arg_page(struct vm_area_str
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned 
long start,
int len, int write, int force, struct page **pages, struct 
vm_area_struct **vmas);
+#define invalid_pfn(pte, vm_flags, vaddr)  \
+   __invalid_pfn(__FUNCTION__, pte, vm_flags, vaddr)
+void __invalid_pfn(const char *, pte_t, unsigned long, unsigned long);
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
Index: linux-2.6/mm/madvise.c
===
--- linux-2.6.orig/mm/madvise.c
+++ linux-2.6/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_a
 unsigned long start, unsigned long end)
 {
*prev = vma;
-   if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+   if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || 
is_vm_hugetlb_page(vma))
return -EINVAL;
 
if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
Index: linux-2.6/mm/memory.c
===
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -333,6 +333,21 @@ out:
 }
 
 /*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void __invalid_pfn(const char *errfunc, pte_t pte,
+   unsigned long vm_flags, unsigned long vaddr)
+{
+   printk(KERN_ERR "%s: pte does not point to valid memory. "
+   "process = %s, pte = %08lx, vm_flags = %lx, vaddr = %lx\n",
+   errfunc, current->comm, (long)pte_val(pte), vm_flags, vaddr);
+}
+
+/*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
  * covered by this vma.
@@ -361,25 +376,29 @@ copy_one_pte(struct mm_struct *dst_mm, s
spin_unlock(_lock);
}
}
-   set_pte_at(dst_mm, addr, dst_pte, pte);
-   return;
+   goto out_set_pte;
}
 
+   /* If the region is VM_RESERVED, the mapping is not
+* mapped via rmap - duplicate the pte as is.
+*/
+   if (vm_flags & VM_RESERVED)
+   goto out_set_pte;
+
+   /* If the pte points outside of valid 

[patch 6/6] mm: core remove PageReserved

2005-07-26 Thread Nick Piggin

6/6

Remove PageReserved() calls from core code by tightening VM_RESERVED
handling in mm/ to cover PageReserved functionality.

PageReserved special casing is removed from get_page and put_page.

All setting and clearning of PageReserved is retained, and it is now
flagged in the page_alloc checks to help ensure we don't introduce
any refcount based freeing of Reserved pages.

MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being
deprecated. We never completely handled it correctly anyway, and is
difficult to handle nicely - difficult but not impossible, it could
be reintroduced in future if required (Hugh has a proof of concept).

Once PageReserved() calls are removed from kernel/power/swsusp.c, and
all arch/ and driver code, the Set and Clear calls, and the PG_reserved
bit can be trivially removed.

Last real user of PageReserved is swsusp, which uses PageReserved to
determine whether a struct page points to valid memory or not. This
still needs to be addressed.

Many thanks to Hugh Dickins for input.

Signed-off-by: Nick Piggin <[EMAIL PROTECTED]>


Index: linux-2.6/include/linux/mm.h
===
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -156,7 +156,8 @@ extern unsigned int kobjsize(const void 
 
 #define VM_DONTCOPY0x0002  /* Do not copy this vma on fork */
 #define VM_DONTEXPAND  0x0004  /* Cannot expand with mremap() */
-#define VM_RESERVED0x0008  /* Don't unmap it from swap_out */
+#define VM_RESERVED0x0008  /* Pages and ptes in region aren't 
managed with regular pagecache or rmap routines */
+
 #define VM_ACCOUNT 0x0010  /* Is a VM accounted object */
 #define VM_HUGETLB 0x0040  /* Huge TLB Page VM */
 #define VM_NONLINEAR   0x0080  /* Is non-linear (remap_file_pages) */
@@ -337,7 +338,7 @@ static inline void get_page(struct page 
 
 static inline void put_page(struct page *page)
 {
-   if (!PageReserved(page) && put_page_testzero(page))
+   if (put_page_testzero(page))
__page_cache_release(page);
 }
 
@@ -711,6 +712,7 @@ void install_arg_page(struct vm_area_str
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned 
long start,
int len, int write, int force, struct page **pages, struct 
vm_area_struct **vmas);
+void print_invalid_pfn(const char *, pte_t, unsigned long, unsigned long);
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
Index: linux-2.6/mm/madvise.c
===
--- linux-2.6.orig/mm/madvise.c
+++ linux-2.6/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_a
 unsigned long start, unsigned long end)
 {
*prev = vma;
-   if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+   if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || 
is_vm_hugetlb_page(vma))
return -EINVAL;
 
if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
Index: linux-2.6/mm/memory.c
===
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -333,6 +333,21 @@ out:
 }
 
 /*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void print_invalid_pfn(const char *errfunc, pte_t pte,
+   unsigned long vm_flags, unsigned long vaddr)
+{
+   printk(KERN_ERR "%s: pte does not point to valid memory. "
+   "process = %s, pte = %08lx, vm_flags = %lx, vaddr = %lx\n",
+   errfunc, current->comm, (long)pte_val(pte), vm_flags, vaddr);
+}
+
+/*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
  * covered by this vma.
@@ -361,25 +376,29 @@ copy_one_pte(struct mm_struct *dst_mm, s
spin_unlock(_lock);
}
}
-   set_pte_at(dst_mm, addr, dst_pte, pte);
-   return;
+   goto out_set_pte;
}
 
+   /* If the region is VM_RESERVED, the mapping is not
+* mapped via rmap - duplicate the pte as is.
+*/
+   if (vm_flags & VM_RESERVED)
+   goto out_set_pte;
+
+   /* If the pte points outside of valid memory but
+* the region is not VM_RESERVED, we have a problem.
+*/
pfn = pte_pfn(pte);
-   /* the pte points outside of valid memory, the
-* mapping is assumed to be good, meaningful
-* and not mapped via rmap - duplicate the
-* mapping as is.
-*/
-   page = NULL;
-   if (pfn_valid(pfn))
-   page = pfn_to_page(pfn);
-
-   

[patch 6/6] mm: core remove PageReserved

2005-07-26 Thread Nick Piggin

6/6

Remove PageReserved() calls from core code by tightening VM_RESERVED
handling in mm/ to cover PageReserved functionality.

PageReserved special casing is removed from get_page and put_page.

All setting and clearning of PageReserved is retained, and it is now
flagged in the page_alloc checks to help ensure we don't introduce
any refcount based freeing of Reserved pages.

MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being
deprecated. We never completely handled it correctly anyway, and is
difficult to handle nicely - difficult but not impossible, it could
be reintroduced in future if required (Hugh has a proof of concept).

Once PageReserved() calls are removed from kernel/power/swsusp.c, and
all arch/ and driver code, the Set and Clear calls, and the PG_reserved
bit can be trivially removed.

Last real user of PageReserved is swsusp, which uses PageReserved to
determine whether a struct page points to valid memory or not. This
still needs to be addressed.

Many thanks to Hugh Dickins for input.

Signed-off-by: Nick Piggin [EMAIL PROTECTED]


Index: linux-2.6/include/linux/mm.h
===
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -156,7 +156,8 @@ extern unsigned int kobjsize(const void 
 
 #define VM_DONTCOPY0x0002  /* Do not copy this vma on fork */
 #define VM_DONTEXPAND  0x0004  /* Cannot expand with mremap() */
-#define VM_RESERVED0x0008  /* Don't unmap it from swap_out */
+#define VM_RESERVED0x0008  /* Pages and ptes in region aren't 
managed with regular pagecache or rmap routines */
+
 #define VM_ACCOUNT 0x0010  /* Is a VM accounted object */
 #define VM_HUGETLB 0x0040  /* Huge TLB Page VM */
 #define VM_NONLINEAR   0x0080  /* Is non-linear (remap_file_pages) */
@@ -337,7 +338,7 @@ static inline void get_page(struct page 
 
 static inline void put_page(struct page *page)
 {
-   if (!PageReserved(page)  put_page_testzero(page))
+   if (put_page_testzero(page))
__page_cache_release(page);
 }
 
@@ -711,6 +712,7 @@ void install_arg_page(struct vm_area_str
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned 
long start,
int len, int write, int force, struct page **pages, struct 
vm_area_struct **vmas);
+void print_invalid_pfn(const char *, pte_t, unsigned long, unsigned long);
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
Index: linux-2.6/mm/madvise.c
===
--- linux-2.6.orig/mm/madvise.c
+++ linux-2.6/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_a
 unsigned long start, unsigned long end)
 {
*prev = vma;
-   if ((vma-vm_flags  VM_LOCKED) || is_vm_hugetlb_page(vma))
+   if ((vma-vm_flags  (VM_LOCKED|VM_RESERVED)) || 
is_vm_hugetlb_page(vma))
return -EINVAL;
 
if (unlikely(vma-vm_flags  VM_NONLINEAR)) {
Index: linux-2.6/mm/memory.c
===
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -333,6 +333,21 @@ out:
 }
 
 /*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void print_invalid_pfn(const char *errfunc, pte_t pte,
+   unsigned long vm_flags, unsigned long vaddr)
+{
+   printk(KERN_ERR %s: pte does not point to valid memory. 
+   process = %s, pte = %08lx, vm_flags = %lx, vaddr = %lx\n,
+   errfunc, current-comm, (long)pte_val(pte), vm_flags, vaddr);
+}
+
+/*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
  * covered by this vma.
@@ -361,25 +376,29 @@ copy_one_pte(struct mm_struct *dst_mm, s
spin_unlock(mmlist_lock);
}
}
-   set_pte_at(dst_mm, addr, dst_pte, pte);
-   return;
+   goto out_set_pte;
}
 
+   /* If the region is VM_RESERVED, the mapping is not
+* mapped via rmap - duplicate the pte as is.
+*/
+   if (vm_flags  VM_RESERVED)
+   goto out_set_pte;
+
+   /* If the pte points outside of valid memory but
+* the region is not VM_RESERVED, we have a problem.
+*/
pfn = pte_pfn(pte);
-   /* the pte points outside of valid memory, the
-* mapping is assumed to be good, meaningful
-* and not mapped via rmap - duplicate the
-* mapping as is.
-*/
-   page = NULL;
-   if (pfn_valid(pfn))
-   page = pfn_to_page(pfn);
-
-   if (!page 

[patch 6/6] mm: core remove PageReserved (take 2)

2005-07-26 Thread Nick Piggin

Nick Piggin wrote:

6/6



Actually I think Hugh gave me some feedback about the introduced
`print_invalid_pfn` function, which I ignored.

So here is patch 6 again, with print_invalid_pfn renamed invalid_pfn,
and using a macro to alleviate the requirement of passing in the
function name by hand.

Remove PageReserved() calls from core code by tightening VM_RESERVED
handling in mm/ to cover PageReserved functionality.

PageReserved special casing is removed from get_page and put_page.

All setting and clearning of PageReserved is retained, and it is now
flagged in the page_alloc checks to help ensure we don't introduce
any refcount based freeing of Reserved pages.

MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being
deprecated. We never completely handled it correctly anyway, and is
difficult to handle nicely - difficult but not impossible, it could
be reintroduced in future if required (Hugh has a proof of concept).

Once PageReserved() calls are removed from kernel/power/swsusp.c, and
all arch/ and driver code, the Set and Clear calls, and the PG_reserved
bit can be trivially removed.

Last real user of PageReserved is swsusp, which uses PageReserved to
determine whether a struct page points to valid memory or not. This
still needs to be addressed.

Many thanks to Hugh Dickins for input.

Signed-off-by: Nick Piggin [EMAIL PROTECTED]


Index: linux-2.6/include/linux/mm.h
===
--- linux-2.6.orig/include/linux/mm.h
+++ linux-2.6/include/linux/mm.h
@@ -156,7 +156,8 @@ extern unsigned int kobjsize(const void 
 
 #define VM_DONTCOPY0x0002  /* Do not copy this vma on fork */
 #define VM_DONTEXPAND  0x0004  /* Cannot expand with mremap() */
-#define VM_RESERVED0x0008  /* Don't unmap it from swap_out */
+#define VM_RESERVED0x0008  /* Pages and ptes in region aren't 
managed with regular pagecache or rmap routines */
+
 #define VM_ACCOUNT 0x0010  /* Is a VM accounted object */
 #define VM_HUGETLB 0x0040  /* Huge TLB Page VM */
 #define VM_NONLINEAR   0x0080  /* Is non-linear (remap_file_pages) */
@@ -337,7 +338,7 @@ static inline void get_page(struct page 
 
 static inline void put_page(struct page *page)
 {
-   if (!PageReserved(page)  put_page_testzero(page))
+   if (put_page_testzero(page))
__page_cache_release(page);
 }
 
@@ -711,6 +712,9 @@ void install_arg_page(struct vm_area_str
 
 int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned 
long start,
int len, int write, int force, struct page **pages, struct 
vm_area_struct **vmas);
+#define invalid_pfn(pte, vm_flags, vaddr)  \
+   __invalid_pfn(__FUNCTION__, pte, vm_flags, vaddr)
+void __invalid_pfn(const char *, pte_t, unsigned long, unsigned long);
 
 int __set_page_dirty_buffers(struct page *page);
 int __set_page_dirty_nobuffers(struct page *page);
Index: linux-2.6/mm/madvise.c
===
--- linux-2.6.orig/mm/madvise.c
+++ linux-2.6/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_a
 unsigned long start, unsigned long end)
 {
*prev = vma;
-   if ((vma-vm_flags  VM_LOCKED) || is_vm_hugetlb_page(vma))
+   if ((vma-vm_flags  (VM_LOCKED|VM_RESERVED)) || 
is_vm_hugetlb_page(vma))
return -EINVAL;
 
if (unlikely(vma-vm_flags  VM_NONLINEAR)) {
Index: linux-2.6/mm/memory.c
===
--- linux-2.6.orig/mm/memory.c
+++ linux-2.6/mm/memory.c
@@ -333,6 +333,21 @@ out:
 }
 
 /*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void __invalid_pfn(const char *errfunc, pte_t pte,
+   unsigned long vm_flags, unsigned long vaddr)
+{
+   printk(KERN_ERR %s: pte does not point to valid memory. 
+   process = %s, pte = %08lx, vm_flags = %lx, vaddr = %lx\n,
+   errfunc, current-comm, (long)pte_val(pte), vm_flags, vaddr);
+}
+
+/*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
  * covered by this vma.
@@ -361,25 +376,29 @@ copy_one_pte(struct mm_struct *dst_mm, s
spin_unlock(mmlist_lock);
}
}
-   set_pte_at(dst_mm, addr, dst_pte, pte);
-   return;
+   goto out_set_pte;
}
 
+   /* If the region is VM_RESERVED, the mapping is not
+* mapped via rmap - duplicate the pte as is.
+*/
+   if (vm_flags  VM_RESERVED)
+   goto out_set_pte;
+
+   /* If the pte points outside of valid memory