[patch 6/6] mm: core remove PageReserved (take 2)
Nick Piggin wrote: 6/6 Actually I think Hugh gave me some feedback about the introduced `print_invalid_pfn` function, which I ignored. So here is patch 6 again, with print_invalid_pfn renamed invalid_pfn, and using a macro to alleviate the requirement of passing in the function name by hand. Remove PageReserved() calls from core code by tightening VM_RESERVED handling in mm/ to cover PageReserved functionality. PageReserved special casing is removed from get_page and put_page. All setting and clearning of PageReserved is retained, and it is now flagged in the page_alloc checks to help ensure we don't introduce any refcount based freeing of Reserved pages. MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being deprecated. We never completely handled it correctly anyway, and is difficult to handle nicely - difficult but not impossible, it could be reintroduced in future if required (Hugh has a proof of concept). Once PageReserved() calls are removed from kernel/power/swsusp.c, and all arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can be trivially removed. Last real user of PageReserved is swsusp, which uses PageReserved to determine whether a struct page points to valid memory or not. This still needs to be addressed. Many thanks to Hugh Dickins for input. Signed-off-by: Nick Piggin <[EMAIL PROTECTED]> Index: linux-2.6/include/linux/mm.h === --- linux-2.6.orig/include/linux/mm.h +++ linux-2.6/include/linux/mm.h @@ -156,7 +156,8 @@ extern unsigned int kobjsize(const void #define VM_DONTCOPY0x0002 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x0004 /* Cannot expand with mremap() */ -#define VM_RESERVED0x0008 /* Don't unmap it from swap_out */ +#define VM_RESERVED0x0008 /* Pages and ptes in region aren't managed with regular pagecache or rmap routines */ + #define VM_ACCOUNT 0x0010 /* Is a VM accounted object */ #define VM_HUGETLB 0x0040 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x0080 /* Is non-linear (remap_file_pages) */ @@ -337,7 +338,7 @@ static inline void get_page(struct page static inline void put_page(struct page *page) { - if (!PageReserved(page) && put_page_testzero(page)) + if (put_page_testzero(page)) __page_cache_release(page); } @@ -711,6 +712,9 @@ void install_arg_page(struct vm_area_str int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +#define invalid_pfn(pte, vm_flags, vaddr) \ + __invalid_pfn(__FUNCTION__, pte, vm_flags, vaddr) +void __invalid_pfn(const char *, pte_t, unsigned long, unsigned long); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); Index: linux-2.6/mm/madvise.c === --- linux-2.6.orig/mm/madvise.c +++ linux-2.6/mm/madvise.c @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_a unsigned long start, unsigned long end) { *prev = vma; - if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) + if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || is_vm_hugetlb_page(vma)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { Index: linux-2.6/mm/memory.c === --- linux-2.6.orig/mm/memory.c +++ linux-2.6/mm/memory.c @@ -333,6 +333,21 @@ out: } /* + * This function is called to print an error when a pte in a + * !VM_RESERVED region is found pointing to an invalid pfn (which + * is an error. + * + * The calling function must still handle the error. + */ +void __invalid_pfn(const char *errfunc, pte_t pte, + unsigned long vm_flags, unsigned long vaddr) +{ + printk(KERN_ERR "%s: pte does not point to valid memory. " + "process = %s, pte = %08lx, vm_flags = %lx, vaddr = %lx\n", + errfunc, current->comm, (long)pte_val(pte), vm_flags, vaddr); +} + +/* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. @@ -361,25 +376,29 @@ copy_one_pte(struct mm_struct *dst_mm, s spin_unlock(_lock); } } - set_pte_at(dst_mm, addr, dst_pte, pte); - return; + goto out_set_pte; } + /* If the region is VM_RESERVED, the mapping is not +* mapped via rmap - duplicate the pte as is. +*/ + if (vm_flags & VM_RESERVED) + goto out_set_pte; + + /* If the pte points outside of valid
[patch 6/6] mm: core remove PageReserved
6/6 Remove PageReserved() calls from core code by tightening VM_RESERVED handling in mm/ to cover PageReserved functionality. PageReserved special casing is removed from get_page and put_page. All setting and clearning of PageReserved is retained, and it is now flagged in the page_alloc checks to help ensure we don't introduce any refcount based freeing of Reserved pages. MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being deprecated. We never completely handled it correctly anyway, and is difficult to handle nicely - difficult but not impossible, it could be reintroduced in future if required (Hugh has a proof of concept). Once PageReserved() calls are removed from kernel/power/swsusp.c, and all arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can be trivially removed. Last real user of PageReserved is swsusp, which uses PageReserved to determine whether a struct page points to valid memory or not. This still needs to be addressed. Many thanks to Hugh Dickins for input. Signed-off-by: Nick Piggin <[EMAIL PROTECTED]> Index: linux-2.6/include/linux/mm.h === --- linux-2.6.orig/include/linux/mm.h +++ linux-2.6/include/linux/mm.h @@ -156,7 +156,8 @@ extern unsigned int kobjsize(const void #define VM_DONTCOPY0x0002 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x0004 /* Cannot expand with mremap() */ -#define VM_RESERVED0x0008 /* Don't unmap it from swap_out */ +#define VM_RESERVED0x0008 /* Pages and ptes in region aren't managed with regular pagecache or rmap routines */ + #define VM_ACCOUNT 0x0010 /* Is a VM accounted object */ #define VM_HUGETLB 0x0040 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x0080 /* Is non-linear (remap_file_pages) */ @@ -337,7 +338,7 @@ static inline void get_page(struct page static inline void put_page(struct page *page) { - if (!PageReserved(page) && put_page_testzero(page)) + if (put_page_testzero(page)) __page_cache_release(page); } @@ -711,6 +712,7 @@ void install_arg_page(struct vm_area_str int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +void print_invalid_pfn(const char *, pte_t, unsigned long, unsigned long); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); Index: linux-2.6/mm/madvise.c === --- linux-2.6.orig/mm/madvise.c +++ linux-2.6/mm/madvise.c @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_a unsigned long start, unsigned long end) { *prev = vma; - if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma)) + if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || is_vm_hugetlb_page(vma)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { Index: linux-2.6/mm/memory.c === --- linux-2.6.orig/mm/memory.c +++ linux-2.6/mm/memory.c @@ -333,6 +333,21 @@ out: } /* + * This function is called to print an error when a pte in a + * !VM_RESERVED region is found pointing to an invalid pfn (which + * is an error. + * + * The calling function must still handle the error. + */ +void print_invalid_pfn(const char *errfunc, pte_t pte, + unsigned long vm_flags, unsigned long vaddr) +{ + printk(KERN_ERR "%s: pte does not point to valid memory. " + "process = %s, pte = %08lx, vm_flags = %lx, vaddr = %lx\n", + errfunc, current->comm, (long)pte_val(pte), vm_flags, vaddr); +} + +/* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. @@ -361,25 +376,29 @@ copy_one_pte(struct mm_struct *dst_mm, s spin_unlock(_lock); } } - set_pte_at(dst_mm, addr, dst_pte, pte); - return; + goto out_set_pte; } + /* If the region is VM_RESERVED, the mapping is not +* mapped via rmap - duplicate the pte as is. +*/ + if (vm_flags & VM_RESERVED) + goto out_set_pte; + + /* If the pte points outside of valid memory but +* the region is not VM_RESERVED, we have a problem. +*/ pfn = pte_pfn(pte); - /* the pte points outside of valid memory, the -* mapping is assumed to be good, meaningful -* and not mapped via rmap - duplicate the -* mapping as is. -*/ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - -
[patch 6/6] mm: core remove PageReserved
6/6 Remove PageReserved() calls from core code by tightening VM_RESERVED handling in mm/ to cover PageReserved functionality. PageReserved special casing is removed from get_page and put_page. All setting and clearning of PageReserved is retained, and it is now flagged in the page_alloc checks to help ensure we don't introduce any refcount based freeing of Reserved pages. MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being deprecated. We never completely handled it correctly anyway, and is difficult to handle nicely - difficult but not impossible, it could be reintroduced in future if required (Hugh has a proof of concept). Once PageReserved() calls are removed from kernel/power/swsusp.c, and all arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can be trivially removed. Last real user of PageReserved is swsusp, which uses PageReserved to determine whether a struct page points to valid memory or not. This still needs to be addressed. Many thanks to Hugh Dickins for input. Signed-off-by: Nick Piggin [EMAIL PROTECTED] Index: linux-2.6/include/linux/mm.h === --- linux-2.6.orig/include/linux/mm.h +++ linux-2.6/include/linux/mm.h @@ -156,7 +156,8 @@ extern unsigned int kobjsize(const void #define VM_DONTCOPY0x0002 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x0004 /* Cannot expand with mremap() */ -#define VM_RESERVED0x0008 /* Don't unmap it from swap_out */ +#define VM_RESERVED0x0008 /* Pages and ptes in region aren't managed with regular pagecache or rmap routines */ + #define VM_ACCOUNT 0x0010 /* Is a VM accounted object */ #define VM_HUGETLB 0x0040 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x0080 /* Is non-linear (remap_file_pages) */ @@ -337,7 +338,7 @@ static inline void get_page(struct page static inline void put_page(struct page *page) { - if (!PageReserved(page) put_page_testzero(page)) + if (put_page_testzero(page)) __page_cache_release(page); } @@ -711,6 +712,7 @@ void install_arg_page(struct vm_area_str int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +void print_invalid_pfn(const char *, pte_t, unsigned long, unsigned long); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); Index: linux-2.6/mm/madvise.c === --- linux-2.6.orig/mm/madvise.c +++ linux-2.6/mm/madvise.c @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_a unsigned long start, unsigned long end) { *prev = vma; - if ((vma-vm_flags VM_LOCKED) || is_vm_hugetlb_page(vma)) + if ((vma-vm_flags (VM_LOCKED|VM_RESERVED)) || is_vm_hugetlb_page(vma)) return -EINVAL; if (unlikely(vma-vm_flags VM_NONLINEAR)) { Index: linux-2.6/mm/memory.c === --- linux-2.6.orig/mm/memory.c +++ linux-2.6/mm/memory.c @@ -333,6 +333,21 @@ out: } /* + * This function is called to print an error when a pte in a + * !VM_RESERVED region is found pointing to an invalid pfn (which + * is an error. + * + * The calling function must still handle the error. + */ +void print_invalid_pfn(const char *errfunc, pte_t pte, + unsigned long vm_flags, unsigned long vaddr) +{ + printk(KERN_ERR %s: pte does not point to valid memory. + process = %s, pte = %08lx, vm_flags = %lx, vaddr = %lx\n, + errfunc, current-comm, (long)pte_val(pte), vm_flags, vaddr); +} + +/* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. @@ -361,25 +376,29 @@ copy_one_pte(struct mm_struct *dst_mm, s spin_unlock(mmlist_lock); } } - set_pte_at(dst_mm, addr, dst_pte, pte); - return; + goto out_set_pte; } + /* If the region is VM_RESERVED, the mapping is not +* mapped via rmap - duplicate the pte as is. +*/ + if (vm_flags VM_RESERVED) + goto out_set_pte; + + /* If the pte points outside of valid memory but +* the region is not VM_RESERVED, we have a problem. +*/ pfn = pte_pfn(pte); - /* the pte points outside of valid memory, the -* mapping is assumed to be good, meaningful -* and not mapped via rmap - duplicate the -* mapping as is. -*/ - page = NULL; - if (pfn_valid(pfn)) - page = pfn_to_page(pfn); - - if (!page
[patch 6/6] mm: core remove PageReserved (take 2)
Nick Piggin wrote: 6/6 Actually I think Hugh gave me some feedback about the introduced `print_invalid_pfn` function, which I ignored. So here is patch 6 again, with print_invalid_pfn renamed invalid_pfn, and using a macro to alleviate the requirement of passing in the function name by hand. Remove PageReserved() calls from core code by tightening VM_RESERVED handling in mm/ to cover PageReserved functionality. PageReserved special casing is removed from get_page and put_page. All setting and clearning of PageReserved is retained, and it is now flagged in the page_alloc checks to help ensure we don't introduce any refcount based freeing of Reserved pages. MAP_PRIVATE, PROT_WRITE of VM_RESERVED regions is tentatively being deprecated. We never completely handled it correctly anyway, and is difficult to handle nicely - difficult but not impossible, it could be reintroduced in future if required (Hugh has a proof of concept). Once PageReserved() calls are removed from kernel/power/swsusp.c, and all arch/ and driver code, the Set and Clear calls, and the PG_reserved bit can be trivially removed. Last real user of PageReserved is swsusp, which uses PageReserved to determine whether a struct page points to valid memory or not. This still needs to be addressed. Many thanks to Hugh Dickins for input. Signed-off-by: Nick Piggin [EMAIL PROTECTED] Index: linux-2.6/include/linux/mm.h === --- linux-2.6.orig/include/linux/mm.h +++ linux-2.6/include/linux/mm.h @@ -156,7 +156,8 @@ extern unsigned int kobjsize(const void #define VM_DONTCOPY0x0002 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x0004 /* Cannot expand with mremap() */ -#define VM_RESERVED0x0008 /* Don't unmap it from swap_out */ +#define VM_RESERVED0x0008 /* Pages and ptes in region aren't managed with regular pagecache or rmap routines */ + #define VM_ACCOUNT 0x0010 /* Is a VM accounted object */ #define VM_HUGETLB 0x0040 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x0080 /* Is non-linear (remap_file_pages) */ @@ -337,7 +338,7 @@ static inline void get_page(struct page static inline void put_page(struct page *page) { - if (!PageReserved(page) put_page_testzero(page)) + if (put_page_testzero(page)) __page_cache_release(page); } @@ -711,6 +712,9 @@ void install_arg_page(struct vm_area_str int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +#define invalid_pfn(pte, vm_flags, vaddr) \ + __invalid_pfn(__FUNCTION__, pte, vm_flags, vaddr) +void __invalid_pfn(const char *, pte_t, unsigned long, unsigned long); int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page); Index: linux-2.6/mm/madvise.c === --- linux-2.6.orig/mm/madvise.c +++ linux-2.6/mm/madvise.c @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_a unsigned long start, unsigned long end) { *prev = vma; - if ((vma-vm_flags VM_LOCKED) || is_vm_hugetlb_page(vma)) + if ((vma-vm_flags (VM_LOCKED|VM_RESERVED)) || is_vm_hugetlb_page(vma)) return -EINVAL; if (unlikely(vma-vm_flags VM_NONLINEAR)) { Index: linux-2.6/mm/memory.c === --- linux-2.6.orig/mm/memory.c +++ linux-2.6/mm/memory.c @@ -333,6 +333,21 @@ out: } /* + * This function is called to print an error when a pte in a + * !VM_RESERVED region is found pointing to an invalid pfn (which + * is an error. + * + * The calling function must still handle the error. + */ +void __invalid_pfn(const char *errfunc, pte_t pte, + unsigned long vm_flags, unsigned long vaddr) +{ + printk(KERN_ERR %s: pte does not point to valid memory. + process = %s, pte = %08lx, vm_flags = %lx, vaddr = %lx\n, + errfunc, current-comm, (long)pte_val(pte), vm_flags, vaddr); +} + +/* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. @@ -361,25 +376,29 @@ copy_one_pte(struct mm_struct *dst_mm, s spin_unlock(mmlist_lock); } } - set_pte_at(dst_mm, addr, dst_pte, pte); - return; + goto out_set_pte; } + /* If the region is VM_RESERVED, the mapping is not +* mapped via rmap - duplicate the pte as is. +*/ + if (vm_flags VM_RESERVED) + goto out_set_pte; + + /* If the pte points outside of valid memory