Re: [PATCH v3 04/10] thp: do_huge_pmd_wp_page(): handle huge zero page

2012-10-02 Thread Kirill A. Shutemov
On Tue, Oct 02, 2012 at 05:35:59PM +0200, Brice Goglin wrote:
> Le 02/10/2012 17:19, Kirill A. Shutemov a écrit :
> > From: "Kirill A. Shutemov" 
> >
> > On right access to huge zero page we alloc a new page and clear it.
> >
> 
> s/right/write/ ?

Oops... thanks.

-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 04/10] thp: do_huge_pmd_wp_page(): handle huge zero page

2012-10-02 Thread Brice Goglin
Le 02/10/2012 17:19, Kirill A. Shutemov a écrit :
> From: "Kirill A. Shutemov" 
>
> On right access to huge zero page we alloc a new page and clear it.
>

s/right/write/ ?

Brice

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


[PATCH v3 04/10] thp: do_huge_pmd_wp_page(): handle huge zero page

2012-10-02 Thread Kirill A. Shutemov
From: "Kirill A. Shutemov" 

On right access to huge zero page we alloc a new page and clear it.

In fallback path we create a new table and set pte around fault address
to the newly allocated page. All other ptes set to normal zero page.

Signed-off-by: Kirill A. Shutemov 
Reviewed-by: Andrea Arcangeli 
---
 include/linux/mm.h |8 
 mm/huge_memory.c   |  102 
 mm/memory.c|7 
 3 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 311be90..179a41c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -514,6 +514,14 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct 
vm_area_struct *vma)
 }
 #endif
 
+#ifndef my_zero_pfn
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+   extern unsigned long zero_pfn;
+   return zero_pfn;
+}
+#endif
+
 /*
  * Multiple processes may "see" the same page. E.g. for untouched
  * mappings of /dev/null, all processes see the same page full of
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c8b157d..f30f39d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -868,6 +868,61 @@ pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
return pgtable;
 }
 
+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
+   struct vm_area_struct *vma, unsigned long address,
+   pmd_t *pmd, unsigned long haddr)
+{
+   pgtable_t pgtable;
+   pmd_t _pmd;
+   struct page *page;
+   int i, ret = 0;
+
+   page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+   if (!page) {
+   ret |= VM_FAULT_OOM;
+   goto out;
+   }
+
+   if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+   put_page(page);
+   ret |= VM_FAULT_OOM;
+   goto out;
+   }
+
+   clear_user_highpage(page, address);
+   __SetPageUptodate(page);
+
+   spin_lock(>page_table_lock);
+   pmdp_clear_flush_notify(vma, haddr, pmd);
+   /* leave pmd empty until pte is filled */
+
+   pgtable = get_pmd_huge_pte(mm);
+   pmd_populate(mm, &_pmd, pgtable);
+
+   for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+   pte_t *pte, entry;
+   if (haddr == (address & PAGE_MASK)) {
+   entry = mk_pte(page, vma->vm_page_prot);
+   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   page_add_new_anon_rmap(page, vma, haddr);
+   } else {
+   entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
+   entry = pte_mkspecial(entry);
+   }
+   pte = pte_offset_map(&_pmd, haddr);
+   VM_BUG_ON(!pte_none(*pte));
+   set_pte_at(mm, haddr, pte, entry);
+   pte_unmap(pte);
+   }
+   smp_wmb(); /* make pte visible before pmd */
+   pmd_populate(mm, pmd, pgtable);
+   spin_unlock(>page_table_lock);
+
+   ret |= VM_FAULT_WRITE;
+out:
+   return ret;
+}
+
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -965,17 +1020,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
 {
int ret = 0;
-   struct page *page, *new_page;
+   struct page *page = NULL, *new_page;
unsigned long haddr;
 
VM_BUG_ON(!vma->anon_vma);
+   haddr = address & HPAGE_PMD_MASK;
+   if (is_huge_zero_pmd(orig_pmd))
+   goto alloc;
spin_lock(>page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
 
page = pmd_page(orig_pmd);
VM_BUG_ON(!PageCompound(page) || !PageHead(page));
-   haddr = address & HPAGE_PMD_MASK;
if (page_mapcount(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
@@ -987,7 +1044,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
}
get_page(page);
spin_unlock(>page_table_lock);
-
+alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -997,28 +1054,39 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
 
if (unlikely(!new_page)) {
count_vm_event(THP_FAULT_FALLBACK);
-   ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
-  pmd, orig_pmd, page, haddr);
-   if (ret & VM_FAULT_OOM)
-   split_huge_page(page);
-   put_page(page);
+   if (is_huge_zero_pmd(orig_pmd)) {
+  

[PATCH v3 04/10] thp: do_huge_pmd_wp_page(): handle huge zero page

2012-10-02 Thread Kirill A. Shutemov
From: Kirill A. Shutemov kirill.shute...@linux.intel.com

On right access to huge zero page we alloc a new page and clear it.

In fallback path we create a new table and set pte around fault address
to the newly allocated page. All other ptes set to normal zero page.

Signed-off-by: Kirill A. Shutemov kirill.shute...@linux.intel.com
Reviewed-by: Andrea Arcangeli aarca...@redhat.com
---
 include/linux/mm.h |8 
 mm/huge_memory.c   |  102 
 mm/memory.c|7 
 3 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 311be90..179a41c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -514,6 +514,14 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct 
vm_area_struct *vma)
 }
 #endif
 
+#ifndef my_zero_pfn
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+   extern unsigned long zero_pfn;
+   return zero_pfn;
+}
+#endif
+
 /*
  * Multiple processes may see the same page. E.g. for untouched
  * mappings of /dev/null, all processes see the same page full of
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c8b157d..f30f39d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -868,6 +868,61 @@ pgtable_t get_pmd_huge_pte(struct mm_struct *mm)
return pgtable;
 }
 
+static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
+   struct vm_area_struct *vma, unsigned long address,
+   pmd_t *pmd, unsigned long haddr)
+{
+   pgtable_t pgtable;
+   pmd_t _pmd;
+   struct page *page;
+   int i, ret = 0;
+
+   page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+   if (!page) {
+   ret |= VM_FAULT_OOM;
+   goto out;
+   }
+
+   if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+   put_page(page);
+   ret |= VM_FAULT_OOM;
+   goto out;
+   }
+
+   clear_user_highpage(page, address);
+   __SetPageUptodate(page);
+
+   spin_lock(mm-page_table_lock);
+   pmdp_clear_flush_notify(vma, haddr, pmd);
+   /* leave pmd empty until pte is filled */
+
+   pgtable = get_pmd_huge_pte(mm);
+   pmd_populate(mm, _pmd, pgtable);
+
+   for (i = 0; i  HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+   pte_t *pte, entry;
+   if (haddr == (address  PAGE_MASK)) {
+   entry = mk_pte(page, vma-vm_page_prot);
+   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+   page_add_new_anon_rmap(page, vma, haddr);
+   } else {
+   entry = pfn_pte(my_zero_pfn(haddr), vma-vm_page_prot);
+   entry = pte_mkspecial(entry);
+   }
+   pte = pte_offset_map(_pmd, haddr);
+   VM_BUG_ON(!pte_none(*pte));
+   set_pte_at(mm, haddr, pte, entry);
+   pte_unmap(pte);
+   }
+   smp_wmb(); /* make pte visible before pmd */
+   pmd_populate(mm, pmd, pgtable);
+   spin_unlock(mm-page_table_lock);
+
+   ret |= VM_FAULT_WRITE;
+out:
+   return ret;
+}
+
 static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma,
unsigned long address,
@@ -965,17 +1020,19 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
 {
int ret = 0;
-   struct page *page, *new_page;
+   struct page *page = NULL, *new_page;
unsigned long haddr;
 
VM_BUG_ON(!vma-anon_vma);
+   haddr = address  HPAGE_PMD_MASK;
+   if (is_huge_zero_pmd(orig_pmd))
+   goto alloc;
spin_lock(mm-page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
 
page = pmd_page(orig_pmd);
VM_BUG_ON(!PageCompound(page) || !PageHead(page));
-   haddr = address  HPAGE_PMD_MASK;
if (page_mapcount(page) == 1) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
@@ -987,7 +1044,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
}
get_page(page);
spin_unlock(mm-page_table_lock);
-
+alloc:
if (transparent_hugepage_enabled(vma) 
!transparent_hugepage_debug_cow())
new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
@@ -997,28 +1054,39 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
 
if (unlikely(!new_page)) {
count_vm_event(THP_FAULT_FALLBACK);
-   ret = do_huge_pmd_wp_page_fallback(mm, vma, address,
-  pmd, orig_pmd, page, haddr);
-   if (ret  VM_FAULT_OOM)
-   split_huge_page(page);
-   

Re: [PATCH v3 04/10] thp: do_huge_pmd_wp_page(): handle huge zero page

2012-10-02 Thread Brice Goglin
Le 02/10/2012 17:19, Kirill A. Shutemov a écrit :
 From: Kirill A. Shutemov kirill.shute...@linux.intel.com

 On right access to huge zero page we alloc a new page and clear it.


s/right/write/ ?

Brice

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 04/10] thp: do_huge_pmd_wp_page(): handle huge zero page

2012-10-02 Thread Kirill A. Shutemov
On Tue, Oct 02, 2012 at 05:35:59PM +0200, Brice Goglin wrote:
 Le 02/10/2012 17:19, Kirill A. Shutemov a écrit :
  From: Kirill A. Shutemov kirill.shute...@linux.intel.com
 
  On right access to huge zero page we alloc a new page and clear it.
 
 
 s/right/write/ ?

Oops... thanks.

-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/