Re: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

2005-09-08 Thread Adam Litke
Version 3 (Thu, 08 Aug 2005)
Organized logic in hugetlb_pte_fault() by breaking out
  find_get_page/alloc_huge_page logic into separate function
Removed a few more paranoid checks <
Fixed tlb flushing in a race case  < (thanks Yanmin Zhang)

Version 2 (Wed, 17 Aug 2005)
Removed spurious WARN_ON()
Patches added earlier in the series:
Check for p?d_none() in arch/i386/mm/hugetlbpage.c:huge_pte_offset()
Move i386 stale pte check into huge_pte_alloc()

Initial Post (Fri, 05 Aug 2005)

Below is a patch to implement demand faulting for huge pages.  The main
motivation for changing from prefaulting to demand faulting is so that
huge page memory areas can be allocated according to NUMA policy.

Thanks to consolidated hugetlb code, switching the behavior requires changing
only one fault handler.  The bulk of the patch just moves the logic from 
hugelb_prefault() to hugetlb_pte_fault().

Diffed against 2.6.13-git6

Signed-off-by: Adam Litke <[EMAIL PROTECTED]>
---
 fs/hugetlbfs/inode.c|6 -
 include/linux/hugetlb.h |2 
 mm/hugetlb.c|  154 +---
 mm/memory.c |2 
 4 files changed, 98 insertions(+), 66 deletions(-)
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c
+++ current/fs/hugetlbfs/inode.c
@@ -48,7 +48,6 @@ int sysctl_hugetlb_shm_group;
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
struct inode *inode = file->f_dentry->d_inode;
-   struct address_space *mapping = inode->i_mapping;
loff_t len, vma_len;
int ret;
 
@@ -79,10 +78,7 @@ static int hugetlbfs_file_mmap(struct fi
if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
goto out;
 
-   ret = hugetlb_prefault(mapping, vma);
-   if (ret)
-   goto out;
-
+   ret = 0;
if (inode->i_size < len)
inode->i_size = len;
 out:
diff -upN reference/include/linux/hugetlb.h current/include/linux/hugetlb.h
--- reference/include/linux/hugetlb.h
+++ current/include/linux/hugetlb.h
@@ -25,6 +25,8 @@ int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
 struct page *alloc_huge_page(void);
 void free_huge_page(struct page *);
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+   unsigned long address, int write_access);
 
 extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
diff -upN reference/mm/hugetlb.c current/mm/hugetlb.c
--- reference/mm/hugetlb.c
+++ current/mm/hugetlb.c
@@ -274,21 +274,22 @@ int copy_hugetlb_page_range(struct mm_st
 {
pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
-   unsigned long addr = vma->vm_start;
+   unsigned long addr;
unsigned long end = vma->vm_end;
 
-   while (addr < end) {
+   for (addr = vma->vm_start; addr < end; addr += HPAGE_SIZE) {
+   src_pte = huge_pte_offset(src, addr);
+   if (!src_pte || pte_none(*src_pte))
+   continue;
+   
dst_pte = huge_pte_alloc(dst, addr);
if (!dst_pte)
goto nomem;
-   src_pte = huge_pte_offset(src, addr);
-   BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(dst, addr, dst_pte, entry);
-   addr += HPAGE_SIZE;
}
return 0;
 
@@ -338,61 +339,6 @@ void zap_hugepage_range(struct vm_area_s
spin_unlock(>page_table_lock);
 }
 
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-   struct mm_struct *mm = current->mm;
-   unsigned long addr;
-   int ret = 0;
-
-   WARN_ON(!is_vm_hugetlb_page(vma));
-   BUG_ON(vma->vm_start & ~HPAGE_MASK);
-   BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-   hugetlb_prefault_arch_hook(mm);
-
-   spin_lock(>page_table_lock);
-   for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-   unsigned long idx;
-   pte_t *pte = huge_pte_alloc(mm, addr);
-   struct page *page;
-
-   if (!pte) {
-   ret = -ENOMEM;
-   goto out;
-   }
-
-   idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-   + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-   page = find_get_page(mapping, idx);
-   if (!page) {
-   /* charge the fs quota first */
-   if (hugetlb_get_quota(mapping)) {
-   ret = -ENOMEM;
-   goto out;
-   

Re: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

2005-09-08 Thread Adam Litke
Version 3 (Thu, 08 Aug 2005)
Organized logic in hugetlb_pte_fault() by breaking out
  find_get_page/alloc_huge_page logic into separate function
Removed a few more paranoid checks 
Fixed tlb flushing in a race case   (thanks Yanmin Zhang)

Version 2 (Wed, 17 Aug 2005)
Removed spurious WARN_ON()
Patches added earlier in the series:
Check for p?d_none() in arch/i386/mm/hugetlbpage.c:huge_pte_offset()
Move i386 stale pte check into huge_pte_alloc()

Initial Post (Fri, 05 Aug 2005)

Below is a patch to implement demand faulting for huge pages.  The main
motivation for changing from prefaulting to demand faulting is so that
huge page memory areas can be allocated according to NUMA policy.

Thanks to consolidated hugetlb code, switching the behavior requires changing
only one fault handler.  The bulk of the patch just moves the logic from 
hugelb_prefault() to hugetlb_pte_fault().

Diffed against 2.6.13-git6

Signed-off-by: Adam Litke [EMAIL PROTECTED]
---
 fs/hugetlbfs/inode.c|6 -
 include/linux/hugetlb.h |2 
 mm/hugetlb.c|  154 +---
 mm/memory.c |2 
 4 files changed, 98 insertions(+), 66 deletions(-)
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c
+++ current/fs/hugetlbfs/inode.c
@@ -48,7 +48,6 @@ int sysctl_hugetlb_shm_group;
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
struct inode *inode = file-f_dentry-d_inode;
-   struct address_space *mapping = inode-i_mapping;
loff_t len, vma_len;
int ret;
 
@@ -79,10 +78,7 @@ static int hugetlbfs_file_mmap(struct fi
if (!(vma-vm_flags  VM_WRITE)  len  inode-i_size)
goto out;
 
-   ret = hugetlb_prefault(mapping, vma);
-   if (ret)
-   goto out;
-
+   ret = 0;
if (inode-i_size  len)
inode-i_size = len;
 out:
diff -upN reference/include/linux/hugetlb.h current/include/linux/hugetlb.h
--- reference/include/linux/hugetlb.h
+++ current/include/linux/hugetlb.h
@@ -25,6 +25,8 @@ int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
 struct page *alloc_huge_page(void);
 void free_huge_page(struct page *);
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+   unsigned long address, int write_access);
 
 extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
diff -upN reference/mm/hugetlb.c current/mm/hugetlb.c
--- reference/mm/hugetlb.c
+++ current/mm/hugetlb.c
@@ -274,21 +274,22 @@ int copy_hugetlb_page_range(struct mm_st
 {
pte_t *src_pte, *dst_pte, entry;
struct page *ptepage;
-   unsigned long addr = vma-vm_start;
+   unsigned long addr;
unsigned long end = vma-vm_end;
 
-   while (addr  end) {
+   for (addr = vma-vm_start; addr  end; addr += HPAGE_SIZE) {
+   src_pte = huge_pte_offset(src, addr);
+   if (!src_pte || pte_none(*src_pte))
+   continue;
+   
dst_pte = huge_pte_alloc(dst, addr);
if (!dst_pte)
goto nomem;
-   src_pte = huge_pte_offset(src, addr);
-   BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(dst, addr, dst_pte, entry);
-   addr += HPAGE_SIZE;
}
return 0;
 
@@ -338,61 +339,6 @@ void zap_hugepage_range(struct vm_area_s
spin_unlock(mm-page_table_lock);
 }
 
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-   struct mm_struct *mm = current-mm;
-   unsigned long addr;
-   int ret = 0;
-
-   WARN_ON(!is_vm_hugetlb_page(vma));
-   BUG_ON(vma-vm_start  ~HPAGE_MASK);
-   BUG_ON(vma-vm_end  ~HPAGE_MASK);
-
-   hugetlb_prefault_arch_hook(mm);
-
-   spin_lock(mm-page_table_lock);
-   for (addr = vma-vm_start; addr  vma-vm_end; addr += HPAGE_SIZE) {
-   unsigned long idx;
-   pte_t *pte = huge_pte_alloc(mm, addr);
-   struct page *page;
-
-   if (!pte) {
-   ret = -ENOMEM;
-   goto out;
-   }
-
-   idx = ((addr - vma-vm_start)  HPAGE_SHIFT)
-   + (vma-vm_pgoff  (HPAGE_SHIFT - PAGE_SHIFT));
-   page = find_get_page(mapping, idx);
-   if (!page) {
-   /* charge the fs quota first */
-   if (hugetlb_get_quota(mapping)) {
-   ret = -ENOMEM;
-   goto out;
-   }
-

RE: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

2005-09-07 Thread Adam Litke
On Wed, 2005-09-07 at 10:33 +0800, Zhang, Yanmin wrote:
> >>-Original Message-
> >>From: [EMAIL PROTECTED]
> >>[mailto:[EMAIL PROTECTED] On Behalf Of Adam Litke
> >>Sent: Wednesday, September 07, 2005 5:59 AM
> >>To: linux-kernel@vger.kernel.org
> >>Cc: ADAM G. LITKE [imap]
> >>Subject: Re: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb
> 
> >>+retry:
> >>+   page = find_get_page(mapping, idx);
> >>+   if (!page) {
> >>+   /* charge the fs quota first */
> >>+   if (hugetlb_get_quota(mapping)) {
> >>+   ret = VM_FAULT_SIGBUS;
> >>+   goto out;
> >>+   }
> >>+   page = alloc_huge_page();
> >>+   if (!page) {
> >>+   hugetlb_put_quota(mapping);
> >>+   ret = VM_FAULT_SIGBUS;
> >>+   goto out;
> >>+   }
> >>+   if (add_to_page_cache(page, mapping, idx, GFP_ATOMIC)) {
> 
> Here you lost hugetlb_put_quota(mapping);

Whoops, thanks for catching that.

> >>+   put_page(page);
> >>+   goto retry;
> >>+   }
> >>+   unlock_page(page);
> 
> As for regular pages, kernel is used to unlock mm-> page_table_lock
> before find_get_page and relock it before setting pte. Why isn't the
> style followed by huge page fault?

As far as I can tell, we should be able to do that for large pages as
well.  I'll give it a spin.

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

2005-09-07 Thread Adam Litke
On Wed, 2005-09-07 at 10:33 +0800, Zhang, Yanmin wrote:
 -Original Message-
 From: [EMAIL PROTECTED]
 [mailto:[EMAIL PROTECTED] On Behalf Of Adam Litke
 Sent: Wednesday, September 07, 2005 5:59 AM
 To: linux-kernel@vger.kernel.org
 Cc: ADAM G. LITKE [imap]
 Subject: Re: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb
 
 +retry:
 +   page = find_get_page(mapping, idx);
 +   if (!page) {
 +   /* charge the fs quota first */
 +   if (hugetlb_get_quota(mapping)) {
 +   ret = VM_FAULT_SIGBUS;
 +   goto out;
 +   }
 +   page = alloc_huge_page();
 +   if (!page) {
 +   hugetlb_put_quota(mapping);
 +   ret = VM_FAULT_SIGBUS;
 +   goto out;
 +   }
 +   if (add_to_page_cache(page, mapping, idx, GFP_ATOMIC)) {
 
 Here you lost hugetlb_put_quota(mapping);

Whoops, thanks for catching that.

 +   put_page(page);
 +   goto retry;
 +   }
 +   unlock_page(page);
 
 As for regular pages, kernel is used to unlock mm- page_table_lock
 before find_get_page and relock it before setting pte. Why isn't the
 style followed by huge page fault?

As far as I can tell, we should be able to do that for large pages as
well.  I'll give it a spin.

-- 
Adam Litke - (agl at us.ibm.com)
IBM Linux Technology Center

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

2005-09-06 Thread Zhang, Yanmin
More comments below.

>>-Original Message-
>>From: [EMAIL PROTECTED]
>>[mailto:[EMAIL PROTECTED] On Behalf Of Adam Litke
>>Sent: Wednesday, September 07, 2005 5:59 AM
>>To: linux-kernel@vger.kernel.org
>>Cc: ADAM G. LITKE [imap]
>>Subject: Re: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb
>>
>>Below is a patch to implement demand faulting for huge pages.  The
main
>>motivation for changing from prefaulting to demand faulting is so that
>>huge page memory areas can be allocated according to NUMA policy.

>>@@ -277,18 +277,20 @@ int copy_hugetlb_page_range(struct mm_st
>>  unsigned long addr = vma->vm_start;
>>  unsigned long end = vma->vm_end;
>>
>>- while (addr < end) {
>>+ for (; addr < end; addr += HPAGE_SIZE) {
>>+ src_pte = huge_pte_offset(src, addr);
>>+ if (!src_pte || pte_none(*src_pte))
>>+ continue;
>>+
>>  dst_pte = huge_pte_alloc(dst, addr);
>>  if (!dst_pte)
>>  goto nomem;
>>- src_pte = huge_pte_offset(src, addr);
>>- BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
>>+ BUG_ON(!src_pte);
Should this BUG_ON be deleted?

>>  entry = *src_pte;
>>  ptepage = pte_page(entry);
>>  get_page(ptepage);
>>  add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
>>  set_huge_pte_at(dst, addr, dst_pte, entry);
>>- addr += HPAGE_SIZE;
>>  }
>>  return 0;
>>


>>+int hugetlb_pte_fault(struct mm_struct *mm, struct vm_area_struct
*vma,
>>+ unsigned long address, int write_access)
>>+{
>>+ int ret = VM_FAULT_MINOR;
>>+ unsigned long idx;
>>+ pte_t *pte;
>>+ struct page *page;
>>+ struct address_space *mapping;
>>+
>>+ BUG_ON(vma->vm_start & ~HPAGE_MASK);
>>+ BUG_ON(vma->vm_end & ~HPAGE_MASK);
>>+ BUG_ON(!vma->vm_file);
>>+
>>+ pte = huge_pte_alloc(mm, address);
Why to call huge_pte_alloc again? Hugetlb_fault already calls it.


>>+ if (!pte) {
>>+ ret = VM_FAULT_SIGBUS;
>>+ goto out;
>>+ }
>>+ if (! pte_none(*pte))
>>+ goto flush;
>>+
>>+ mapping = vma->vm_file->f_mapping;
>>+ idx = ((address - vma->vm_start) >> HPAGE_SHIFT)
>>+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
>>+retry:
>>+ page = find_get_page(mapping, idx);
>>+ if (!page) {
>>+ /* charge the fs quota first */
>>+ if (hugetlb_get_quota(mapping)) {
>>+ ret = VM_FAULT_SIGBUS;
>>+ goto out;
>>+ }
>>+ page = alloc_huge_page();
>>+ if (!page) {
>>+ hugetlb_put_quota(mapping);
>>+ ret = VM_FAULT_SIGBUS;
>>+ goto out;
>>+ }
>>+ if (add_to_page_cache(page, mapping, idx, GFP_ATOMIC)) {
>>+ put_page(page);
>>+ goto retry;
>>+ }
>>+ unlock_page(page);
>>+ }
>>+ add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
>>+ set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
>>+flush:
>>+ flush_tlb_page(vma, address);
>>+out:
>>+ return ret;
>>+}
>>+
>>+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
>>+ unsigned long address, int write_access)
>>+{
>>+ pte_t *ptep;
>>+ int rc = VM_FAULT_MINOR;
>>+
>>+ spin_lock(>page_table_lock);
>>+
>>+ ptep = huge_pte_alloc(mm, address & HPAGE_MASK);
The alignment is not needed. How about change it to ptep =
huge_pte_alloc(mm, address)?

>>+ if (! ptep) {
>>+ rc = VM_FAULT_SIGBUS;
>>+ goto out;
>>+ }
>>+ if (pte_none(*ptep))
>>+ rc = hugetlb_pte_fault(mm, vma, address, write_access);
In function hugetlb_pte_fault, if(!pte_none(*ptep)), tlb page will be
flushed, but here is doesn't. Why?

>>+out:
>>+ spin_unlock(>page_table_lock);
>>+ return rc;
>>+}
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

2005-09-06 Thread Zhang, Yanmin
>>-Original Message-
>>From: [EMAIL PROTECTED]
>>[mailto:[EMAIL PROTECTED] On Behalf Of Adam Litke
>>Sent: Wednesday, September 07, 2005 5:59 AM
>>To: linux-kernel@vger.kernel.org
>>Cc: ADAM G. LITKE [imap]
>>Subject: Re: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

>>+retry:
>>+ page = find_get_page(mapping, idx);
>>+ if (!page) {
>>+ /* charge the fs quota first */
>>+ if (hugetlb_get_quota(mapping)) {
>>+ ret = VM_FAULT_SIGBUS;
>>+ goto out;
>>+ }
>>+ page = alloc_huge_page();
>>+ if (!page) {
>>+ hugetlb_put_quota(mapping);
>>+ ret = VM_FAULT_SIGBUS;
>>+ goto out;
>>+ }
>>+ if (add_to_page_cache(page, mapping, idx, GFP_ATOMIC)) {

Here you lost hugetlb_put_quota(mapping);


>>+ put_page(page);
>>+ goto retry;
>>+ }
>>+ unlock_page(page);

As for regular pages, kernel is used to unlock mm-> page_table_lock
before find_get_page and relock it before setting pte. Why isn't the
style followed by huge page fault?

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

2005-09-06 Thread Adam Litke
Below is a patch to implement demand faulting for huge pages.  The main
motivation for changing from prefaulting to demand faulting is so that
huge page memory areas can be allocated according to NUMA policy.

Thanks to consolidated hugetlb code, switching the behavior requires changing
only one fault handler.  The bulk of the patch just moves the logic from 
hugelb_prefault() to hugetlb_pte_fault().

Diffed against 2.6.13-git6

Signed-off-by: Adam Litke <[EMAIL PROTECTED]>
---
 fs/hugetlbfs/inode.c|6 --
 include/linux/hugetlb.h |2 
 mm/hugetlb.c|  137 +++-
 mm/memory.c |2 
 4 files changed, 82 insertions(+), 65 deletions(-)
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c
+++ current/fs/hugetlbfs/inode.c
@@ -48,7 +48,6 @@ int sysctl_hugetlb_shm_group;
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
struct inode *inode = file->f_dentry->d_inode;
-   struct address_space *mapping = inode->i_mapping;
loff_t len, vma_len;
int ret;
 
@@ -79,10 +78,7 @@ static int hugetlbfs_file_mmap(struct fi
if (!(vma->vm_flags & VM_WRITE) && len > inode->i_size)
goto out;
 
-   ret = hugetlb_prefault(mapping, vma);
-   if (ret)
-   goto out;
-
+   ret = 0;
if (inode->i_size < len)
inode->i_size = len;
 out:
diff -upN reference/include/linux/hugetlb.h current/include/linux/hugetlb.h
--- reference/include/linux/hugetlb.h
+++ current/include/linux/hugetlb.h
@@ -25,6 +25,8 @@ int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
 struct page *alloc_huge_page(void);
 void free_huge_page(struct page *);
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+   unsigned long address, int write_access);
 
 extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
diff -upN reference/mm/hugetlb.c current/mm/hugetlb.c
--- reference/mm/hugetlb.c
+++ current/mm/hugetlb.c
@@ -277,18 +277,20 @@ int copy_hugetlb_page_range(struct mm_st
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
 
-   while (addr < end) {
+   for (; addr < end; addr += HPAGE_SIZE) {
+   src_pte = huge_pte_offset(src, addr);
+   if (!src_pte || pte_none(*src_pte))
+   continue;
+   
dst_pte = huge_pte_alloc(dst, addr);
if (!dst_pte)
goto nomem;
-   src_pte = huge_pte_offset(src, addr);
-   BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
+   BUG_ON(!src_pte);
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(dst, addr, dst_pte, entry);
-   addr += HPAGE_SIZE;
}
return 0;
 
@@ -338,61 +340,6 @@ void zap_hugepage_range(struct vm_area_s
spin_unlock(>page_table_lock);
 }
 
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-   struct mm_struct *mm = current->mm;
-   unsigned long addr;
-   int ret = 0;
-
-   WARN_ON(!is_vm_hugetlb_page(vma));
-   BUG_ON(vma->vm_start & ~HPAGE_MASK);
-   BUG_ON(vma->vm_end & ~HPAGE_MASK);
-
-   hugetlb_prefault_arch_hook(mm);
-
-   spin_lock(>page_table_lock);
-   for (addr = vma->vm_start; addr < vma->vm_end; addr += HPAGE_SIZE) {
-   unsigned long idx;
-   pte_t *pte = huge_pte_alloc(mm, addr);
-   struct page *page;
-
-   if (!pte) {
-   ret = -ENOMEM;
-   goto out;
-   }
-
-   idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
-   + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
-   page = find_get_page(mapping, idx);
-   if (!page) {
-   /* charge the fs quota first */
-   if (hugetlb_get_quota(mapping)) {
-   ret = -ENOMEM;
-   goto out;
-   }
-   page = alloc_huge_page();
-   if (!page) {
-   hugetlb_put_quota(mapping);
-   ret = -ENOMEM;
-   goto out;
-   }
-   ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-   if (! ret) {
-   unlock_page(page);
-   } else {
-   hugetlb_put_quota(mapping);
-   free_huge_page(page);
-   goto 

Re: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

2005-09-06 Thread Adam Litke
Below is a patch to implement demand faulting for huge pages.  The main
motivation for changing from prefaulting to demand faulting is so that
huge page memory areas can be allocated according to NUMA policy.

Thanks to consolidated hugetlb code, switching the behavior requires changing
only one fault handler.  The bulk of the patch just moves the logic from 
hugelb_prefault() to hugetlb_pte_fault().

Diffed against 2.6.13-git6

Signed-off-by: Adam Litke [EMAIL PROTECTED]
---
 fs/hugetlbfs/inode.c|6 --
 include/linux/hugetlb.h |2 
 mm/hugetlb.c|  137 +++-
 mm/memory.c |2 
 4 files changed, 82 insertions(+), 65 deletions(-)
diff -upN reference/fs/hugetlbfs/inode.c current/fs/hugetlbfs/inode.c
--- reference/fs/hugetlbfs/inode.c
+++ current/fs/hugetlbfs/inode.c
@@ -48,7 +48,6 @@ int sysctl_hugetlb_shm_group;
 static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
struct inode *inode = file-f_dentry-d_inode;
-   struct address_space *mapping = inode-i_mapping;
loff_t len, vma_len;
int ret;
 
@@ -79,10 +78,7 @@ static int hugetlbfs_file_mmap(struct fi
if (!(vma-vm_flags  VM_WRITE)  len  inode-i_size)
goto out;
 
-   ret = hugetlb_prefault(mapping, vma);
-   if (ret)
-   goto out;
-
+   ret = 0;
if (inode-i_size  len)
inode-i_size = len;
 out:
diff -upN reference/include/linux/hugetlb.h current/include/linux/hugetlb.h
--- reference/include/linux/hugetlb.h
+++ current/include/linux/hugetlb.h
@@ -25,6 +25,8 @@ int is_hugepage_mem_enough(size_t);
 unsigned long hugetlb_total_pages(void);
 struct page *alloc_huge_page(void);
 void free_huge_page(struct page *);
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct * vma,
+   unsigned long address, int write_access);
 
 extern unsigned long max_huge_pages;
 extern const unsigned long hugetlb_zero, hugetlb_infinity;
diff -upN reference/mm/hugetlb.c current/mm/hugetlb.c
--- reference/mm/hugetlb.c
+++ current/mm/hugetlb.c
@@ -277,18 +277,20 @@ int copy_hugetlb_page_range(struct mm_st
unsigned long addr = vma-vm_start;
unsigned long end = vma-vm_end;
 
-   while (addr  end) {
+   for (; addr  end; addr += HPAGE_SIZE) {
+   src_pte = huge_pte_offset(src, addr);
+   if (!src_pte || pte_none(*src_pte))
+   continue;
+   
dst_pte = huge_pte_alloc(dst, addr);
if (!dst_pte)
goto nomem;
-   src_pte = huge_pte_offset(src, addr);
-   BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
+   BUG_ON(!src_pte);
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(dst, addr, dst_pte, entry);
-   addr += HPAGE_SIZE;
}
return 0;
 
@@ -338,61 +340,6 @@ void zap_hugepage_range(struct vm_area_s
spin_unlock(mm-page_table_lock);
 }
 
-int hugetlb_prefault(struct address_space *mapping, struct vm_area_struct *vma)
-{
-   struct mm_struct *mm = current-mm;
-   unsigned long addr;
-   int ret = 0;
-
-   WARN_ON(!is_vm_hugetlb_page(vma));
-   BUG_ON(vma-vm_start  ~HPAGE_MASK);
-   BUG_ON(vma-vm_end  ~HPAGE_MASK);
-
-   hugetlb_prefault_arch_hook(mm);
-
-   spin_lock(mm-page_table_lock);
-   for (addr = vma-vm_start; addr  vma-vm_end; addr += HPAGE_SIZE) {
-   unsigned long idx;
-   pte_t *pte = huge_pte_alloc(mm, addr);
-   struct page *page;
-
-   if (!pte) {
-   ret = -ENOMEM;
-   goto out;
-   }
-
-   idx = ((addr - vma-vm_start)  HPAGE_SHIFT)
-   + (vma-vm_pgoff  (HPAGE_SHIFT - PAGE_SHIFT));
-   page = find_get_page(mapping, idx);
-   if (!page) {
-   /* charge the fs quota first */
-   if (hugetlb_get_quota(mapping)) {
-   ret = -ENOMEM;
-   goto out;
-   }
-   page = alloc_huge_page();
-   if (!page) {
-   hugetlb_put_quota(mapping);
-   ret = -ENOMEM;
-   goto out;
-   }
-   ret = add_to_page_cache(page, mapping, idx, GFP_ATOMIC);
-   if (! ret) {
-   unlock_page(page);
-   } else {
-   hugetlb_put_quota(mapping);
-   free_huge_page(page);
-   goto out;
-   }

RE: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

2005-09-06 Thread Zhang, Yanmin
-Original Message-
From: [EMAIL PROTECTED]
[mailto:[EMAIL PROTECTED] On Behalf Of Adam Litke
Sent: Wednesday, September 07, 2005 5:59 AM
To: linux-kernel@vger.kernel.org
Cc: ADAM G. LITKE [imap]
Subject: Re: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

+retry:
+ page = find_get_page(mapping, idx);
+ if (!page) {
+ /* charge the fs quota first */
+ if (hugetlb_get_quota(mapping)) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if (add_to_page_cache(page, mapping, idx, GFP_ATOMIC)) {

Here you lost hugetlb_put_quota(mapping);


+ put_page(page);
+ goto retry;
+ }
+ unlock_page(page);

As for regular pages, kernel is used to unlock mm- page_table_lock
before find_get_page and relock it before setting pte. Why isn't the
style followed by huge page fault?

-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


RE: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

2005-09-06 Thread Zhang, Yanmin
More comments below.

-Original Message-
From: [EMAIL PROTECTED]
[mailto:[EMAIL PROTECTED] On Behalf Of Adam Litke
Sent: Wednesday, September 07, 2005 5:59 AM
To: linux-kernel@vger.kernel.org
Cc: ADAM G. LITKE [imap]
Subject: Re: [PATCH 2/3 htlb-fault] Demand faulting for hugetlb

Below is a patch to implement demand faulting for huge pages.  The
main
motivation for changing from prefaulting to demand faulting is so that
huge page memory areas can be allocated according to NUMA policy.

@@ -277,18 +277,20 @@ int copy_hugetlb_page_range(struct mm_st
  unsigned long addr = vma-vm_start;
  unsigned long end = vma-vm_end;

- while (addr  end) {
+ for (; addr  end; addr += HPAGE_SIZE) {
+ src_pte = huge_pte_offset(src, addr);
+ if (!src_pte || pte_none(*src_pte))
+ continue;
+
  dst_pte = huge_pte_alloc(dst, addr);
  if (!dst_pte)
  goto nomem;
- src_pte = huge_pte_offset(src, addr);
- BUG_ON(!src_pte || pte_none(*src_pte)); /* prefaulted */
+ BUG_ON(!src_pte);
Should this BUG_ON be deleted?

  entry = *src_pte;
  ptepage = pte_page(entry);
  get_page(ptepage);
  add_mm_counter(dst, rss, HPAGE_SIZE / PAGE_SIZE);
  set_huge_pte_at(dst, addr, dst_pte, entry);
- addr += HPAGE_SIZE;
  }
  return 0;



+int hugetlb_pte_fault(struct mm_struct *mm, struct vm_area_struct
*vma,
+ unsigned long address, int write_access)
+{
+ int ret = VM_FAULT_MINOR;
+ unsigned long idx;
+ pte_t *pte;
+ struct page *page;
+ struct address_space *mapping;
+
+ BUG_ON(vma-vm_start  ~HPAGE_MASK);
+ BUG_ON(vma-vm_end  ~HPAGE_MASK);
+ BUG_ON(!vma-vm_file);
+
+ pte = huge_pte_alloc(mm, address);
Why to call huge_pte_alloc again? Hugetlb_fault already calls it.


+ if (!pte) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if (! pte_none(*pte))
+ goto flush;
+
+ mapping = vma-vm_file-f_mapping;
+ idx = ((address - vma-vm_start)  HPAGE_SHIFT)
+ + (vma-vm_pgoff  (HPAGE_SHIFT - PAGE_SHIFT));
+retry:
+ page = find_get_page(mapping, idx);
+ if (!page) {
+ /* charge the fs quota first */
+ if (hugetlb_get_quota(mapping)) {
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ page = alloc_huge_page();
+ if (!page) {
+ hugetlb_put_quota(mapping);
+ ret = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if (add_to_page_cache(page, mapping, idx, GFP_ATOMIC)) {
+ put_page(page);
+ goto retry;
+ }
+ unlock_page(page);
+ }
+ add_mm_counter(mm, rss, HPAGE_SIZE / PAGE_SIZE);
+ set_huge_pte_at(mm, address, pte, make_huge_pte(vma, page));
+flush:
+ flush_tlb_page(vma, address);
+out:
+ return ret;
+}
+
+int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long address, int write_access)
+{
+ pte_t *ptep;
+ int rc = VM_FAULT_MINOR;
+
+ spin_lock(mm-page_table_lock);
+
+ ptep = huge_pte_alloc(mm, address  HPAGE_MASK);
The alignment is not needed. How about change it to ptep =
huge_pte_alloc(mm, address)?

+ if (! ptep) {
+ rc = VM_FAULT_SIGBUS;
+ goto out;
+ }
+ if (pte_none(*ptep))
+ rc = hugetlb_pte_fault(mm, vma, address, write_access);
In function hugetlb_pte_fault, if(!pte_none(*ptep)), tlb page will be
flushed, but here is doesn't. Why?

+out:
+ spin_unlock(mm-page_table_lock);
+ return rc;
+}
-
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/