Re: [PATCH 13/18] mm: numa: Scan pages with elevated page_mapcount

2013-07-16 Thread Sam Ben

On 07/15/2013 11:20 PM, Mel Gorman wrote:

Currently automatic NUMA balancing is unable to distinguish between false
shared versus private pages except by ignoring pages with an elevated


What's the meaning of false shared?


page_mapcount entirely. This avoids shared pages bouncing between the
nodes whose task is using them but that is ignored quite a lot of data.

This patch kicks away the training wheels in preparation for adding support
for identifying shared/private pages is now in place. The ordering is so
that the impact of the shared/private detection can be easily measured. Note
that the patch does not migrate shared, file-backed within vmas marked
VM_EXEC as these are generally shared library pages. Migrating such pages
is not beneficial as there is an expectation they are read-shared between
caches and iTLB and iCache pressure is generally low.

Signed-off-by: Mel Gorman 
---
  include/linux/migrate.h |  7 ---
  mm/memory.c |  7 ++-
  mm/migrate.c| 17 ++---
  mm/mprotect.c   |  4 +---
  4 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a405d3dc..e7e26af 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -92,11 +92,12 @@ static inline int migrate_huge_page_move_mapping(struct 
address_space *mapping,
  #endif /* CONFIG_MIGRATION */
  
  #ifdef CONFIG_NUMA_BALANCING

-extern int migrate_misplaced_page(struct page *page, int node);
-extern int migrate_misplaced_page(struct page *page, int node);
+extern int migrate_misplaced_page(struct page *page,
+ struct vm_area_struct *vma, int node);
  extern bool migrate_ratelimited(int node);
  #else
-static inline int migrate_misplaced_page(struct page *page, int node)
+static inline int migrate_misplaced_page(struct page *page,
+struct vm_area_struct *vma, int node)
  {
return -EAGAIN; /* can't migrate now */
  }
diff --git a/mm/memory.c b/mm/memory.c
index ab933be..62ae8a7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3586,7 +3586,7 @@ int do_numa_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
}
  
  	/* Migrate to the requested node */

-   migrated = migrate_misplaced_page(page, target_nid);
+   migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated)
current_nid = target_nid;
  
@@ -3651,9 +3651,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,

page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page))
continue;
-   /* only check non-shared pages */
-   if (unlikely(page_mapcount(page) != 1))
-   continue;
  
  		/*

 * Note that the NUMA fault is later accounted to either
@@ -3671,7 +3668,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
  
  		/* Migrate to the requested node */

pte_unmap_unlock(pte, ptl);
-   migrated = migrate_misplaced_page(page, target_nid);
+   migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated)
curr_nid = target_nid;
task_numa_fault(last_nid, curr_nid, 1, migrated);
diff --git a/mm/migrate.c b/mm/migrate.c
index 3bbaf5d..23f8122 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1579,7 +1579,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct 
page *page)
   * node. Caller is expected to have an elevated reference count on
   * the page that will be dropped by this function before returning.
   */
-int migrate_misplaced_page(struct page *page, int node)
+int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+  int node)
  {
pg_data_t *pgdat = NODE_DATA(node);
int isolated;
@@ -1587,10 +1588,11 @@ int migrate_misplaced_page(struct page *page, int node)
LIST_HEAD(migratepages);
  
  	/*

-* Don't migrate pages that are mapped in multiple processes.
-* TODO: Handle false sharing detection instead of this hammer
+* Don't migrate file pages that are mapped in multiple processes
+* with execute permissions as they are probably shared libraries.
 */
-   if (page_mapcount(page) != 1)
+   if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
+   (vma->vm_flags & VM_EXEC))
goto out;
  
  	/*

@@ -1641,13 +1643,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct 
*mm,
int page_lru = page_is_file_cache(page);
  
  	/*

-* Don't migrate pages that are mapped in multiple processes.
-* TODO: Handle false sharing detection instead of this hammer
-*/
-   if (page_mapcount(page) != 1)
-   goto out_dropref;
-
-   /*
 * Rate-limit the amount of data 

Re: [RFC 0/4] Transparent on-demand struct page initialization embedded in the buddy allocator

2013-07-16 Thread Sam Ben

On 07/12/2013 10:03 AM, Robin Holt wrote:

We have been working on this since we returned from shutdown and have
something to discuss now.  We restricted ourselves to 2MiB initialization
to keep the patch set a little smaller and more clear.

First, I think I want to propose getting rid of the page flag.  If I knew
of a concrete way to determine that the page has not been initialized,
this patch series would look different.  If there is no definitive
way to determine that the struct page has been initialized aside from
checking the entire page struct is zero, then I think I would suggest
we change the page flag to indicate the page has been initialized.

The heart of the problem as I see it comes from expand().  We nearly
always see a first reference to a struct page which is in the middle
of the 2MiB region.  Due to that access, the unlikely() check that was
originally proposed really ends up referencing a different page entirely.
We actually did not introduce an unlikely and refactor the patches to
make that unlikely inside a static inline function.  Also, given the
strong warning at the head of expand(), we did not feel experienced
enough to refactor it to make things always reference the 2MiB page
first.

With this patch, we did boot a 16TiB machine.  Without the patches,
the v3.10 kernel with the same configuration took 407 seconds for
free_all_bootmem.  With the patches and operating on 2MiB pages instead
of 1GiB, it took 26 seconds so performance was improved.  I have no feel
for how the 1GiB chunk size will perform.


How to test how much time spend on free_all_bootmem?



I am on vacation for the next three days so I am sorry in advance for
my infrequent or non-existant responses.


Signed-off-by: Robin Holt 
Signed-off-by: Nate Zimmer 
To: "H. Peter Anvin" 
To: Ingo Molnar 
Cc: Linux Kernel 
Cc: Linux MM 
Cc: Rob Landley 
Cc: Mike Travis 
Cc: Daniel J Blueman 
Cc: Andrew Morton 
Cc: Greg KH 
Cc: Yinghai Lu 
Cc: Mel Gorman 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/4] Have __free_pages_memory() free in larger chunks.

2013-07-16 Thread Sam Ben

Hi Robin,
On 07/12/2013 10:03 AM, Robin Holt wrote:

Currently, when free_all_bootmem() calls __free_pages_memory(), the
number of contiguous pages that __free_pages_memory() passes to the
buddy allocator is limited to BITS_PER_LONG.  In order to be able to


I fail to understand this. Why the original page number is BITS_PER_LONG?


free only the first page of a 2MiB chunk, we need that to be increased
to PTRS_PER_PMD.

Signed-off-by: Robin Holt 
Signed-off-by: Nate Zimmer 
To: "H. Peter Anvin" 
To: Ingo Molnar 
Cc: Linux Kernel 
Cc: Linux MM 
Cc: Rob Landley 
Cc: Mike Travis 
Cc: Daniel J Blueman 
Cc: Andrew Morton 
Cc: Greg KH 
Cc: Yinghai Lu 
Cc: Mel Gorman 
---
  mm/nobootmem.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bdd3fa2..3b512ca 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -83,10 +83,10 @@ void __init free_bootmem_late(unsigned long addr, unsigned 
long size)
  static void __init __free_pages_memory(unsigned long start, unsigned long end)
  {
unsigned long i, start_aligned, end_aligned;
-   int order = ilog2(BITS_PER_LONG);
+   int order = ilog2(max(BITS_PER_LONG, PTRS_PER_PMD));
  
-	start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);

-   end_aligned = end & ~(BITS_PER_LONG - 1);
+   start_aligned = (start + ((1UL << order) - 1)) & ~((1UL << order) - 1);
+   end_aligned = end & ~((1UL << order) - 1);
  
  	if (end_aligned <= start_aligned) {

for (i = start; i < end; i++)
@@ -98,7 +98,7 @@ static void __init __free_pages_memory(unsigned long start, 
unsigned long end)
for (i = start; i < start_aligned; i++)
__free_pages_bootmem(pfn_to_page(i), 0);
  
-	for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)

+   for (i = start_aligned; i < end_aligned; i += 1 << order)
__free_pages_bootmem(pfn_to_page(i), order);
  
  	for (i = end_aligned; i < end; i++)


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 2/4] Have __free_pages_memory() free in larger chunks.

2013-07-16 Thread Sam Ben

Hi Robin,
On 07/12/2013 10:03 AM, Robin Holt wrote:

Currently, when free_all_bootmem() calls __free_pages_memory(), the
number of contiguous pages that __free_pages_memory() passes to the
buddy allocator is limited to BITS_PER_LONG.  In order to be able to


I fail to understand this. Why the original page number is BITS_PER_LONG?


free only the first page of a 2MiB chunk, we need that to be increased
to PTRS_PER_PMD.

Signed-off-by: Robin Holt h...@sgi.com
Signed-off-by: Nate Zimmer nzim...@sgi.com
To: H. Peter Anvin h...@zytor.com
To: Ingo Molnar mi...@kernel.org
Cc: Linux Kernel linux-kernel@vger.kernel.org
Cc: Linux MM linux...@kvack.org
Cc: Rob Landley r...@landley.net
Cc: Mike Travis tra...@sgi.com
Cc: Daniel J Blueman dan...@numascale-asia.com
Cc: Andrew Morton a...@linux-foundation.org
Cc: Greg KH gre...@linuxfoundation.org
Cc: Yinghai Lu ying...@kernel.org
Cc: Mel Gorman mgor...@suse.de
---
  mm/nobootmem.c | 8 
  1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index bdd3fa2..3b512ca 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -83,10 +83,10 @@ void __init free_bootmem_late(unsigned long addr, unsigned 
long size)
  static void __init __free_pages_memory(unsigned long start, unsigned long end)
  {
unsigned long i, start_aligned, end_aligned;
-   int order = ilog2(BITS_PER_LONG);
+   int order = ilog2(max(BITS_PER_LONG, PTRS_PER_PMD));
  
-	start_aligned = (start + (BITS_PER_LONG - 1))  ~(BITS_PER_LONG - 1);

-   end_aligned = end  ~(BITS_PER_LONG - 1);
+   start_aligned = (start + ((1UL  order) - 1))  ~((1UL  order) - 1);
+   end_aligned = end  ~((1UL  order) - 1);
  
  	if (end_aligned = start_aligned) {

for (i = start; i  end; i++)
@@ -98,7 +98,7 @@ static void __init __free_pages_memory(unsigned long start, 
unsigned long end)
for (i = start; i  start_aligned; i++)
__free_pages_bootmem(pfn_to_page(i), 0);
  
-	for (i = start_aligned; i  end_aligned; i += BITS_PER_LONG)

+   for (i = start_aligned; i  end_aligned; i += 1  order)
__free_pages_bootmem(pfn_to_page(i), order);
  
  	for (i = end_aligned; i  end; i++)


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC 0/4] Transparent on-demand struct page initialization embedded in the buddy allocator

2013-07-16 Thread Sam Ben

On 07/12/2013 10:03 AM, Robin Holt wrote:

We have been working on this since we returned from shutdown and have
something to discuss now.  We restricted ourselves to 2MiB initialization
to keep the patch set a little smaller and more clear.

First, I think I want to propose getting rid of the page flag.  If I knew
of a concrete way to determine that the page has not been initialized,
this patch series would look different.  If there is no definitive
way to determine that the struct page has been initialized aside from
checking the entire page struct is zero, then I think I would suggest
we change the page flag to indicate the page has been initialized.

The heart of the problem as I see it comes from expand().  We nearly
always see a first reference to a struct page which is in the middle
of the 2MiB region.  Due to that access, the unlikely() check that was
originally proposed really ends up referencing a different page entirely.
We actually did not introduce an unlikely and refactor the patches to
make that unlikely inside a static inline function.  Also, given the
strong warning at the head of expand(), we did not feel experienced
enough to refactor it to make things always reference the 2MiB page
first.

With this patch, we did boot a 16TiB machine.  Without the patches,
the v3.10 kernel with the same configuration took 407 seconds for
free_all_bootmem.  With the patches and operating on 2MiB pages instead
of 1GiB, it took 26 seconds so performance was improved.  I have no feel
for how the 1GiB chunk size will perform.


How to test how much time spend on free_all_bootmem?



I am on vacation for the next three days so I am sorry in advance for
my infrequent or non-existant responses.


Signed-off-by: Robin Holt h...@sgi.com
Signed-off-by: Nate Zimmer nzim...@sgi.com
To: H. Peter Anvin h...@zytor.com
To: Ingo Molnar mi...@kernel.org
Cc: Linux Kernel linux-kernel@vger.kernel.org
Cc: Linux MM linux...@kvack.org
Cc: Rob Landley r...@landley.net
Cc: Mike Travis tra...@sgi.com
Cc: Daniel J Blueman dan...@numascale-asia.com
Cc: Andrew Morton a...@linux-foundation.org
Cc: Greg KH gre...@linuxfoundation.org
Cc: Yinghai Lu ying...@kernel.org
Cc: Mel Gorman mgor...@suse.de
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 13/18] mm: numa: Scan pages with elevated page_mapcount

2013-07-16 Thread Sam Ben

On 07/15/2013 11:20 PM, Mel Gorman wrote:

Currently automatic NUMA balancing is unable to distinguish between false
shared versus private pages except by ignoring pages with an elevated


What's the meaning of false shared?


page_mapcount entirely. This avoids shared pages bouncing between the
nodes whose task is using them but that is ignored quite a lot of data.

This patch kicks away the training wheels in preparation for adding support
for identifying shared/private pages is now in place. The ordering is so
that the impact of the shared/private detection can be easily measured. Note
that the patch does not migrate shared, file-backed within vmas marked
VM_EXEC as these are generally shared library pages. Migrating such pages
is not beneficial as there is an expectation they are read-shared between
caches and iTLB and iCache pressure is generally low.

Signed-off-by: Mel Gorman mgor...@suse.de
---
  include/linux/migrate.h |  7 ---
  mm/memory.c |  7 ++-
  mm/migrate.c| 17 ++---
  mm/mprotect.c   |  4 +---
  4 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a405d3dc..e7e26af 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -92,11 +92,12 @@ static inline int migrate_huge_page_move_mapping(struct 
address_space *mapping,
  #endif /* CONFIG_MIGRATION */
  
  #ifdef CONFIG_NUMA_BALANCING

-extern int migrate_misplaced_page(struct page *page, int node);
-extern int migrate_misplaced_page(struct page *page, int node);
+extern int migrate_misplaced_page(struct page *page,
+ struct vm_area_struct *vma, int node);
  extern bool migrate_ratelimited(int node);
  #else
-static inline int migrate_misplaced_page(struct page *page, int node)
+static inline int migrate_misplaced_page(struct page *page,
+struct vm_area_struct *vma, int node)
  {
return -EAGAIN; /* can't migrate now */
  }
diff --git a/mm/memory.c b/mm/memory.c
index ab933be..62ae8a7 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3586,7 +3586,7 @@ int do_numa_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
}
  
  	/* Migrate to the requested node */

-   migrated = migrate_misplaced_page(page, target_nid);
+   migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated)
current_nid = target_nid;
  
@@ -3651,9 +3651,6 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,

page = vm_normal_page(vma, addr, pteval);
if (unlikely(!page))
continue;
-   /* only check non-shared pages */
-   if (unlikely(page_mapcount(page) != 1))
-   continue;
  
  		/*

 * Note that the NUMA fault is later accounted to either
@@ -3671,7 +3668,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct 
vm_area_struct *vma,
  
  		/* Migrate to the requested node */

pte_unmap_unlock(pte, ptl);
-   migrated = migrate_misplaced_page(page, target_nid);
+   migrated = migrate_misplaced_page(page, vma, target_nid);
if (migrated)
curr_nid = target_nid;
task_numa_fault(last_nid, curr_nid, 1, migrated);
diff --git a/mm/migrate.c b/mm/migrate.c
index 3bbaf5d..23f8122 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1579,7 +1579,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct 
page *page)
   * node. Caller is expected to have an elevated reference count on
   * the page that will be dropped by this function before returning.
   */
-int migrate_misplaced_page(struct page *page, int node)
+int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
+  int node)
  {
pg_data_t *pgdat = NODE_DATA(node);
int isolated;
@@ -1587,10 +1588,11 @@ int migrate_misplaced_page(struct page *page, int node)
LIST_HEAD(migratepages);
  
  	/*

-* Don't migrate pages that are mapped in multiple processes.
-* TODO: Handle false sharing detection instead of this hammer
+* Don't migrate file pages that are mapped in multiple processes
+* with execute permissions as they are probably shared libraries.
 */
-   if (page_mapcount(page) != 1)
+   if (page_mapcount(page) != 1  page_is_file_cache(page) 
+   (vma-vm_flags  VM_EXEC))
goto out;
  
  	/*

@@ -1641,13 +1643,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct 
*mm,
int page_lru = page_is_file_cache(page);
  
  	/*

-* Don't migrate pages that are mapped in multiple processes.
-* TODO: Handle false sharing detection instead of this hammer
-*/
-   if (page_mapcount(page) != 1)
-   goto out_dropref;
-
-   /*
 * Rate-limit the amount 

Re: [PATCH 0/9] mm, hugetlb: clean-up and possible bug fix

2013-07-15 Thread Sam Ben

On 07/16/2013 09:45 AM, Joonsoo Kim wrote:

On Tue, Jul 16, 2013 at 09:27:29AM +0800, Sam Ben wrote:

On 07/16/2013 09:10 AM, Joonsoo Kim wrote:

On Mon, Jul 15, 2013 at 07:40:16PM +0530, Aneesh Kumar K.V wrote:

Joonsoo Kim  writes:


First 5 patches are almost trivial clean-up patches.

The others are for fixing three bugs.
Perhaps, these problems are minor, because this codes are used
for a long time, and there is no bug reporting for these problems.

These patches are based on v3.10.0 and
passed sanity check of libhugetlbfs.

does that mean you had run with libhugetlbfs test suite ?

Yes! I can't find any reggression on libhugetlbfs test suite.

Where can get your test case?

These are my own test cases.
I will plan to submit these test cases to libhugetlbfs test suite.


Could you point out where can get libhugetlbfs test suite? ;-)



Thanks.


-aneesh

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/9] mm, hugetlb: clean-up and possible bug fix

2013-07-15 Thread Sam Ben

On 07/16/2013 09:10 AM, Joonsoo Kim wrote:

On Mon, Jul 15, 2013 at 07:40:16PM +0530, Aneesh Kumar K.V wrote:

Joonsoo Kim  writes:


First 5 patches are almost trivial clean-up patches.

The others are for fixing three bugs.
Perhaps, these problems are minor, because this codes are used
for a long time, and there is no bug reporting for these problems.

These patches are based on v3.10.0 and
passed sanity check of libhugetlbfs.

does that mean you had run with libhugetlbfs test suite ?

Yes! I can't find any reggression on libhugetlbfs test suite.


Where can get your test case?



  
-aneesh


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/9] mm, hugetlb: clean-up and possible bug fix

2013-07-15 Thread Sam Ben

On 07/16/2013 09:10 AM, Joonsoo Kim wrote:

On Mon, Jul 15, 2013 at 07:40:16PM +0530, Aneesh Kumar K.V wrote:

Joonsoo Kim iamjoonsoo@lge.com writes:


First 5 patches are almost trivial clean-up patches.

The others are for fixing three bugs.
Perhaps, these problems are minor, because this codes are used
for a long time, and there is no bug reporting for these problems.

These patches are based on v3.10.0 and
passed sanity check of libhugetlbfs.

does that mean you had run with libhugetlbfs test suite ?

Yes! I can't find any reggression on libhugetlbfs test suite.


Where can get your test case?



  
-aneesh


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 0/9] mm, hugetlb: clean-up and possible bug fix

2013-07-15 Thread Sam Ben

On 07/16/2013 09:45 AM, Joonsoo Kim wrote:

On Tue, Jul 16, 2013 at 09:27:29AM +0800, Sam Ben wrote:

On 07/16/2013 09:10 AM, Joonsoo Kim wrote:

On Mon, Jul 15, 2013 at 07:40:16PM +0530, Aneesh Kumar K.V wrote:

Joonsoo Kim iamjoonsoo@lge.com writes:


First 5 patches are almost trivial clean-up patches.

The others are for fixing three bugs.
Perhaps, these problems are minor, because this codes are used
for a long time, and there is no bug reporting for these problems.

These patches are based on v3.10.0 and
passed sanity check of libhugetlbfs.

does that mean you had run with libhugetlbfs test suite ?

Yes! I can't find any reggression on libhugetlbfs test suite.

Where can get your test case?

These are my own test cases.
I will plan to submit these test cases to libhugetlbfs test suite.


Could you point out where can get libhugetlbfs test suite? ;-)



Thanks.


-aneesh

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: boot tracing

2013-07-14 Thread Sam Ben

On 07/12/2013 04:53 PM, Ingo Molnar wrote:

* Borislav Petkov  wrote:


On Fri, Jul 12, 2013 at 10:27:56AM +0200, Ingo Molnar wrote:

Robert Richter and Boris Petkov are working on 'persistent events'
support for perf, which will eventually allow boot time profiling -
I'm not sure if the patches and the tooling support is ready enough
yet for your purposes.

Nope, not yet but we're getting there.


Robert, Boris, the following workflow would be pretty intuitive:

  - kernel developer sets boot flag: perf=boot,freq=1khz,size=16MB

What does perf=boot mean? I assume boot tracing.

In this case it would mean boot profiling - i.e. a cycles hardware-PMU
event collecting into a perf trace buffer as usual.

Essentially a 'perf record -a' work-alike, just one that gets activated as
early as practical, and which would allow the profiling of memory
initialization.

Now, one extra complication here is that to be able to profile buddy
allocator this persistent event would have to work before the buddy
allocator is active :-/ So this sort of profiling would have to use
memblock_alloc().


Could perf=boot be used to sample the performance of memblock subsystem? 
I think the perf subsystem is too late to be initialized and monitor this.




Just wanted to highlight this usecase, we might eventually want to support
it.

[ Note that this is different from boot tracing of one or more trace
   events - but it's a conceptually pretty close cousin. ]
  
Thanks,


Ingo
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: boot tracing

2013-07-14 Thread Sam Ben

On 07/12/2013 04:53 PM, Ingo Molnar wrote:

* Borislav Petkov b...@alien8.de wrote:


On Fri, Jul 12, 2013 at 10:27:56AM +0200, Ingo Molnar wrote:

Robert Richter and Boris Petkov are working on 'persistent events'
support for perf, which will eventually allow boot time profiling -
I'm not sure if the patches and the tooling support is ready enough
yet for your purposes.

Nope, not yet but we're getting there.


Robert, Boris, the following workflow would be pretty intuitive:

  - kernel developer sets boot flag: perf=boot,freq=1khz,size=16MB

What does perf=boot mean? I assume boot tracing.

In this case it would mean boot profiling - i.e. a cycles hardware-PMU
event collecting into a perf trace buffer as usual.

Essentially a 'perf record -a' work-alike, just one that gets activated as
early as practical, and which would allow the profiling of memory
initialization.

Now, one extra complication here is that to be able to profile buddy
allocator this persistent event would have to work before the buddy
allocator is active :-/ So this sort of profiling would have to use
memblock_alloc().


Could perf=boot be used to sample the performance of memblock subsystem? 
I think the perf subsystem is too late to be initialized and monitor this.




Just wanted to highlight this usecase, we might eventually want to support
it.

[ Note that this is different from boot tracing of one or more trace
   events - but it's a conceptually pretty close cousin. ]
  
Thanks,


Ingo
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] mm: madvise: MADV_POPULATE for quick pre-faulting

2013-07-13 Thread Sam Ben

On 07/02/2013 10:37 AM, Zheng Liu wrote:

On Mon, Jul 01, 2013 at 09:16:46AM -0700, Dave Hansen wrote:

On 06/28/2013 07:20 PM, Zheng Liu wrote:

IOW, a process needing to do a bunch of MAP_POPULATEs isn't
parallelizable, but one using this mechanism would be.

I look at the code, and it seems that we will handle MAP_POPULATE flag
after we release mmap_sem locking in vm_mmap_pgoff():

 down_write(>mmap_sem);
 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
 );
 up_write(>mmap_sem);
 if (populate)
 mm_populate(ret, populate);

Am I missing something?

I went and did my same test using mmap(MAP_POPULATE)/munmap() pair
versus using MADV_POPULATE in 160 threads in parallel.

MADV_POPULATE was about 10x faster in the threaded configuration.

With MADV_POPULATE, the biggest cost is shipping the mmap_sem cacheline
around so that we can write the reader count update in to it.  With
mmap(), there is a lot of _contention_ on that lock which is much, much
more expensive than simply bouncing a cacheline around.

Thanks for your explanation.

FWIW, it would be great if we can let MAP_POPULATE flag support shared
mappings because in our product system there has a lot of applications
that uses mmap(2) and then pre-faults this mapping.  Currently these
applications need to pre-fault the mapping manually.


How do you pre-fault the mapping manually in your product system? By 
walking through the file touching each page?




Regards,
 - Zheng
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: RFC: named anonymous vmas

2013-07-13 Thread Sam Ben

Hi Christoph,
On 06/24/2013 07:48 PM, Christoph Hellwig wrote:

On Sat, Jun 22, 2013 at 12:47:29PM -0700, Alex Elsayed wrote:

Couldn't this be done by having a root-only tmpfs, and having a userspace
component that creates per-app directories with restrictive permissions on
startup/app install? Then each app creates files in its own directory, and
can pass the fds around.

Honestly having a device that allows passing fds around that can be
mmaped sounds a lot simpler.  I have to admit that I expect /dev/zero
to do this, but looking at the code it creates new file structures
at ->mmap time which would defeat this.


Could you point out where done this?



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: RFC: named anonymous vmas

2013-07-13 Thread Sam Ben

Hi Colin,
On 06/22/2013 07:42 AM, Colin Cross wrote:

One of the features of ashmem (drivers/staging/android/ashmem.c) that
hasn't gotten much discussion about moving out of staging is named
anonymous memory.

In Android, ashmem is used for three different features, and most
users of it only care about one feature at a time.  One is volatile
ranges, which John Stultz has been implementing.  The second is
anonymous shareable memory without having a world-writable tmpfs that
untrusted apps could fill with files.  The third and most heavily used


How to understand "anonymous shareable memory without having a 
world-writable tmpfs that untrusted apps could fill with files"?



feature within the Android codebase is named anonymous memory, where a
region of anonymous memory can have a name associated with it that
will show up in /proc/pid/maps.  The Dalvik VM likes to use this
feature extensively, even for memory that will never be shared and
could easily be allocated using an anonymous mmap, and even malloc has
used it in the past.  It provides an easy way to collate memory used
for different purposes across multiple processes, which Android uses
for its "dumpsys meminfo" and "librank" tools to determine how much
memory is used for java heaps, JIT caches, native mallocs, etc.

I'd like to add this feature for anonymous mmap memory.  I propose
adding an madvise2(unsigned long start, size_t len_in, int behavior,
void *ptr, size_t size) syscall and a new MADV_NAME behavior, which
treats ptr as a string of length size.  The string would be copied
somewhere reusable in the kernel, or reused if it already exists, and
the kernel address of the string would get stashed in a new field in
struct vm_area_struct.  Adjacent vmas would only get merged if the
name pointer matched, and naming part of a mapping would split the
mapping.  show_map_vma would print the name only if none of the other
existing names rules match.

Any comments as I start implementing it?  Is there any reason to allow
naming a file-backed mapping and showing it alongside the file name in
/proc/pid/maps?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: mailto:"d...@kvack.org;> em...@kvack.org 


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: RFC: named anonymous vmas

2013-07-13 Thread Sam Ben

Hi Colin,
On 06/22/2013 07:42 AM, Colin Cross wrote:

One of the features of ashmem (drivers/staging/android/ashmem.c) that
hasn't gotten much discussion about moving out of staging is named
anonymous memory.

In Android, ashmem is used for three different features, and most
users of it only care about one feature at a time.  One is volatile
ranges, which John Stultz has been implementing.  The second is
anonymous shareable memory without having a world-writable tmpfs that
untrusted apps could fill with files.  The third and most heavily used


How to understand anonymous shareable memory without having a 
world-writable tmpfs that untrusted apps could fill with files?



feature within the Android codebase is named anonymous memory, where a
region of anonymous memory can have a name associated with it that
will show up in /proc/pid/maps.  The Dalvik VM likes to use this
feature extensively, even for memory that will never be shared and
could easily be allocated using an anonymous mmap, and even malloc has
used it in the past.  It provides an easy way to collate memory used
for different purposes across multiple processes, which Android uses
for its dumpsys meminfo and librank tools to determine how much
memory is used for java heaps, JIT caches, native mallocs, etc.

I'd like to add this feature for anonymous mmap memory.  I propose
adding an madvise2(unsigned long start, size_t len_in, int behavior,
void *ptr, size_t size) syscall and a new MADV_NAME behavior, which
treats ptr as a string of length size.  The string would be copied
somewhere reusable in the kernel, or reused if it already exists, and
the kernel address of the string would get stashed in a new field in
struct vm_area_struct.  Adjacent vmas would only get merged if the
name pointer matched, and naming part of a mapping would split the
mapping.  show_map_vma would print the name only if none of the other
existing names rules match.

Any comments as I start implementing it?  Is there any reason to allow
naming a file-backed mapping and showing it alongside the file name in
/proc/pid/maps?

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: RFC: named anonymous vmas

2013-07-13 Thread Sam Ben

Hi Christoph,
On 06/24/2013 07:48 PM, Christoph Hellwig wrote:

On Sat, Jun 22, 2013 at 12:47:29PM -0700, Alex Elsayed wrote:

Couldn't this be done by having a root-only tmpfs, and having a userspace
component that creates per-app directories with restrictive permissions on
startup/app install? Then each app creates files in its own directory, and
can pass the fds around.

Honestly having a device that allows passing fds around that can be
mmaped sounds a lot simpler.  I have to admit that I expect /dev/zero
to do this, but looking at the code it creates new file structures
at -mmap time which would defeat this.


Could you point out where done this?



--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majord...@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: a href=mailto:d...@kvack.org; em...@kvack.org /a


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [RFC][PATCH] mm: madvise: MADV_POPULATE for quick pre-faulting

2013-07-13 Thread Sam Ben

On 07/02/2013 10:37 AM, Zheng Liu wrote:

On Mon, Jul 01, 2013 at 09:16:46AM -0700, Dave Hansen wrote:

On 06/28/2013 07:20 PM, Zheng Liu wrote:

IOW, a process needing to do a bunch of MAP_POPULATEs isn't
parallelizable, but one using this mechanism would be.

I look at the code, and it seems that we will handle MAP_POPULATE flag
after we release mmap_sem locking in vm_mmap_pgoff():

 down_write(mm-mmap_sem);
 ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
 populate);
 up_write(mm-mmap_sem);
 if (populate)
 mm_populate(ret, populate);

Am I missing something?

I went and did my same test using mmap(MAP_POPULATE)/munmap() pair
versus using MADV_POPULATE in 160 threads in parallel.

MADV_POPULATE was about 10x faster in the threaded configuration.

With MADV_POPULATE, the biggest cost is shipping the mmap_sem cacheline
around so that we can write the reader count update in to it.  With
mmap(), there is a lot of _contention_ on that lock which is much, much
more expensive than simply bouncing a cacheline around.

Thanks for your explanation.

FWIW, it would be great if we can let MAP_POPULATE flag support shared
mappings because in our product system there has a lot of applications
that uses mmap(2) and then pre-faults this mapping.  Currently these
applications need to pre-fault the mapping manually.


How do you pre-fault the mapping manually in your product system? By 
walking through the file touching each page?




Regards,
 - Zheng
--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH v3 1/2] sched: smart wake-affine foundation

2013-07-09 Thread Sam Ben

On 07/08/2013 10:36 AM, Michael Wang wrote:

Hi, Sam

On 07/07/2013 09:31 AM, Sam Ben wrote:

On 07/04/2013 12:55 PM, Michael Wang wrote:

wake-affine stuff is always trying to pull wakee close to waker, by
theory,
this will bring benefit if waker's cpu cached hot data for wakee, or the
extreme ping-pong case.

What's the meaning of ping-pong case?

PeterZ explained it well in here:

https://lkml.org/lkml/2013/3/7/332

And you could try to compare:
taskset 1 perf bench sched pipe
with
perf bench sched pipe


Why sched pipe is special?



to confirm it ;-)

Regards,
Michael Wang


And testing show it could benefit hackbench 15% at most.

However, the whole stuff is somewhat blindly and time-consuming, some
workload therefore suffer.

And testing show it could damage pgbench 50% at most.

Thus, wake-affine stuff should be more smart, and realise when to stop
it's thankless effort.

This patch introduced 'nr_wakee_switch', which will be increased each
time the task switch it's wakee.

So a high 'nr_wakee_switch' means the task has more than one wakee, and
bigger the number, higher the wakeup frequency.

Now when making the decision on whether to pull or not, pay attention on
the wakee with a high 'nr_wakee_switch', pull such task may benefit
wakee,
but also imply that waker will face cruel competition later, it could be
very cruel or very fast depends on the story behind 'nr_wakee_switch',
whatever, waker therefore suffer.

Furthermore, if waker also has a high 'nr_wakee_switch', imply that
multiple
tasks rely on it, then waker's higher latency will damage all of them,
pull
wakee seems to be a bad deal.

Thus, when 'waker->nr_wakee_switch / wakee->nr_wakee_switch' become
higher
and higher, the deal seems to be worse and worse.

The patch therefore help wake-affine stuff to stop it's work when:

 wakee->nr_wakee_switch > factor &&
 waker->nr_wakee_switch > (factor * wakee->nr_wakee_switch)

The factor here is the node-size of current-cpu, so bigger node will lead
to more pull since the trial become more severe.

After applied the patch, pgbench show 40% improvement at most.

Test:
 Tested with 12 cpu X86 server and tip 3.10.0-rc7.

 pgbenchbasesmart

 | db_size | clients |  tps  ||  tps  |
 +-+-+---+   +---+
 | 22 MB   |   1 | 10598 |   | 10796 |
 | 22 MB   |   2 | 21257 |   | 21336 |
 | 22 MB   |   4 | 41386 |   | 41622 |
 | 22 MB   |   8 | 51253 |   | 57932 |
 | 22 MB   |  12 | 48570 |   | 54000 |
 | 22 MB   |  16 | 46748 |   | 55982 | +19.75%
 | 22 MB   |  24 | 44346 |   | 55847 | +25.93%
 | 22 MB   |  32 | 43460 |   | 54614 | +25.66%
 | 7484 MB |   1 |  8951 |   |  9193 |
 | 7484 MB |   2 | 19233 |   | 19240 |
 | 7484 MB |   4 | 37239 |   | 37302 |
 | 7484 MB |   8 | 46087 |   | 50018 |
 | 7484 MB |  12 | 42054 |   | 48763 |
 | 7484 MB |  16 | 40765 |   | 51633 | +26.66%
 | 7484 MB |  24 | 37651 |   | 52377 | +39.11%
 | 7484 MB |  32 | 37056 |   | 51108 | +37.92%
 | 15 GB   |   1 |  8845 |   |  9104 |
 | 15 GB   |   2 | 19094 |   | 19162 |
 | 15 GB   |   4 | 36979 |   | 36983 |
 | 15 GB   |   8 | 46087 |   | 49977 |
 | 15 GB   |  12 | 41901 |   | 48591 |
 | 15 GB   |  16 | 40147 |   | 50651 | +26.16%
 | 15 GB   |  24 | 37250 |   | 52365 | +40.58%
 | 15 GB   |  32 | 36470 |   | 50015 | +37.14%

CC: Ingo Molnar 
CC: Peter Zijlstra 
CC: Mike Galbraith 
Signed-off-by: Michael Wang 
---
   include/linux/sched.h |3 +++
   kernel/sched/fair.c   |   47
+++
   2 files changed, 50 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 178a8d9..1c996c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1041,6 +1041,9 @@ struct task_struct {
   #ifdef CONFIG_SMP
   struct llist_node wake_entry;
   int on_cpu;
+struct task_struct *last_wakee;
+unsigned long nr_wakee_switch;
+unsigned long last_switch_decay;
   #endif
   int on_rq;
   diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614..a4ddbf5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2971,6 +2971,23 @@ static unsigned long cpu_avg_load_per_task(int
cpu)
   return 0;
   }
   +static void record_wakee(struct task_struct *p)
+{
+/*
+ * Rough decay(wiping) for cost saving, don't worry
+ * about the boundary, really active task won't care
+ * the loose.
+ */
+if (jiffies > current->last_switch_decay + HZ) {
+current->nr_wakee_switch = 0;
+current->last_switch_decay = jiffies;
+}
+
+if (current->last_wakee != p) {
+current->last_wakee = p;
+current->nr_wakee_switch++;
+}
+}
 static void task_waking_f

Re: [PATCH v3 1/2] sched: smart wake-affine foundation

2013-07-09 Thread Sam Ben

On 07/08/2013 10:36 AM, Michael Wang wrote:

Hi, Sam

On 07/07/2013 09:31 AM, Sam Ben wrote:

On 07/04/2013 12:55 PM, Michael Wang wrote:

wake-affine stuff is always trying to pull wakee close to waker, by
theory,
this will bring benefit if waker's cpu cached hot data for wakee, or the
extreme ping-pong case.

What's the meaning of ping-pong case?

PeterZ explained it well in here:

https://lkml.org/lkml/2013/3/7/332

And you could try to compare:
taskset 1 perf bench sched pipe
with
perf bench sched pipe


Why sched pipe is special?



to confirm it ;-)

Regards,
Michael Wang


And testing show it could benefit hackbench 15% at most.

However, the whole stuff is somewhat blindly and time-consuming, some
workload therefore suffer.

And testing show it could damage pgbench 50% at most.

Thus, wake-affine stuff should be more smart, and realise when to stop
it's thankless effort.

This patch introduced 'nr_wakee_switch', which will be increased each
time the task switch it's wakee.

So a high 'nr_wakee_switch' means the task has more than one wakee, and
bigger the number, higher the wakeup frequency.

Now when making the decision on whether to pull or not, pay attention on
the wakee with a high 'nr_wakee_switch', pull such task may benefit
wakee,
but also imply that waker will face cruel competition later, it could be
very cruel or very fast depends on the story behind 'nr_wakee_switch',
whatever, waker therefore suffer.

Furthermore, if waker also has a high 'nr_wakee_switch', imply that
multiple
tasks rely on it, then waker's higher latency will damage all of them,
pull
wakee seems to be a bad deal.

Thus, when 'waker-nr_wakee_switch / wakee-nr_wakee_switch' become
higher
and higher, the deal seems to be worse and worse.

The patch therefore help wake-affine stuff to stop it's work when:

 wakee-nr_wakee_switch  factor 
 waker-nr_wakee_switch  (factor * wakee-nr_wakee_switch)

The factor here is the node-size of current-cpu, so bigger node will lead
to more pull since the trial become more severe.

After applied the patch, pgbench show 40% improvement at most.

Test:
 Tested with 12 cpu X86 server and tip 3.10.0-rc7.

 pgbenchbasesmart

 | db_size | clients |  tps  ||  tps  |
 +-+-+---+   +---+
 | 22 MB   |   1 | 10598 |   | 10796 |
 | 22 MB   |   2 | 21257 |   | 21336 |
 | 22 MB   |   4 | 41386 |   | 41622 |
 | 22 MB   |   8 | 51253 |   | 57932 |
 | 22 MB   |  12 | 48570 |   | 54000 |
 | 22 MB   |  16 | 46748 |   | 55982 | +19.75%
 | 22 MB   |  24 | 44346 |   | 55847 | +25.93%
 | 22 MB   |  32 | 43460 |   | 54614 | +25.66%
 | 7484 MB |   1 |  8951 |   |  9193 |
 | 7484 MB |   2 | 19233 |   | 19240 |
 | 7484 MB |   4 | 37239 |   | 37302 |
 | 7484 MB |   8 | 46087 |   | 50018 |
 | 7484 MB |  12 | 42054 |   | 48763 |
 | 7484 MB |  16 | 40765 |   | 51633 | +26.66%
 | 7484 MB |  24 | 37651 |   | 52377 | +39.11%
 | 7484 MB |  32 | 37056 |   | 51108 | +37.92%
 | 15 GB   |   1 |  8845 |   |  9104 |
 | 15 GB   |   2 | 19094 |   | 19162 |
 | 15 GB   |   4 | 36979 |   | 36983 |
 | 15 GB   |   8 | 46087 |   | 49977 |
 | 15 GB   |  12 | 41901 |   | 48591 |
 | 15 GB   |  16 | 40147 |   | 50651 | +26.16%
 | 15 GB   |  24 | 37250 |   | 52365 | +40.58%
 | 15 GB   |  32 | 36470 |   | 50015 | +37.14%

CC: Ingo Molnar mi...@kernel.org
CC: Peter Zijlstra pet...@infradead.org
CC: Mike Galbraith efa...@gmx.de
Signed-off-by: Michael Wang wang...@linux.vnet.ibm.com
---
   include/linux/sched.h |3 +++
   kernel/sched/fair.c   |   47
+++
   2 files changed, 50 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 178a8d9..1c996c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1041,6 +1041,9 @@ struct task_struct {
   #ifdef CONFIG_SMP
   struct llist_node wake_entry;
   int on_cpu;
+struct task_struct *last_wakee;
+unsigned long nr_wakee_switch;
+unsigned long last_switch_decay;
   #endif
   int on_rq;
   diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c61a614..a4ddbf5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2971,6 +2971,23 @@ static unsigned long cpu_avg_load_per_task(int
cpu)
   return 0;
   }
   +static void record_wakee(struct task_struct *p)
+{
+/*
+ * Rough decay(wiping) for cost saving, don't worry
+ * about the boundary, really active task won't care
+ * the loose.
+ */
+if (jiffies  current-last_switch_decay + HZ) {
+current-nr_wakee_switch = 0;
+current-last_switch_decay = jiffies;
+}
+
+if (current-last_wakee != p) {
+current-last_wakee = p;
+current-nr_wakee_switch++;
+}
+}
 static void

Re: [PATCH v3 1/2] sched: smart wake-affine foundation

2013-07-06 Thread Sam Ben

On 07/04/2013 12:55 PM, Michael Wang wrote:

wake-affine stuff is always trying to pull wakee close to waker, by theory,
this will bring benefit if waker's cpu cached hot data for wakee, or the
extreme ping-pong case.


What's the meaning of ping-pong case?



And testing show it could benefit hackbench 15% at most.

However, the whole stuff is somewhat blindly and time-consuming, some
workload therefore suffer.

And testing show it could damage pgbench 50% at most.

Thus, wake-affine stuff should be more smart, and realise when to stop
it's thankless effort.

This patch introduced 'nr_wakee_switch', which will be increased each
time the task switch it's wakee.

So a high 'nr_wakee_switch' means the task has more than one wakee, and
bigger the number, higher the wakeup frequency.

Now when making the decision on whether to pull or not, pay attention on
the wakee with a high 'nr_wakee_switch', pull such task may benefit wakee,
but also imply that waker will face cruel competition later, it could be
very cruel or very fast depends on the story behind 'nr_wakee_switch',
whatever, waker therefore suffer.

Furthermore, if waker also has a high 'nr_wakee_switch', imply that multiple
tasks rely on it, then waker's higher latency will damage all of them, pull
wakee seems to be a bad deal.

Thus, when 'waker->nr_wakee_switch / wakee->nr_wakee_switch' become higher
and higher, the deal seems to be worse and worse.

The patch therefore help wake-affine stuff to stop it's work when:

wakee->nr_wakee_switch > factor &&
waker->nr_wakee_switch > (factor * wakee->nr_wakee_switch)

The factor here is the node-size of current-cpu, so bigger node will lead
to more pull since the trial become more severe.

After applied the patch, pgbench show 40% improvement at most.

Test:
Tested with 12 cpu X86 server and tip 3.10.0-rc7.

pgbench basesmart

| db_size | clients |  tps  |   |  tps  |
+-+-+---+   +---+
| 22 MB   |   1 | 10598 |   | 10796 |
| 22 MB   |   2 | 21257 |   | 21336 |
| 22 MB   |   4 | 41386 |   | 41622 |
| 22 MB   |   8 | 51253 |   | 57932 |
| 22 MB   |  12 | 48570 |   | 54000 |
| 22 MB   |  16 | 46748 |   | 55982 | +19.75%
| 22 MB   |  24 | 44346 |   | 55847 | +25.93%
| 22 MB   |  32 | 43460 |   | 54614 | +25.66%
| 7484 MB |   1 |  8951 |   |  9193 |
| 7484 MB |   2 | 19233 |   | 19240 |
| 7484 MB |   4 | 37239 |   | 37302 |
| 7484 MB |   8 | 46087 |   | 50018 |
| 7484 MB |  12 | 42054 |   | 48763 |
| 7484 MB |  16 | 40765 |   | 51633 | +26.66%
| 7484 MB |  24 | 37651 |   | 52377 | +39.11%
| 7484 MB |  32 | 37056 |   | 51108 | +37.92%
| 15 GB   |   1 |  8845 |   |  9104 |
| 15 GB   |   2 | 19094 |   | 19162 |
| 15 GB   |   4 | 36979 |   | 36983 |
| 15 GB   |   8 | 46087 |   | 49977 |
| 15 GB   |  12 | 41901 |   | 48591 |
| 15 GB   |  16 | 40147 |   | 50651 | +26.16%
| 15 GB   |  24 | 37250 |   | 52365 | +40.58%
| 15 GB   |  32 | 36470 |   | 50015 | +37.14%

CC: Ingo Molnar 
CC: Peter Zijlstra 
CC: Mike Galbraith 
Signed-off-by: Michael Wang 
---
  include/linux/sched.h |3 +++
  kernel/sched/fair.c   |   47 +++
  2 files changed, 50 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 178a8d9..1c996c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1041,6 +1041,9 @@ struct task_struct {
  #ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
+   struct task_struct *last_wakee;
+   unsigned long nr_wakee_switch;
+   unsigned long last_switch_decay;
  #endif
int on_rq;
  
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index c61a614..a4ddbf5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2971,6 +2971,23 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return 0;
  }
  
+static void record_wakee(struct task_struct *p)

+{
+   /*
+* Rough decay(wiping) for cost saving, don't worry
+* about the boundary, really active task won't care
+* the loose.
+*/
+   if (jiffies > current->last_switch_decay + HZ) {
+   current->nr_wakee_switch = 0;
+   current->last_switch_decay = jiffies;
+   }
+
+   if (current->last_wakee != p) {
+   current->last_wakee = p;
+   current->nr_wakee_switch++;
+   }
+}
  
  static void task_waking_fair(struct task_struct *p)

  {
@@ -2991,6 +3008,7 @@ static void task_waking_fair(struct task_struct *p)
  #endif
  
  	se->vruntime -= min_vruntime;

+   record_wakee(p);
  }
  
  #ifdef CONFIG_FAIR_GROUP_SCHED

@@ -3109,6 +3127,28 @@ static 

Re: [PATCH v3 1/2] sched: smart wake-affine foundation

2013-07-06 Thread Sam Ben

On 07/04/2013 12:55 PM, Michael Wang wrote:

wake-affine stuff is always trying to pull wakee close to waker, by theory,
this will bring benefit if waker's cpu cached hot data for wakee, or the
extreme ping-pong case.


What's the meaning of ping-pong case?



And testing show it could benefit hackbench 15% at most.

However, the whole stuff is somewhat blindly and time-consuming, some
workload therefore suffer.

And testing show it could damage pgbench 50% at most.

Thus, wake-affine stuff should be more smart, and realise when to stop
it's thankless effort.

This patch introduced 'nr_wakee_switch', which will be increased each
time the task switch it's wakee.

So a high 'nr_wakee_switch' means the task has more than one wakee, and
bigger the number, higher the wakeup frequency.

Now when making the decision on whether to pull or not, pay attention on
the wakee with a high 'nr_wakee_switch', pull such task may benefit wakee,
but also imply that waker will face cruel competition later, it could be
very cruel or very fast depends on the story behind 'nr_wakee_switch',
whatever, waker therefore suffer.

Furthermore, if waker also has a high 'nr_wakee_switch', imply that multiple
tasks rely on it, then waker's higher latency will damage all of them, pull
wakee seems to be a bad deal.

Thus, when 'waker-nr_wakee_switch / wakee-nr_wakee_switch' become higher
and higher, the deal seems to be worse and worse.

The patch therefore help wake-affine stuff to stop it's work when:

wakee-nr_wakee_switch  factor 
waker-nr_wakee_switch  (factor * wakee-nr_wakee_switch)

The factor here is the node-size of current-cpu, so bigger node will lead
to more pull since the trial become more severe.

After applied the patch, pgbench show 40% improvement at most.

Test:
Tested with 12 cpu X86 server and tip 3.10.0-rc7.

pgbench basesmart

| db_size | clients |  tps  |   |  tps  |
+-+-+---+   +---+
| 22 MB   |   1 | 10598 |   | 10796 |
| 22 MB   |   2 | 21257 |   | 21336 |
| 22 MB   |   4 | 41386 |   | 41622 |
| 22 MB   |   8 | 51253 |   | 57932 |
| 22 MB   |  12 | 48570 |   | 54000 |
| 22 MB   |  16 | 46748 |   | 55982 | +19.75%
| 22 MB   |  24 | 44346 |   | 55847 | +25.93%
| 22 MB   |  32 | 43460 |   | 54614 | +25.66%
| 7484 MB |   1 |  8951 |   |  9193 |
| 7484 MB |   2 | 19233 |   | 19240 |
| 7484 MB |   4 | 37239 |   | 37302 |
| 7484 MB |   8 | 46087 |   | 50018 |
| 7484 MB |  12 | 42054 |   | 48763 |
| 7484 MB |  16 | 40765 |   | 51633 | +26.66%
| 7484 MB |  24 | 37651 |   | 52377 | +39.11%
| 7484 MB |  32 | 37056 |   | 51108 | +37.92%
| 15 GB   |   1 |  8845 |   |  9104 |
| 15 GB   |   2 | 19094 |   | 19162 |
| 15 GB   |   4 | 36979 |   | 36983 |
| 15 GB   |   8 | 46087 |   | 49977 |
| 15 GB   |  12 | 41901 |   | 48591 |
| 15 GB   |  16 | 40147 |   | 50651 | +26.16%
| 15 GB   |  24 | 37250 |   | 52365 | +40.58%
| 15 GB   |  32 | 36470 |   | 50015 | +37.14%

CC: Ingo Molnar mi...@kernel.org
CC: Peter Zijlstra pet...@infradead.org
CC: Mike Galbraith efa...@gmx.de
Signed-off-by: Michael Wang wang...@linux.vnet.ibm.com
---
  include/linux/sched.h |3 +++
  kernel/sched/fair.c   |   47 +++
  2 files changed, 50 insertions(+), 0 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 178a8d9..1c996c7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1041,6 +1041,9 @@ struct task_struct {
  #ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
+   struct task_struct *last_wakee;
+   unsigned long nr_wakee_switch;
+   unsigned long last_switch_decay;
  #endif
int on_rq;
  
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index c61a614..a4ddbf5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2971,6 +2971,23 @@ static unsigned long cpu_avg_load_per_task(int cpu)
return 0;
  }
  
+static void record_wakee(struct task_struct *p)

+{
+   /*
+* Rough decay(wiping) for cost saving, don't worry
+* about the boundary, really active task won't care
+* the loose.
+*/
+   if (jiffies  current-last_switch_decay + HZ) {
+   current-nr_wakee_switch = 0;
+   current-last_switch_decay = jiffies;
+   }
+
+   if (current-last_wakee != p) {
+   current-last_wakee = p;
+   current-nr_wakee_switch++;
+   }
+}
  
  static void task_waking_fair(struct task_struct *p)

  {
@@ -2991,6 +3008,7 @@ static void task_waking_fair(struct task_struct *p)
  #endif
  
  	se-vruntime -= min_vruntime;

+   record_wakee(p);
  }
  
  

Re: [PATCH 2/3] mm: Ensure that mark_page_accessed moves pages to the active list

2013-04-30 Thread Sam Ben

Hi Mel,
On 04/30/2013 12:31 AM, Mel Gorman wrote:

If a page is on a pagevec then it is !PageLRU and mark_page_accessed()
may fail to move a page to the active list as expected. Now that the
LRU is selected at LRU drain time, mark pages PageActive if they are
on a pagevec so it gets moved to the correct list at LRU drain time.
Using a debugging patch it was found that for a simple git checkout
based workload that pages were never added to the active file list in


Could you show us the details of your workload?


practice but with this patch applied they are.

before   after
LRU Add Active File  0  757121
LRU Add Active Anon2678833 2633924
LRU Add Inactive File  8821711 8085543
LRU Add Inactive Anon  183 200

The question to consider is if this is universally safe. If the page
was isolated for reclaim and there is a parallel mark_page_accessed()
then vmscan.c will get upset when it finds an isolated PageActive page.
Similarly a potential race exists between a per-cpu drain on a pagevec
list and an activation on a remote CPU.

lru_add_drain_cpu
__pagevec_lru_add
  lru = page_lru(page);
mark_page_accessed
   if (PageLRU(page))
 activate_page
   else
 SetPageActive
  SetPageLRU(page);
  add_page_to_lru_list(page, lruvec, lru);

A PageActive page is now added to the inactivate list.

While this looks strange, I think it is sufficiently harmless that additional
barriers to address the case is not justified.  Unfortunately, while I never
witnessed it myself, these parallel updates potentially trigger defensive
DEBUG_VM checks on PageActive and hence they are removed by this patch.

Signed-off-by: Mel Gorman 
---
  mm/swap.c   | 18 --
  mm/vmscan.c |  3 ---
  2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 80fbc37..2a10d08 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -437,8 +437,17 @@ void activate_page(struct page *page)
  void mark_page_accessed(struct page *page)
  {
if (!PageActive(page) && !PageUnevictable(page) &&
-   PageReferenced(page) && PageLRU(page)) {
-   activate_page(page);
+   PageReferenced(page)) {
+
+   /*
+* If the page is on the LRU, promote immediately. Otherwise,
+* assume the page is on a pagevec, mark it active and it'll
+* be moved to the active LRU on the next drain
+*/
+   if (PageLRU(page))
+   activate_page(page);
+   else
+   SetPageActive(page);
ClearPageReferenced(page);
} else if (!PageReferenced(page)) {
SetPageReferenced(page);
@@ -478,11 +487,8 @@ EXPORT_SYMBOL(__lru_cache_add);
   */
  void lru_cache_add_lru(struct page *page, enum lru_list lru)
  {
-   if (PageActive(page)) {
+   if (PageActive(page))
VM_BUG_ON(PageUnevictable(page));
-   } else if (PageUnevictable(page)) {
-   VM_BUG_ON(PageActive(page));
-   }
  
  	VM_BUG_ON(PageLRU(page));

__lru_cache_add(page, lru);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 88c5fed..751b897 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -704,7 +704,6 @@ static unsigned long shrink_page_list(struct list_head 
*page_list,
if (!trylock_page(page))
goto keep;
  
-		VM_BUG_ON(PageActive(page));

VM_BUG_ON(page_zone(page) != zone);
  
  		sc->nr_scanned++;

@@ -935,7 +934,6 @@ activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && vm_swap_full())
try_to_free_swap(page);
-   VM_BUG_ON(PageActive(page));
SetPageActive(page);
pgactivate++;
  keep_locked:
@@ -3488,7 +3486,6 @@ void check_move_unevictable_pages(struct page **pages, 
int nr_pages)
if (page_evictable(page)) {
enum lru_list lru = page_lru_base_type(page);
  
-			VM_BUG_ON(PageActive(page));

ClearPageUnevictable(page);
del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
add_page_to_lru_list(page, lruvec, lru);


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/


Re: [PATCH 2/3] mm: Ensure that mark_page_accessed moves pages to the active list

2013-04-30 Thread Sam Ben

Hi Mel,
On 04/30/2013 12:31 AM, Mel Gorman wrote:

If a page is on a pagevec then it is !PageLRU and mark_page_accessed()
may fail to move a page to the active list as expected. Now that the
LRU is selected at LRU drain time, mark pages PageActive if they are
on a pagevec so it gets moved to the correct list at LRU drain time.
Using a debugging patch it was found that for a simple git checkout
based workload that pages were never added to the active file list in


Could you show us the details of your workload?


practice but with this patch applied they are.

before   after
LRU Add Active File  0  757121
LRU Add Active Anon2678833 2633924
LRU Add Inactive File  8821711 8085543
LRU Add Inactive Anon  183 200

The question to consider is if this is universally safe. If the page
was isolated for reclaim and there is a parallel mark_page_accessed()
then vmscan.c will get upset when it finds an isolated PageActive page.
Similarly a potential race exists between a per-cpu drain on a pagevec
list and an activation on a remote CPU.

lru_add_drain_cpu
__pagevec_lru_add
  lru = page_lru(page);
mark_page_accessed
   if (PageLRU(page))
 activate_page
   else
 SetPageActive
  SetPageLRU(page);
  add_page_to_lru_list(page, lruvec, lru);

A PageActive page is now added to the inactivate list.

While this looks strange, I think it is sufficiently harmless that additional
barriers to address the case is not justified.  Unfortunately, while I never
witnessed it myself, these parallel updates potentially trigger defensive
DEBUG_VM checks on PageActive and hence they are removed by this patch.

Signed-off-by: Mel Gorman mgor...@suse.de
---
  mm/swap.c   | 18 --
  mm/vmscan.c |  3 ---
  2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/mm/swap.c b/mm/swap.c
index 80fbc37..2a10d08 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -437,8 +437,17 @@ void activate_page(struct page *page)
  void mark_page_accessed(struct page *page)
  {
if (!PageActive(page)  !PageUnevictable(page) 
-   PageReferenced(page)  PageLRU(page)) {
-   activate_page(page);
+   PageReferenced(page)) {
+
+   /*
+* If the page is on the LRU, promote immediately. Otherwise,
+* assume the page is on a pagevec, mark it active and it'll
+* be moved to the active LRU on the next drain
+*/
+   if (PageLRU(page))
+   activate_page(page);
+   else
+   SetPageActive(page);
ClearPageReferenced(page);
} else if (!PageReferenced(page)) {
SetPageReferenced(page);
@@ -478,11 +487,8 @@ EXPORT_SYMBOL(__lru_cache_add);
   */
  void lru_cache_add_lru(struct page *page, enum lru_list lru)
  {
-   if (PageActive(page)) {
+   if (PageActive(page))
VM_BUG_ON(PageUnevictable(page));
-   } else if (PageUnevictable(page)) {
-   VM_BUG_ON(PageActive(page));
-   }
  
  	VM_BUG_ON(PageLRU(page));

__lru_cache_add(page, lru);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 88c5fed..751b897 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -704,7 +704,6 @@ static unsigned long shrink_page_list(struct list_head 
*page_list,
if (!trylock_page(page))
goto keep;
  
-		VM_BUG_ON(PageActive(page));

VM_BUG_ON(page_zone(page) != zone);
  
  		sc-nr_scanned++;

@@ -935,7 +934,6 @@ activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page)  vm_swap_full())
try_to_free_swap(page);
-   VM_BUG_ON(PageActive(page));
SetPageActive(page);
pgactivate++;
  keep_locked:
@@ -3488,7 +3486,6 @@ void check_move_unevictable_pages(struct page **pages, 
int nr_pages)
if (page_evictable(page)) {
enum lru_list lru = page_lru_base_type(page);
  
-			VM_BUG_ON(PageActive(page));

ClearPageUnevictable(page);
del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
add_page_to_lru_list(page, lruvec, lru);


--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/