[PATCH] scripts/checkpatch.pl: fix false warning of externs checking.

2017-10-10 Thread Jiang Biao
When adding a function declaration in a .c file without an extern
keywork decoration, checkpatch.pl will complain *externs should be
avoided in .c files* false warning. This patch fix the bug.

Signed-off-by: Jiang Biao <jiang.bi...@zte.com.cn>
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index dd2c262..170daf1 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -5927,7 +5927,7 @@ sub process {
 
 # check for new externs in .c files.
if ($realfile =~ /\.c$/ && defined $stat &&
-   $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s)
+   $stat =~ /^.\s*(?:extern\s+)$Type\s+($Ident)(\s*)\(/s)
{
my $function_name = $1;
my $paren_space = $2;
-- 
2.7.4



[PATCH] checkpatch: statement should end at a close brace at the outermost level.

2017-10-10 Thread Jiang Biao
Statement should end at a close brace at the outermost level in
ctx_statement_block.

The way to reproduce the bug,
1, Add two external function declarations into line 505 of
kernel/stop_machine.c, such as,
int foo1(void);
int foo2(void);
2, Format a patch for that, and use the checkpatch.pl to check.
3, The first declaration(foo1()) would not be warned, because the
statement does not end at the '}' before it.

Signed-off-by: Jiang Biao <jiang.bi...@zte.com.cn>
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index dd2c262..f220cfc 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1330,7 +1330,7 @@ sub ctx_statement_block {
 
# Statement ends at the ';' or a close '}' at the
# outermost level.
-   if ($level == 0 && $c eq ';') {
+   if ($level == 0 && ($c eq ';' || $c eq '}')) {
last;
}
 
-- 
2.7.4



[PATCH] mm/vmscan: make do_shrink_slab more robust.

2017-11-26 Thread Jiang Biao
When running ltp stress test for 7*24 hours, the kernel occasionally
complains the following warning continuously,

mb_cache_shrink_scan+0x0/0x3f0 negative objects to delete
nr=-9222526086287711848
mb_cache_shrink_scan+0x0/0x3f0 negative objects to delete
nr=-9222420761333860545
mb_cache_shrink_scan+0x0/0x3f0 negative objects to delete
nr=-987677544280360
...

The tracing result shows the freeable(mb_cache_shrink_scan returns)
is -1, which causes the continuous accumulation and overflow of
total_scan.

This patch make do_shrink_slab more robust when
shrinker->count_objects return negative freeable.

Signed-off-by: Jiang Biao <jiang.bi...@zte.com.cn>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb2f031..3ea28f0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -323,7 +323,7 @@ static unsigned long do_shrink_slab(struct shrink_control 
*shrinkctl,
long scanned = 0, next_deferred;
 
freeable = shrinker->count_objects(shrinker, shrinkctl);
-   if (freeable == 0)
+   if (freeable <= 0)
return 0;
 
/*
-- 
2.7.4



[PATCH] fs/mbcache: make count_objects more robust.

2017-11-26 Thread Jiang Biao
When running ltp stress test for 7*24 hours, the vmscan occasionally
complains the following warning continuously,

mb_cache_scan+0x0/0x3f0 negative objects to delete
nr=-9232265467809300450
...

The tracing result shows the freeable(mb_cache_count returns)
is -1, which causes the continuous accumulation and overflow of
total_scan.

This patch make sure the mb_cache_count not return negative value,
which make the mbcache shrinker more robust.

Signed-off-by: Jiang Biao <jiang.bi...@zte.com.cn>
---
 fs/mbcache.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index d818fd2..b8b8b9c 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink,
struct mb_cache *cache = container_of(shrink, struct mb_cache,
  c_shrink);
 
+   /* Unlikely, but not impossible */
+   if (unlikely(cache->c_entry_count < 0))
+   return 0;
return cache->c_entry_count;
 }
 
-- 
2.7.4



[PATCH] mm/vmscan: try to optimize branch procedures.

2017-11-27 Thread Jiang Biao
1. Use unlikely to try to improve branch prediction. The
*total_scan < 0* branch is unlikely to reach, so use unlikely.

2. Optimize *next_deferred >= scanned* condition.
*next_deferred >= scanned* condition could be optimized into
*next_deferred > scanned*, because when *next_deferred == scanned*,
next_deferred shoud be 0, which is covered by the else branch.

3. Merge two branch blocks into one. The *next_deferred > 0* branch
could be merged into *next_deferred > scanned* to simplify the code.

Signed-off-by: Jiang Biao <jiang.bi...@zte.com.cn>
---
 mm/vmscan.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb2f031..5f5d4ab 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -338,7 +338,7 @@ static unsigned long do_shrink_slab(struct shrink_control 
*shrinkctl,
delta *= freeable;
do_div(delta, nr_eligible + 1);
total_scan += delta;
-   if (total_scan < 0) {
+   if (unlikely(total_scan < 0)) {
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
   shrinker->scan_objects, total_scan);
total_scan = freeable;
@@ -407,18 +407,16 @@ static unsigned long do_shrink_slab(struct shrink_control 
*shrinkctl,
cond_resched();
}
 
-   if (next_deferred >= scanned)
-   next_deferred -= scanned;
-   else
-   next_deferred = 0;
/*
 * move the unused scan count back into the shrinker in a
 * manner that handles concurrent updates. If we exhausted the
 * scan, there is no need to do an update.
 */
-   if (next_deferred > 0)
+   if (next_deferred > scanned) {
+   next_deferred -= scanned;
new_nr = atomic_long_add_return(next_deferred,
>nr_deferred[nid]);
+   }
else
new_nr = atomic_long_read(>nr_deferred[nid]);
 
-- 
2.7.4



[PATCH] mm/vmscan: change return type of is_page_cache_freeable from int to bool

2017-11-27 Thread Jiang Biao
Using bool for the return type of is_page_cache_freeable() should be
more appropriate.

Signed-off-by: Jiang Biao <jiang.bi...@zte.com.cn>
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb2f031..5fe63ed 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -530,7 +530,7 @@ void drop_slab(void)
drop_slab_node(nid);
 }
 
-static inline int is_page_cache_freeable(struct page *page)
+static inline bool is_page_cache_freeable(struct page *page)
 {
/*
 * A freeable page cache page is referenced only by the caller
-- 
2.7.4



[PATCH v2] mm: fix page_freeze_refs and page_unfreeze_refs in comments.

2018-07-26 Thread Jiang Biao
page_freeze_refs/page_unfreeze_refs have already been relplaced by
page_ref_freeze/page_ref_unfreeze , but they are not modified in
the comments.

Signed-off-by: Jiang Biao 
---
v1: fix comments in vmscan.
v2: fix other two places and fix typoes.

 mm/ksm.c| 4 ++--
 mm/memory-failure.c | 2 +-
 mm/vmscan.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index a6d43cf..4c39cb67 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -703,7 +703,7 @@ static struct page *get_ksm_page(struct stable_node 
*stable_node, bool lock_it)
 * We cannot do anything with the page while its refcount is 0.
 * Usually 0 means free, or tail of a higher-order page: in which
 * case this node is no longer referenced, and should be freed;
-* however, it might mean that the page is under page_freeze_refs().
+* however, it might mean that the page is under page_ref_freeze().
 * The __remove_mapping() case is easy, again the node is now stale;
 * but if page is swapcache in migrate_page_move_mapping(), it might
 * still be our page, in which case it's essential to keep the node.
@@ -714,7 +714,7 @@ static struct page *get_ksm_page(struct stable_node 
*stable_node, bool lock_it)
 * work here too.  We have chosen the !PageSwapCache test to
 * optimize the common case, when the page is or is about to
 * be freed: PageSwapCache is cleared (under spin_lock_irq)
-* in the freeze_refs section of __remove_mapping(); but Anon
+* in the ref_freeze section of __remove_mapping(); but Anon
 * page->mapping reset to NULL later, in free_pages_prepare().
 */
if (!PageSwapCache(page))
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9d142b9..c83a174 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1167,7 +1167,7 @@ int memory_failure(unsigned long pfn, int flags)
 *R/W the page; let's pray that the page has been
 *used and will be freed some time later.
 * In fact it's dangerous to directly bump up page count from 0,
-* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+* that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
 */
if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
if (is_free_buddy_page(p)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f8..02d0c20 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -744,7 +744,7 @@ static int __remove_mapping(struct address_space *mapping, 
struct page *page,
refcount = 2;
if (!page_ref_freeze(page, refcount))
goto cannot_free;
-   /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+   /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_ref_unfreeze(page, refcount);
goto cannot_free;
-- 
2.7.4



[PATCH] mm/vmscan: fix page_freeze_refs in comment.

2018-07-25 Thread Jiang Biao
page_freeze_refs has already been relplaced by page_ref_freeze, but
it is not modified in the comment.

Signed-off-by: Jiang Biao 
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f8..d29e207 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -744,7 +744,7 @@ static int __remove_mapping(struct address_space *mapping, 
struct page *page,
refcount = 2;
if (!page_ref_freeze(page, refcount))
goto cannot_free;
-   /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+   /* note: atomic_cmpxchg in page_refs_freeze provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_ref_unfreeze(page, refcount);
goto cannot_free;
-- 
2.7.4



[PATCH v4 1/2] x86/pti: check the return value of pti_user_pagetable_walk_p4d

2018-07-19 Thread Jiang Biao
pti_user_pagetable_walk_p4d() may return NULL, we should check the
return value to avoid NULL pointer dereference. And add warning
for fail allocation where NULL returned.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e7..8679c64 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -176,7 +176,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)

if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
-   if (!new_p4d_page)
+   if (WARN_ON_ONCE(!new_p4d_page))
return NULL;

set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
@@ -195,9 +195,13 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+   p4d_t *p4d;
pud_t *pud;

+   p4d = pti_user_pagetable_walk_p4d(address);
+   if (!p4d)
+   return NULL;
+
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
@@ -354,6 +358,9 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;

user_p4d = pti_user_pagetable_walk_p4d(addr);
+   if (!user_p4d)
+   return;
+
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
--
2.7.4



[PATCH v4 2/2] x86/pti: check the return value of pti_user_pagetable_walk_pmd

2018-07-19 Thread Jiang Biao
Check the return value of pti_user_pagetable_walk_pmd() to avoid
NULL pointer dereference. And add warning for fail allocation.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 8679c64..5cd7b82 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -205,7 +205,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
-   if (!new_pud_page)
+   if (WARN_ON_ONCE(!new_pud_page))
return NULL;

set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
@@ -219,7 +219,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
-   if (!new_pmd_page)
+   if (WARN_ON_ONCE(!new_pmd_page))
return NULL;

set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
@@ -241,9 +241,13 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
 static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+   pmd_t *pmd;
pte_t *pte;

+   pmd = pti_user_pagetable_walk_pmd(address);
+   if (!pmd)
+   return NULL;
+
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
WARN_ON(1);
--
2.7.4



[PATCH v3 1/2] x86/pti: check the return value of pti_user_pagetable_walk_p4d

2018-07-17 Thread Jiang Biao
pti_user_pagetable_walk_p4d() may return NULL, we should check the
return value to avoid NULL pointer dereference. And add warning
for fail allocation where NULL returned.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e7..5c33a16 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -176,7 +176,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 
if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
-   if (!new_p4d_page)
+   if (WARN_ON(!new_p4d_page))
return NULL;
 
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
@@ -195,9 +195,12 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
pud_t *pud;
 
+   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+   if (!p4d)
+   return NULL;
+
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
@@ -354,6 +357,9 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;
 
user_p4d = pti_user_pagetable_walk_p4d(addr);
+   if (!user_p4d)
+   return; 
+
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
-- 
2.7.4



[PATCH v3 2/2] x86/pti: check the return value of pti_user_pagetable_walk_pmd

2018-07-17 Thread Jiang Biao
Check the return value of pti_user_pagetable_walk_pmd() to avoid
NULL pointer dereference. And add warning for fail allocation.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 5c33a16..1342f73 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -204,7 +204,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
-   if (!new_pud_page)
+   if (WARN_ON(!new_pud_page))
return NULL;
 
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
@@ -218,7 +218,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
-   if (!new_pmd_page)
+   if (WARN_ON(!new_pmd_page))
return NULL;
 
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
@@ -240,9 +240,12 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
 static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
pte_t *pte;
 
+   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+   if (!pmd)
+   return NULL;
+
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
WARN_ON(1);
-- 
2.7.4



[PATCH] x86/speculation: remove SPECTRE_V2_IBRS in enum spectre_v2_mitigation

2018-07-17 Thread Jiang Biao
SPECTRE_V2_IBRS in enum spectre_v2_mitigation is never used, remove
it to avoid confusing.

Signed-off-by: Jiang Biao 
---
 arch/x86/include/asm/nospec-branch.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
index f6f6c63..c99082e 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -214,7 +214,6 @@ enum spectre_v2_mitigation {
SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
SPECTRE_V2_RETPOLINE_GENERIC,
SPECTRE_V2_RETPOLINE_AMD,
-   SPECTRE_V2_IBRS,
 };
 
 /* The Speculative Store Bypass disable variants */
-- 
2.7.4



[PATCH 4/5] x86/pti: warn for unknown pti boot options

2018-07-15 Thread Jiang Biao
When using unknown pti boot options other than on/off/auto, we
select auto silently, which is sometimes confusing. Add warning for
unknown pti boot options like we do in
spectre_v2_select_mitigation().

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index a76b2cc..a368656 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -92,14 +92,14 @@ void __init pti_check_boottime_disable(void)
pti_mode = PTI_FORCE_OFF;
pti_print_if_insecure("disabled on command line.");
return;
-   }
-   if (ret == 2 && !strncmp(arg, "on", 2)) {
+   } else if (ret == 2 && !strncmp(arg, "on", 2)) {
pti_mode = PTI_FORCE_ON;
pti_print_if_secure("force enabled on command line.");
goto enable;
-   }
-   if (ret == 4 && !strncmp(arg, "auto", 4)) {
-   pti_mode = PTI_AUTO;
+   } else if (ret == 4 && !strncmp(arg, "auto", 4)) {
+   goto autosel;
+   } else {
+   pr_err("unknown option (%s). Switching to AUTO 
select\n", arg);
goto autosel;
}
}
-- 
2.7.4



[PATCH 5/5] x86/pti: constify address parameters

2018-07-15 Thread Jiang Biao
Addresses passed in pti_user_pagetable_walk_*() are not supposed to
change at runtime, make them const to aviod future slipups.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index a368656..ac2cbdd 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -164,7 +164,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
  *
  * Returns a pointer to a P4D on success, or NULL on failure.
  */
-static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+static p4d_t *pti_user_pagetable_walk_p4d(const unsigned long address)
 {
pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
@@ -192,7 +192,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
  *
  * Returns a pointer to a PMD on success, or NULL on failure.
  */
-static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+static pmd_t *pti_user_pagetable_walk_pmd(const unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pud_t *pud;
@@ -236,7 +236,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
  *
  * Returns a pointer to a PTE on success, or NULL on failure.
  */
-static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+static __init pte_t *pti_user_pagetable_walk_pte(const unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pte_t *pte;
-- 
2.7.4



[PATCH 2/5] x86/pti: check the return value of pti_user_pagetable_walk_pmd

2018-07-15 Thread Jiang Biao
pti_user_pagetable_walk_pmd() may return NULL, we should check the
return value in pti_user_pagetable_walk_pte() to avoid NULL pointer
dereference like it is checked in pti_clone_pmds().

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index be9e5bc..bb6f608 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -239,8 +239,10 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
 static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
pte_t *pte;
+   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+   if (WARN_ON(!pmd))
+   return NULL;
 
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
-- 
2.7.4



[PATCH 1/5] x86/pti: check the return value of pti_user_pagetable_walk_p4d

2018-07-15 Thread Jiang Biao
pti_user_pagetable_walk_p4d() may return NULL, we should check the
return value to avoid NULL pointer dereference.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e7..be9e5bc 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -195,8 +195,10 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
pud_t *pud;
+   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+   if (WARN_ON(!p4d))
+   return NULL;
 
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
@@ -354,6 +356,9 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;
 
user_p4d = pti_user_pagetable_walk_p4d(addr);
+   if (WARN_ON(!user_p4d))
+   return;
+   
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
-- 
2.7.4



[PATCH 3/5] x86/pti: make pti_set_kernel_image_nonglobal static

2018-07-15 Thread Jiang Biao
pti_set_kernel_image_nonglobal() is only used in pti.c, make it
static.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index bb6f608..a76b2cc 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -473,7 +473,7 @@ void pti_clone_kernel_text(void)
  * the other set_memory.h functions.  Just extern it.
  */
 extern int set_memory_nonglobal(unsigned long addr, int numpages);
-void pti_set_kernel_image_nonglobal(void)
+static void pti_set_kernel_image_nonglobal(void)
 {
/*
 * The identity map is created with PMDs, regardless of the
-- 
2.7.4



[PATCH v2 2/2] x86/pti: check the return value of pti_user_pagetable_walk_pmd

2018-07-17 Thread Jiang Biao
Check the return value of pti_user_pagetable_walk_pmd() to avoid
NULL pointer dereference. And add warning for fail allocation.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 7c402e9..3dba6a7 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -203,7 +203,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
-   if (!new_pud_page)
+   if (WARN_ON(!new_pud_page))
return NULL;
 
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
@@ -217,7 +217,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
-   if (!new_pmd_page)
+   if (WARN_ON(!new_pmd_page))
return NULL;
 
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
@@ -239,8 +239,10 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
 static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
pte_t *pte;
+   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+   if (!pmd)
+   return NULL;
 
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
-- 
2.7.4



[PATCH v2 1/2] x86/pti: check the return value of pti_user_pagetable_walk_p4d

2018-07-17 Thread Jiang Biao
pti_user_pagetable_walk_p4d() may return NULL, we should check the
return value to avoid NULL pointer dereference. And add warning
for fail allocation where NULL returned.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e7..7c402e9 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -176,7 +176,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 
if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
-   if (!new_p4d_page)
+   if (WARN_ON(!new_p4d_page))
return NULL;
 
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
@@ -195,8 +195,10 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
pud_t *pud;
+   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+   if (!p4d)
+   return NULL;
 
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
@@ -354,6 +356,8 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;
 
user_p4d = pti_user_pagetable_walk_p4d(addr);
+   if (!user_p4d)
+   return;
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
-- 
2.7.4



[PATCH 0/5] x86/pti: small fixes and cleanups for pti

2018-07-15 Thread Jiang Biao
This is a short series of fixes and cleanups found when reading the
code, no functional changes.

Jiang Biao (5):
  x86/pti: check the return value of pti_user_pagetable_walk_p4d
  x86/pti: check the return value of pti_user_pagetable_walk_pmd
  x86/pti: make pti_set_kernel_image_nonglobal static
  x86/pti: warn for unknown pti boot options
  x86/pti: constify address parameters

 arch/x86/mm/pti.c | 29 ++---
 1 file changed, 18 insertions(+), 11 deletions(-)

-- 
2.7.4



[PATCH] blkcg: not hold blkcg lock when deactivating policy.

2018-04-17 Thread Jiang Biao
As described in the comment of blkcg_activate_policy(),
*Update of each blkg is protected by both queue and blkcg locks so
that holding either lock and testing blkcg_policy_enabled() is
always enough for dereferencing policy data.*
with queue lock held, there is no need to hold blkcg lock in
blkcg_deactivate_policy(). Similar case is in
blkcg_activate_policy(), which has removed holding of blkcg lock in
commit 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23.

Signed-off-by: Jiang Biao <jiang.bi...@zte.com.cn>
Signed-off-by: Wen Yang <wen.yan...@zte.com.cn>
CC: Tejun Heo <t...@kernel.org>
CC: Jens Axboe <ax...@kernel.dk>
---
 block/blk-cgroup.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c2033a2..2b7f8d0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1367,17 +1367,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
__clear_bit(pol->plid, q->blkcg_pols);
 
list_for_each_entry(blkg, >blkg_list, q_node) {
-   /* grab blkcg lock too while removing @pd from @blkg */
-   spin_lock(>blkcg->lock);
-
if (blkg->pd[pol->plid]) {
if (pol->pd_offline_fn)
pol->pd_offline_fn(blkg->pd[pol->plid]);
pol->pd_free_fn(blkg->pd[pol->plid]);
blkg->pd[pol->plid] = NULL;
}
-
-   spin_unlock(>blkcg->lock);
}
 
spin_unlock_irq(q->queue_lock);
-- 
2.7.4



[PATCH 1/2] blkcg: small fix on comment in blkcg_init_queue

2018-04-18 Thread Jiang Biao
The comment before blkg_create() in blkcg_init_queue() was moved
from blkcg_activate_policy() by commit ec13b1d6f0a0457312e615, but
it does not suit for the new context.

Signed-off-by: Jiang Biao <jiang.bi...@zte.com.cn>
Signed-off-by: Wen Yang <wen.yan...@zte.com.cn>
CC: Tejun Heo <t...@kernel.org>
CC: Jens Axboe <ax...@kernel.dk>
---
 block/blk-cgroup.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2b7f8d0..07e3359 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1134,11 +1134,7 @@ int blkcg_init_queue(struct request_queue *q)
 
preloaded = !radix_tree_preload(GFP_KERNEL);
 
-   /*
-* Make sure the root blkg exists and count the existing blkgs.  As
-* @q is bypassing at this point, blkg_lookup_create() can't be
-* used.  Open code insertion.
-*/
+   /* Make sure the root blkg exists. */
rcu_read_lock();
spin_lock_irq(q->queue_lock);
blkg = blkg_create(_root, q, new_blkg);
-- 
2.7.4



[PATCH 2/2] blkcg: init root blkcg_gq under lock

2018-04-18 Thread Jiang Biao
The initializing of q->root_blkg is currently outside of queue lock
and rcu, so the blkg may be destroied before the initializing, which
may cause dangling/null references. On the other side, the destroys
of blkg are protected by queue lock or rcu. Put the initializing
inside the queue lock and rcu to make it safer.

Signed-off-by: Jiang Biao <jiang.bi...@zte.com.cn>
Signed-off-by: Wen Yang <wen.yan...@zte.com.cn>
CC: Tejun Heo <t...@kernel.org>
CC: Jens Axboe <ax...@kernel.dk>
---
 block/blk-cgroup.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 07e3359..ec86837 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1138,18 +1138,16 @@ int blkcg_init_queue(struct request_queue *q)
rcu_read_lock();
spin_lock_irq(q->queue_lock);
blkg = blkg_create(_root, q, new_blkg);
+   if (IS_ERR(blkg))
+   goto err_unlock;
+   q->root_blkg = blkg;
+   q->root_rl.blkg = blkg;
spin_unlock_irq(q->queue_lock);
rcu_read_unlock();
 
if (preloaded)
radix_tree_preload_end();
 
-   if (IS_ERR(blkg))
-   return PTR_ERR(blkg);
-
-   q->root_blkg = blkg;
-   q->root_rl.blkg = blkg;
-
ret = blk_throtl_init(q);
if (ret) {
spin_lock_irq(q->queue_lock);
@@ -1157,6 +1155,13 @@ int blkcg_init_queue(struct request_queue *q)
spin_unlock_irq(q->queue_lock);
}
return ret;
+
+err_unlock:
+   spin_unlock_irq(q->queue_lock);
+   rcu_read_unlock();
+   if (preloaded)
+   radix_tree_preload_end();
+   return PTR_ERR(blkg);
 }
 
 /**
-- 
2.7.4



[PATCH] fs/ubifs: constify struct ubifs_lprops in scan_for_leb_for_idx

2018-02-27 Thread Jiang Biao
Constify struct ubifs_lprops in scan_for_leb_for_idx to be
consistent with other references.

Signed-off-by: Jiang Biao <jiang.bi...@zte.com.cn>
---
 fs/ubifs/find.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 2dcf3d4..9571616 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -632,7 +632,7 @@ static int scan_for_idx_cb(struct ubifs_info *c,
  */
 static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
 {
-   struct ubifs_lprops *lprops;
+   const struct ubifs_lprops *lprops;
struct scan_data data;
int err;
 
-- 
2.7.4



[PATCH] fs/ubifs: remove useless parameter of lpt_heap_replace

2018-02-27 Thread Jiang Biao
The parameter *old_lprops* is never used in lpt_heap_replace(),
remove it to avoid compile warning.

Signed-off-by: Jiang Biao <jiang.bi...@zte.com.cn>
---
 fs/ubifs/lprops.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 6c3a1ab..f5a4684 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -244,7 +244,6 @@ static void remove_from_lpt_heap(struct ubifs_info *c,
 /**
  * lpt_heap_replace - replace lprops in a category heap.
  * @c: UBIFS file-system description object
- * @old_lprops: LEB properties to replace
  * @new_lprops: LEB properties with which to replace
  * @cat: LEB category
  *
@@ -254,7 +253,6 @@ static void remove_from_lpt_heap(struct ubifs_info *c,
  * lprops.  This function does that.
  */
 static void lpt_heap_replace(struct ubifs_info *c,
-struct ubifs_lprops *old_lprops,
 struct ubifs_lprops *new_lprops, int cat)
 {
struct ubifs_lpt_heap *heap;
@@ -362,7 +360,7 @@ void ubifs_replace_cat(struct ubifs_info *c, struct 
ubifs_lprops *old_lprops,
case LPROPS_DIRTY:
case LPROPS_DIRTY_IDX:
case LPROPS_FREE:
-   lpt_heap_replace(c, old_lprops, new_lprops, cat);
+   lpt_heap_replace(c, new_lprops, cat);
break;
case LPROPS_UNCAT:
case LPROPS_EMPTY:
-- 
2.7.4



[PATCH] checkpatch: statement should end at a close brace at the outermost level.

2017-10-10 Thread Jiang Biao
Statement should end at a close brace at the outermost level in
ctx_statement_block.

The way to reproduce the bug,
1, Add two external function declarations into line 505 of
kernel/stop_machine.c, such as,
int foo1(void);
int foo2(void);
2, Format a patch for that, and use the checkpatch.pl to check.
3, The first declaration(foo1()) would not be warned, because the
statement does not end at the '}' before it.

Signed-off-by: Jiang Biao 
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index dd2c262..f220cfc 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -1330,7 +1330,7 @@ sub ctx_statement_block {
 
# Statement ends at the ';' or a close '}' at the
# outermost level.
-   if ($level == 0 && $c eq ';') {
+   if ($level == 0 && ($c eq ';' || $c eq '}')) {
last;
}
 
-- 
2.7.4



[PATCH] mm/vmscan: try to optimize branch procedures.

2017-11-27 Thread Jiang Biao
1. Use unlikely to try to improve branch prediction. The
*total_scan < 0* branch is unlikely to reach, so use unlikely.

2. Optimize *next_deferred >= scanned* condition.
*next_deferred >= scanned* condition could be optimized into
*next_deferred > scanned*, because when *next_deferred == scanned*,
next_deferred shoud be 0, which is covered by the else branch.

3. Merge two branch blocks into one. The *next_deferred > 0* branch
could be merged into *next_deferred > scanned* to simplify the code.

Signed-off-by: Jiang Biao 
---
 mm/vmscan.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb2f031..5f5d4ab 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -338,7 +338,7 @@ static unsigned long do_shrink_slab(struct shrink_control 
*shrinkctl,
delta *= freeable;
do_div(delta, nr_eligible + 1);
total_scan += delta;
-   if (total_scan < 0) {
+   if (unlikely(total_scan < 0)) {
pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
   shrinker->scan_objects, total_scan);
total_scan = freeable;
@@ -407,18 +407,16 @@ static unsigned long do_shrink_slab(struct shrink_control 
*shrinkctl,
cond_resched();
}
 
-   if (next_deferred >= scanned)
-   next_deferred -= scanned;
-   else
-   next_deferred = 0;
/*
 * move the unused scan count back into the shrinker in a
 * manner that handles concurrent updates. If we exhausted the
 * scan, there is no need to do an update.
 */
-   if (next_deferred > 0)
+   if (next_deferred > scanned) {
+   next_deferred -= scanned;
new_nr = atomic_long_add_return(next_deferred,
>nr_deferred[nid]);
+   }
else
new_nr = atomic_long_read(>nr_deferred[nid]);
 
-- 
2.7.4



[PATCH] mm/vmscan: change return type of is_page_cache_freeable from int to bool

2017-11-27 Thread Jiang Biao
Using bool for the return type of is_page_cache_freeable() should be
more appropriate.

Signed-off-by: Jiang Biao 
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb2f031..5fe63ed 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -530,7 +530,7 @@ void drop_slab(void)
drop_slab_node(nid);
 }
 
-static inline int is_page_cache_freeable(struct page *page)
+static inline bool is_page_cache_freeable(struct page *page)
 {
/*
 * A freeable page cache page is referenced only by the caller
-- 
2.7.4



[PATCH] blkcg: not hold blkcg lock when deactivating policy.

2018-04-17 Thread Jiang Biao
As described in the comment of blkcg_activate_policy(),
*Update of each blkg is protected by both queue and blkcg locks so
that holding either lock and testing blkcg_policy_enabled() is
always enough for dereferencing policy data.*
with queue lock held, there is no need to hold blkcg lock in
blkcg_deactivate_policy(). Similar case is in
blkcg_activate_policy(), which has removed holding of blkcg lock in
commit 4c55f4f9ad3001ac1fefdd8d8ca7641d18558e23.

Signed-off-by: Jiang Biao 
Signed-off-by: Wen Yang 
CC: Tejun Heo 
CC: Jens Axboe 
---
 block/blk-cgroup.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index c2033a2..2b7f8d0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1367,17 +1367,12 @@ void blkcg_deactivate_policy(struct request_queue *q,
__clear_bit(pol->plid, q->blkcg_pols);
 
list_for_each_entry(blkg, >blkg_list, q_node) {
-   /* grab blkcg lock too while removing @pd from @blkg */
-   spin_lock(>blkcg->lock);
-
if (blkg->pd[pol->plid]) {
if (pol->pd_offline_fn)
pol->pd_offline_fn(blkg->pd[pol->plid]);
pol->pd_free_fn(blkg->pd[pol->plid]);
blkg->pd[pol->plid] = NULL;
}
-
-   spin_unlock(>blkcg->lock);
}
 
spin_unlock_irq(q->queue_lock);
-- 
2.7.4



[PATCH 1/2] blkcg: small fix on comment in blkcg_init_queue

2018-04-18 Thread Jiang Biao
The comment before blkg_create() in blkcg_init_queue() was moved
from blkcg_activate_policy() by commit ec13b1d6f0a0457312e615, but
it does not suit for the new context.

Signed-off-by: Jiang Biao 
Signed-off-by: Wen Yang 
CC: Tejun Heo 
CC: Jens Axboe 
---
 block/blk-cgroup.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 2b7f8d0..07e3359 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1134,11 +1134,7 @@ int blkcg_init_queue(struct request_queue *q)
 
preloaded = !radix_tree_preload(GFP_KERNEL);
 
-   /*
-* Make sure the root blkg exists and count the existing blkgs.  As
-* @q is bypassing at this point, blkg_lookup_create() can't be
-* used.  Open code insertion.
-*/
+   /* Make sure the root blkg exists. */
rcu_read_lock();
spin_lock_irq(q->queue_lock);
blkg = blkg_create(_root, q, new_blkg);
-- 
2.7.4



[PATCH 2/2] blkcg: init root blkcg_gq under lock

2018-04-18 Thread Jiang Biao
The initializing of q->root_blkg is currently outside of queue lock
and rcu, so the blkg may be destroied before the initializing, which
may cause dangling/null references. On the other side, the destroys
of blkg are protected by queue lock or rcu. Put the initializing
inside the queue lock and rcu to make it safer.

Signed-off-by: Jiang Biao 
Signed-off-by: Wen Yang 
CC: Tejun Heo 
CC: Jens Axboe 
---
 block/blk-cgroup.c | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 07e3359..ec86837 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1138,18 +1138,16 @@ int blkcg_init_queue(struct request_queue *q)
rcu_read_lock();
spin_lock_irq(q->queue_lock);
blkg = blkg_create(_root, q, new_blkg);
+   if (IS_ERR(blkg))
+   goto err_unlock;
+   q->root_blkg = blkg;
+   q->root_rl.blkg = blkg;
spin_unlock_irq(q->queue_lock);
rcu_read_unlock();
 
if (preloaded)
radix_tree_preload_end();
 
-   if (IS_ERR(blkg))
-   return PTR_ERR(blkg);
-
-   q->root_blkg = blkg;
-   q->root_rl.blkg = blkg;
-
ret = blk_throtl_init(q);
if (ret) {
spin_lock_irq(q->queue_lock);
@@ -1157,6 +1155,13 @@ int blkcg_init_queue(struct request_queue *q)
spin_unlock_irq(q->queue_lock);
}
return ret;
+
+err_unlock:
+   spin_unlock_irq(q->queue_lock);
+   rcu_read_unlock();
+   if (preloaded)
+   radix_tree_preload_end();
+   return PTR_ERR(blkg);
 }
 
 /**
-- 
2.7.4



[PATCH] fs/ubifs: constify struct ubifs_lprops in scan_for_leb_for_idx

2018-02-27 Thread Jiang Biao
Constify struct ubifs_lprops in scan_for_leb_for_idx to be
consistent with other references.

Signed-off-by: Jiang Biao 
---
 fs/ubifs/find.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 2dcf3d4..9571616 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -632,7 +632,7 @@ static int scan_for_idx_cb(struct ubifs_info *c,
  */
 static const struct ubifs_lprops *scan_for_leb_for_idx(struct ubifs_info *c)
 {
-   struct ubifs_lprops *lprops;
+   const struct ubifs_lprops *lprops;
struct scan_data data;
int err;
 
-- 
2.7.4



[PATCH] fs/ubifs: remove useless parameter of lpt_heap_replace

2018-02-27 Thread Jiang Biao
The parameter *old_lprops* is never used in lpt_heap_replace(),
remove it to avoid compile warning.

Signed-off-by: Jiang Biao 
---
 fs/ubifs/lprops.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index 6c3a1ab..f5a4684 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -244,7 +244,6 @@ static void remove_from_lpt_heap(struct ubifs_info *c,
 /**
  * lpt_heap_replace - replace lprops in a category heap.
  * @c: UBIFS file-system description object
- * @old_lprops: LEB properties to replace
  * @new_lprops: LEB properties with which to replace
  * @cat: LEB category
  *
@@ -254,7 +253,6 @@ static void remove_from_lpt_heap(struct ubifs_info *c,
  * lprops.  This function does that.
  */
 static void lpt_heap_replace(struct ubifs_info *c,
-struct ubifs_lprops *old_lprops,
 struct ubifs_lprops *new_lprops, int cat)
 {
struct ubifs_lpt_heap *heap;
@@ -362,7 +360,7 @@ void ubifs_replace_cat(struct ubifs_info *c, struct 
ubifs_lprops *old_lprops,
case LPROPS_DIRTY:
case LPROPS_DIRTY_IDX:
case LPROPS_FREE:
-   lpt_heap_replace(c, old_lprops, new_lprops, cat);
+   lpt_heap_replace(c, new_lprops, cat);
break;
case LPROPS_UNCAT:
case LPROPS_EMPTY:
-- 
2.7.4



[PATCH] mm/vmscan: make do_shrink_slab more robust.

2017-11-26 Thread Jiang Biao
When running ltp stress test for 7*24 hours, the kernel occasionally
complains the following warning continuously,

mb_cache_shrink_scan+0x0/0x3f0 negative objects to delete
nr=-9222526086287711848
mb_cache_shrink_scan+0x0/0x3f0 negative objects to delete
nr=-9222420761333860545
mb_cache_shrink_scan+0x0/0x3f0 negative objects to delete
nr=-987677544280360
...

The tracing result shows the freeable(mb_cache_shrink_scan returns)
is -1, which causes the continuous accumulation and overflow of
total_scan.

This patch make do_shrink_slab more robust when
shrinker->count_objects return negative freeable.

Signed-off-by: Jiang Biao 
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb2f031..3ea28f0 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -323,7 +323,7 @@ static unsigned long do_shrink_slab(struct shrink_control 
*shrinkctl,
long scanned = 0, next_deferred;
 
freeable = shrinker->count_objects(shrinker, shrinkctl);
-   if (freeable == 0)
+   if (freeable <= 0)
return 0;
 
/*
-- 
2.7.4



[PATCH] fs/mbcache: make count_objects more robust.

2017-11-26 Thread Jiang Biao
When running ltp stress test for 7*24 hours, the vmscan occasionally
complains the following warning continuously,

mb_cache_scan+0x0/0x3f0 negative objects to delete
nr=-9232265467809300450
...

The tracing result shows the freeable(mb_cache_count returns)
is -1, which causes the continuous accumulation and overflow of
total_scan.

This patch make sure the mb_cache_count not return negative value,
which make the mbcache shrinker more robust.

Signed-off-by: Jiang Biao 
---
 fs/mbcache.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/mbcache.c b/fs/mbcache.c
index d818fd2..b8b8b9c 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -269,6 +269,9 @@ static unsigned long mb_cache_count(struct shrinker *shrink,
struct mb_cache *cache = container_of(shrink, struct mb_cache,
  c_shrink);
 
+   /* Unlikely, but not impossible */
+   if (unlikely(cache->c_entry_count < 0))
+   return 0;
return cache->c_entry_count;
 }
 
-- 
2.7.4



[PATCH] scripts/checkpatch.pl: fix false warning of externs checking.

2017-10-10 Thread Jiang Biao
When adding a function declaration in a .c file without an extern
keywork decoration, checkpatch.pl will complain *externs should be
avoided in .c files* false warning. This patch fix the bug.

Signed-off-by: Jiang Biao 
---
 scripts/checkpatch.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl
index dd2c262..170daf1 100755
--- a/scripts/checkpatch.pl
+++ b/scripts/checkpatch.pl
@@ -5927,7 +5927,7 @@ sub process {
 
 # check for new externs in .c files.
if ($realfile =~ /\.c$/ && defined $stat &&
-   $stat =~ /^.\s*(?:extern\s+)?$Type\s+($Ident)(\s*)\(/s)
+   $stat =~ /^.\s*(?:extern\s+)$Type\s+($Ident)(\s*)\(/s)
{
my $function_name = $1;
my $paren_space = $2;
-- 
2.7.4



[PATCH] sched/fair: Trivial fix for comment of check_preempt_tick

2021-01-16 Thread Jiang Biao
From: Jiang Biao 

check_preempt_tick() is not just for newly woken task preemption,
and check_preempt_wakeup() is instead.

The current comment of check_preempt_tick() is a little confusing.

Signed-off-by: Jiang Biao 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04a3ce20da67..c0374c1152e0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4360,7 +4360,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int flags)
 }
 
 /*
- * Preempt the current task with a newly woken task if needed:
+ * Preempt the current task with the leftmost task if needed:
  */
 static void
 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
-- 
2.21.0



Re: [PATCH] sched/fair: add protection for delta of wait time

2021-01-18 Thread Jiang Biao
Hi, Vincent

On Mon, 18 Jan 2021 at 23:32, Vincent Guittot
 wrote:
>
> On Mon, 18 Jan 2021 at 15:11, Jiang Biao  wrote:
> >
> > Hi, Vincent
> >
> > On Mon, 18 Jan 2021 at 15:56, Vincent Guittot
> >  wrote:
> > >
> > > On Sun, 17 Jan 2021 at 13:31, Jiang Biao  wrote:
> > > >
> > > > From: Jiang Biao 
> > > >
> > > > delta in update_stats_wait_end() might be negative, which would
> > > > make following statistics go wrong.
> > >
> > > Could you describe the use case that generates a negative delta ?
> > >
> > > rq_clock is always increasing so this should not lead to a negative
> > > value even if update_stats_wait_end/start are not called in the right
> > > order,
> > Yes, indeed.
> >
> > > This situation could happen after a migration if we forgot to call
> > > update_stats_wait_start
> > The migration case was what I worried about, but no regular use case
> > comes into my mind. :)
>
> IIUC, you haven't faced the problem and it's only based on studying the code.
Not yet. :).  Just found there are protections for
sleep_time/block_time, but no protection
for wait_time.

Think more later, the sleep_time/block_time do need to be protected
for the migration case,
because update_stats_enqueue_sleeper could be called right after
migration with src cpu's
sleep_start/block_start. But wait_time is not the case.

The following case might be too extreme to happen. :)

Thanks a lot for your patience.

Regards,
Jiang

>
> > As an extreme case, would it be a problem if we disable/re-enable
> > sched_schedstats during migration?
> >
> > static inline void
> > update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > u64 wait_start, prev_wait_start;
> >
> > if (!schedstat_enabled()) // disable during migration
> > return; // return here, and skip updating wait_start
> > ...
> > }
> >
> > static inline void
> > update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
> > {
> > struct task_struct *p;
> > u64 delta;
> >
> > if (!schedstat_enabled())  // re-enable again
> > return;
> >
> > /*
> >  * When the sched_schedstat changes from 0 to 1, some sched se
> >  * maybe already in the runqueue, the se->statistics.wait_start
> >  * will be 0.So it will let the delta wrong. We need to avoid this
> >  * scenario.
> >  */
> > if (unlikely(!schedstat_val(se->statistics.wait_start)))
> > return;
> >  //stale wait_start which might be bigger than rq_clock would
> > be used. -)
> > delta = rq_clock(rq_of(cfs_rq)) -
> > schedstat_val(se->statistics.wait_start);
> > ...
> >
> > Thanks a lot.
> > Regards,
> > Jiang


Re: [PATCH] sched/fair: add protection for delta of wait time

2021-01-18 Thread Jiang Biao
Hi, Vincent

On Mon, 18 Jan 2021 at 15:56, Vincent Guittot
 wrote:
>
> On Sun, 17 Jan 2021 at 13:31, Jiang Biao  wrote:
> >
> > From: Jiang Biao 
> >
> > delta in update_stats_wait_end() might be negative, which would
> > make following statistics go wrong.
>
> Could you describe the use case that generates a negative delta ?
>
> rq_clock is always increasing so this should not lead to a negative
> value even if update_stats_wait_end/start are not called in the right
> order,
Yes, indeed.

> This situation could happen after a migration if we forgot to call
> update_stats_wait_start
The migration case was what I worried about, but no regular use case
comes into my mind. :)
As an extreme case, would it be a problem if we disable/re-enable
sched_schedstats during migration?

static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
u64 wait_start, prev_wait_start;

if (!schedstat_enabled()) // disable during migration
return; // return here, and skip updating wait_start
...
}

static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *p;
u64 delta;

if (!schedstat_enabled())  // re-enable again
return;

/*
 * When the sched_schedstat changes from 0 to 1, some sched se
 * maybe already in the runqueue, the se->statistics.wait_start
 * will be 0.So it will let the delta wrong. We need to avoid this
 * scenario.
 */
if (unlikely(!schedstat_val(se->statistics.wait_start)))
return;
 //stale wait_start which might be bigger than rq_clock would
be used. -)
delta = rq_clock(rq_of(cfs_rq)) -
schedstat_val(se->statistics.wait_start);
...

Thanks a lot.
Regards,
Jiang

}
>
> >
> > Add protection for delta of wait time, like what have been done in
> > update_stats_enqueue_sleeper() for deltas of sleep/block time.
> >
> > Signed-off-by: Jiang Biao 
> > ---
> >  kernel/sched/fair.c | 3 +++
> >  1 file changed, 3 insertions(+)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index c0374c1152e0..ac950ac950bc 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -917,6 +917,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct 
> > sched_entity *se)
> >
> > delta = rq_clock(rq_of(cfs_rq)) - 
> > schedstat_val(se->statistics.wait_start);
> >
> > +   if ((s64)delta < 0)
> > +   delta = 0;
> > +
> > if (entity_is_task(se)) {
> > p = task_of(se);
> > if (task_on_rq_migrating(p)) {
> > --
> > 2.21.0
> >


[PATCH] Documentation: Fix typos found in cgroup-v2.rst

2021-01-07 Thread Jiang Biao
From: Jiang Biao 

Fix typos found in Documentation/admin-guide/cgroup-v2.rst.

Signed-off-by: Jiang Biao 
---
 Documentation/admin-guide/cgroup-v2.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst 
b/Documentation/admin-guide/cgroup-v2.rst
index 63521cd36ce5..e0f6ff7cfa93 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -1558,7 +1558,7 @@ IO Interface Files
  8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252 
dbytes=50331648 dios=3021
 
   io.cost.qos
-   A read-write nested-keyed file with exists only on the root
+   A read-write nested-keyed file which exists only on the root
cgroup.
 
This file configures the Quality of Service of the IO cost
@@ -1613,7 +1613,7 @@ IO Interface Files
automatic mode can be restored by setting "ctrl" to "auto".
 
   io.cost.model
-   A read-write nested-keyed file with exists only on the root
+   A read-write nested-keyed file which exists only on the root
cgroup.
 
This file configures the cost model of the IO cost model based
-- 
2.21.0



[PATCH] sched/fair: add protection for delta of wait time

2021-01-17 Thread Jiang Biao
From: Jiang Biao 

delta in update_stats_wait_end() might be negative, which would
make following statistics go wrong.

Add protection for delta of wait time, like what have been done in
update_stats_enqueue_sleeper() for deltas of sleep/block time.

Signed-off-by: Jiang Biao 
---
 kernel/sched/fair.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c0374c1152e0..ac950ac950bc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -917,6 +917,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct 
sched_entity *se)
 
delta = rq_clock(rq_of(cfs_rq)) - 
schedstat_val(se->statistics.wait_start);
 
+   if ((s64)delta < 0)
+   delta = 0;
+
if (entity_is_task(se)) {
p = task_of(se);
if (task_on_rq_migrating(p)) {
-- 
2.21.0



[PATCH RFC] sched/fair: simplfy the work when reweighting entity

2020-08-04 Thread Jiang Biao
If a se is on_rq when reweighting entity, all we need should be
updating the load of cfs_rq, other dequeue/enqueue works could be
redundant, such as,
* account_numa_dequeue/account_numa_enqueue
* list_del/list_add from/into cfs_tasks
* nr_running--/nr_running++

Just simplfy the work. Could be helpful for the hot path.

Signed-off-by: Jiang Biao 
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..18a8fc7bd0de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3086,7 +3086,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se,
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
-   account_entity_dequeue(cfs_rq, se);
+   update_load_sub(_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
 
@@ -3102,7 +3102,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se,
 
enqueue_load_avg(cfs_rq, se);
if (se->on_rq)
-   account_entity_enqueue(cfs_rq, se);
+   update_load_add(_rq->load, se->load.weight);
 
 }
 
-- 
2.21.0



[PATCH] sched/fair: Optimize dequeue_task_fair()

2020-08-11 Thread Jiang Biao
Similar optimization as what has been done in commit,
7d148be69e3a(sched/fair: Optimize enqueue_task_fair())

dequeue_task_fair jumps to dequeue_throttle label when cfs_rq_of(se) is
throttled which means that se can't be NULL. We can move the label after
the if (!se) statement and remove the if(!se) statment as se is always
NULL when reaching this point.

Besides, trying to keep the same pattern with enqueue_task_fair can make
it more readable.

Signed-off-by: Jiang Biao 
---
 kernel/sched/fair.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..cbbeafdfa8b7 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5618,10 +5618,10 @@ static void dequeue_task_fair(struct rq *rq, struct 
task_struct *p, int flags)
 
}
 
-dequeue_throttle:
-   if (!se)
-   sub_nr_running(rq, 1);
+   /* At this point se is NULL and we are at root level*/
+   sub_nr_running(rq, 1);
 
+dequeue_throttle:
/* balance early to pull high priority tasks */
if (unlikely(!was_sched_idle && sched_idle_rq(rq)))
rq->next_balance = jiffies;
-- 
2.21.0



[PATCH v3] sched/fair: simplfy the work when reweighting entity

2020-08-11 Thread Jiang Biao
From: Jiang Biao 

The code in reweight_entity() can be simplified.

For a sched entity on the rq, the entity accounting can be replaced by
cfs_rq instantaneous load updates currently called from within the
entity accounting.

Even though an entity on the rq can't represent a task in
reweight_entity() (a task is always dequeued before calling this
function) and so the numa task accounting and the rq->cfs_tasks list
management of the entity accounting are never called, the redundant
cfs_rq->nr_running decrement/increment will be avoided.

Signed-off-by: Jiang Biao 
---
v3<-v2: Amend commit log taking Dietmar's advice. Thx.
v2<-v1: Amend the commit log

 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..18a8fc7bd0de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3086,7 +3086,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se,
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
-   account_entity_dequeue(cfs_rq, se);
+   update_load_sub(_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
 
@@ -3102,7 +3102,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se,
 
enqueue_load_avg(cfs_rq, se);
if (se->on_rq)
-   account_entity_enqueue(cfs_rq, se);
+   update_load_add(_rq->load, se->load.weight);
 
 }
 
-- 
2.21.0



Re: [PATCH] driver/pci: reduce the single block time in pci_read_config

2020-09-16 Thread Jiang Biao
Hi,

On Thu, 17 Sep 2020 at 00:56, Bjorn Helgaas  wrote:
>
> On Sun, Sep 13, 2020 at 12:27:09PM +0800, Jiang Biao wrote:
> > On Thu, 10 Sep 2020 at 09:59, Bjorn Helgaas  wrote:
> > > On Thu, Sep 10, 2020 at 09:54:02AM +0800, Jiang Biao wrote:
> > > > On Thu, 10 Sep 2020 at 09:25, Bjorn Helgaas  wrote:
> > > > > On Mon, Aug 24, 2020 at 01:20:25PM +0800, Jiang Biao wrote:
> > > > > > From: Jiang Biao 
> > > > > >
> > > > > > pci_read_config() could block several ms in kernel space, mainly
> > > > > > caused by the while loop to call pci_user_read_config_dword().
> > > > > > Singel pci_user_read_config_dword() loop could consume 130us+,
> > > > > >   |pci_user_read_config_dword() {
> > > > > >   |  _raw_spin_lock_irq() {
> > > > > > ! 136.698 us  |native_queued_spin_lock_slowpath();
> > > > > > ! 137.582 us  |  }
> > > > > >   |  pci_read() {
> > > > > >   |raw_pci_read() {
> > > > > >   |  pci_conf1_read() {
> > > > > >   0.230 us|_raw_spin_lock_irqsave();
> > > > > >   0.035 us|_raw_spin_unlock_irqrestore();
> > > > > >   8.476 us|  }
> > > > > >   8.790 us|}
> > > > > >   9.091 us|  }
> > > > > > ! 147.263 us  |}
> > > > > > and dozens of the loop could consume ms+.
> > > > > >
> > > > > > If we execute some lspci commands concurrently, ms+ scheduling
> > > > > > latency could be detected.
> > > > > >
> > > > > > Add scheduling chance in the loop to improve the latency.
> > > > >
> > > > > Thanks for the patch, this makes a lot of sense.
> > > > >
> > > > > Shouldn't we do the same in pci_write_config()?
> > > >
> > > > Yes, IMHO, that could be helpful too.
> > >
> > > If it's feasible, it would be nice to actually verify that it makes a
> > > difference.  I know config writes should be faster than reads, but
> > > they're certainly not as fast as a CPU can pump out data, so there
> > > must be *some* mechanism that slows the CPU down.
> > >
> > We failed to build a test case to produce the latency by setpci command,
> > AFAIU, setpci could be much less frequently realistically used than lspci.
> > So, the latency from pci_write_config() path could not be verified for now,
> > could we apply this patch alone to erase the verified latency introduced
> > by pci_read_config() path? :)
>
> Thanks for trying!  I'll apply the patch as-is.  I'd like to include a
Thanks. :)

> note in the commit log about the user-visible effect of this.  I
> looked through recent similar commits:
>
>   928da37a229f ("RDMA/umem: Add a schedule point in ib_umem_get()")
>   47aaabdedf36 ("fanotify: Avoid softlockups when reading many events")
>   9f47eb5461aa ("fs/btrfs: Add cond_resched() for 
> try_release_extent_mapping() stalls")
>   0a3b3c253a1e ("mm/mmap.c: Add cond_resched() for exit_mmap() CPU stalls")
>   b7e3debdd040 ("mm/memory_hotplug.c: fix false softlockup during pfn range 
> removal")
>   d35bd764e689 ("dm writecache: add cond_resched to loop in 
> persistent_memory_claim()")
>   da97f2d56bbd ("mm: call cond_resched() from deferred_init_memmap()")
>   ab8b65be1831 ("KVM: PPC: Book3S: Fix some RCU-list locks")
>   48c963e31bc6 ("KVM: arm/arm64: Release kvm->mmu_lock in loop to prevent 
> starvation")
>   e84fe99b68ce ("mm/page_alloc: fix watchdog soft lockups during 
> set_zone_contiguous()")
>   4005f5c3c9d0 ("wireguard: send/receive: cond_resched() when processing 
> worker ringbuffers")
>   3fd44c86711f ("io_uring: use cond_resched() in io_ring_ctx_wait_and_kill()")
>   7979457b1d3a ("net: bridge: vlan: Add a schedule point during VLAN 
> processing")
>   2ed6edd33a21 ("perf: Add cond_resched() to task_function_call()")
>   1edaa447d958 ("dm writecache: add cond_resched to avoid CPU hangs")
>   ce9a4186f9ac ("macvlan: add cond_resched() during multicast processing")
>   7be1b9b8e9d1 ("drm/mm: Break long searches in fragmented address spaces")
>   bb699a793110 ("drm/i915/gem: Break up long lists of object reclaim")
>   9424ef56e13a ("ext4: add cond_resched() to __ext4_find_entry()")
>
> and many of them mention softlockups, RCU CPU stall warnings, or
> watchdogs triggering.  Are you seeing one of those, or are you
No softlockup or RCU stall warnings.

> measuring latency some other way?
We test the scheduling latency by cyclictest benchmark, the max
latency could be more than 5ms without this patch. The ftrace log
shows pci_read_config is the main cause, and the 5ms+ latency
disappeared with this patch applied.

Thanks a lot.
Regards,
Jiang


Re: [PATCH] driver/pci: reduce the single block time in pci_read_config

2020-08-27 Thread Jiang Biao
kindly ping :)

On Mon, 24 Aug 2020 at 13:20, Jiang Biao  wrote:
>
> From: Jiang Biao 
>
> pci_read_config() could block several ms in kernel space, mainly
> caused by the while loop to call pci_user_read_config_dword().
> Singel pci_user_read_config_dword() loop could consume 130us+,
>   |pci_user_read_config_dword() {
>   |  _raw_spin_lock_irq() {
> ! 136.698 us  |native_queued_spin_lock_slowpath();
> ! 137.582 us  |  }
>   |  pci_read() {
>   |raw_pci_read() {
>   |  pci_conf1_read() {
>   0.230 us|_raw_spin_lock_irqsave();
>   0.035 us|_raw_spin_unlock_irqrestore();
>   8.476 us|  }
>   8.790 us|}
>   9.091 us|  }
> ! 147.263 us  |}
> and dozens of the loop could consume ms+.
>
> If we execute some lspci commands concurrently, ms+ scheduling
> latency could be detected.
>
> Add scheduling chance in the loop to improve the latency.
>
> Reported-by: Bin Lai 
> Signed-off-by: Jiang Biao 
> ---
>  drivers/pci/pci-sysfs.c | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
> index 6d78df9..3b9f63d 100644
> --- a/drivers/pci/pci-sysfs.c
> +++ b/drivers/pci/pci-sysfs.c
> @@ -708,6 +708,7 @@ static ssize_t pci_read_config(struct file *filp, struct 
> kobject *kobj,
> data[off - init_off + 3] = (val >> 24) & 0xff;
> off += 4;
> size -= 4;
> +   cond_resched();
> }
>
> if (size >= 2) {
> --
> 1.8.3.1
>


Re: [PATCH] driver/pci: reduce the single block time in pci_read_config

2020-09-09 Thread Jiang Biao
Hi,

On Thu, 10 Sep 2020 at 09:25, Bjorn Helgaas  wrote:
>
> On Mon, Aug 24, 2020 at 01:20:25PM +0800, Jiang Biao wrote:
> > From: Jiang Biao 
> >
> > pci_read_config() could block several ms in kernel space, mainly
> > caused by the while loop to call pci_user_read_config_dword().
> > Singel pci_user_read_config_dword() loop could consume 130us+,
> >   |pci_user_read_config_dword() {
> >   |  _raw_spin_lock_irq() {
> > ! 136.698 us  |native_queued_spin_lock_slowpath();
> > ! 137.582 us  |  }
> >   |  pci_read() {
> >   |raw_pci_read() {
> >   |  pci_conf1_read() {
> >   0.230 us|_raw_spin_lock_irqsave();
> >   0.035 us|_raw_spin_unlock_irqrestore();
> >   8.476 us|  }
> >   8.790 us|}
> >   9.091 us|  }
> > ! 147.263 us  |}
> > and dozens of the loop could consume ms+.
> >
> > If we execute some lspci commands concurrently, ms+ scheduling
> > latency could be detected.
> >
> > Add scheduling chance in the loop to improve the latency.
>
> Thanks for the patch, this makes a lot of sense.
>
> Shouldn't we do the same in pci_write_config()?
Yes, IMHO, that could be helpful too.
I'll send v2 to include that. :)
Thanks a lot for your comment.

Regards,
Jiang


Re: [PATCH] driver/pci: reduce the single block time in pci_read_config

2020-09-09 Thread Jiang Biao
Hi,

On Thu, 10 Sep 2020 at 09:59, Bjorn Helgaas  wrote:
>
> On Thu, Sep 10, 2020 at 09:54:02AM +0800, Jiang Biao wrote:
> > Hi,
> >
> > On Thu, 10 Sep 2020 at 09:25, Bjorn Helgaas  wrote:
> > >
> > > On Mon, Aug 24, 2020 at 01:20:25PM +0800, Jiang Biao wrote:
> > > > From: Jiang Biao 
> > > >
> > > > pci_read_config() could block several ms in kernel space, mainly
> > > > caused by the while loop to call pci_user_read_config_dword().
> > > > Singel pci_user_read_config_dword() loop could consume 130us+,
> > > >   |pci_user_read_config_dword() {
> > > >   |  _raw_spin_lock_irq() {
> > > > ! 136.698 us  |native_queued_spin_lock_slowpath();
> > > > ! 137.582 us  |  }
> > > >   |  pci_read() {
> > > >   |raw_pci_read() {
> > > >   |  pci_conf1_read() {
> > > >   0.230 us|_raw_spin_lock_irqsave();
> > > >   0.035 us|_raw_spin_unlock_irqrestore();
> > > >   8.476 us|  }
> > > >   8.790 us|}
> > > >   9.091 us|  }
> > > > ! 147.263 us  |}
> > > > and dozens of the loop could consume ms+.
> > > >
> > > > If we execute some lspci commands concurrently, ms+ scheduling
> > > > latency could be detected.
> > > >
> > > > Add scheduling chance in the loop to improve the latency.
> > >
> > > Thanks for the patch, this makes a lot of sense.
> > >
> > > Shouldn't we do the same in pci_write_config()?
> > Yes, IMHO, that could be helpful too.
>
> If it's feasible, it would be nice to actually verify that it makes a
> difference.  I know config writes should be faster than reads, but
> they're certainly not as fast as a CPU can pump out data, so there
> must be *some* mechanism that slows the CPU down.
We did catch 5ms+ latency caused by pci_read_config() triggered by
concurrent lspcis, and latency disappeared after this patch.
For pci_write_config path, we have not met the case actually.:)
I'll have some tries to verify that, and would send another patch if verified.

Thx.
Regards,
Jiang


Re: [PATCH] sched/fair: avoid vruntime compensation for SCHED_IDLE task

2020-09-01 Thread Jiang Biao
Hi, Vincent

Sorry for the late reply.:)

On Fri, 28 Aug 2020 at 20:55, Vincent Guittot
 wrote:
>
> On Sun, 23 Aug 2020 at 09:33, Jiang Biao  wrote:
> >
> > Hi, Vincent and Peter
> >
> > On Thu, 20 Aug 2020 at 22:09, Vincent Guittot
> >  wrote:
> > >
> > > On Thu, 20 Aug 2020 at 15:44,  wrote:
> > > >
> > > > > That's been said, not compensating the vruntime for a sched_idle task
> > > > > makes sense for me. Even if that will only help for others task in the
> > > > > same cfs_rq
> > > >
> > > > Yeah, but it is worth the extra pointer chasing and branches?
> > >
> > > For that I let Jiang provides figures to show the worthful
> > Using the following configuration for rt-app,
> > {
> > "tasks" : {
> > "task_other" : {
> > "instance" : 1, //only 1 instance to be easy to 
> > observe
> > "cpus" : [2],
> > "loop" : 2000,
> > "policy" : "SCHED_OTHER",
> > "run" : -1,  //make normal task 100% running
> > "priority" : 0,
> > "sleep" : 0
> > },
> > "task_idle" : {
> > "instance" : 1,
> > "cpus" : [2],
> > "loop" : 2000,
> > "policy" : "SCHED_IDLE",
> > "run" : 1, //only run 1us to avoid
> > blocking(always waiting for running), making check_preempt_wakeup
> > work(S->R switching)
> > "timer" : { "ref" : "unique2" , "period" :
> > 16000, "mode" : "absolute" }
> > }
> > },
> > "global" : {
> > "calibration" : "CPU0",
> > "default_policy" : "SCHED_OTHER",
> > "duration" : -1
> > }
> > }
> > without the patch,
> >   <...>-39771 [002] d.h. 42478.11: sched_wakeup:
> > comm=task_idle-1 pid=39772 prio=120 target_cpu=002
> ><...>-39771 [002] d... 42478.190437: sched_switch:
> > prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
> > next_comm=task_idle-1 next_pid=39772 next_prio=120
> ><...>-39771 [002] d.h. 42478.193771: sched_wakeup:
> > comm=task_idle-1 pid=39772 prio=120 target_cpu=002
> ><...>-39771 [002] d... 42478.206438: sched_switch:
> > prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
> > next_comm=task_idle-1 next_pid=39772 next_prio=120
> ><...>-39771 [002] d.h. 42478.209771: sched_wakeup:
> > comm=task_idle-1 pid=39772 prio=120 target_cpu=002
> ><...>-39771 [002] d... 42478.222438: sched_switch:
> > prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
> > next_comm=task_idle-1 next_pid=39772 next_prio=120
> ><...>-39771 [002] d.h. 42478.225771: sched_wakeup:
> > comm=task_idle-1 pid=39772 prio=120 target_cpu=002
> ><...>-39771 [002] d... 42478.238438: sched_switch:
> > prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
> > next_comm=task_idle-1 next_pid=39772 next_prio=120
> > *task_idle*  preempts every 12ms because of the compensation.
> >
> > with the patch,
> >task_other-0-27670 [002] d.h. 136785.278059: sched_wakeup:
> > comm=task_idle-1 pid=27671 prio=120 target_cpu=002
> > task_other-0-27670 [002] d... 136785.293623: sched_switch:
> > prev_comm=task_other-0 prev_pid=27670 prev_prio=120 prev_state=R ==>
> > next_comm=task_idle-1 next_pid=27671 next_prio=120
> > task_other-0-27670 [002] d.h. 136785.294059: sched_wakeup:
> > comm=task_idle-1 pid=27671 prio=120 target_cpu=002
> > task_other-0-27670 [002] d... 136785.317624: sched_switch:
> > prev_comm=task_other-0 prev_pid=27670 prev_prio=120 prev_state=R ==>
> > next_comm=task_idle-1 next_pid=27671 next_prio=120
> > task_other-0-27670 [002] d.h. 136785.326059: sched_wakeup:
> > comm=task_idle-1 pid=27671 prio=120 target_cpu=002
> > task_other-0-27670 [002] d... 136785.341622: sched_switch:
> > prev_comm=task_

Re: [PATCH] sched/fair: avoid vruntime compensation for SCHED_IDLE task

2020-09-01 Thread Jiang Biao
Hi,

On Tue, 1 Sep 2020 at 21:04, Vincent Guittot  wrote:
>
> On Tue, 1 Sep 2020 at 12:15, Jiang Biao  wrote:
> >
> > Hi, Vincent
> >
> > Sorry for the late reply.:)
> >
> > On Fri, 28 Aug 2020 at 20:55, Vincent Guittot
> >  wrote:
> > >
> > > On Sun, 23 Aug 2020 at 09:33, Jiang Biao  wrote:
> > > >
> > > > Hi, Vincent and Peter
> > > >
> > > > On Thu, 20 Aug 2020 at 22:09, Vincent Guittot
> > > >  wrote:
> > > > >
> > > > > On Thu, 20 Aug 2020 at 15:44,  wrote:
> > > > > >
> > > > > > > That's been said, not compensating the vruntime for a sched_idle 
> > > > > > > task
> > > > > > > makes sense for me. Even if that will only help for others task 
> > > > > > > in the
> > > > > > > same cfs_rq
> > > > > >
> > > > > > Yeah, but it is worth the extra pointer chasing and branches?
> > > > >
> > > > > For that I let Jiang provides figures to show the worthful
> > > > Using the following configuration for rt-app,
> > > > {
> > > > "tasks" : {
> > > > "task_other" : {
> > > > "instance" : 1, //only 1 instance to be easy to 
> > > > observe
> > > > "cpus" : [2],
> > > > "loop" : 2000,
> > > > "policy" : "SCHED_OTHER",
> > > > "run" : -1,  //make normal task 100% running
> > > > "priority" : 0,
> > > > "sleep" : 0
> > > > },
> > > > "task_idle" : {
> > > > "instance" : 1,
> > > > "cpus" : [2],
> > > > "loop" : 2000,
> > > > "policy" : "SCHED_IDLE",
> > > > "run" : 1, //only run 1us to avoid
> > > > blocking(always waiting for running), making check_preempt_wakeup
> > > > work(S->R switching)
> > > > "timer" : { "ref" : "unique2" , "period" :
> > > > 16000, "mode" : "absolute" }
> > > > }
> > > > },
> > > > "global" : {
> > > > "calibration" : "CPU0",
> > > > "default_policy" : "SCHED_OTHER",
> > > > "duration" : -1
> > > > }
> > > > }
> > > > without the patch,
> > > >   <...>-39771 [002] d.h. 42478.11: sched_wakeup:
> > > > comm=task_idle-1 pid=39772 prio=120 target_cpu=002
> > > ><...>-39771 [002] d... 42478.190437: sched_switch:
> > > > prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
> > > > next_comm=task_idle-1 next_pid=39772 next_prio=120
> > > ><...>-39771 [002] d.h. 42478.193771: sched_wakeup:
> > > > comm=task_idle-1 pid=39772 prio=120 target_cpu=002
> > > ><...>-39771 [002] d... 42478.206438: sched_switch:
> > > > prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
> > > > next_comm=task_idle-1 next_pid=39772 next_prio=120
> > > ><...>-39771 [002] d.h. 42478.209771: sched_wakeup:
> > > > comm=task_idle-1 pid=39772 prio=120 target_cpu=002
> > > ><...>-39771 [002] d... 42478.222438: sched_switch:
> > > > prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
> > > > next_comm=task_idle-1 next_pid=39772 next_prio=120
> > > ><...>-39771 [002] d.h. 42478.225771: sched_wakeup:
> > > > comm=task_idle-1 pid=39772 prio=120 target_cpu=002
> > > ><...>-39771 [002] d... 42478.238438: sched_switch:
> > > > prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
> > > > next_comm=task_idle-1 next_pid=39772 next_prio=120
> > > > *task_idle*  preempts every 12ms because of the compensation.
> > >

[PATCH] sched/fair: reduce preemption with IDLE tasks runable

2020-07-31 Thread Jiang Biao
From: Jiang Biao 

No need to preempt when there are only one runable CFS task with
other IDLE tasks on runqueue. The only one CFS task would always
be picked in that case.

Signed-off-by: Jiang Biao 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..8fb80636b010 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4527,7 +4527,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity 
*curr, int queued)
return;
 #endif
 
-   if (cfs_rq->nr_running > 1)
+   if (cfs_rq->nr_running > cfs_rq.idle_h_nr_running + 1)
check_preempt_tick(cfs_rq, curr);
 }
 
-- 
2.21.0



Re: [PATCH] sched/fair: avoid vruntime compensation for SCHED_IDLE task

2020-08-23 Thread Jiang Biao
Hi, Vincent and Peter

On Thu, 20 Aug 2020 at 22:09, Vincent Guittot
 wrote:
>
> On Thu, 20 Aug 2020 at 15:44,  wrote:
> >
> > > That's been said, not compensating the vruntime for a sched_idle task
> > > makes sense for me. Even if that will only help for others task in the
> > > same cfs_rq
> >
> > Yeah, but it is worth the extra pointer chasing and branches?
>
> For that I let Jiang provides figures to show the worthful
Using the following configuration for rt-app,
{
"tasks" : {
"task_other" : {
"instance" : 1, //only 1 instance to be easy to observe
"cpus" : [2],
"loop" : 2000,
"policy" : "SCHED_OTHER",
"run" : -1,  //make normal task 100% running
"priority" : 0,
"sleep" : 0
},
"task_idle" : {
"instance" : 1,
"cpus" : [2],
"loop" : 2000,
"policy" : "SCHED_IDLE",
"run" : 1, //only run 1us to avoid
blocking(always waiting for running), making check_preempt_wakeup
work(S->R switching)
"timer" : { "ref" : "unique2" , "period" :
16000, "mode" : "absolute" }
}
},
"global" : {
"calibration" : "CPU0",
"default_policy" : "SCHED_OTHER",
"duration" : -1
}
}
without the patch,
  <...>-39771 [002] d.h. 42478.11: sched_wakeup:
comm=task_idle-1 pid=39772 prio=120 target_cpu=002
   <...>-39771 [002] d... 42478.190437: sched_switch:
prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
next_comm=task_idle-1 next_pid=39772 next_prio=120
   <...>-39771 [002] d.h. 42478.193771: sched_wakeup:
comm=task_idle-1 pid=39772 prio=120 target_cpu=002
   <...>-39771 [002] d... 42478.206438: sched_switch:
prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
next_comm=task_idle-1 next_pid=39772 next_prio=120
   <...>-39771 [002] d.h. 42478.209771: sched_wakeup:
comm=task_idle-1 pid=39772 prio=120 target_cpu=002
   <...>-39771 [002] d... 42478.222438: sched_switch:
prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
next_comm=task_idle-1 next_pid=39772 next_prio=120
   <...>-39771 [002] d.h. 42478.225771: sched_wakeup:
comm=task_idle-1 pid=39772 prio=120 target_cpu=002
   <...>-39771 [002] d... 42478.238438: sched_switch:
prev_comm=task_other-0 prev_pid=39771 prev_prio=120 prev_state=R ==>
next_comm=task_idle-1 next_pid=39772 next_prio=120
*task_idle*  preempts every 12ms because of the compensation.

with the patch,
   task_other-0-27670 [002] d.h. 136785.278059: sched_wakeup:
comm=task_idle-1 pid=27671 prio=120 target_cpu=002
task_other-0-27670 [002] d... 136785.293623: sched_switch:
prev_comm=task_other-0 prev_pid=27670 prev_prio=120 prev_state=R ==>
next_comm=task_idle-1 next_pid=27671 next_prio=120
task_other-0-27670 [002] d.h. 136785.294059: sched_wakeup:
comm=task_idle-1 pid=27671 prio=120 target_cpu=002
task_other-0-27670 [002] d... 136785.317624: sched_switch:
prev_comm=task_other-0 prev_pid=27670 prev_prio=120 prev_state=R ==>
next_comm=task_idle-1 next_pid=27671 next_prio=120
task_other-0-27670 [002] d.h. 136785.326059: sched_wakeup:
comm=task_idle-1 pid=27671 prio=120 target_cpu=002
task_other-0-27670 [002] d... 136785.341622: sched_switch:
prev_comm=task_other-0 prev_pid=27670 prev_prio=120 prev_state=R ==>
next_comm=task_idle-1 next_pid=27671 next_prio=120
task_other-0-27670 [002] d.h. 136785.342059: sched_wakeup:
comm=task_idle-1 pid=27671 prio=120 target_cpu=002
task_other-0-27670 [002] d... 136785.365623: sched_switch:
prev_comm=task_other-0 prev_pid=27670 prev_prio=120 prev_state=R ==>
next_comm=task_idle-1 next_pid=27671 next_prio=120
*task_idle* preempts every 24 or 16 ms.

This patch could reduce the preempting frequency of task_idle, and
reduce the interference from SCHED_IDLE task.

Thx.
Regards,
Jiang


[PATCH] driver/pci: reduce the single block time in pci_read_config

2020-08-23 Thread Jiang Biao
From: Jiang Biao 

pci_read_config() could block several ms in kernel space, mainly
caused by the while loop to call pci_user_read_config_dword().
Singel pci_user_read_config_dword() loop could consume 130us+,
  |pci_user_read_config_dword() {
  |  _raw_spin_lock_irq() {
! 136.698 us  |native_queued_spin_lock_slowpath();
! 137.582 us  |  }
  |  pci_read() {
  |raw_pci_read() {
  |  pci_conf1_read() {
  0.230 us|_raw_spin_lock_irqsave();
  0.035 us|_raw_spin_unlock_irqrestore();
  8.476 us|  }
  8.790 us|}
  9.091 us|  }
! 147.263 us  |}
and dozens of the loop could consume ms+.

If we execute some lspci commands concurrently, ms+ scheduling
latency could be detected.

Add scheduling chance in the loop to improve the latency.

Reported-by: Bin Lai 
Signed-off-by: Jiang Biao 
---
 drivers/pci/pci-sysfs.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index 6d78df9..3b9f63d 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -708,6 +708,7 @@ static ssize_t pci_read_config(struct file *filp, struct 
kobject *kobj,
data[off - init_off + 3] = (val >> 24) & 0xff;
off += 4;
size -= 4;
+   cond_resched();
}
 
if (size >= 2) {
-- 
1.8.3.1



Re: [RFC PATCH v1 1/1] sched/fair: select idle cpu from idle cpumask in sched domain

2020-09-12 Thread Jiang Biao
Hi, Aubrey

On Fri, 11 Sep 2020 at 23:48, Aubrey Li  wrote:
>
> Added idle cpumask to track idle cpus in sched domain. When a CPU
> enters idle, its corresponding bit in the idle cpumask will be set,
> and when the CPU exits idle, its bit will be cleared.
>
> When a task wakes up to select an idle cpu, scanning idle cpumask
> has low cost than scanning all the cpus in last level cache domain,
> especially when the system is heavily loaded.
>
> Signed-off-by: Aubrey Li 
> ---
>  include/linux/sched/topology.h | 13 +
>  kernel/sched/fair.c|  4 +++-
>  kernel/sched/topology.c|  2 +-
>  3 files changed, 17 insertions(+), 2 deletions(-)
>
> diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
> index fb11091129b3..43a641d26154 100644
> --- a/include/linux/sched/topology.h
> +++ b/include/linux/sched/topology.h
> @@ -65,8 +65,21 @@ struct sched_domain_shared {
> atomic_tref;
> atomic_tnr_busy_cpus;
> int has_idle_cores;
> +   /*
> +* Span of all idle CPUs in this domain.
> +*
> +* NOTE: this field is variable length. (Allocated dynamically
> +* by attaching extra space to the end of the structure,
> +* depending on how many CPUs the kernel has booted up with)
> +*/
> +   unsigned long   idle_cpus_span[];
>  };
>
> +static inline struct cpumask *sds_idle_cpus(struct sched_domain_shared *sds)
> +{
> +   return to_cpumask(sds->idle_cpus_span);
> +}
> +
>  struct sched_domain {
> /* These fields must be setup */
> struct sched_domain __rcu *parent;  /* top domain must be null 
> terminated */
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 6b3b59cc51d6..3b6f8a3589be 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6136,7 +6136,7 @@ static int select_idle_cpu(struct task_struct *p, 
> struct sched_domain *sd, int t
>
> time = cpu_clock(this);
>
> -   cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> +   cpumask_and(cpus, sds_idle_cpus(sd->shared), p->cpus_ptr);
Is the sds_idle_cpus() always empty if nohz=off?
Do we need to initialize the idle_cpus_span with sched_domain_span(sd)?

>
> for_each_cpu_wrap(cpu, cpus, target) {
> if (!--nr)
> @@ -10182,6 +10182,7 @@ static void set_cpu_sd_state_busy(int cpu)
> sd->nohz_idle = 0;
>
> atomic_inc(>shared->nr_busy_cpus);
> +   cpumask_clear_cpu(cpu, sds_idle_cpus(sd->shared));
>  unlock:
> rcu_read_unlock();
>  }
> @@ -10212,6 +10213,7 @@ static void set_cpu_sd_state_idle(int cpu)
> sd->nohz_idle = 1;
>
> atomic_dec(>shared->nr_busy_cpus);
> +   cpumask_set_cpu(cpu, sds_idle_cpus(sd->shared));
This only works when entering/exiting tickless mode? :)
Why not update idle_cpus_span during tick_nohz_idle_enter()/exit()?

Thx.
Regards,
Jiang

>  unlock:
> rcu_read_unlock();
>  }
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 9079d865a935..92d0aeef86bf 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1769,7 +1769,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
>
> *per_cpu_ptr(sdd->sd, j) = sd;
>
> -   sds = kzalloc_node(sizeof(struct sched_domain_shared),
> +   sds = kzalloc_node(sizeof(struct sched_domain_shared) 
> + cpumask_size(),
> GFP_KERNEL, cpu_to_node(j));
> if (!sds)
> return -ENOMEM;
> --
> 2.25.1
>


Re: [PATCH] driver/pci: reduce the single block time in pci_read_config

2020-09-12 Thread Jiang Biao
Hi, Bjorn

On Thu, 10 Sep 2020 at 09:59, Bjorn Helgaas  wrote:
>
> On Thu, Sep 10, 2020 at 09:54:02AM +0800, Jiang Biao wrote:
> > Hi,
> >
> > On Thu, 10 Sep 2020 at 09:25, Bjorn Helgaas  wrote:
> > >
> > > On Mon, Aug 24, 2020 at 01:20:25PM +0800, Jiang Biao wrote:
> > > > From: Jiang Biao 
> > > >
> > > > pci_read_config() could block several ms in kernel space, mainly
> > > > caused by the while loop to call pci_user_read_config_dword().
> > > > Singel pci_user_read_config_dword() loop could consume 130us+,
> > > >   |pci_user_read_config_dword() {
> > > >   |  _raw_spin_lock_irq() {
> > > > ! 136.698 us  |native_queued_spin_lock_slowpath();
> > > > ! 137.582 us  |  }
> > > >   |  pci_read() {
> > > >   |raw_pci_read() {
> > > >   |  pci_conf1_read() {
> > > >   0.230 us|_raw_spin_lock_irqsave();
> > > >   0.035 us|_raw_spin_unlock_irqrestore();
> > > >   8.476 us|  }
> > > >   8.790 us|}
> > > >   9.091 us|  }
> > > > ! 147.263 us  |}
> > > > and dozens of the loop could consume ms+.
> > > >
> > > > If we execute some lspci commands concurrently, ms+ scheduling
> > > > latency could be detected.
> > > >
> > > > Add scheduling chance in the loop to improve the latency.
> > >
> > > Thanks for the patch, this makes a lot of sense.
> > >
> > > Shouldn't we do the same in pci_write_config()?
> > Yes, IMHO, that could be helpful too.
>
> If it's feasible, it would be nice to actually verify that it makes a
> difference.  I know config writes should be faster than reads, but
> they're certainly not as fast as a CPU can pump out data, so there
> must be *some* mechanism that slows the CPU down.
>
> Bjorn
We failed to build a test case to produce the latency by setpci command,
AFAIU, setpci could be much less frequently realistically used than lspci.
So, the latency from pci_write_config() path could not be verified for now,
could we apply this patch alone to erase the verified latency introduced
by pci_read_config() path? :)

Thanks a lot.
Regards,
Jiang


Re: [RFC PATCH v1 1/1] sched/fair: select idle cpu from idle cpumask in sched domain

2020-09-15 Thread Jiang Biao
Hi, Vincent

On Mon, 14 Sep 2020 at 20:26, Vincent Guittot
 wrote:
>
> On Sun, 13 Sep 2020 at 05:59, Jiang Biao  wrote:
> >
> > Hi, Aubrey
> >
> > On Fri, 11 Sep 2020 at 23:48, Aubrey Li  wrote:
> > >
> > > Added idle cpumask to track idle cpus in sched domain. When a CPU
> > > enters idle, its corresponding bit in the idle cpumask will be set,
> > > and when the CPU exits idle, its bit will be cleared.
> > >
> > > When a task wakes up to select an idle cpu, scanning idle cpumask
> > > has low cost than scanning all the cpus in last level cache domain,
> > > especially when the system is heavily loaded.
> > >
> > > Signed-off-by: Aubrey Li 
> > > ---
> > >  include/linux/sched/topology.h | 13 +
> > >  kernel/sched/fair.c|  4 +++-
> > >  kernel/sched/topology.c|  2 +-
> > >  3 files changed, 17 insertions(+), 2 deletions(-)
> > >
> > > diff --git a/include/linux/sched/topology.h 
> > > b/include/linux/sched/topology.h
> > > index fb11091129b3..43a641d26154 100644
> > > --- a/include/linux/sched/topology.h
> > > +++ b/include/linux/sched/topology.h
> > > @@ -65,8 +65,21 @@ struct sched_domain_shared {
> > > atomic_tref;
> > > atomic_tnr_busy_cpus;
> > > int has_idle_cores;
> > > +   /*
> > > +* Span of all idle CPUs in this domain.
> > > +*
> > > +* NOTE: this field is variable length. (Allocated dynamically
> > > +* by attaching extra space to the end of the structure,
> > > +* depending on how many CPUs the kernel has booted up with)
> > > +*/
> > > +   unsigned long   idle_cpus_span[];
> > >  };
> > >
> > > +static inline struct cpumask *sds_idle_cpus(struct sched_domain_shared 
> > > *sds)
> > > +{
> > > +   return to_cpumask(sds->idle_cpus_span);
> > > +}
> > > +
> > >  struct sched_domain {
> > > /* These fields must be setup */
> > > struct sched_domain __rcu *parent;  /* top domain must be 
> > > null terminated */
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 6b3b59cc51d6..3b6f8a3589be 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -6136,7 +6136,7 @@ static int select_idle_cpu(struct task_struct *p, 
> > > struct sched_domain *sd, int t
> > >
> > > time = cpu_clock(this);
> > >
> > > -   cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr);
> > > +   cpumask_and(cpus, sds_idle_cpus(sd->shared), p->cpus_ptr);
> > Is the sds_idle_cpus() always empty if nohz=off?
>
> Good point
>
> > Do we need to initialize the idle_cpus_span with sched_domain_span(sd)?
> >
> > >
> > > for_each_cpu_wrap(cpu, cpus, target) {
> > > if (!--nr)
> > > @@ -10182,6 +10182,7 @@ static void set_cpu_sd_state_busy(int cpu)
> > > sd->nohz_idle = 0;
> > >
> > > atomic_inc(>shared->nr_busy_cpus);
> > > +   cpumask_clear_cpu(cpu, sds_idle_cpus(sd->shared));
> > >  unlock:
> > > rcu_read_unlock();
> > >  }
> > > @@ -10212,6 +10213,7 @@ static void set_cpu_sd_state_idle(int cpu)
> > > sd->nohz_idle = 1;
> > >
> > > atomic_dec(>shared->nr_busy_cpus);
> > > +   cpumask_set_cpu(cpu, sds_idle_cpus(sd->shared));
> > This only works when entering/exiting tickless mode? :)
> > Why not update idle_cpus_span during tick_nohz_idle_enter()/exit()?
>
> set_cpu_sd_state_busy is only called during a tick in order to limit
> the rate of the update to once per tick per cpu at most and prevents
> any kind of storm of update if short running tasks wake/sleep all the
> time. We don't want to update a cpumask at each and every enter/leave
> idle.
>
Agree. But set_cpu_sd_state_busy seems not being reached when
nohz=off, which means it will not work for that case? :)

Thx.
Regards,
Jiang


Re: [PATCH 4/4] sched/fair: reduce busy load balance interval

2020-09-15 Thread Jiang Biao
Hi, Vincent

On Mon, 14 Sep 2020 at 18:07, Vincent Guittot
 wrote:
>
> The busy_factor, which increases load balance interval when a cpu is busy,
> is set to 32 by default. This value generates some huge LB interval on
> large system like the THX2 made of 2 node x 28 cores x 4 threads.
> For such system, the interval increases from 112ms to 3584ms at MC level.
> And from 228ms to 7168ms at NUMA level.
Agreed that the interval is too big for that case.
But would it be too small for an AMD environment(like ROME) with 8cpu
at MC level(CCX), if we reduce busy_factor?
For that case, the interval could be reduced from 256ms to 128ms.
Or should we define an MIN_INTERVAL for MC level to avoid too small interval?

Thx.
Regards,
Jiang

>
> Even on smaller system, a lower busy factor has shown improvement on the
> fair distribution of the running time so let reduce it for all.
>
> Signed-off-by: Vincent Guittot 
> ---
>  kernel/sched/topology.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
> index 1a84b778755d..a8477c9e8569 100644
> --- a/kernel/sched/topology.c
> +++ b/kernel/sched/topology.c
> @@ -1336,7 +1336,7 @@ sd_init(struct sched_domain_topology_level *tl,
> *sd = (struct sched_domain){
> .min_interval   = sd_weight,
> .max_interval   = 2*sd_weight,
> -   .busy_factor= 32,
> +   .busy_factor= 16,
> .imbalance_pct  = 117,
>
> .cache_nice_tries   = 0,
> --
> 2.17.1
>


Re: [PATCH 4/4] sched/fair: reduce busy load balance interval

2020-09-15 Thread Jiang Biao
Hi, Vincent

On Tue, 15 Sep 2020 at 17:28, Vincent Guittot
 wrote:
>
> On Tue, 15 Sep 2020 at 11:11, Jiang Biao  wrote:
> >
> > Hi, Vincent
> >
> > On Mon, 14 Sep 2020 at 18:07, Vincent Guittot
> >  wrote:
> > >
> > > The busy_factor, which increases load balance interval when a cpu is busy,
> > > is set to 32 by default. This value generates some huge LB interval on
> > > large system like the THX2 made of 2 node x 28 cores x 4 threads.
> > > For such system, the interval increases from 112ms to 3584ms at MC level.
> > > And from 228ms to 7168ms at NUMA level.
> > Agreed that the interval is too big for that case.
> > But would it be too small for an AMD environment(like ROME) with 8cpu
> > at MC level(CCX), if we reduce busy_factor?
>
> Are you sure that this is too small ? As mentioned in the commit
> message below, I tested it on small system (2x4 cores Arm64) and i
> have seen some improvements
Not so sure. :)
Small interval means more frequent balances and more cost consumed for
balancing, especially for pinned vm cases.
For our case, we have AMD ROME servers made of 2node x 48cores x
2thread, and 8c at MC level(within a CCX). The 256ms interval seems a
little too big for us, compared to Intel Cascadlake CPU with 48c at MC
level, whose balance interval is 1536ms. 128ms seems a little more
waste. :)
I guess more balance costs may hurt the throughput of sysbench like
benchmark.. Just a guess.

>
> > For that case, the interval could be reduced from 256ms to 128ms.
> > Or should we define an MIN_INTERVAL for MC level to avoid too small 
> > interval?
>
> What would be a too small interval ?
That's hard to say. :)
My guess is just for large server system cases.

Thanks.
Regards,
Jiang


Re: [PATCH 4/4] sched/fair: reduce busy load balance interval

2020-09-15 Thread Jiang Biao
Hi,

On Tue, 15 Sep 2020 at 20:43, Vincent Guittot
 wrote:
>
> On Tue, 15 Sep 2020 at 13:36, Jiang Biao  wrote:
> >
> > Hi, Vincent
> >
> > On Tue, 15 Sep 2020 at 17:28, Vincent Guittot
> >  wrote:
> > >
> > > On Tue, 15 Sep 2020 at 11:11, Jiang Biao  wrote:
> > > >
> > > > Hi, Vincent
> > > >
> > > > On Mon, 14 Sep 2020 at 18:07, Vincent Guittot
> > > >  wrote:
> > > > >
> > > > > The busy_factor, which increases load balance interval when a cpu is 
> > > > > busy,
> > > > > is set to 32 by default. This value generates some huge LB interval on
> > > > > large system like the THX2 made of 2 node x 28 cores x 4 threads.
> > > > > For such system, the interval increases from 112ms to 3584ms at MC 
> > > > > level.
> > > > > And from 228ms to 7168ms at NUMA level.
> > > > Agreed that the interval is too big for that case.
> > > > But would it be too small for an AMD environment(like ROME) with 8cpu
> > > > at MC level(CCX), if we reduce busy_factor?
> > >
> > > Are you sure that this is too small ? As mentioned in the commit
> > > message below, I tested it on small system (2x4 cores Arm64) and i
> > > have seen some improvements
> > Not so sure. :)
> > Small interval means more frequent balances and more cost consumed for
> > balancing, especially for pinned vm cases.
>
> If you are running only pinned threads, the interval can increase
> above 512ms which means 8sec after applying the busy factor
Yep. :)

>
> > For our case, we have AMD ROME servers made of 2node x 48cores x
> > 2thread, and 8c at MC level(within a CCX). The 256ms interval seems a
> > little too big for us, compared to Intel Cascadlake CPU with 48c at MC
>
> so IIUC your topology is :
> 2 nodes at NUMA
> 6 CCX at DIE level
> 8 cores per CCX at MC
> 2 threads per core at SMT
Yes.

>
> > level, whose balance interval is 1536ms. 128ms seems a little more
> > waste. :)
>
> the 256ms/128ms interval only looks at 8 cores whereas the 1536
> intervall looks for the whole 48 cores
Yes. The real problem for us is the cpu number difference between MC
and DIE level is too big(8 VS. 96), 3072ms for DIE level is too big(reduce
busy_factor is good enough), while 128ms for MC level seems a little waste(
if reduce busy_factor)
And no objection for this patch. It still looks ok for us.

Thx.
Regards,
Jiang


Re: [PATCH] sched/fair: Fix wrong cpu selecting from isolated domain

2020-08-25 Thread Jiang Biao
On Mon, 24 Aug 2020 at 20:31, Xunlei Pang  wrote:
>
> We've met problems that occasionally tasks with full cpumask
> (e.g. by putting it into a cpuset or setting to full affinity)
> were migrated to our isolated cpus in production environment.
>
> After some analysis, we found that it is due to the current
> select_idle_smt() not considering the sched_domain mask.
>
> Fix it by checking the valid domain mask in select_idle_smt().
>
> Fixes: 10e2f1acd010 ("sched/core: Rewrite and improve select_idle_siblings())
> Reported-by: Wetp Zhang 
> Signed-off-by: Xunlei Pang 
> ---
>  kernel/sched/fair.c | 9 +
>  1 file changed, 5 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 1a68a05..fa942c4 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -6075,7 +6075,7 @@ static int select_idle_core(struct task_struct *p, 
> struct sched_domain *sd, int
>  /*
>   * Scan the local SMT mask for idle CPUs.
>   */
> -static int select_idle_smt(struct task_struct *p, int target)
> +static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, 
> int target)
>  {
> int cpu;
>
> @@ -6083,7 +6083,8 @@ static int select_idle_smt(struct task_struct *p, int 
> target)
> return -1;
>
> for_each_cpu(cpu, cpu_smt_mask(target)) {
> -   if (!cpumask_test_cpu(cpu, p->cpus_ptr))
> +   if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> +   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
Maybe the following change could be better, :)
for_each_cpu_and(cpu, cpu_smt_mask(target), sched_domain_span(sd))
keep a similar style with select_idle_core/cpu, and could reduce loops.

Just an option.
Reviewed-by: Jiang Biao 


Re: [PATCH] sched/fair: Fix wrong cpu selecting from isolated domain

2020-08-25 Thread Jiang Biao
On Tue, 25 Aug 2020 at 17:28, xunlei  wrote:
>
> On 2020/8/25 下午2:37, Jiang Biao wrote:
> > On Mon, 24 Aug 2020 at 20:31, Xunlei Pang  wrote:
> >>
> >> We've met problems that occasionally tasks with full cpumask
> >> (e.g. by putting it into a cpuset or setting to full affinity)
> >> were migrated to our isolated cpus in production environment.
> >>
> >> After some analysis, we found that it is due to the current
> >> select_idle_smt() not considering the sched_domain mask.
> >>
> >> Fix it by checking the valid domain mask in select_idle_smt().
> >>
> >> Fixes: 10e2f1acd010 ("sched/core: Rewrite and improve 
> >> select_idle_siblings())
> >> Reported-by: Wetp Zhang 
> >> Signed-off-by: Xunlei Pang 
> >> ---
> >>  kernel/sched/fair.c | 9 +
> >>  1 file changed, 5 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> >> index 1a68a05..fa942c4 100644
> >> --- a/kernel/sched/fair.c
> >> +++ b/kernel/sched/fair.c
> >> @@ -6075,7 +6075,7 @@ static int select_idle_core(struct task_struct *p, 
> >> struct sched_domain *sd, int
> >>  /*
> >>   * Scan the local SMT mask for idle CPUs.
> >>   */
> >> -static int select_idle_smt(struct task_struct *p, int target)
> >> +static int select_idle_smt(struct task_struct *p, struct sched_domain 
> >> *sd, int target)
> >>  {
> >> int cpu;
> >>
> >> @@ -6083,7 +6083,8 @@ static int select_idle_smt(struct task_struct *p, 
> >> int target)
> >> return -1;
> >>
> >> for_each_cpu(cpu, cpu_smt_mask(target)) {
> >> -   if (!cpumask_test_cpu(cpu, p->cpus_ptr))
> >> +   if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
> >> +   !cpumask_test_cpu(cpu, sched_domain_span(sd)))
> > Maybe the following change could be better, :)
> > for_each_cpu_and(cpu, cpu_smt_mask(target), sched_domain_span(sd))
> > keep a similar style with select_idle_core/cpu, and could reduce loops.
> >
>
> I thought that, but given that smt mask is usually small, the original
> code may run a bit faster?
Not sure. :)
It's OK for me.

Regards,
Jiang


[PATCH] sched/fair: avoid vruntime compensation for SCHED_IDLE task

2020-08-20 Thread Jiang Biao
From: Jiang Biao 

Vruntime compensation has been down in place_entity() to
boot the waking procedure for fair tasks. There is no need to
do that for SCHED_IDLE task actually.

Not compensating vruntime for SCHED_IDLE task could make
SCHED_IDLE task more harmless for normal tasks.

Signed-off-by: Jiang Biao 
---
 kernel/sched/fair.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1a68a0536add..adff77676a0a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4115,7 +4115,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity 
*se, int initial)
vruntime += sched_vslice(cfs_rq, se);
 
/* sleeps up to a single latency don't count. */
-   if (!initial) {
+   if (!initial && likely(!task_has_idle_policy(task_of(se {
unsigned long thresh = sysctl_sched_latency;
 
/*
-- 
2.21.0



Re: [PATCH] sched/fair: avoid vruntime compensation for SCHED_IDLE task

2020-08-20 Thread Jiang Biao
On Thu, 20 Aug 2020 at 20:51, Vincent Guittot
 wrote:
>
> On Thu, 20 Aug 2020 at 14:00, Jiang Biao  wrote:
> >
> > From: Jiang Biao 
> >
> > Vruntime compensation has been down in place_entity() to
> > boot the waking procedure for fair tasks. There is no need to
>
> s/boot/boost/ ?
Yes. -:)
>
> > do that for SCHED_IDLE task actually.
> >
> > Not compensating vruntime for SCHED_IDLE task could make
> > SCHED_IDLE task more harmless for normal tasks.
> >
> > Signed-off-by: Jiang Biao 
> > ---
> >  kernel/sched/fair.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > index 1a68a0536add..adff77676a0a 100644
> > --- a/kernel/sched/fair.c
> > +++ b/kernel/sched/fair.c
> > @@ -4115,7 +4115,7 @@ place_entity(struct cfs_rq *cfs_rq, struct 
> > sched_entity *se, int initial)
> > vruntime += sched_vslice(cfs_rq, se);
> >
> > /* sleeps up to a single latency don't count. */
> > -   if (!initial) {
> > +   if (!initial && likely(!task_has_idle_policy(task_of(se {
>
> What if se is not a task ?
Is it ok to limit the case to task se, like?
+   if (!initial && likely(!entity_is_task(se) ||
!task_has_idle_policy(task_of(se {

Thx.
Regards,
Jiang
>
> > unsigned long thresh = sysctl_sched_latency;
> >
> > /*
> > --
> > 2.21.0
> >


Re: [PATCH] sched/fair: avoid vruntime compensation for SCHED_IDLE task

2020-08-20 Thread Jiang Biao
On Thu, 20 Aug 2020 at 20:58,  wrote:
>
> On Thu, Aug 20, 2020 at 02:51:06PM +0200, Vincent Guittot wrote:
> > On Thu, 20 Aug 2020 at 14:00, Jiang Biao  wrote:
> > >
> > > From: Jiang Biao 
> > >
> > > Vruntime compensation has been down in place_entity() to
> > > boot the waking procedure for fair tasks. There is no need to
> >
> > s/boot/boost/ ?
> >
> > > do that for SCHED_IDLE task actually.
> > >
> > > Not compensating vruntime for SCHED_IDLE task could make
> > > SCHED_IDLE task more harmless for normal tasks.
>
> This is rather week. It would be much better if there's some actual data
> to support this claim.
I'll try to have a test and get some data. :)

>
> > > Signed-off-by: Jiang Biao 
> > > ---
> > >  kernel/sched/fair.c | 2 +-
> > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > >
> > > diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> > > index 1a68a0536add..adff77676a0a 100644
> > > --- a/kernel/sched/fair.c
> > > +++ b/kernel/sched/fair.c
> > > @@ -4115,7 +4115,7 @@ place_entity(struct cfs_rq *cfs_rq, struct 
> > > sched_entity *se, int initial)
> > > vruntime += sched_vslice(cfs_rq, se);
> > >
> > > /* sleeps up to a single latency don't count. */
> > > -   if (!initial) {
> > > +   if (!initial && likely(!task_has_idle_policy(task_of(se {
> >
> > What if se is not a task ?
>
> Then we very much need it, because it might have fair tasks inside. I
> suppose you could do something complicated with idle_h_nr_running, but
> is all that really worth the effort?
Would it be better to limit to task se case? like
 +   if (!initial && likely(!entity_is_task(se) ||
!task_has_idle_policy(task_of(se {



>
> > > unsigned long thresh = sysctl_sched_latency;
> > >
> > > /*
> > > --
> > > 2.21.0
> > >


Re: [PATCH] sched/fair: avoid vruntime compensation for SCHED_IDLE task

2020-08-20 Thread Jiang Biao
On Thu, 20 Aug 2020 at 22:09, Vincent Guittot
 wrote:
>
> On Thu, 20 Aug 2020 at 15:44,  wrote:
> >
> > > That's been said, not compensating the vruntime for a sched_idle task
> > > makes sense for me. Even if that will only help for others task in the
> > > same cfs_rq
> >
> > Yeah, but it is worth the extra pointer chasing and branches?
>
> For that I let Jiang provides figures to show the worthful
I'll try to have some tests and show the wakeup latency difference
after applying the patch later.

Thanks for your time.
Regards
Jiang
>
> >
> > Then again, I suppose we started all that with the idle_h_nr_running
> > nonsense :/


Re: [PATCH] sched/fair: reduce preemption with IDLE tasks runable(Internet mail)

2020-08-20 Thread Jiang Biao
On Thu, 20 Aug 2020 at 20:46, Vincent Guittot
 wrote:
>
> On Thu, 20 Aug 2020 at 13:28, benbjiang(蒋彪)  wrote:
> >
> >
> >
> > > On Aug 20, 2020, at 3:35 PM, Vincent Guittot  
> > > wrote:
> > >
> > > On Thu, 20 Aug 2020 at 02:13, benbjiang(蒋彪)  wrote:
> > >>
> > >>
> > >>
> > >>> On Aug 19, 2020, at 10:55 PM, Vincent Guittot 
> > >>>  wrote:
> > >>>
> > >>> On Wed, 19 Aug 2020 at 16:27, benbjiang(蒋彪)  
> > >>> wrote:
> > >>>>
> > >>>>
> > >>>>
> > >>>>> On Aug 19, 2020, at 7:55 PM, Dietmar Eggemann 
> > >>>>>  wrote:
> > >>>>>
> > >>>>> On 19/08/2020 13:05, Vincent Guittot wrote:
> > >>>>>> On Wed, 19 Aug 2020 at 12:46, Dietmar Eggemann 
> > >>>>>>  wrote:
> > >>>>>>>
> > >>>>>>> On 17/08/2020 14:05, benbjiang(蒋彪) wrote:
> > >>>>>>>>
> > >>>>>>>>
> > >>>>>>>>> On Aug 17, 2020, at 4:57 PM, Dietmar Eggemann 
> > >>>>>>>>>  wrote:
> > >>>>>>>>>
> > >>>>>>>>> On 14/08/2020 01:55, benbjiang(蒋彪) wrote:
> > >>>>>>>>>> Hi,
> > >>>>>>>>>>
> > >>>>>>>>>>> On Aug 13, 2020, at 2:39 AM, Dietmar Eggemann 
> > >>>>>>>>>>>  wrote:
> > >>>>>>>>>>>
> > >>>>>>>>>>> On 12/08/2020 05:19, benbjiang(蒋彪) wrote:
> > >>>>>>>>>>>> Hi,
> > >>>>>>>>>>>>
> > >>>>>>>>>>>>> On Aug 11, 2020, at 11:54 PM, Dietmar Eggemann 
> > >>>>>>>>>>>>>  wrote:
> > >>>>>>>>>>>>>
> > >>>>>>>>>>>>> On 11/08/2020 02:41, benbjiang(蒋彪) wrote:
> > >>>>>>>>>>>>>> Hi,
> > >>>>>>>>>>>>>>
> > >>>>>>>>>>>>>>> On Aug 10, 2020, at 9:24 PM, Dietmar Eggemann 
> > >>>>>>>>>>>>>>>  wrote:
> > >>>>>>>>>>>>>>>
> > >>>>>>>>>>>>>>> On 06/08/2020 17:52, benbjiang(蒋彪) wrote:
> > >>>>>>>>>>>>>>>> Hi,
> > >>>>>>>>>>>>>>>>
> > >>>>>>>>>>>>>>>>> On Aug 6, 2020, at 9:29 PM, Dietmar Eggemann 
> > >>>>>>>>>>>>>>>>>  wrote:
> > >>>>>>>>>>>>>>>>>
> > >>>>>>>>>>>>>>>>> On 03/08/2020 13:26, benbjiang(蒋彪) wrote:
> > >>>>>>>>>>>>>>>>>>
> > >>>>>>>>>>>>>>>>>>
> > >>>>>>>>>>>>>>>>>>> On Aug 3, 2020, at 4:16 PM, Dietmar Eggemann 
> > >>>>>>>>>>>>>>>>>>>  wrote:
> > >>>>>>>>>>>>>>>>>>>
> > >>>>>>>>>>>>>>>>>>> On 01/08/2020 04:32, Jiang Biao wrote:
> > >>>>>>>>>>>>>>>>>>>> From: Jiang Biao 
> > >>>>>>>
> > >>>>>>> [...]
> > >>>>>>>
> > >>>>>>>>> Are you sure about this?
> > >>>>>>>> Yes. :)
> > >>>>>>>>>
> > >>>>>>>>> The math is telling me for the:
> > >>>>>>>>>
> > >>>>>>>>> idle task:  (3 / (1024 + 1024 + 3))^(-1) * 4ms = 2735ms
> > >>>>>>>>>
> > >>>>>>>>> normal task: (1024 / (1024 + 1024 + 3))^(-1) * 4ms =8ms
> > >>>>>>>>>
> > >>>>>>>>> (4ms - 250

Re: [PATCH] sched/fair: reduce preemption with IDLE tasks runable(Internet mail)

2020-08-20 Thread Jiang Biao
On Thu, 20 Aug 2020 at 22:36, Vincent Guittot
 wrote:
>
> On Thu, 20 Aug 2020 at 16:28, Jiang Biao  wrote:
> >
> > On Thu, 20 Aug 2020 at 20:46, Vincent Guittot
> >  wrote:
> > >
> > > On Thu, 20 Aug 2020 at 13:28, benbjiang(蒋彪)  wrote:
> > > >
> > > >
> > > >
> > > > > On Aug 20, 2020, at 3:35 PM, Vincent Guittot 
> > > > >  wrote:
> > > > >
> > > > > On Thu, 20 Aug 2020 at 02:13, benbjiang(蒋彪)  
> > > > > wrote:
> > > > >>
> > > > >>
> > > > >>
> > > > >>> On Aug 19, 2020, at 10:55 PM, Vincent Guittot 
> > > > >>>  wrote:
> > > > >>>
> > > > >>> On Wed, 19 Aug 2020 at 16:27, benbjiang(蒋彪)  
> > > > >>> wrote:
> > > > >>>>
> > > > >>>>
> > > > >>>>
> > > > >>>>> On Aug 19, 2020, at 7:55 PM, Dietmar Eggemann 
> > > > >>>>>  wrote:
> > > > >>>>>
> > > > >>>>> On 19/08/2020 13:05, Vincent Guittot wrote:
> > > > >>>>>> On Wed, 19 Aug 2020 at 12:46, Dietmar Eggemann 
> > > > >>>>>>  wrote:
> > > > >>>>>>>
> > > > >>>>>>> On 17/08/2020 14:05, benbjiang(蒋彪) wrote:
> > > > >>>>>>>>
> > > > >>>>>>>>
> > > > >>>>>>>>> On Aug 17, 2020, at 4:57 PM, Dietmar Eggemann 
> > > > >>>>>>>>>  wrote:
> > > > >>>>>>>>>
> > > > >>>>>>>>> On 14/08/2020 01:55, benbjiang(蒋彪) wrote:
> > > > >>>>>>>>>> Hi,
> > > > >>>>>>>>>>
> > > > >>>>>>>>>>> On Aug 13, 2020, at 2:39 AM, Dietmar Eggemann 
> > > > >>>>>>>>>>>  wrote:
> > > > >>>>>>>>>>>
> > > > >>>>>>>>>>> On 12/08/2020 05:19, benbjiang(蒋彪) wrote:
> > > > >>>>>>>>>>>> Hi,
> > > > >>>>>>>>>>>>
> > > > >>>>>>>>>>>>> On Aug 11, 2020, at 11:54 PM, Dietmar Eggemann 
> > > > >>>>>>>>>>>>>  wrote:
> > > > >>>>>>>>>>>>>
> > > > >>>>>>>>>>>>> On 11/08/2020 02:41, benbjiang(蒋彪) wrote:
> > > > >>>>>>>>>>>>>> Hi,
> > > > >>>>>>>>>>>>>>
> > > > >>>>>>>>>>>>>>> On Aug 10, 2020, at 9:24 PM, Dietmar Eggemann 
> > > > >>>>>>>>>>>>>>>  wrote:
> > > > >>>>>>>>>>>>>>>
> > > > >>>>>>>>>>>>>>> On 06/08/2020 17:52, benbjiang(蒋彪) wrote:
> > > > >>>>>>>>>>>>>>>> Hi,
> > > > >>>>>>>>>>>>>>>>
> > > > >>>>>>>>>>>>>>>>> On Aug 6, 2020, at 9:29 PM, Dietmar Eggemann 
> > > > >>>>>>>>>>>>>>>>>  wrote:
> > > > >>>>>>>>>>>>>>>>>
> > > > >>>>>>>>>>>>>>>>> On 03/08/2020 13:26, benbjiang(蒋彪) wrote:
> > > > >>>>>>>>>>>>>>>>>>
> > > > >>>>>>>>>>>>>>>>>>
> > > > >>>>>>>>>>>>>>>>>>> On Aug 3, 2020, at 4:16 PM, Dietmar Eggemann 
> > > > >>>>>>>>>>>>>>>>>>>  wrote:
> > > > >>>>>>>>>>>>>>>>>>>
> > > > >>>>>>>>>>>>>>>>>>> On 01/08/2020 04:32, Jiang Biao wrote:
> > > > >>>>>>>>>>>>>>>>>

[PATCH RFC v2] sched/fair: simplify the work when reweighting entity

2020-08-06 Thread Jiang Biao
From: Jiang Biao 

If a se is on_rq when reweighting entity, all we need should be
updating the load of cfs_rq, other dequeue/enqueue work could be
redundant, such as,
* nr_running--/nr_running++

Even though the following dequeue/enqueue path would never be reached
* account_numa_dequeue/account_numa_enqueue
* list_del/list_add from/into cfs_tasks
but it could be a little confusing.

Simplifying the logic could be helpful to reduce a litte overhead for
hot path, and make it cleaner and more readable.

Signed-off-by: Jiang Biao 
---
v2<-v1: 
- Amend the commit log.

 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..18a8fc7bd0de 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3086,7 +3086,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se,
/* commit outstanding execution time */
if (cfs_rq->curr == se)
update_curr(cfs_rq);
-   account_entity_dequeue(cfs_rq, se);
+   update_load_sub(_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
 
@@ -3102,7 +3102,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct 
sched_entity *se,
 
enqueue_load_avg(cfs_rq, se);
if (se->on_rq)
-   account_entity_enqueue(cfs_rq, se);
+   update_load_add(_rq->load, se->load.weight);
 
 }
 
-- 
2.21.0



Re: [PATCH] sched: pull tasks when CPU is about to run SCHED_IDLE tasks

2021-02-03 Thread Jiang Biao
Hi,

On Wed, 3 Feb 2021 at 19:17, chin  wrote:
>
>
>
>
> At 2021-02-02 23:54:15, "Vincent Guittot"  wrote:
> >On Tue, 2 Feb 2021 at 08:56, chin  wrote:
> >>
> >>
> >>
> >>
> >> At 2021-01-13 16:30:14, "Vincent Guittot"  
> >> wrote:
> >> >On Wed, 13 Jan 2021 at 04:14, chin  wrote:
> >> >>
> >> >>
> >> >>
> >> >>
> >> >> At 2021-01-12 16:18:51, "Vincent Guittot"  
> >> >> wrote:
> >> >> >On Tue, 12 Jan 2021 at 07:59, chin  wrote:
> >> >> >>
> >> >> >>
> >> >> >>
> >> >> >>
> >> >> >> At 2021-01-11 19:04:19, "Vincent Guittot" 
> >> >> >>  wrote:
> >> >> >> >On Mon, 11 Jan 2021 at 09:27, chin  wrote:
> >> >> >> >>
> >> >> >> >>
> >> >> >> >> At 2020-12-23 19:30:26, "Vincent Guittot" 
> >> >> >> >>  wrote:
> >> >> >> >> >On Wed, 23 Dec 2020 at 09:32,  wrote:
> >> >> >> >> >>
> >> >> >> >> >> From: Chen Xiaoguang 
> >> >> >> >> >>
> >> >> >> >> >> Before a CPU switches from running SCHED_NORMAL task to
> >> >> >> >> >> SCHED_IDLE task, trying to pull SCHED_NORMAL tasks from other
> >> >> >> >> >
> >> >> >> >> >Could you explain more in detail why you only care about this 
> >> >> >> >> >use case
> >> >> >> >>
> >> >> >> >> >in particular and not the general case?
> >> >> >> >>
> >> >> >> >>
> >> >> >> >> We want to run online tasks using SCHED_NORMAL policy and offline 
> >> >> >> >> tasks
> >> >> >> >> using SCHED_IDLE policy. The online tasks and the offline tasks 
> >> >> >> >> run in
> >> >> >> >> the same computer in order to use the computer efficiently.
> >> >> >> >> The online tasks are in sleep in most times but should responce 
> >> >> >> >> soon once
> >> >> >> >> wake up. The offline tasks are in low priority and will run only 
> >> >> >> >> when no online
> >> >> >> >> tasks.
> >> >> >> >>
> >> >> >> >> The online tasks are more important than the offline tasks and 
> >> >> >> >> are latency
> >> >> >> >> sensitive we should make sure the online tasks preempt the 
> >> >> >> >> offline tasks
> >> >> >> >> as soon as possilbe while there are online tasks waiting to run.
> >> >> >> >> So in our situation we hope the SCHED_NORMAL to run if has any.
> >> >> >> >>
> >> >> >> >> Let's assume we have 2 CPUs,
> >> >> >> >> In CPU1 we got 2 SCHED_NORMAL tasks.
> >> >> >> >> in CPU2 we got 1 SCHED_NORMAL task and 2 SCHED_IDLE tasks.
> >> >> >> >>
> >> >> >> >>  CPU1  CPU2
> >> >> >> >> curr   rq1curr  rq2
> >> >> >> >>   +--+ | +--+   +--+ | ++ ++
> >> >> >> >> t0|NORMAL| | |NORMAL|   |NORMAL| | |IDLE| |IDLE|
> >> >> >> >>   +--+ | +--+   +--+ | ++ ++
> >> >> >> >>
> >> >> >> >>  NORMAL exits or blocked
> >> >> >> >>   +--+ | +--+| ++ ++
> >> >> >> >> t1|NORMAL| | |NORMAL|| |IDLE| |IDLE|
> >> >> >> >>   +--+ | +--+| ++ ++
> >> >> >> >>
> >> >> >> >>  pick_next_task_fair
> >> >> >> >>   +--+ | +--+ ++ | ++
> >> >> >> >> t2|NORMAL| | |NORMAL| |IDLE| | |IDLE|
> >> >> >> >>   +--+ | +--+ ++ | ++
> >> >> >> >>
> >> >> >> >>  SCHED_IDLE running
> >> >> >> >> t3+--+ | +--+++  | ++
> >> >> >> >>   |NORMAL| | |NORMAL||IDLE|  | |IDLE|
> >> >> >> >>   +--+ | +--+++  | ++
> >> >> >> >>
> >> >> >> >>  run_rebalance_domains
> >> >> >> >>   +--+ |+--+ | ++ ++
> >> >> >> >> t4|NORMAL| ||NORMAL| | |IDLE| |IDLE|
> >> >> >> >>   +--+ |+--+ | ++ ++
> >> >> >> >>
> >> >> >> >> As we can see
> >> >> >> >> t1: NORMAL task in CPU2 exits or blocked
> >> >> >> >> t2: CPU2 pick_next_task_fair would pick a SCHED_IDLE to run while
> >> >> >> >> another SCHED_NORMAL in rq1 is waiting.
> >> >> >> >> t3: SCHED_IDLE run in CPU2 while a SCHED_NORMAL wait in CPU1.
> >> >> >> >> t4: after a short time, periodic load_balance triggerd and pull
> >> >> >> >> SCHED_NORMAL in rq1 to rq2, and SCHED_NORMAL likely preempts 
> >> >> >> >> SCHED_IDLE.
> >> >> >> >>
> >> >> >> >> In this scenario, SCHED_IDLE is running while SCHED_NORMAL is 
> >> >> >> >> waiting to run.
> >> >> >> >> The latency of this SCHED_NORMAL will be high which is not 
> >> >> >> >> acceptble.
> >> >> >> >>
> >> >> >> >> Do a load_balance before running the SCHED_IDLE may fix this 
> >> >> >> >> problem.
> >> >> >> >>
> >> >> >> >> This patch works as below:
> >> >> >> >>
> >> >> >> >>  CPU1  CPU2
> >> >> >> >> curr   rq1curr  rq2
> >> >> >> >>   +--+ | +--+   +--+ | ++ ++
> >> >> >> >> t0|NORMAL| | |NORMAL|   |NORMAL| | |IDLE| |IDLE|
> >> >> >> >>   +--+ | +--+   +--+ | ++ ++
> >> >> >> 

Re: [PATCH] sched: pull tasks when CPU is about to run SCHED_IDLE tasks

2021-02-04 Thread Jiang Biao
Hi, Vincent

On Thu, 4 Feb 2021 at 16:04, Vincent Guittot  wrote:
>
> On Tue, 2 Feb 2021 at 08:56, chin  wrote:
> >
> >
> >
> >
> > At 2021-01-13 16:30:14, "Vincent Guittot"  
> > wrote:
> > >On Wed, 13 Jan 2021 at 04:14, chin  wrote:
> > >>
> > >>
> > >>
> > >>
> > >> At 2021-01-12 16:18:51, "Vincent Guittot"  
> > >> wrote:
> > >> >On Tue, 12 Jan 2021 at 07:59, chin  wrote:
> > >> >>
> > >> >>
> > >> >>
> > >> >>
> > >> >> At 2021-01-11 19:04:19, "Vincent Guittot" 
> > >> >>  wrote:
> > >> >> >On Mon, 11 Jan 2021 at 09:27, chin  wrote:
> > >> >> >>
> > >> >> >>
> > >> >> >> At 2020-12-23 19:30:26, "Vincent Guittot" 
> > >> >> >>  wrote:
> > >> >> >> >On Wed, 23 Dec 2020 at 09:32,  wrote:
> > >> >> >> >>
> > >> >> >> >> From: Chen Xiaoguang 
> > >> >> >> >>
> > >> >> >> >> Before a CPU switches from running SCHED_NORMAL task to
> > >> >> >> >> SCHED_IDLE task, trying to pull SCHED_NORMAL tasks from other
> > >> >> >> >
> > >> >> >> >Could you explain more in detail why you only care about this use 
> > >> >> >> >case
> > >> >> >>
> > >> >> >> >in particular and not the general case?
> > >> >> >>
> > >> >> >>
> > >> >> >> We want to run online tasks using SCHED_NORMAL policy and offline 
> > >> >> >> tasks
> > >> >> >> using SCHED_IDLE policy. The online tasks and the offline tasks 
> > >> >> >> run in
> > >> >> >> the same computer in order to use the computer efficiently.
> > >> >> >> The online tasks are in sleep in most times but should responce 
> > >> >> >> soon once
> > >> >> >> wake up. The offline tasks are in low priority and will run only 
> > >> >> >> when no online
> > >> >> >> tasks.
> > >> >> >>
> > >> >> >> The online tasks are more important than the offline tasks and are 
> > >> >> >> latency
> > >> >> >> sensitive we should make sure the online tasks preempt the offline 
> > >> >> >> tasks
> > >> >> >> as soon as possilbe while there are online tasks waiting to run.
> > >> >> >> So in our situation we hope the SCHED_NORMAL to run if has any.
> > >> >> >>
> > >> >> >> Let's assume we have 2 CPUs,
> > >> >> >> In CPU1 we got 2 SCHED_NORMAL tasks.
> > >> >> >> in CPU2 we got 1 SCHED_NORMAL task and 2 SCHED_IDLE tasks.
> > >> >> >>
> > >> >> >>  CPU1  CPU2
> > >> >> >> curr   rq1curr  rq2
> > >> >> >>   +--+ | +--+   +--+ | ++ ++
> > >> >> >> t0|NORMAL| | |NORMAL|   |NORMAL| | |IDLE| |IDLE|
> > >> >> >>   +--+ | +--+   +--+ | ++ ++
> > >> >> >>
> > >> >> >>  NORMAL exits or blocked
> > >> >> >>   +--+ | +--+| ++ ++
> > >> >> >> t1|NORMAL| | |NORMAL|| |IDLE| |IDLE|
> > >> >> >>   +--+ | +--+| ++ ++
> > >> >> >>
> > >> >> >>  pick_next_task_fair
> > >> >> >>   +--+ | +--+ ++ | ++
> > >> >> >> t2|NORMAL| | |NORMAL| |IDLE| | |IDLE|
> > >> >> >>   +--+ | +--+ ++ | ++
> > >> >> >>
> > >> >> >>  SCHED_IDLE running
> > >> >> >> t3+--+ | +--+++  | ++
> > >> >> >>   |NORMAL| | |NORMAL||IDLE|  | |IDLE|
> > >> >> >>   +--+ | +--+++  | ++
> > >> >> >>
> > >> >> >>  run_rebalance_domains
> > >> >> >>   +--+ |+--+ | ++ ++
> > >> >> >> t4|NORMAL| ||NORMAL| | |IDLE| |IDLE|
> > >> >> >>   +--+ |+--+ | ++ ++
> > >> >> >>
> > >> >> >> As we can see
> > >> >> >> t1: NORMAL task in CPU2 exits or blocked
> > >> >> >> t2: CPU2 pick_next_task_fair would pick a SCHED_IDLE to run while
> > >> >> >> another SCHED_NORMAL in rq1 is waiting.
> > >> >> >> t3: SCHED_IDLE run in CPU2 while a SCHED_NORMAL wait in CPU1.
> > >> >> >> t4: after a short time, periodic load_balance triggerd and pull
> > >> >> >> SCHED_NORMAL in rq1 to rq2, and SCHED_NORMAL likely preempts 
> > >> >> >> SCHED_IDLE.
> > >> >> >>
> > >> >> >> In this scenario, SCHED_IDLE is running while SCHED_NORMAL is 
> > >> >> >> waiting to run.
> > >> >> >> The latency of this SCHED_NORMAL will be high which is not 
> > >> >> >> acceptble.
> > >> >> >>
> > >> >> >> Do a load_balance before running the SCHED_IDLE may fix this 
> > >> >> >> problem.
> > >> >> >>
> > >> >> >> This patch works as below:
> > >> >> >>
> > >> >> >>  CPU1  CPU2
> > >> >> >> curr   rq1curr  rq2
> > >> >> >>   +--+ | +--+   +--+ | ++ ++
> > >> >> >> t0|NORMAL| | |NORMAL|   |NORMAL| | |IDLE| |IDLE|
> > >> >> >>   +--+ | +--+   +--+ | ++ ++
> > >> >> >>
> > >> >> >>  NORMAL exits or blocked
> > >> >> >>   +--+ | +--+| ++ ++
> > >> 

[PATCH] sched/fair: consider sched-idle CPU when selecting idle core

2020-07-23 Thread Jiang Biao
From: Jiang Biao 

Sched-idle CPU has been considered in select_idle_cpu and
select_idle_smt, it also needs to be considered in select_idle_core to
be consistent and keep the same *idle* policy.

Signed-off-by: Jiang Biao 
---
 kernel/sched/fair.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 04fa8dbcfa4d..f430a9820d08 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6014,7 +6014,7 @@ void __update_idle_core(struct rq *rq)
if (cpu == core)
continue;
 
-   if (!available_idle_cpu(cpu))
+   if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
goto unlock;
}
 
@@ -6045,7 +6045,7 @@ static int select_idle_core(struct task_struct *p, struct 
sched_domain *sd, int
bool idle = true;
 
for_each_cpu(cpu, cpu_smt_mask(core)) {
-   if (!available_idle_cpu(cpu)) {
+   if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) {
idle = false;
break;
}
-- 
2.21.0




Re: [PATCH] sched/fair: consider sched-idle CPU when selecting idle core

2020-07-24 Thread Jiang Biao
On Fri, 24 Jul 2020 at 15:24, Vincent Guittot
 wrote:
>
> On Fri, 24 Jul 2020 at 01:39, Jiang Biao  wrote:
> >
> > From: Jiang Biao 
> >
> > Sched-idle CPU has been considered in select_idle_cpu and
> > select_idle_smt, it also needs to be considered in select_idle_core to
> > be consistent and keep the same *idle* policy.
>
> In the case of select_idle_core, we are looking for a core that is
> fully idle but if one CPU of the core is running a sched_idle task,
> the core will not be idle and we might end up having  the wakeup task
> on a CPU and a sched_idle task on another CPU of the core which is not
> what we want
Got it. sched_idle task may interfere its sibling, which brings me
another question,
If there's a core with smt1 running sched_idle task and smt2 idle,
selecting smt1
rather than smt2 should be more helpful for wakee task, because wakee task
could suppress the sched_idle task without neighbour interfering.
And there seems to be no consideration about that currently.
Is it worth improving that?

Thanks a lot.

Regards,
Jiang


Re: [PATCH] sched/fair: consider sched-idle CPU when selecting idle core

2020-07-24 Thread Jiang Biao
On Fri, 24 Jul 2020 at 18:34, Vincent Guittot
 wrote:
>
> On Fri, 24 Jul 2020 at 10:12, Jiang Biao  wrote:
> >
> > On Fri, 24 Jul 2020 at 15:24, Vincent Guittot
> >  wrote:
> > >
> > > On Fri, 24 Jul 2020 at 01:39, Jiang Biao  wrote:
> > > >
> > > > From: Jiang Biao 
> > > >
> > > > Sched-idle CPU has been considered in select_idle_cpu and
> > > > select_idle_smt, it also needs to be considered in select_idle_core to
> > > > be consistent and keep the same *idle* policy.
> > >
> > > In the case of select_idle_core, we are looking for a core that is
> > > fully idle but if one CPU of the core is running a sched_idle task,
> > > the core will not be idle and we might end up having  the wakeup task
> > > on a CPU and a sched_idle task on another CPU of the core which is not
> > > what we want
> > Got it. sched_idle task may interfere its sibling, which brings me
> > another question,
> > If there's a core with smt1 running sched_idle task and smt2 idle,
> > selecting smt1
> > rather than smt2 should be more helpful for wakee task, because wakee task
> > could suppress the sched_idle task without neighbour interfering.
>
> But the sched_idle will then probably quickly move on the idle smt2
>
> > And there seems to be no consideration about that currently.
> > Is it worth improving that?
>
> This will complexify and extend the duration of the search loop  and
> as mentioned above, it will most probably be a nop at the end because
> of sched_idle task moving on smt2
Indeed, the complexity is not worth.
Thanks for the explanation.

Regards,
Jiang
>
> >
> > Thanks a lot.
> >
> > Regards,
> > Jiang


Re: [RFC PATCH 09/16] sched/fair: core wide cfs task priority comparison(Internet mail)

2020-07-24 Thread Jiang Biao
On Fri, 24 Jul 2020 at 15:17, Aaron Lu  wrote:
>
> On Wed, Jul 22, 2020 at 12:23:44AM +, benbjiang(蒋彪) wrote:
> >
> >
> > > +/*
> > > + * This function takes care of adjusting the min_vruntime of siblings of
> > > + * a core during coresched enable/disable.
> > > + * This is called in stop machine context so no need to take the rq lock.
> > Hi,
> >
> > IMHO, it seems that stop machine context cannot guarantee race free. The 
> > param *cpu* maybe not *this_cpu*, rq lock should be taken even in stop 
> > machine context, and irq should be disabled too, to avoid potential races 
> > with other contexts.
> >
>
> In stop machine context, all CPUs except the active ones are spinning
> with irq disabled and in this invocation of stop_machine(), only one
> CPU is active so I don't think race is possible.
You're right, stop_machine provides much more protection than stop_*_cpu.
Thanks for the explanation.

Regards,
Jiang


Re: [PATCH] sched/fair: consider sched-idle CPU when selecting idle core

2020-07-24 Thread Jiang Biao
On Fri, 24 Jul 2020 at 20:36, Ingo Molnar  wrote:
>
>
> * Jiang Biao  wrote:
>
> > On Fri, 24 Jul 2020 at 18:34, Vincent Guittot
> >  wrote:
> > >
> > > On Fri, 24 Jul 2020 at 10:12, Jiang Biao  wrote:
> > > >
> > > > On Fri, 24 Jul 2020 at 15:24, Vincent Guittot
> > > >  wrote:
> > > > >
> > > > > On Fri, 24 Jul 2020 at 01:39, Jiang Biao  wrote:
> > > > > >
> > > > > > From: Jiang Biao 
> > > > > >
> > > > > > Sched-idle CPU has been considered in select_idle_cpu and
> > > > > > select_idle_smt, it also needs to be considered in select_idle_core 
> > > > > > to
> > > > > > be consistent and keep the same *idle* policy.
> > > > >
> > > > > In the case of select_idle_core, we are looking for a core that is
> > > > > fully idle but if one CPU of the core is running a sched_idle task,
> > > > > the core will not be idle and we might end up having  the wakeup task
> > > > > on a CPU and a sched_idle task on another CPU of the core which is not
> > > > > what we want
> > > > Got it. sched_idle task may interfere its sibling, which brings me
> > > > another question,
> > > > If there's a core with smt1 running sched_idle task and smt2 idle,
> > > > selecting smt1
> > > > rather than smt2 should be more helpful for wakee task, because wakee 
> > > > task
> > > > could suppress the sched_idle task without neighbour interfering.
> > >
> > > But the sched_idle will then probably quickly move on the idle smt2
> > >
> > > > And there seems to be no consideration about that currently.
> > > > Is it worth improving that?
> > >
> > > This will complexify and extend the duration of the search loop  and
> > > as mentioned above, it will most probably be a nop at the end because
> > > of sched_idle task moving on smt2
> > Indeed, the complexity is not worth.
> > Thanks for the explanation.
>
> BTW., if you disagree then you could add a bit of debug
> instrumentation to measure to what extent it's a nop at the end of the
> search loop, to turn the "most probably" statement into a specific
> number.
>
> Thanks,
>
> Ingo
Ok, I'll try.
Thanks for your reply.

Regards,
Jiang


[PATCH v2 2/2] x86/pti: check the return value of pti_user_pagetable_walk_pmd

2018-07-17 Thread Jiang Biao
Check the return value of pti_user_pagetable_walk_pmd() to avoid
NULL pointer dereference. And add warning for fail allocation.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 7c402e9..3dba6a7 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -203,7 +203,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
-   if (!new_pud_page)
+   if (WARN_ON(!new_pud_page))
return NULL;
 
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
@@ -217,7 +217,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
-   if (!new_pmd_page)
+   if (WARN_ON(!new_pmd_page))
return NULL;
 
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
@@ -239,8 +239,10 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
 static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
pte_t *pte;
+   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+   if (!pmd)
+   return NULL;
 
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
-- 
2.7.4



[PATCH v2 1/2] x86/pti: check the return value of pti_user_pagetable_walk_p4d

2018-07-17 Thread Jiang Biao
pti_user_pagetable_walk_p4d() may return NULL, we should check the
return value to avoid NULL pointer dereference. And add warning
for fail allocation where NULL returned.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e7..7c402e9 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -176,7 +176,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 
if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
-   if (!new_p4d_page)
+   if (WARN_ON(!new_p4d_page))
return NULL;
 
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
@@ -195,8 +195,10 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
pud_t *pud;
+   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+   if (!p4d)
+   return NULL;
 
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
@@ -354,6 +356,8 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;
 
user_p4d = pti_user_pagetable_walk_p4d(addr);
+   if (!user_p4d)
+   return;
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
-- 
2.7.4



[PATCH v3 1/2] x86/pti: check the return value of pti_user_pagetable_walk_p4d

2018-07-17 Thread Jiang Biao
pti_user_pagetable_walk_p4d() may return NULL, we should check the
return value to avoid NULL pointer dereference. And add warning
for fail allocation where NULL returned.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e7..5c33a16 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -176,7 +176,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 
if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
-   if (!new_p4d_page)
+   if (WARN_ON(!new_p4d_page))
return NULL;
 
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
@@ -195,9 +195,12 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
pud_t *pud;
 
+   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+   if (!p4d)
+   return NULL;
+
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
@@ -354,6 +357,9 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;
 
user_p4d = pti_user_pagetable_walk_p4d(addr);
+   if (!user_p4d)
+   return; 
+
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
-- 
2.7.4



[PATCH v3 2/2] x86/pti: check the return value of pti_user_pagetable_walk_pmd

2018-07-17 Thread Jiang Biao
Check the return value of pti_user_pagetable_walk_pmd() to avoid
NULL pointer dereference. And add warning for fail allocation.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 5c33a16..1342f73 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -204,7 +204,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
-   if (!new_pud_page)
+   if (WARN_ON(!new_pud_page))
return NULL;
 
set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
@@ -218,7 +218,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
-   if (!new_pmd_page)
+   if (WARN_ON(!new_pmd_page))
return NULL;
 
set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
@@ -240,9 +240,12 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
 static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
pte_t *pte;
 
+   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+   if (!pmd)
+   return NULL;
+
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
WARN_ON(1);
-- 
2.7.4



[PATCH] x86/speculation: remove SPECTRE_V2_IBRS in enum spectre_v2_mitigation

2018-07-17 Thread Jiang Biao
SPECTRE_V2_IBRS in enum spectre_v2_mitigation is never used, remove
it to avoid confusing.

Signed-off-by: Jiang Biao 
---
 arch/x86/include/asm/nospec-branch.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/include/asm/nospec-branch.h 
b/arch/x86/include/asm/nospec-branch.h
index f6f6c63..c99082e 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -214,7 +214,6 @@ enum spectre_v2_mitigation {
SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
SPECTRE_V2_RETPOLINE_GENERIC,
SPECTRE_V2_RETPOLINE_AMD,
-   SPECTRE_V2_IBRS,
 };
 
 /* The Speculative Store Bypass disable variants */
-- 
2.7.4



[PATCH] asm/irq_vector.h: fix outdated comments

2019-04-21 Thread Jiang Biao
INVALIDATE_TLB_VECTOR_START has been removed by commit,
52aec3308db8("x86/tlb: replace INVALIDATE_TLB_VECTOR by
CALL_FUNCTION_VECTOR")
And VSYSCALL_EMU_VECTO(204) has also been remove by commit,
3ae36655b97a("x86-64: Rework vsyscall emulation and add vsyscall=
parameter")
but the comments here are outdated, update them.

Signed-off-by: Jiang Biao 
---
 arch/x86/include/asm/irq_vectors.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/irq_vectors.h 
b/arch/x86/include/asm/irq_vectors.h
index 548d90bbf919..889f8b1b5b7f 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -18,8 +18,8 @@
  *  Vectors   0 ...  31 : system traps and exceptions - hardcoded events
  *  Vectors  32 ... 127 : device interrupts
  *  Vector  128 : legacy int80 syscall interface
- *  Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 except 204 : device 
interrupts
- *  Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
+ *  Vectors 129 ... LOCAL_TIMER_VECTOR-1
+ *  Vectors LOCAL_TIMER_VECTOR ... 255 : special interrupts
  *
  * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
  *
-- 
2.17.2 (Apple Git-113)



[PATCH] rcu/tree_exp: cleanup initialized but not used rdp

2019-04-22 Thread Jiang Biao
rdp is initialized but never used in synchronize_rcu_expedited(),
just remove it.

Signed-off-by: Jiang Biao 
---
 kernel/rcu/tree_exp.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 4c2a0189e748..5772612379e4 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -733,7 +733,6 @@ static void sync_sched_exp_online_cleanup(int cpu)
  */
 void synchronize_rcu_expedited(void)
 {
-   struct rcu_data *rdp;
struct rcu_exp_work rew;
struct rcu_node *rnp;
unsigned long s;
@@ -770,7 +769,6 @@ void synchronize_rcu_expedited(void)
}
 
/* Wait for expedited grace period to complete. */
-   rdp = per_cpu_ptr(_data, raw_smp_processor_id());
rnp = rcu_get_root();
wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3],
   sync_exp_work_done(s));
-- 
2.17.2 (Apple Git-113)



[PATCH] rcu/srcutree: make __call_srcu static

2019-04-22 Thread Jiang Biao
__call_srcu() is only used in current file, just make it static.

Signed-off-by: Jiang Biao 
---
 kernel/rcu/srcutree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index a60b8ba9e1ac..a2ade0c6cd87 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -835,7 +835,7 @@ static void srcu_leak_callback(struct rcu_head *rhp)
  * srcu_read_lock(), and srcu_read_unlock() that are all passed the same
  * srcu_struct structure.
  */
-void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
+static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp,
 rcu_callback_t func, bool do_norm)
 {
unsigned long flags;
-- 
2.17.2 (Apple Git-113)



[PATCH] kvm_main: fix some comments

2019-04-23 Thread Jiang Biao
is_dirty has been rename to flush, but the comment for it is
outdated. And the description about @flush parameter for
kvm_clear_dirty_log_protect() is missing, add it in this patch
also.

Signed-off-by: Jiang Biao 
---
 virt/kvm/kvm_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dc8edc97ba85..40b435e77672 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1138,7 +1138,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
  * and reenable dirty page tracking for the corresponding pages.
  * @kvm:   pointer to kvm instance
  * @log:   slot id and address to which we copy the log
- * @is_dirty:  flag set if any page is dirty
+ * @flush: if TLB flush is needed by caller
  *
  * We need to keep it in mind that VCPU threads can write to the bitmap
  * concurrently. So, to avoid losing track of dirty pages we keep the
@@ -1223,6 +1223,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
  * and reenable dirty page tracking for the corresponding pages.
  * @kvm:   pointer to kvm instance
  * @log:   slot id and address from which to fetch the bitmap of dirty 
pages
+ * @flush: if TLB flush is needed by caller
  */
 int kvm_clear_dirty_log_protect(struct kvm *kvm,
struct kvm_clear_dirty_log *log, bool *flush)
-- 
2.17.2 (Apple Git-113)



[PATCH v2] kvm_main: fix some comments

2019-04-23 Thread Jiang Biao
is_dirty has been renamed to flush, but the comment for it is
outdated. And the description about @flush parameter for
kvm_clear_dirty_log_protect() is missing, add it in this patch
as well.

Signed-off-by: Jiang Biao 
---
 virt/kvm/kvm_main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index dc8edc97ba85..6cf7e99e6003 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1134,11 +1134,11 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
 
 #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
 /**
- * kvm_get_dirty_log_protect - get a snapshot of dirty pages, and if any pages
+ * kvm_get_dirty_log_protect - get a snapshot of dirty pages
  * and reenable dirty page tracking for the corresponding pages.
  * @kvm:   pointer to kvm instance
  * @log:   slot id and address to which we copy the log
- * @is_dirty:  flag set if any page is dirty
+ * @flush: true if TLB flush is needed by caller
  *
  * We need to keep it in mind that VCPU threads can write to the bitmap
  * concurrently. So, to avoid losing track of dirty pages we keep the
@@ -1223,6 +1223,7 @@ EXPORT_SYMBOL_GPL(kvm_get_dirty_log_protect);
  * and reenable dirty page tracking for the corresponding pages.
  * @kvm:   pointer to kvm instance
  * @log:   slot id and address from which to fetch the bitmap of dirty 
pages
+ * @flush: true if TLB flush is needed by caller
  */
 int kvm_clear_dirty_log_protect(struct kvm *kvm,
struct kvm_clear_dirty_log *log, bool *flush)
-- 
2.17.2 (Apple Git-113)



[PATCH] fs/quota: erase unused but set variable warning

2019-04-23 Thread Jiang Biao
Local variable *reserved* of remove_dquot_ref() is only used if
define CONFIG_QUOTA_DEBUG, but not ebraced in CONFIG_QUOTA_DEBUG
macro, which leads to unused-but-set-variable warning when compiling.

This patch ebrace it into CONFIG_QUOTA_DEBUG macro like what is done
in add_dquot_ref().

Signed-off-by: Jiang Biao 
---
 fs/quota/dquot.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index fc20e06c56ba..14ee4c6deba1 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1049,7 +1049,9 @@ static void remove_dquot_ref(struct super_block *sb, int 
type,
struct list_head *tofree_head)
 {
struct inode *inode;
+#ifdef CONFIG_QUOTA_DEBUG
int reserved = 0;
+#endif
 
spin_lock(>s_inode_list_lock);
list_for_each_entry(inode, >s_inodes, i_sb_list) {
@@ -1061,8 +1063,10 @@ static void remove_dquot_ref(struct super_block *sb, int 
type,
 */
spin_lock(_data_lock);
if (!IS_NOQUOTA(inode)) {
+#ifdef CONFIG_QUOTA_DEBUG
if (unlikely(inode_get_rsv_space(inode) > 0))
reserved = 1;
+#endif
remove_inode_dquot_ref(inode, type, tofree_head);
}
spin_unlock(_data_lock);
-- 
2.17.2 (Apple Git-113)



[PATCH] mm/vmscan: fix page_freeze_refs in comment.

2018-07-25 Thread Jiang Biao
page_freeze_refs has already been relplaced by page_ref_freeze, but
it is not modified in the comment.

Signed-off-by: Jiang Biao 
---
 mm/vmscan.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f8..d29e207 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -744,7 +744,7 @@ static int __remove_mapping(struct address_space *mapping, 
struct page *page,
refcount = 2;
if (!page_ref_freeze(page, refcount))
goto cannot_free;
-   /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+   /* note: atomic_cmpxchg in page_refs_freeze provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_ref_unfreeze(page, refcount);
goto cannot_free;
-- 
2.7.4



[PATCH v2] mm: fix page_freeze_refs and page_unfreeze_refs in comments.

2018-07-26 Thread Jiang Biao
page_freeze_refs/page_unfreeze_refs have already been relplaced by
page_ref_freeze/page_ref_unfreeze , but they are not modified in
the comments.

Signed-off-by: Jiang Biao 
---
v1: fix comments in vmscan.
v2: fix other two places and fix typoes.

 mm/ksm.c| 4 ++--
 mm/memory-failure.c | 2 +-
 mm/vmscan.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/ksm.c b/mm/ksm.c
index a6d43cf..4c39cb67 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -703,7 +703,7 @@ static struct page *get_ksm_page(struct stable_node 
*stable_node, bool lock_it)
 * We cannot do anything with the page while its refcount is 0.
 * Usually 0 means free, or tail of a higher-order page: in which
 * case this node is no longer referenced, and should be freed;
-* however, it might mean that the page is under page_freeze_refs().
+* however, it might mean that the page is under page_ref_freeze().
 * The __remove_mapping() case is easy, again the node is now stale;
 * but if page is swapcache in migrate_page_move_mapping(), it might
 * still be our page, in which case it's essential to keep the node.
@@ -714,7 +714,7 @@ static struct page *get_ksm_page(struct stable_node 
*stable_node, bool lock_it)
 * work here too.  We have chosen the !PageSwapCache test to
 * optimize the common case, when the page is or is about to
 * be freed: PageSwapCache is cleared (under spin_lock_irq)
-* in the freeze_refs section of __remove_mapping(); but Anon
+* in the ref_freeze section of __remove_mapping(); but Anon
 * page->mapping reset to NULL later, in free_pages_prepare().
 */
if (!PageSwapCache(page))
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 9d142b9..c83a174 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1167,7 +1167,7 @@ int memory_failure(unsigned long pfn, int flags)
 *R/W the page; let's pray that the page has been
 *used and will be freed some time later.
 * In fact it's dangerous to directly bump up page count from 0,
-* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
+* that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
 */
if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
if (is_free_buddy_page(p)) {
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 03822f8..02d0c20 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -744,7 +744,7 @@ static int __remove_mapping(struct address_space *mapping, 
struct page *page,
refcount = 2;
if (!page_ref_freeze(page, refcount))
goto cannot_free;
-   /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+   /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
if (unlikely(PageDirty(page))) {
page_ref_unfreeze(page, refcount);
goto cannot_free;
-- 
2.7.4



[PATCH v4 2/2] x86/pti: check the return value of pti_user_pagetable_walk_pmd

2018-07-19 Thread Jiang Biao
Check the return value of pti_user_pagetable_walk_pmd() to avoid
NULL pointer dereference. And add warning for fail allocation.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 8679c64..5cd7b82 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -205,7 +205,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
-   if (!new_pud_page)
+   if (WARN_ON_ONCE(!new_pud_page))
return NULL;

set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
@@ -219,7 +219,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
}
if (pud_none(*pud)) {
unsigned long new_pmd_page = __get_free_page(gfp);
-   if (!new_pmd_page)
+   if (WARN_ON_ONCE(!new_pmd_page))
return NULL;

set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
@@ -241,9 +241,13 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
 static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+   pmd_t *pmd;
pte_t *pte;

+   pmd = pti_user_pagetable_walk_pmd(address);
+   if (!pmd)
+   return NULL;
+
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
WARN_ON(1);
--
2.7.4



[PATCH v4 1/2] x86/pti: check the return value of pti_user_pagetable_walk_p4d

2018-07-19 Thread Jiang Biao
pti_user_pagetable_walk_p4d() may return NULL, we should check the
return value to avoid NULL pointer dereference. And add warning
for fail allocation where NULL returned.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e7..8679c64 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -176,7 +176,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)

if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
-   if (!new_p4d_page)
+   if (WARN_ON_ONCE(!new_p4d_page))
return NULL;

set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
@@ -195,9 +195,13 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+   p4d_t *p4d;
pud_t *pud;

+   p4d = pti_user_pagetable_walk_p4d(address);
+   if (!p4d)
+   return NULL;
+
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
@@ -354,6 +358,9 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;

user_p4d = pti_user_pagetable_walk_p4d(addr);
+   if (!user_p4d)
+   return;
+
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
--
2.7.4



[PATCH 0/5] x86/pti: small fixes and cleanups for pti

2018-07-15 Thread Jiang Biao
This is a short series of fixes and cleanups found when reading the
code, no functional changes.

Jiang Biao (5):
  x86/pti: check the return value of pti_user_pagetable_walk_p4d
  x86/pti: check the return value of pti_user_pagetable_walk_pmd
  x86/pti: make pti_set_kernel_image_nonglobal static
  x86/pti: warn for unknown pti boot options
  x86/pti: constify address parameters

 arch/x86/mm/pti.c | 29 ++---
 1 file changed, 18 insertions(+), 11 deletions(-)

-- 
2.7.4



[PATCH 5/5] x86/pti: constify address parameters

2018-07-15 Thread Jiang Biao
Addresses passed in pti_user_pagetable_walk_*() are not supposed to
change at runtime, make them const to aviod future slipups.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index a368656..ac2cbdd 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -164,7 +164,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
  *
  * Returns a pointer to a P4D on success, or NULL on failure.
  */
-static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+static p4d_t *pti_user_pagetable_walk_p4d(const unsigned long address)
 {
pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
@@ -192,7 +192,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
  *
  * Returns a pointer to a PMD on success, or NULL on failure.
  */
-static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+static pmd_t *pti_user_pagetable_walk_pmd(const unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pud_t *pud;
@@ -236,7 +236,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
  *
  * Returns a pointer to a PTE on success, or NULL on failure.
  */
-static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
+static __init pte_t *pti_user_pagetable_walk_pte(const unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
pte_t *pte;
-- 
2.7.4



[PATCH 2/5] x86/pti: check the return value of pti_user_pagetable_walk_pmd

2018-07-15 Thread Jiang Biao
pti_user_pagetable_walk_pmd() may return NULL, we should check the
return value in pti_user_pagetable_walk_pte() to avoid NULL pointer
dereference like it is checked in pti_clone_pmds().

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index be9e5bc..bb6f608 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -239,8 +239,10 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long 
address)
 static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
pte_t *pte;
+   pmd_t *pmd = pti_user_pagetable_walk_pmd(address);
+   if (WARN_ON(!pmd))
+   return NULL;
 
/* We can't do anything sensible if we hit a large mapping. */
if (pmd_large(*pmd)) {
-- 
2.7.4



[PATCH 1/5] x86/pti: check the return value of pti_user_pagetable_walk_p4d

2018-07-15 Thread Jiang Biao
pti_user_pagetable_walk_p4d() may return NULL, we should check the
return value to avoid NULL pointer dereference.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 4d418e7..be9e5bc 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -195,8 +195,10 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
pud_t *pud;
+   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+   if (WARN_ON(!p4d))
+   return NULL;
 
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
@@ -354,6 +356,9 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;
 
user_p4d = pti_user_pagetable_walk_p4d(addr);
+   if (WARN_ON(!user_p4d))
+   return;
+   
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;
-- 
2.7.4



[PATCH 3/5] x86/pti: make pti_set_kernel_image_nonglobal static

2018-07-15 Thread Jiang Biao
pti_set_kernel_image_nonglobal() is only used in pti.c, make it
static.

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index bb6f608..a76b2cc 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -473,7 +473,7 @@ void pti_clone_kernel_text(void)
  * the other set_memory.h functions.  Just extern it.
  */
 extern int set_memory_nonglobal(unsigned long addr, int numpages);
-void pti_set_kernel_image_nonglobal(void)
+static void pti_set_kernel_image_nonglobal(void)
 {
/*
 * The identity map is created with PMDs, regardless of the
-- 
2.7.4



[PATCH 4/5] x86/pti: warn for unknown pti boot options

2018-07-15 Thread Jiang Biao
When using unknown pti boot options other than on/off/auto, we
select auto silently, which is sometimes confusing. Add warning for
unknown pti boot options like we do in
spectre_v2_select_mitigation().

Signed-off-by: Jiang Biao 
---
 arch/x86/mm/pti.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index a76b2cc..a368656 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -92,14 +92,14 @@ void __init pti_check_boottime_disable(void)
pti_mode = PTI_FORCE_OFF;
pti_print_if_insecure("disabled on command line.");
return;
-   }
-   if (ret == 2 && !strncmp(arg, "on", 2)) {
+   } else if (ret == 2 && !strncmp(arg, "on", 2)) {
pti_mode = PTI_FORCE_ON;
pti_print_if_secure("force enabled on command line.");
goto enable;
-   }
-   if (ret == 4 && !strncmp(arg, "auto", 4)) {
-   pti_mode = PTI_AUTO;
+   } else if (ret == 4 && !strncmp(arg, "auto", 4)) {
+   goto autosel;
+   } else {
+   pr_err("unknown option (%s). Switching to AUTO 
select\n", arg);
goto autosel;
}
}
-- 
2.7.4



[PATCH] mm/vmstat.c: erase latency in vmstat_shepherd

2021-01-10 Thread Jiang Biao
From: Jiang Biao 

Many 100us+ latencies have been deteceted in vmstat_shepherd()
on CPX platform which has 208 logic cpus. And vmstat_shepherd is
queued every second, which could make the case worse.

Add schedule point in vmstat_shepherd() to erase the latency.

Signed-off-by: Jiang Biao 
Reported-by: Bin Lai 
---
 mm/vmstat.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/vmstat.c b/mm/vmstat.c
index f8942160fc95..b538199883b5 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1953,6 +1953,8 @@ static void vmstat_shepherd(struct work_struct *w)
 
if (!delayed_work_pending(dw) && need_update(cpu))
queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
+
+   cond_resched();
}
put_online_cpus();
 
-- 
2.21.0



[PATCH] mm/oom_kill: fix wild pointer in out_of_memory

2021-03-12 Thread Jiang Biao
From: Bin Lai 

From: Bin Lai 

The oc->chosen is used by oom implementation, and the caller does
not initialize this variable. If the tasks of memcg are all unkillable,
oom_evaluate_task cann't choose any task, and the oc->chosen will be
a wild pointer. So we should initialize oc->chosen before calling
oom_evaluate_task.

Signed-off-by: Bin Lai 
Reviewed-by: Jiang Biao 

diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9efaf43..0658a30 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -366,6 +366,7 @@ static int oom_evaluate_task(struct task_struct *task, void 
*arg)
 static void select_bad_process(struct oom_control *oc)
 {
oc->chosen_points = LONG_MIN;
+   oc->chosen = NULL;
 
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
-- 
1.8.3.1



[tip:x86/pti] x86/pti: Check the return value of pti_user_pagetable_walk_p4d()

2018-07-19 Thread tip-bot for Jiang Biao
Commit-ID:  b2b7d986a89b6c94b1331a909de1217214fb08c1
Gitweb: https://git.kernel.org/tip/b2b7d986a89b6c94b1331a909de1217214fb08c1
Author: Jiang Biao 
AuthorDate: Fri, 20 Jul 2018 08:06:31 +0800
Committer:  Thomas Gleixner 
CommitDate: Fri, 20 Jul 2018 07:07:39 +0200

x86/pti: Check the return value of pti_user_pagetable_walk_p4d()

pti_user_pagetable_walk_p4d() can return NULL, so the return value should
be checked to prevent a NULL pointer dereference.

Add the check and a warning when the P4D allocation fails.

Signed-off-by: Jiang Biao 
Signed-off-by: Thomas Gleixner 
Cc: dave.han...@linux.intel.com
Cc: l...@kernel.org
Cc: h...@zytor.com
Cc: albca...@gmail.com
Cc: zhong.weid...@zte.com.cn
Link: 
https://lkml.kernel.org/r/1532045192-49622-1-git-send-email-jiang.bi...@zte.com.cn

---
 arch/x86/mm/pti.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
index 7b1c85759005..001ee6b0619e 100644
--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -176,7 +176,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 
if (pgd_none(*pgd)) {
unsigned long new_p4d_page = __get_free_page(gfp);
-   if (!new_p4d_page)
+   if (WARN_ON_ONCE(!new_p4d_page))
return NULL;
 
set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
@@ -195,9 +195,13 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long 
address)
 static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 {
gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
-   p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
+   p4d_t *p4d;
pud_t *pud;
 
+   p4d = pti_user_pagetable_walk_p4d(address);
+   if (!p4d)
+   return NULL;
+
BUILD_BUG_ON(p4d_large(*p4d) != 0);
if (p4d_none(*p4d)) {
unsigned long new_pud_page = __get_free_page(gfp);
@@ -359,6 +363,9 @@ static void __init pti_clone_p4d(unsigned long addr)
pgd_t *kernel_pgd;
 
user_p4d = pti_user_pagetable_walk_p4d(addr);
+   if (!user_p4d)
+   return;
+
kernel_pgd = pgd_offset_k(addr);
kernel_p4d = p4d_offset(kernel_pgd, addr);
*user_p4d = *kernel_p4d;


  1   2   >