from:"Prateek Sood"

[PATCH v3] firmware_loader: fix memory leak for paged buffer

2020-08-20 Thread Prateek Sood

vfree() is being called on paged buffer allocated
using alloc_page() and mapped using vmap().

Freeing of pages in vfree() relies on nr_pages of
struct vm_struct. vmap() does not update nr_pages.
It can lead to memory leaks.

Fixes: ddaf29fd9bb6 ("firmware: Free temporary page table after vmapping")
Signed-off-by: Prateek Sood 
Reviewed-by: Takashi Iwai 
Cc: sta...@vger.kernel.org
---
 drivers/base/firmware_loader/firmware.h |  2 ++
 drivers/base/firmware_loader/main.c | 17 +++--
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/base/firmware_loader/firmware.h 
b/drivers/base/firmware_loader/firmware.h
index 933e2192..d08efc7 100644
--- a/drivers/base/firmware_loader/firmware.h
+++ b/drivers/base/firmware_loader/firmware.h
@@ -142,10 +142,12 @@ static inline void fw_state_done(struct fw_priv *fw_priv)
 void fw_free_paged_buf(struct fw_priv *fw_priv);
 int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed);
 int fw_map_paged_buf(struct fw_priv *fw_priv);
+bool fw_is_paged_buf(struct fw_priv *fw_priv);
 #else
 static inline void fw_free_paged_buf(struct fw_priv *fw_priv) {}
 static inline int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed) 
{ return -ENXIO; }
 static inline int fw_map_paged_buf(struct fw_priv *fw_priv) { return -ENXIO; }
+static inline bool fw_is_paged_buf(struct fw_priv *fw_priv) { return false; }
 #endif
 
 #endif /* __FIRMWARE_LOADER_H */
diff --git a/drivers/base/firmware_loader/main.c 
b/drivers/base/firmware_loader/main.c
index ca871b1..36bf455 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -252,9 +252,11 @@ static void __free_fw_priv(struct kref *ref)
list_del(_priv->list);
spin_unlock(>lock);
 
-   fw_free_paged_buf(fw_priv); /* free leftover pages */
-   if (!fw_priv->allocated_size)
+   if (fw_is_paged_buf(fw_priv))
+   fw_free_paged_buf(fw_priv);
+   else if (!fw_priv->allocated_size)
vfree(fw_priv->data);
+
kfree_const(fw_priv->fw_name);
kfree(fw_priv);
 }
@@ -268,6 +270,11 @@ static void free_fw_priv(struct fw_priv *fw_priv)
 }
 
 #ifdef CONFIG_FW_LOADER_PAGED_BUF
+bool fw_is_paged_buf(struct fw_priv *fw_priv)
+{
+   return fw_priv->is_paged_buf;
+}
+
 void fw_free_paged_buf(struct fw_priv *fw_priv)
 {
int i;
@@ -275,6 +282,8 @@ void fw_free_paged_buf(struct fw_priv *fw_priv)
if (!fw_priv->pages)
return;
 
+   vunmap(fw_priv->data);
+
for (i = 0; i < fw_priv->nr_pages; i++)
__free_page(fw_priv->pages[i]);
kvfree(fw_priv->pages);
@@ -328,10 +337,6 @@ int fw_map_paged_buf(struct fw_priv *fw_priv)
if (!fw_priv->data)
return -ENOMEM;
 
-   /* page table is no longer needed after mapping, let's free */
-   kvfree(fw_priv->pages);
-   fw_priv->pages = NULL;
-
return 0;
 }
 #endif
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

[PATCH v2] firmware_loader: fix memory leak for paged buffer

2020-08-13 Thread Prateek Sood

vfree() is being called on paged buffer allocated
using alloc_page() and mapped using vmap().

Freeing of pages in vfree() relies on nr_pages of
struct vm_struct. vmap() does not update nr_pages.
It can lead to memory leaks.

Fixes: ddaf29fd9bb6 ("firmware: Free temporary page table after vmapping")
Signed-off-by: Prateek Sood 
Reviewed-by: Takashi Iwai 
---
 drivers/base/firmware_loader/firmware.h |  2 ++
 drivers/base/firmware_loader/main.c | 17 +++--
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/base/firmware_loader/firmware.h 
b/drivers/base/firmware_loader/firmware.h
index 933e2192..d08efc7 100644
--- a/drivers/base/firmware_loader/firmware.h
+++ b/drivers/base/firmware_loader/firmware.h
@@ -142,10 +142,12 @@ static inline void fw_state_done(struct fw_priv *fw_priv)
 void fw_free_paged_buf(struct fw_priv *fw_priv);
 int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed);
 int fw_map_paged_buf(struct fw_priv *fw_priv);
+bool fw_is_paged_buf(struct fw_priv *fw_priv);
 #else
 static inline void fw_free_paged_buf(struct fw_priv *fw_priv) {}
 static inline int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed) 
{ return -ENXIO; }
 static inline int fw_map_paged_buf(struct fw_priv *fw_priv) { return -ENXIO; }
+static inline bool fw_is_paged_buf(struct fw_priv *fw_priv) { return false; }
 #endif
 
 #endif /* __FIRMWARE_LOADER_H */
diff --git a/drivers/base/firmware_loader/main.c 
b/drivers/base/firmware_loader/main.c
index ca871b1..36bf455 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -252,9 +252,11 @@ static void __free_fw_priv(struct kref *ref)
list_del(_priv->list);
spin_unlock(>lock);
 
-   fw_free_paged_buf(fw_priv); /* free leftover pages */
-   if (!fw_priv->allocated_size)
+   if (fw_is_paged_buf(fw_priv))
+   fw_free_paged_buf(fw_priv);
+   else if (!fw_priv->allocated_size)
vfree(fw_priv->data);
+
kfree_const(fw_priv->fw_name);
kfree(fw_priv);
 }
@@ -268,6 +270,11 @@ static void free_fw_priv(struct fw_priv *fw_priv)
 }
 
 #ifdef CONFIG_FW_LOADER_PAGED_BUF
+bool fw_is_paged_buf(struct fw_priv *fw_priv)
+{
+   return fw_priv->is_paged_buf;
+}
+
 void fw_free_paged_buf(struct fw_priv *fw_priv)
 {
int i;
@@ -275,6 +282,8 @@ void fw_free_paged_buf(struct fw_priv *fw_priv)
if (!fw_priv->pages)
return;
 
+   vunmap(fw_priv->data);
+
for (i = 0; i < fw_priv->nr_pages; i++)
__free_page(fw_priv->pages[i]);
kvfree(fw_priv->pages);
@@ -328,10 +337,6 @@ int fw_map_paged_buf(struct fw_priv *fw_priv)
if (!fw_priv->data)
return -ENOMEM;
 
-   /* page table is no longer needed after mapping, let's free */
-   kvfree(fw_priv->pages);
-   fw_priv->pages = NULL;
-
return 0;
 }
 #endif
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: [PATCH] firmware_loader: fix memory leak for paged buffer

2020-08-13 Thread Prateek Sood




On 8/13/2020 6:28 PM, Takashi Iwai wrote:

On Wed, 12 Aug 2020 21:00:19 +0200,
Prateek Sood wrote:

vfree() is being called on paged buffer allocated
using alloc_page() and mapped using vmap().

Freeing of pages in vfree() relies on nr_pages of
struct vm_struct. vmap() does not update nr_pages.
It can lead to memory leaks.

Signed-off-by: Prateek Sood 

Thanks for spotting this out!  This is essentially a revert of the
commit ddaf29fd9bb6 ("firmware: Free temporary page table after
vmapping"), so better to mention it via Fixes tag as well as Cc to
stable.
About the changes:

--- a/drivers/base/firmware_loader/firmware.h
+++ b/drivers/base/firmware_loader/firmware.h
@@ -142,10 +142,12 @@ static inline void fw_state_done(struct fw_priv *fw_priv)
  void fw_free_paged_buf(struct fw_priv *fw_priv);
  int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed);
  int fw_map_paged_buf(struct fw_priv *fw_priv);
+bool fw_is_paged_buf(struct fw_priv *fw_priv);

I guess this isn't necessary if we just swap the call order of
fw_free_paged_buf() and vfree(); then fw_priv->is_paged_buf is
referred only in fw_free_paged_buf().
That is, something like below.

In anyway, take my review tag:
   Reviewed-by: Takashi Iwai 


Thanks for reviewing.

I would prefer to keep the patch as is to have vmap() and vunmap() pair used

for readability. Will upload a new version with Fixes tag and your 
Reviewed-by.




thanks,

Takashi

--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -252,9 +252,9 @@ static void __free_fw_priv(struct kref *ref)
list_del(_priv->list);
spin_unlock(>lock);
  
-	fw_free_paged_buf(fw_priv); /* free leftover pages */

if (!fw_priv->allocated_size)
vfree(fw_priv->data);
+   fw_free_paged_buf(fw_priv); /* free leftover pages */
kfree_const(fw_priv->fw_name);
kfree(fw_priv);
  }
@@ -272,7 +272,7 @@ void fw_free_paged_buf(struct fw_priv *fw_priv)
  {
int i;
  
-	if (!fw_priv->pages)

+   if (!fw_priv->is_paged_buf)
return;
  
  	for (i = 0; i < fw_priv->nr_pages; i++)

@@ -328,10 +328,6 @@ int fw_map_paged_buf(struct fw_priv *fw_priv)
if (!fw_priv->data)
return -ENOMEM;
  
-	/* page table is no longer needed after mapping, let's free */

-   kvfree(fw_priv->pages);
-   fw_priv->pages = NULL;
-
return 0;
  }
  #endif


--
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

[PATCH] firmware_loader: fix memory leak for paged buffer

2020-08-12 Thread Prateek Sood

vfree() is being called on paged buffer allocated
using alloc_page() and mapped using vmap().

Freeing of pages in vfree() relies on nr_pages of
struct vm_struct. vmap() does not update nr_pages.
It can lead to memory leaks.

Signed-off-by: Prateek Sood 
---
 drivers/base/firmware_loader/firmware.h |  2 ++
 drivers/base/firmware_loader/main.c | 17 +++--
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/base/firmware_loader/firmware.h 
b/drivers/base/firmware_loader/firmware.h
index 933e2192..d08efc7 100644
--- a/drivers/base/firmware_loader/firmware.h
+++ b/drivers/base/firmware_loader/firmware.h
@@ -142,10 +142,12 @@ static inline void fw_state_done(struct fw_priv *fw_priv)
 void fw_free_paged_buf(struct fw_priv *fw_priv);
 int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed);
 int fw_map_paged_buf(struct fw_priv *fw_priv);
+bool fw_is_paged_buf(struct fw_priv *fw_priv);
 #else
 static inline void fw_free_paged_buf(struct fw_priv *fw_priv) {}
 static inline int fw_grow_paged_buf(struct fw_priv *fw_priv, int pages_needed) 
{ return -ENXIO; }
 static inline int fw_map_paged_buf(struct fw_priv *fw_priv) { return -ENXIO; }
+static inline bool fw_is_paged_buf(struct fw_priv *fw_priv) { return false; }
 #endif
 
 #endif /* __FIRMWARE_LOADER_H */
diff --git a/drivers/base/firmware_loader/main.c 
b/drivers/base/firmware_loader/main.c
index ca871b1..36bf455 100644
--- a/drivers/base/firmware_loader/main.c
+++ b/drivers/base/firmware_loader/main.c
@@ -252,9 +252,11 @@ static void __free_fw_priv(struct kref *ref)
list_del(_priv->list);
spin_unlock(>lock);
 
-   fw_free_paged_buf(fw_priv); /* free leftover pages */
-   if (!fw_priv->allocated_size)
+   if (fw_is_paged_buf(fw_priv))
+   fw_free_paged_buf(fw_priv);
+   else if (!fw_priv->allocated_size)
vfree(fw_priv->data);
+
kfree_const(fw_priv->fw_name);
kfree(fw_priv);
 }
@@ -268,6 +270,11 @@ static void free_fw_priv(struct fw_priv *fw_priv)
 }
 
 #ifdef CONFIG_FW_LOADER_PAGED_BUF
+bool fw_is_paged_buf(struct fw_priv *fw_priv)
+{
+   return fw_priv->is_paged_buf;
+}
+
 void fw_free_paged_buf(struct fw_priv *fw_priv)
 {
int i;
@@ -275,6 +282,8 @@ void fw_free_paged_buf(struct fw_priv *fw_priv)
if (!fw_priv->pages)
return;
 
+   vunmap(fw_priv->data);
+
for (i = 0; i < fw_priv->nr_pages; i++)
__free_page(fw_priv->pages[i]);
kvfree(fw_priv->pages);
@@ -328,10 +337,6 @@ int fw_map_paged_buf(struct fw_priv *fw_priv)
if (!fw_priv->data)
return -ENOMEM;
 
-   /* page table is no longer needed after mapping, let's free */
-   kvfree(fw_priv->pages);
-   fw_priv->pages = NULL;
-
return 0;
 }
 #endif
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: [PATCH] trace: fix race in perf_trace_buf initialization

2019-10-20 Thread Prateek Sood

On 10/19/19 2:42 AM, Steven Rostedt wrote:
> 
> Peter told me it was fine, but I'm also adding Song who is the author
> of the code.
> 
> Also needs the tags (which I'll add)
> 
> Cc: sta...@vger.kernel.org
> Fixes: e12f03d7031a9 ("perf/core: Implement the 'perf_kprobe' PMU")
> 
> -- Steve
> 
> 
> On Tue, 15 Oct 2019 11:47:25 +0530
> Prateek Sood  wrote:
> 
>> [  943.034988] Unable to handle kernel paging request at virtual address 
>> 003106f2003c
>> [  943.043653] Mem abort info:
>> [  943.046679]   ESR = 0x9645
>> [  943.050428]   Exception class = DABT (current EL), IL = 32 bits
>> [  943.056643]   SET = 0, FnV = 0
>> [  943.060168]   EA = 0, S1PTW = 0
>> [  943.063449] Data abort info:
>> [  943.066474]   ISV = 0, ISS = 0x0045
>> [  943.070856]   CM = 0, WnR = 1
>> [  943.074016] user pgtable: 4k pages, 39-bit VAs, pgdp = ffc034b9b000
>> [  943.081446] [003106f2003c] pgd=, pud=
>> [  943.088862] Internal error: Oops: 9645 [#1] PREEMPT SMP
>> [  943.141700] Process syz-executor (pid: 18393, stack limit = 
>> 0xffc09319)
>> [  943.164146] pstate: 8045 (Nzcv daif +PAN -UAO)
>> [  943.169119] pc : __memset+0x20/0x1ac
>> [  943.172831] lr : memset+0x3c/0x50
>> [  943.176269] sp : ffc09319fc50
>>
>> [  943.557593]  __memset+0x20/0x1ac
>> [  943.560953]  perf_trace_buf_alloc+0x140/0x1a0
>> [  943.565472]  perf_trace_sys_enter+0x158/0x310
>> [  943.569985]  syscall_trace_enter+0x348/0x7c0
>> [  943.574413]  el0_svc_common+0x11c/0x368
>> [  943.578394]  el0_svc_handler+0x12c/0x198
>> [  943.582459]  el0_svc+0x8/0xc
>>
>> In Ramdumps:
>> total_ref_count = 3
>> perf_trace_buf = (
>> 0x0 -> NULL,
>> 0x0 -> NULL,
>> 0x0 -> NULL,
>> 0x0 -> NULL)
>>
>> event_call in perf_trace_sys_enter()
>> event_call = 0xFF900CB511D8 -> (
>> list = (next = 0xFF900CB4E2E0, prev = 0xFF900CB512B0),
>> class = 0xFF900CDC8308,
>> name = 0xFF900CDDA1D8,
>> tp = 0xFF900CDDA1D8,
>> event = (
>>   node = (next = 0x0, pprev = 0xFF900CB80210),
>>   list = (next = 0xFF900CB512E0, prev = 0xFF900CB4E310),
>>   type = 21,
>>   funcs = 0xFF900CB51130),
>> print_fmt = 0xFF900CB51150,
>> filter = 0x0,
>> mod = 0x0,
>> data = 0x0,
>> flags = 18,
>> perf_refcount = 1,
>> perf_events = 0xFF8DB8E54158,
>> prog_array = 0x0,
>> perf_perm = 0x0)
>>
>> perf_events added on CPU0
>> (struct hlist_head *)(0xFF8DB8E54158+__per_cpu_offset[0]) -> (
>> first = 0xFFC0980FD0E0 -> (
>>   next = 0x0,
>>   pprev = 0xFFBEBFD74158))
>>
>> Could you please confirm:
>> 1) the race mentioned below exists or not.
>> 2) if following patch fixes it.
>>
>>
>>> 8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8 
>>>  
>>
>> A race condition exists while initialiazing perf_trace_buf from
>> perf_trace_init() and perf_kprobe_init().
>>
>>   CPU0CPU1
>> perf_trace_init()
>>   mutex_lock(_mutex)
>> perf_trace_event_init()
>>   perf_trace_event_reg()
>> total_ref_count == 0
>>  buf = alloc_percpu()
>> perf_trace_buf[i] = buf
>> tp_event->class->reg() //fails   perf_kprobe_init()
>>  goto fail  perf_trace_event_init()
>>  perf_trace_event_reg()
>> fail:
>>total_ref_count == 0
>>
>>total_ref_count == 0
>>        buf = alloc_percpu()
>>perf_trace_buf[i] = buf
>>tp_event->class->reg()
>>total_ref_count++
>>
>>   free_percpu(perf_trace_buf[i])
>>   perf_trace_buf[i] = NULL
>>
>> Any subsequent call to perf_trace_event_reg() will observe total_ref_count > 
>> 0,
>> causing the perf_trace_buf to be NULL always. This can result in 
>> perf_trace_buf
>> getting accessed from perf_trace_buf_allo

Re: [PATCH] trace: fix race in perf_trace_buf initialization

2019-10-17 Thread Prateek Sood

On 10/15/19 11:47 AM, Prateek Sood wrote:
> [  943.034988] Unable to handle kernel paging request at virtual address 
> 003106f2003c
> [  943.043653] Mem abort info:
> [  943.046679]   ESR = 0x9645
> [  943.050428]   Exception class = DABT (current EL), IL = 32 bits
> [  943.056643]   SET = 0, FnV = 0
> [  943.060168]   EA = 0, S1PTW = 0
> [  943.063449] Data abort info:
> [  943.066474]   ISV = 0, ISS = 0x0045
> [  943.070856]   CM = 0, WnR = 1
> [  943.074016] user pgtable: 4k pages, 39-bit VAs, pgdp = ffc034b9b000
> [  943.081446] [003106f2003c] pgd=, pud=
> [  943.088862] Internal error: Oops: 9645 [#1] PREEMPT SMP
> [  943.141700] Process syz-executor (pid: 18393, stack limit = 
> 0xffc09319)
> [  943.164146] pstate: 8045 (Nzcv daif +PAN -UAO)
> [  943.169119] pc : __memset+0x20/0x1ac
> [  943.172831] lr : memset+0x3c/0x50
> [  943.176269] sp : ffc09319fc50
> 
> [  943.557593]  __memset+0x20/0x1ac
> [  943.560953]  perf_trace_buf_alloc+0x140/0x1a0
> [  943.565472]  perf_trace_sys_enter+0x158/0x310
> [  943.569985]  syscall_trace_enter+0x348/0x7c0
> [  943.574413]  el0_svc_common+0x11c/0x368
> [  943.578394]  el0_svc_handler+0x12c/0x198
> [  943.582459]  el0_svc+0x8/0xc
> 
> In Ramdumps:
> total_ref_count = 3
> perf_trace_buf = (
> 0x0 -> NULL,
> 0x0 -> NULL,
> 0x0 -> NULL,
> 0x0 -> NULL)
> 
> event_call in perf_trace_sys_enter()
> event_call = 0xFF900CB511D8 -> (
> list = (next = 0xFF900CB4E2E0, prev = 0xFF900CB512B0),
> class = 0xFF900CDC8308,
> name = 0xFF900CDDA1D8,
> tp = 0xFF900CDDA1D8,
> event = (
>   node = (next = 0x0, pprev = 0xFF900CB80210),
>   list = (next = 0xFF900CB512E0, prev = 0xFF900CB4E310),
>   type = 21,
>   funcs = 0xFF900CB51130),
> print_fmt = 0xFF900CB51150,
> filter = 0x0,
> mod = 0x0,
> data = 0x0,
> flags = 18,
> perf_refcount = 1,
> perf_events = 0xFF8DB8E54158,
> prog_array = 0x0,
> perf_perm = 0x0)
> 
> perf_events added on CPU0
> (struct hlist_head *)(0xFF8DB8E54158+__per_cpu_offset[0]) -> (
> first = 0xFFC0980FD0E0 -> (
>   next = 0x0,
>   pprev = 0xFFBEBFD74158))
> 
> Could you please confirm:
> 1) the race mentioned below exists or not.
> 2) if following patch fixes it.
> 
> 
>> 8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8
> 
> A race condition exists while initialiazing perf_trace_buf from
> perf_trace_init() and perf_kprobe_init().
> 
>   CPU0CPU1
> perf_trace_init()
>   mutex_lock(_mutex)
> perf_trace_event_init()
>   perf_trace_event_reg()
> total_ref_count == 0
>   buf = alloc_percpu()
> perf_trace_buf[i] = buf
> tp_event->class->reg() //fails   perf_kprobe_init()
>   goto fail  perf_trace_event_init()
>  perf_trace_event_reg()
> fail:
> total_ref_count == 0
> 
>total_ref_count == 0
>buf = alloc_percpu()
>perf_trace_buf[i] = buf
>tp_event->class->reg()
>total_ref_count++
> 
>   free_percpu(perf_trace_buf[i])
>   perf_trace_buf[i] = NULL
> 
> Any subsequent call to perf_trace_event_reg() will observe total_ref_count > 
> 0,
> causing the perf_trace_buf to be NULL always. This can result in 
> perf_trace_buf
> getting accessed from perf_trace_buf_alloc() without being initialized. 
> Acquiring
> event_mutex in perf_kprobe_init() before calling perf_trace_event_init() 
> should
> fix this race.
> 
> Signed-off-by: Prateek Sood 
> ---
>  kernel/trace/trace_event_perf.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
> index 4629a61..48ee92c 100644
> --- a/kernel/trace/trace_event_perf.c
> +++ b/kernel/trace/trace_event_perf.c
> @@ -272,9 +272,11 @@ int perf_kprobe_init(struct perf_event *p_event, bool 
> is_retprobe)
>   goto out;
>   }
>  
> + mutex_lock(_mutex);
>   ret = perf_trace_event_init(tp_event, p_event);
>   if (ret)
>

[PATCH] trace: fix race in perf_trace_buf initialization

2019-10-15 Thread Prateek Sood



[  943.034988] Unable to handle kernel paging request at virtual address 
003106f2003c
[  943.043653] Mem abort info:
[  943.046679]   ESR = 0x9645
[  943.050428]   Exception class = DABT (current EL), IL = 32 bits
[  943.056643]   SET = 0, FnV = 0
[  943.060168]   EA = 0, S1PTW = 0
[  943.063449] Data abort info:
[  943.066474]   ISV = 0, ISS = 0x0045
[  943.070856]   CM = 0, WnR = 1
[  943.074016] user pgtable: 4k pages, 39-bit VAs, pgdp = ffc034b9b000
[  943.081446] [003106f2003c] pgd=, pud=
[  943.088862] Internal error: Oops: 9645 [#1] PREEMPT SMP
[  943.141700] Process syz-executor (pid: 18393, stack limit = 
0xffc09319)
[  943.164146] pstate: 8045 (Nzcv daif +PAN -UAO)
[  943.169119] pc : __memset+0x20/0x1ac
[  943.172831] lr : memset+0x3c/0x50
[  943.176269] sp : ffc09319fc50

[  943.557593]  __memset+0x20/0x1ac
[  943.560953]  perf_trace_buf_alloc+0x140/0x1a0
[  943.565472]  perf_trace_sys_enter+0x158/0x310
[  943.569985]  syscall_trace_enter+0x348/0x7c0
[  943.574413]  el0_svc_common+0x11c/0x368
[  943.578394]  el0_svc_handler+0x12c/0x198
[  943.582459]  el0_svc+0x8/0xc

In Ramdumps:
total_ref_count = 3
perf_trace_buf = (
0x0 -> NULL,
0x0 -> NULL,
0x0 -> NULL,
0x0 -> NULL)

event_call in perf_trace_sys_enter()
event_call = 0xFF900CB511D8 -> (
list = (next = 0xFF900CB4E2E0, prev = 0xFF900CB512B0),
class = 0xFF900CDC8308,
name = 0xFF900CDDA1D8,
tp = 0xFF900CDDA1D8,
event = (
  node = (next = 0x0, pprev = 0xFF900CB80210),
  list = (next = 0xFF900CB512E0, prev = 0xFF900CB4E310),
  type = 21,
  funcs = 0xFF900CB51130),
print_fmt = 0xFF900CB51150,
filter = 0x0,
mod = 0x0,
data = 0x0,
flags = 18,
perf_refcount = 1,
perf_events = 0xFF8DB8E54158,
prog_array = 0x0,
perf_perm = 0x0)

perf_events added on CPU0
(struct hlist_head *)(0xFF8DB8E54158+__per_cpu_offset[0]) -> (
first = 0xFFC0980FD0E0 -> (
  next = 0x0,
  pprev = 0xFFBEBFD74158))

Could you please confirm:
1) the race mentioned below exists or not.
2) if following patch fixes it.


>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8>8

A race condition exists while initialiazing perf_trace_buf from
perf_trace_init() and perf_kprobe_init().

  CPU0CPU1
perf_trace_init()
  mutex_lock(_mutex)
perf_trace_event_init()
  perf_trace_event_reg()
total_ref_count == 0
buf = alloc_percpu()
perf_trace_buf[i] = buf
tp_event->class->reg() //fails   perf_kprobe_init()
goto fail  perf_trace_event_init()
 perf_trace_event_reg()
fail:
  total_ref_count == 0

   total_ref_count == 0
   buf = alloc_percpu()
   perf_trace_buf[i] = buf
   tp_event->class->reg()
   total_ref_count++

  free_percpu(perf_trace_buf[i])
  perf_trace_buf[i] = NULL

Any subsequent call to perf_trace_event_reg() will observe total_ref_count > 0,
causing the perf_trace_buf to be NULL always. This can result in perf_trace_buf
getting accessed from perf_trace_buf_alloc() without being initialized. 
Acquiring
event_mutex in perf_kprobe_init() before calling perf_trace_event_init() should
fix this race.

Signed-off-by: Prateek Sood 
---
 kernel/trace/trace_event_perf.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4629a61..48ee92c 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -272,9 +272,11 @@ int perf_kprobe_init(struct perf_event *p_event, bool 
is_retprobe)
goto out;
}
 
+   mutex_lock(_mutex);
ret = perf_trace_event_init(tp_event, p_event);
if (ret)
destroy_local_trace_kprobe(tp_event);
+   mutex_unlock(_mutex);
 out:
kfree(func);
return ret;
@@ -282,8 +284,10 @@ int perf_kprobe_init(struct perf_event *p_event, bool 
is_retprobe)
 
 void perf_kprobe_destroy(struct perf_event *p_event)
 {
+   mutex_lock(_mutex);
perf_trace_event_close(p_event);
perf_trace_event_unreg(p_event);
+   mutex_unlock(_mutex);
 
destroy_local_trace_kprobe(p_event->tp_event);
 }
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: [PATCH v7] driver core: Fix use-after-free and double free on glue directory

2019-07-28 Thread Prateek Sood

gt;4. Update commit message.
> Change in v3:
>Add change log.
> Change in v2:
>Fix device_move() also.
> 
>  drivers/base/core.c | 53 -
>  1 file changed, 52 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index 4aeaa0c92bda..edc55160c5f0 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -1820,12 +1820,63 @@ static inline struct kobject *get_glue_dir(struct 
> device *dev)
>   */
>  static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
>  {
> + unsigned int ref;
> +
>   /* see if we live in a "glue" directory */
>   if (!live_in_glue_dir(glue_dir, dev))
>   return;
>  
>   mutex_lock(_mutex);
> - if (!kobject_has_children(glue_dir))
> + /**
> +  * There is a race condition between removing glue directory
> +  * and adding a new device under the glue directory.
> +  *
> +  * CPU1: CPU2:
> +  *
> +  * device_add()
> +  *   get_device_parent()
> +  * class_dir_create_and_add()
> +  *   kobject_add_internal()
> +  * create_dir()// create glue_dir
> +  *
> +  *   device_add()
> +  * get_device_parent()
> +  *   kobject_get() // 
> get glue_dir
> +  *
> +  * device_del()
> +  *   cleanup_glue_dir()
> +  * kobject_del(glue_dir)
> +  *
> +  *   kobject_add()
> +  * 
> kobject_add_internal()
> +  *   create_dir() // in 
> glue_dir
> +  * 
> sysfs_create_dir_ns()
> +  *   
> kernfs_create_dir_ns(sd)
> +  *
> +  *   sysfs_remove_dir() // glue_dir->sd=NULL
> +  *   sysfs_put()// free glue_dir->sd
> +  *
> +  * // sd is 
> freed
> +  * 
> kernfs_new_node(sd)
> +  *   
> kernfs_get(glue_dir)
> +  *   
> kernfs_add_one()
> +  *   
> kernfs_put()
> +  *
> +  * Before CPU1 remove last child device under glue dir, if CPU2 add
> +  * a new device under glue dir, the glue_dir kobject reference count
> +  * will be increase to 2 in kobject_get(k). And CPU2 has been called
> +  * kernfs_create_dir_ns(). Meanwhile, CPU1 call sysfs_remove_dir()
> +  * and sysfs_put(). This result in glue_dir->sd is freed.
> +  *
> +  * Then the CPU2 will see a stale "empty" but still potentially used
> +  * glue dir around in kernfs_new_node().
> +  *
> +  * In order to avoid this happening, we also should make sure that
> +  * kernfs_node for glue_dir is released in CPU1 only when refcount
> +  * for glue_dir kobj is 1.
> +  */
> + ref = kref_read(_dir->kref);
> + if (!kobject_has_children(glue_dir) && !--ref)
>   kobject_del(glue_dir);
>   kobject_put(glue_dir);
>   mutex_unlock(_mutex);
>

Looks good to me and inline with https://lkml.org/lkml/2019/5/1/3

Signed-off-by: Prateek Sood 


Thanks
Prateek

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH v6] driver core: Fix use-after-free and double free on glue directory

2019-07-25 Thread Prateek Sood

On 7/24/19 9:30 PM, Muchun Song wrote:
> There is a race condition between removing glue directory and adding a new
> device under the glue directory. It can be reproduced in following test:
> 
> path 1: Add the child device under glue dir
> device_add()
> get_device_parent()
> mutex_lock(_mutex);
> 
> /*find parent from glue_dirs.list*/
> list_for_each_entry(k, >class->p->glue_dirs.list, entry)
> if (k->parent == parent_kobj) {
> kobj = kobject_get(k);
> break;
> }
> 
> mutex_unlock(_mutex);
> 
> 
> kobject_add()
> kobject_add_internal()
> create_dir()
> sysfs_create_dir_ns()
> if (kobj->parent)
> parent = kobj->parent->sd;
> 
> kernfs_create_dir_ns(parent)
> kernfs_new_node()
> kernfs_get(parent)
> 
> /* link in */
> rc = kernfs_add_one(kn);
> if (!rc)
> return kn;
> 
> kernfs_put(kn)
> 
> repeat:
> kmem_cache_free(kn)
> kn = parent;
> 
> if (kn) {
> if (atomic_dec_and_test(>count))
> goto repeat;
> }
> 
> 
> path2: Remove last child device under glue dir
> device_del()
> cleanup_glue_dir()
> mutex_lock(_mutex);
> if (!kobject_has_children(glue_dir))
> kobject_del(glue_dir);
> kobject_put(glue_dir);
> mutex_unlock(_mutex);
> 
> Before path2 remove last child device under glue dir, If path1 add a new
> device under glue dir, the glue_dir kobject reference count will be
> increase to 2 via kobject_get(k) in get_device_parent(). And path1 has
> been called kernfs_new_node(), but not call kernfs_get(parent).
> Meanwhile, path2 call kobject_del(glue_dir) beacause 0 is returned by
> kobject_has_children(). This result in glue_dir->sd is freed and it's
> reference count will be 0. Then path1 call kernfs_get(parent) will trigger
> a warning in kernfs_get()(WARN_ON(!atomic_read(>count))) and increase
> it's reference count to 1. Because glue_dir->sd is freed by path2, the next
> call kernfs_add_one() by path1 will fail(This is also use-after-free)
> and call atomic_dec_and_test() to decrease reference count. Because the
> reference count is decremented to 0, it will also call kmem_cache_free()
> to free glue_dir->sd again. This will result in double free.
> 
> In order to avoid this happening, we also should make sure that kernfs_node
> for glue_dir is released in path2 only when refcount for glue_dir kobj is
> 1 to fix this race.
> 
> The following calltrace is captured in kernel 4.14 with the following patch
> applied:
> 
> commit 726e41097920 ("drivers: core: Remove glue dirs from sysfs earlier")
> 
> --
> [3.633703] WARNING: CPU: 4 PID: 513 at .../fs/kernfs/dir.c:494
> Here is WARN_ON(!atomic_read(>count) in kernfs_get().
> 
> [3.633986] Call trace:
> [3.633991]  kernfs_create_dir_ns+0xa8/0xb0
> [3.633994]  sysfs_create_dir_ns+0x54/0xe8
> [3.634001]  kobject_add_internal+0x22c/0x3f0
> [3.634005]  kobject_add+0xe4/0x118
> [3.634011]  device_add+0x200/0x870
> [3.634017]  _request_firmware+0x958/0xc38
> [3.634020]  request_firmware_into_buf+0x4c/0x70
> 
> [3.634064] kernel BUG at .../mm/slub.c:294!
> Here is BUG_ON(object == fp) in set_freepointer().
> 
> [3.634346] Call trace:
> [3.634351]  kmem_cache_free+0x504/0x6b8
> [3.634355]  kernfs_put+0x14c/0x1d8
> [3.634359]  kernfs_create_dir_ns+0x88/0xb0
> [3.634362]  sysfs_create_dir_ns+0x54/0xe8
> [3.634366]  kobject_add_internal+0x22c/0x3f0
> [3.634370]  kobject_add+0xe4/0x118
> [3.634374]  device_add+0x200/0x870
> [3.634378]  _request_firmware+0x958/0xc38
> [3.634381]  request_firmware_into_buf+0x4c/0x70
> --
> 
> Fixes: 726e41097920 ("drivers: core: Remove glue dirs from sysfs earlier")
> 
> Signed-off-by: Muchun Song 
> Reviewed-by: Mukesh Ojha 
> ---
> 
> Change in v6:
>1. Remove hardcoding "1 "
> Change in v5:
>1. Revert to the v1 fix.
>2. Add some comment to explain why we need do this in
>   cleanup_glue_dir().
> Change in v4:
>1. Add some kerneldoc comment.
>2. Remove unlock_if_glue_dir().
>3. Rename get_device_parent_locked_if_glue_dir() to
>

Re: [PATCH] driver core: Fix use-after-free and double free on glue directory

2019-05-14 Thread Prateek Sood

On 5/14/19 4:26 PM, Mukesh Ojha wrote:
> ++
> 
> On 5/4/2019 8:17 PM, Muchun Song wrote:
>> Benjamin Herrenschmidt  于2019年5月2日周四 下午2:25写道：
>>
> The basic idea yes, the whole bool *locked is horrid though.
> Wouldn't it
> work to have a get_device_parent_locked that always returns with
> the mutex held,
> or just move the mutex to the caller or something simpler like this
> ?
>
 Greg and Rafael, do you have any suggestions for this? Or you also
 agree with Ben?
>>> Ping guys ? This is worth fixing...
>> I also agree with you. But Greg and Rafael seem to be high latency right now.
>>
>>  From your suggestions, I think introduce get_device_parent_locked() may easy
>> to fix. So, do you agree with the fix of the following code snippet
>> (You can also
>> view attachments)?
>>
>> I introduce a new function named get_device_parent_locked_if_glue_dir() which
>> always returns with the mutex held only when we live in glue dir. We should 
>> call
>> unlock_if_glue_dir() to release the mutex. The
>> get_device_parent_locked_if_glue_dir()
>> and unlock_if_glue_dir() should be called in pairs.
>>
>> ---
>> drivers/base/core.c | 44 
>> 1 file changed, 36 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/base/core.c b/drivers/base/core.c
>> index 4aeaa0c92bda..5112755c43fa 100644
>> --- a/drivers/base/core.c
>> +++ b/drivers/base/core.c
>> @@ -1739,8 +1739,9 @@ class_dir_create_and_add(struct class *class,
>> struct kobject *parent_kobj)
>> static DEFINE_MUTEX(gdp_mutex);
>> -static struct kobject *get_device_parent(struct device *dev,
>> -    struct device *parent)
>> +static struct kobject *__get_device_parent(struct device *dev,
>> +    struct device *parent,
>> +    bool lock)
>> {
>>     if (dev->class) {
>>     struct kobject *kobj = NULL;
>> @@ -1779,14 +1780,16 @@ static struct kobject
>> *get_device_parent(struct device *dev,
>>     }
>>     spin_unlock(>class->p->glue_dirs.list_lock);
>>     if (kobj) {
>> -   mutex_unlock(_mutex);
>> +   if (!lock)
>> +   mutex_unlock(_mutex);
>>     return kobj;
>>     }
>>     /* or create a new class-directory at the parent device */
>>     k = class_dir_create_and_add(dev->class, parent_kobj);
>>     /* do not emit an uevent for this simple "glue" directory */
>> -   mutex_unlock(_mutex);
>> +   if (!lock)
>> +   mutex_unlock(_mutex);
>>     return k;
>>     }
>> @@ -1799,6 +1802,19 @@ static struct kobject *get_device_parent(struct
>> device *dev,
>>     return NULL;
>> }
>> +static inline struct kobject *get_device_parent(struct device *dev,
>> +   struct device *parent)
>> +{
>> +   return __get_device_parent(dev, parent, false);
>> +}
>> +
>> +static inline struct kobject *
>> +get_device_parent_locked_if_glue_dir(struct device *dev,
>> +    struct device *parent)
>> +{
>> +   return __get_device_parent(dev, parent, true);
>> +}
>> +
>> static inline bool live_in_glue_dir(struct kobject *kobj,
>>  struct device *dev)
>> {
>> @@ -1831,6 +1847,16 @@ static void cleanup_glue_dir(struct device
>> *dev, struct kobject *glue_dir)
>>     mutex_unlock(_mutex);
>> }
>> +static inline void unlock_if_glue_dir(struct device *dev,
>> +    struct kobject *glue_dir)
>> +{
>> +   /* see if we live in a "glue" directory */
>> +   if (!live_in_glue_dir(glue_dir, dev))
>> +   return;
>> +
>> +   mutex_unlock(_mutex);
>> +}
>> +
>> static int device_add_class_symlinks(struct device *dev)
>> {
>>     struct device_node *of_node = dev_of_node(dev);
>> @@ -2040,7 +2066,7 @@ int device_add(struct device *dev)
>>     pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
>>     parent = get_device(dev->parent);
>> -   kobj = get_device_parent(dev, parent);
>> +   kobj = get_device_parent_locked_if_glue_dir(dev, parent);
>>     if (IS_ERR(kobj)) {
>>     error = PTR_ERR(kobj);
>>     goto parent_error;
>> @@ -2055,10 +2081,12 @@ int device_add(struct device *dev)
>>     /* first, register with generic layer. */
>>     /* we require the name to be set before, and pass NULL */
>>     error = kobject_add(>kobj, dev->kobj.parent, NULL);
>> -   if (error) {
>> -   glue_dir = get_glue_dir(dev);
>> +
>> +   glue_dir = get_glue_dir(dev);
>> +   unlock_if_glue_dir(dev, glue_dir);
>> +
>> +   if (error)
>>     goto Error;
>> -   }
>>     /* notify platform of device entry */
>>     error = device_platform_notify(dev, KOBJ_ADD);
>> -- 

This change has been done in device_add(). AFAICT, locked
version of get_device_parent should be used in device_move()
also.

Thanks

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH v3] drivers: core: Remove glue dirs early only when refcount is 1

2019-05-05 Thread Prateek Sood

On 5/1/19 5:29 PM, Prateek Sood wrote:
> While loading firmware blobs parallely in different threads, it is possible
> to free sysfs node of glue_dirs in device_del() from a thread while another
> thread is trying to add subdir from device_add() in glue_dirs sysfs node.
> 
> CPU1   CPU2
> fw_load_sysfs_fallback()
>   device_add()
> get_device_parent()
>   class_dir_create_and_add()
> kobject_add_internal()
>   create_dir() // glue_dir
> 
>fw_load_sysfs_fallback()
>  device_add()
>get_device_parent()
>  kobject_get() //glue_dir
> 
>   device_del()
> cleanup_glue_dir()
>   kobject_del()
> 
>kobject_add()
>  kobject_add_internal()
>create_dir() // in glue_dir
>  kernfs_create_dir_ns()
> 
>sysfs_remove_dir() //glue_dir->sd=NULL
>sysfs_put() // free glue_dir->sd
> 
>kernfs_new_node()
>  kernfs_get(glue_dir)
> 
> Fix this race by making sure that kernfs_node for glue_dir is released only
> when refcount for glue_dir kobj is 1.
> 
> Signed-off-by: Prateek Sood 
> 
> ---
> 
> Changes from v2->v3:
>  - Added patch version change related comments.
> 
> Changes from v1->v2:
>  - Updated callstack from _request_firmware_load() to 
> fw_load_sysfs_fallback().
> 
> 
>  drivers/base/core.c | 5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/base/core.c b/drivers/base/core.c
> index 4aeaa0c..3955d07 100644
> --- a/drivers/base/core.c
> +++ b/drivers/base/core.c
> @@ -1820,12 +1820,15 @@ static inline struct kobject *get_glue_dir(struct 
> device *dev)
>   */
>  static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
>  {
> + unsigned int refcount;
> +
>   /* see if we live in a "glue" directory */
>   if (!live_in_glue_dir(glue_dir, dev))
>   return;
>  
>   mutex_lock(_mutex);
> - if (!kobject_has_children(glue_dir))
> + refcount = kref_read(_dir->kref);
> + if (!kobject_has_children(glue_dir) && !--refcount)
>   kobject_del(glue_dir);
>   kobject_put(glue_dir);
>   mutex_unlock(_mutex);
> 

Folks,

Please share feedback on the race condition and the patch to
fix it.

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

[PATCH v3] drivers: core: Remove glue dirs early only when refcount is 1

2019-05-01 Thread Prateek Sood

While loading firmware blobs parallely in different threads, it is possible
to free sysfs node of glue_dirs in device_del() from a thread while another
thread is trying to add subdir from device_add() in glue_dirs sysfs node.

CPU1   CPU2
fw_load_sysfs_fallback()
  device_add()
get_device_parent()
  class_dir_create_and_add()
kobject_add_internal()
  create_dir() // glue_dir

   fw_load_sysfs_fallback()
 device_add()
   get_device_parent()
 kobject_get() //glue_dir

  device_del()
cleanup_glue_dir()
  kobject_del()

   kobject_add()
 kobject_add_internal()
   create_dir() // in glue_dir
 kernfs_create_dir_ns()

   sysfs_remove_dir() //glue_dir->sd=NULL
   sysfs_put() // free glue_dir->sd

   kernfs_new_node()
 kernfs_get(glue_dir)

Fix this race by making sure that kernfs_node for glue_dir is released only
when refcount for glue_dir kobj is 1.

Signed-off-by: Prateek Sood 

---

Changes from v2->v3:
 - Added patch version change related comments.

Changes from v1->v2:
 - Updated callstack from _request_firmware_load() to fw_load_sysfs_fallback().


 drivers/base/core.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 4aeaa0c..3955d07 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1820,12 +1820,15 @@ static inline struct kobject *get_glue_dir(struct 
device *dev)
  */
 static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
 {
+   unsigned int refcount;
+
/* see if we live in a "glue" directory */
if (!live_in_glue_dir(glue_dir, dev))
return;
 
mutex_lock(_mutex);
-   if (!kobject_has_children(glue_dir))
+   refcount = kref_read(_dir->kref);
+   if (!kobject_has_children(glue_dir) && !--refcount)
kobject_del(glue_dir);
kobject_put(glue_dir);
mutex_unlock(_mutex);
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

[PATCH V2] drivers: core: Remove glue dirs early only when refcount is 1

2019-04-30 Thread Prateek Sood

While loading firmware blobs parallely in different threads, it is possible
to free sysfs node of glue_dirs in device_del() from a thread while another
thread is trying to add subdir from device_add() in glue_dirs sysfs node.

CPU1   CPU2
fw_load_sysfs_fallback()
  device_add()
get_device_parent()
  class_dir_create_and_add()
kobject_add_internal()
  create_dir() // glue_dir

   fw_load_sysfs_fallback()
 device_add()
   get_device_parent()
 kobject_get() //glue_dir

  device_del()
cleanup_glue_dir()
  kobject_del()

   kobject_add()
 kobject_add_internal()
   create_dir() // in glue_dir
 kernfs_create_dir_ns()

   sysfs_remove_dir() //glue_dir->sd=NULL
   sysfs_put() // free glue_dir->sd

   kernfs_new_node()
 kernfs_get(glue_dir)

Fix this race by making sure that kernfs_node for glue_dir is released only
when refcount for glue_dir kobj is 1.

Signed-off-by: Prateek Sood 
---
 drivers/base/core.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 4aeaa0c..3955d07 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1820,12 +1820,15 @@ static inline struct kobject *get_glue_dir(struct 
device *dev)
  */
 static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
 {
+   unsigned int refcount;
+
/* see if we live in a "glue" directory */
if (!live_in_glue_dir(glue_dir, dev))
return;
 
mutex_lock(_mutex);
-   if (!kobject_has_children(glue_dir))
+   refcount = kref_read(_dir->kref);
+   if (!kobject_has_children(glue_dir) && !--refcount)
kobject_del(glue_dir);
kobject_put(glue_dir);
mutex_unlock(_mutex);
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

[PATCH] drivers: core: Remove glue dirs early only when refcount is 1

2019-04-30 Thread Prateek Sood

While loading firmware blobs parallely in different threads, it is possible
to free sysfs node of glue_dirs in device_del() from a thread while another
thread is trying to add subdir from device_add() in glue_dirs sysfs node.

CPU1   CPU2
_request_firmware_load()
  device_add()
get_device_parent()
  class_dir_create_and_add()
kobject_add_internal()
  create_dir() // glue_dir

   _request_firmware_load()
 device_add()
   get_device_parent()
 kobject_get() //glue_dir

  device_del()
cleanup_glue_dir()
  kobject_del()

   kobject_add()
 kobject_add_internal()
   create_dir() // in glue_dir
 kernfs_create_dir_ns()

   sysfs_remove_dir() //glue_dir->sd=NULL
   sysfs_put() // free glue_dir->sd

   kernfs_new_node()
 kernfs_get(glue_dir)

Fix this race by making sure that kernfs_node for glue_dir is released only
when refcount for glue_dir kobj is 1.

Signed-off-by: Prateek Sood 
---
 drivers/base/core.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 4aeaa0c..3955d07 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1820,12 +1820,15 @@ static inline struct kobject *get_glue_dir(struct 
device *dev)
  */
 static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
 {
+   unsigned int refcount;
+
/* see if we live in a "glue" directory */
if (!live_in_glue_dir(glue_dir, dev))
return;
 
mutex_lock(_mutex);
-   if (!kobject_has_children(glue_dir))
+   refcount = kref_read(_dir->kref);
+   if (!kobject_has_children(glue_dir) && !--refcount)
kobject_del(glue_dir);
kobject_put(glue_dir);
mutex_unlock(_mutex);
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

[PATCH] perf: fix use after free of perf_trace_buf

2019-03-19 Thread Prateek Sood

SyS_perf_event_open()
  free_event()
_free_event()
  tp_perf_event_destroy()
perf_trace_destroy()
  perf_trace_event_unreg() //free perf_trace_buf

trace_cpu_frequency()
  perf_trace_cpu()
perf_trace_buf_alloc() //access perf_trace_buf

CPU0CPU1
perf_trace_event_unreg()perf_trace_cpu()
head = (event_call->perf_events)

free_percpu(tp_event->perf_events)
tp_event->perf_events = NULL
--total_ref_count
free_percpu(perf_trace_buf[i])
perf_trace_buf[i] = NULL

raw_data = perf_trace_buf[rctx]
memset(raw_data)

A potential race exists between access of perf_trace_buf from
perf_trace_buf_alloc() and perf_trace_event_unreg(). This can
result in perf_trace_buf[rctx] being NULL during access from memset()
in perf_trace_buf_alloc().

Change-Id: I95ae774b9fcc653aa808f2d9f3e4359b3605e909
Signed-off-by: Prateek Sood 
---
 include/linux/trace_events.h|  2 ++
 include/trace/perf.h|  5 +++-
 kernel/trace/trace_event_perf.c | 63 ++---
 kernel/trace/trace_kprobe.c | 10 +--
 kernel/trace/trace_syscalls.c   | 14 ++---
 kernel/trace/trace_uprobe.c |  2 ++
 6 files changed, 79 insertions(+), 17 deletions(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8a62731..dbdad19 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -591,6 +591,8 @@ extern int  ftrace_profile_set_filter(struct perf_event 
*event, int event_id,
 extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
+void get_perf_trace_buf(void);
+void put_perf_trace_buf(void);
 
 void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
 void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
diff --git a/include/trace/perf.h b/include/trace/perf.h
index dbc6c74..f808c33 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -55,9 +55,10 @@
 sizeof(u64));  \
__entry_size -= sizeof(u32);\
\
+   get_perf_trace_buf();   \
entry = perf_trace_buf_alloc(__entry_size, &__regs, ); \
if (!entry) \
-   return; \
+   goto out;   \
\
perf_fetch_caller_regs(__regs); \
\
@@ -68,6 +69,8 @@
perf_trace_run_bpf_submit(entry, __entry_size, rctx,\
  event_call, __count, __regs,  \
  head, __task);\
+out:   \
+   put_perf_trace_buf();   \
 }
 
 /*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4629a61..6caca88 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,7 +21,8 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / 
sizeof(unsigned long)])
perf_trace_t;
 
 /* Count the events in use (per event id, not per instance) */
-static int total_ref_count;
+static int alloc_ref_count;
+static atomic_t access_ref_count[PERF_NR_CONTEXTS];
 
 static int perf_trace_event_perm(struct trace_event_call *tp_event,
 struct perf_event *p_event)
@@ -88,6 +89,34 @@ static int perf_trace_event_perm(struct trace_event_call 
*tp_event,
return 0;
 }
 
+void get_perf_trace_buf(void)
+{
+   int rctx;
+
+   rctx = perf_swevent_get_recursion_context();
+   if (rctx < 0)
+   return;
+
+   atomic_inc(_ref_count[rctx]);
+   perf_swevent_put_recursion_context(rctx);
+}
+EXPORT_SYMBOL_GPL(get_perf_trace_buf);
+NOKPROBE_SYMBOL(get_perf_trace_buf);
+
+void put_perf_trace_buf(void)
+{
+   int rctx;
+
+   rctx = perf_swevent_get_recursion_context();
+   if (rctx < 0)
+   return;
+
+   atomic_dec(_ref_count[rctx]);
+   perf_swevent_put_recursion_context(rctx);
+}
+EXPORT_SYMBOL_GPL(put_perf_trace_buf);
+NOKPROBE_SYMBOL(put_perf_trace_buf);
+
 static int perf_trace_event_reg(struct trace_event_call *tp_event,
struct perf_event *p_event)
 {
@@ -108,7 +137,7 @@ static int perf_trace_event_reg(struct trace_ev

[PATCH] perf: extend total_ref_count usage to protect perf_trace_buf access

2019-03-18 Thread Prateek Sood

A potential race exists between access of perf_trace_buf[i] from
perf_trace_buf_alloc() and perf_trace_event_unreg(). This can
result in perf_trace_buf[i] being NULL during access from memset()
in perf_trace_buf_alloc().

Signed-off-by: Prateek Sood 
---
 include/linux/trace_events.h|  2 ++
 include/trace/perf.h|  5 -
 kernel/trace/trace_event_perf.c | 43 -
 kernel/trace/trace_kprobe.c | 10 --
 kernel/trace/trace_syscalls.c   | 14 ++
 kernel/trace/trace_uprobe.c |  2 ++
 6 files changed, 56 insertions(+), 20 deletions(-)

diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h
index 8a62731..dbdad19 100644
--- a/include/linux/trace_events.h
+++ b/include/linux/trace_events.h
@@ -591,6 +591,8 @@ extern int  ftrace_profile_set_filter(struct perf_event 
*event, int event_id,
 extern void ftrace_profile_free_filter(struct perf_event *event);
 void perf_trace_buf_update(void *record, u16 type);
 void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp);
+void get_perf_trace_buf(void);
+void put_perf_trace_buf(void);
 
 void bpf_trace_run1(struct bpf_prog *prog, u64 arg1);
 void bpf_trace_run2(struct bpf_prog *prog, u64 arg1, u64 arg2);
diff --git a/include/trace/perf.h b/include/trace/perf.h
index dbc6c74..f808c33 100644
--- a/include/trace/perf.h
+++ b/include/trace/perf.h
@@ -55,9 +55,10 @@
 sizeof(u64));  \
__entry_size -= sizeof(u32);\
\
+   get_perf_trace_buf();   \
entry = perf_trace_buf_alloc(__entry_size, &__regs, ); \
if (!entry) \
-   return; \
+   goto out;   \
\
perf_fetch_caller_regs(__regs); \
\
@@ -68,6 +69,8 @@
perf_trace_run_bpf_submit(entry, __entry_size, rctx,\
  event_call, __count, __regs,  \
  head, __task);\
+out:   \
+   put_perf_trace_buf();   \
 }
 
 /*
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 4629a61..fabfc21 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -21,7 +21,7 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / 
sizeof(unsigned long)])
perf_trace_t;
 
 /* Count the events in use (per event id, not per instance) */
-static int total_ref_count;
+static atomic_ttotal_ref_count;
 
 static int perf_trace_event_perm(struct trace_event_call *tp_event,
 struct perf_event *p_event)
@@ -88,6 +88,27 @@ static int perf_trace_event_perm(struct trace_event_call 
*tp_event,
return 0;
 }
 
+void get_perf_trace_buf()
+{
+   atomic_inc(_ref_count);
+}
+EXPORT_SYMBOL_GPL(get_perf_trace_buf);
+NOKPROBE_SYMBOL(get_perf_trace_buf);
+
+void put_perf_trace_buf()
+{
+   int index;
+
+   if (atomic_dec_and_test(_ref_count)) {
+   for (index = 0; index < PERF_NR_CONTEXTS; index++) {
+   free_percpu(perf_trace_buf[index]);
+   perf_trace_buf[index] = NULL;
+   }
+   }
+}
+EXPORT_SYMBOL_GPL(put_perf_trace_buf);
+NOKPROBE_SYMBOL(put_perf_trace_buf);
+
 static int perf_trace_event_reg(struct trace_event_call *tp_event,
struct perf_event *p_event)
 {
@@ -108,7 +129,7 @@ static int perf_trace_event_reg(struct trace_event_call 
*tp_event,
 
tp_event->perf_events = list;
 
-   if (!total_ref_count) {
+   if (!atomic_read(_ref_count)) {
char __percpu *buf;
int i;
 
@@ -125,11 +146,11 @@ static int perf_trace_event_reg(struct trace_event_call 
*tp_event,
if (ret)
goto fail;
 
-   total_ref_count++;
+   get_perf_trace_buf();
return 0;
 
 fail:
-   if (!total_ref_count) {
+   if (!atomic_read(_ref_count)) {
int i;
 
for (i = 0; i < PERF_NR_CONTEXTS; i++) {
@@ -164,13 +185,7 @@ static void perf_trace_event_unreg(struct perf_event 
*p_event)
 
free_percpu(tp_event->perf_events);
tp_event->perf_events = NULL;
-
-   if (!--total_ref_count) {
-   for (i = 0; i < PERF_NR_CONTEXTS; i++) {
-   free_percpu(perf_trace_buf[i]);
-   pe

[tip:locking/core] sched/wait: Fix rcuwait_wake_up() ordering

2019-01-21 Thread tip-bot for Prateek Sood

Commit-ID:  6dc080eeb2ba01973bfff0d79844d7a59e12542e
Gitweb: https://git.kernel.org/tip/6dc080eeb2ba01973bfff0d79844d7a59e12542e
Author: Prateek Sood 
AuthorDate: Fri, 30 Nov 2018 20:40:56 +0530
Committer:  Ingo Molnar 
CommitDate: Mon, 21 Jan 2019 11:15:36 +0100

sched/wait: Fix rcuwait_wake_up() ordering

For some peculiar reason rcuwait_wake_up() has the right barrier in
the comment, but not in the code.

This mistake has been observed to cause a deadlock in the following
situation:

P1  P2

percpu_up_read()percpu_down_write()
  rcu_sync_is_idle() // false
  rcu_sync_enter()
  ...
  __percpu_up_read()

[S] ,-  __this_cpu_dec(*sem->read_count)
|   smp_rmb();
[L] |   task = rcu_dereference(w->task) // NULL
|
|   [S] w->task = current
|   smp_mb();
|   [L] readers_active_check() // fail
`-> 

Where the smp_rmb() (obviously) fails to constrain the store.

[ peterz: Added changelog. ]

Signed-off-by: Prateek Sood 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Andrea Parri 
Acked-by: Davidlohr Bueso 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Fixes: 8f95c90ceb54 ("sched/wait, RCU: Introduce rcuwait machinery")
Link: 
https://lkml.kernel.org/r/1543590656-7157-1-git-send-email-prs...@codeaurora.org
Signed-off-by: Ingo Molnar 
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 284f2fe9a293..3fb7be001964 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -307,7 +307,7 @@ void rcuwait_wake_up(struct rcuwait *w)
 *MB (A)  MB (B)
 *[L] cond[L] tsk
 */
-   smp_rmb(); /* (B) */
+   smp_mb(); /* (B) */
 
/*
 * Avoid using task_rcu_dereference() magic as long as we are careful,

[PATCH] percpu_rwsem: fix missed wakeup due to reordering of load

2018-12-20 Thread Prateek Sood

P1 is releaseing the cpu_hotplug_lock and P2 is acquiring
cpu_hotplug_lock.

P1   P2
percpu_up_read() path  percpu_down_write() path

  rcu_sync_enter() //gp_state=GP_PASSED

rcu_sync_is_idle() //returns falsedown_write(rw_sem)

__percpu_up_read()

[L] task = rcu_dereference(w->task) //NULL

smp_rmb()  [S] w->task = current

smp_mb()

   [L] readers_active_check() //fails
 schedule()

[S] __this_cpu_dec(read_count)

Since load of task can result in NULL, it can lead to missed wakeup
in rcuwait_wake_up(). Above sequence violated the following constraint
in rcuwait_wake_up():

 WAITWAKE
[S] tsk = current [S] cond = true
MB (A)  MB (B)
[L] cond  [L] tsk

This can happen as smp_rmb() in rcuwait_wake_up() will provide ordering
of load before barrier with load and store after barrier for arm64
architecture. Here the requirement is to order store before smp_rmb()
with load after the smp_rmb().

For the usage of rcuwait_wake_up() in __percpu_up_read() full barrier
(smp_mb) is required to complete the constraint of rcuwait_wake_up().

Signed-off-by: Prateek Sood 
Acked-by: Davidlohr Bueso 

---
 kernel/exit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index ac1a814..696e0e1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -298,7 +298,7 @@ void rcuwait_wake_up(struct rcuwait *w)
/*
 * Order condition vs @task, such that everything prior to the load
 * of @task is visible. This is the condition as to why the user called
-* rcuwait_trywake() in the first place. Pairs with set_current_state()
+* rcuwait_wake_up() in the first place. Pairs with set_current_state()
 * barrier (A) in rcuwait_wait_event().
 *
 *WAITWAKE
@@ -306,7 +306,7 @@ void rcuwait_wake_up(struct rcuwait *w)
 *MB (A)  MB (B)
 *[L] cond[L] tsk
 */
-   smp_rmb(); /* (B) */
+   smp_mb(); /* (B) */
 
/*
 * Avoid using task_rcu_dereference() magic as long as we are careful,
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: [PATCH] percpu_rwsem: fix missed wakeup due to reordering of load

2018-12-20 Thread Prateek Sood

On 12/12/2018 08:58 PM, Andrea Parri wrote:
> On Fri, Nov 30, 2018 at 08:40:56PM +0530, Prateek Sood wrote:
>> In a scenario where cpu_hotplug_lock percpu_rw_semaphore is already
>> acquired for read operation by P1 using percpu_down_read().
>>
>> Now we have P1 in the path of releaseing the cpu_hotplug_lock and P2
>> is in the process of acquiring cpu_hotplug_lock.
>>
>> P1   P2
>> percpu_up_read() path  percpu_down_write() path
>>
>>   rcu_sync_enter() 
>> //gp_state=GP_PASSED
>>
>> rcu_sync_is_idle() //returns falsedown_write(rw_sem)
>>
>> __percpu_up_read()
>>
>> [L] task = rcu_dereference(w->task) //NULL
>>
>> smp_rmb()  [S] w->task = current
>>
>> smp_mb()
>>
>>[L] readers_active_check() //fails
>>   schedule()
>>
>> [S] __this_cpu_dec(read_count)
>>
>> Since load of task can result in NULL. This can lead to missed wakeup
>> in rcuwait_wake_up(). Above sequence violated the following constraint
>> in rcuwait_wake_up():
>>
>>   WAITWAKE
>> [S] tsk = current  [S] cond = true
>> MB (A)   MB (B)
>> [L] cond   [L] tsk
>>
>> This can happen as smp_rmb() in rcuwait_wake_up() will provide ordering
>> of load before barrier with load and store after barrier for arm64
>> architecture. Here the requirement is to order store before smp_rmb()
>> with load after the smp_rmb().
>>
>> For the usage of rcuwait_wake_up() in __percpu_up_read() full barrier
>> (smp_mb) is required to complete the constraint of rcuwait_wake_up().
>>
>> Signed-off-by: Prateek Sood 
> 
> I know this is going to sound ridiculous (coming from me or from
> the Italian that I am), but it looks like we could both work on
> our English. ;-)
> 
> But the fix seems correct to me:
> 
> Reviewed-by: Andrea Parri 
> 
> It might be a good idea to integrate this fix with fixes to the
> inline comments/annotations: for example, I see that the comment
> in rcuwait_wake_up() mentions a non-existing rcuwait_trywake();
Ok, I will update the comment in next version of the patch.

> moreover, the memory-barrier annotation "B" is used also for the
> smp_mb() preceding the __this_cpu_dec() in __percpu_up_read().
In this annotation "B" is corresponding to annotation "A" in
rcuwait_wait_event(). So this seems to be correct.

> 
>   Andrea
> 
> 
>> ---
>>  kernel/exit.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/kernel/exit.c b/kernel/exit.c
>> index f1d74f0..a10820d 100644
>> --- a/kernel/exit.c
>> +++ b/kernel/exit.c
>> @@ -306,7 +306,7 @@ void rcuwait_wake_up(struct rcuwait *w)
>>   *MB (A)  MB (B)
>>   *[L] cond[L] tsk
>>   */
>> -smp_rmb(); /* (B) */
>> +smp_mb(); /* (B) */
>>  
>>  /*
>>   * Avoid using task_rcu_dereference() magic as long as we are careful,
>> -- 
>> Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, 
>> Inc., 
>> is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.
>>


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] percpu_rwsem: fix missed wakeup due to reordering of load

2018-12-12 Thread Prateek Sood

On 12/04/2018 01:06 AM, Prateek Sood wrote:
> On 12/03/2018 12:08 PM, Davidlohr Bueso wrote:
>> On 2018-11-30 07:10, Prateek Sood wrote:
>>> In a scenario where cpu_hotplug_lock percpu_rw_semaphore is already
>>> acquired for read operation by P1 using percpu_down_read().
>>>
>>> Now we have P1 in the path of releaseing the cpu_hotplug_lock and P2
>>> is in the process of acquiring cpu_hotplug_lock.
>>>
>>> P1   P2
>>> percpu_up_read() path  percpu_down_write() path
>>>
>>>   rcu_sync_enter() 
>>> //gp_state=GP_PASSED
>>>
>>> rcu_sync_is_idle() //returns false    down_write(rw_sem)
>>>
>>> __percpu_up_read()
>>>
>>> [L] task = rcu_dereference(w->task) //NULL
>>>
>>> smp_rmb()  [S] w->task = current
>>>
>>>     smp_mb()
>>>
>>>    [L] readers_active_check() 
>>> //fails
>>>  schedule()
>>>
>>> [S] __this_cpu_dec(read_count)
>>>
>>> Since load of task can result in NULL. This can lead to missed wakeup
>>> in rcuwait_wake_up(). Above sequence violated the following constraint
>>> in rcuwait_wake_up():
>>>
>>>  WAIT    WAKE
>>> [S] tsk = current  [S] cond = true
>>> MB (A)    MB (B)
>>> [L] cond  [L] tsk
>>>
>>
>> Hmm yeah we don't want rcu_wake_up() to get hoisted over the 
>> __this_cpu_dec(read_count). The smp_rmb() does not make sense to me here in 
>> the first place. Did you run into this scenario by code inspection or you 
>> actually it the issue?
>>
>> Thanks,
>> Davidlohr
> 
> I have checked one issue where it seems that cpu hotplug code
> path is not able to get cpu_hotplug_lock in write mode and there
> is a reader pending for cpu hotplug path to release
> percpu_rw_semaphore->rwsem to acquire cpu_hotplug_lock.
> This caused a deadlock.
> 
> From code inspection also it seems to be not adhering to arm64
> smp_rmb() constraint of load/load-store ordering guarantee.
> 
> 
> Thanks,
> Prateek
> 

Folks,

Please confirm if the suspicion of smp_rmb is correct.
IMO, it should be smp_mb() translating to dmb ish.


Thanks
Prateek

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] percpu_rwsem: fix missed wakeup due to reordering of load

2018-12-03 Thread Prateek Sood

On 12/03/2018 12:08 PM, Davidlohr Bueso wrote:
> On 2018-11-30 07:10, Prateek Sood wrote:
>> In a scenario where cpu_hotplug_lock percpu_rw_semaphore is already
>> acquired for read operation by P1 using percpu_down_read().
>>
>> Now we have P1 in the path of releaseing the cpu_hotplug_lock and P2
>> is in the process of acquiring cpu_hotplug_lock.
>>
>> P1   P2
>> percpu_up_read() path  percpu_down_write() path
>>
>>   rcu_sync_enter() 
>> //gp_state=GP_PASSED
>>
>> rcu_sync_is_idle() //returns false    down_write(rw_sem)
>>
>> __percpu_up_read()
>>
>> [L] task = rcu_dereference(w->task) //NULL
>>
>> smp_rmb()  [S] w->task = current
>>
>>     smp_mb()
>>
>>    [L] readers_active_check() //fails
>>  schedule()
>>
>> [S] __this_cpu_dec(read_count)
>>
>> Since load of task can result in NULL. This can lead to missed wakeup
>> in rcuwait_wake_up(). Above sequence violated the following constraint
>> in rcuwait_wake_up():
>>
>>  WAIT    WAKE
>> [S] tsk = current  [S] cond = true
>> MB (A)    MB (B)
>> [L] cond  [L] tsk
>>
> 
> Hmm yeah we don't want rcu_wake_up() to get hoisted over the 
> __this_cpu_dec(read_count). The smp_rmb() does not make sense to me here in 
> the first place. Did you run into this scenario by code inspection or you 
> actually it the issue?
> 
> Thanks,
> Davidlohr

I have checked one issue where it seems that cpu hotplug code
path is not able to get cpu_hotplug_lock in write mode and there
is a reader pending for cpu hotplug path to release
percpu_rw_semaphore->rwsem to acquire cpu_hotplug_lock.
This caused a deadlock.

>From code inspection also it seems to be not adhering to arm64
smp_rmb() constraint of load/load-store ordering guarantee.


Thanks,
Prateek

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] percpu_rwsem: fix missed wakeup due to reordering of load

2018-12-03 Thread Prateek Sood

On 12/03/2018 12:08 PM, Davidlohr Bueso wrote:
> On 2018-11-30 07:10, Prateek Sood wrote:
>> In a scenario where cpu_hotplug_lock percpu_rw_semaphore is already
>> acquired for read operation by P1 using percpu_down_read().
>>
>> Now we have P1 in the path of releaseing the cpu_hotplug_lock and P2
>> is in the process of acquiring cpu_hotplug_lock.
>>
>> P1   P2
>> percpu_up_read() path  percpu_down_write() path
>>
>>   rcu_sync_enter() 
>> //gp_state=GP_PASSED
>>
>> rcu_sync_is_idle() //returns false    down_write(rw_sem)
>>
>> __percpu_up_read()
>>
>> [L] task = rcu_dereference(w->task) //NULL
>>
>> smp_rmb()  [S] w->task = current
>>
>>     smp_mb()
>>
>>    [L] readers_active_check() //fails
>>  schedule()
>>
>> [S] __this_cpu_dec(read_count)
>>
>> Since load of task can result in NULL. This can lead to missed wakeup
>> in rcuwait_wake_up(). Above sequence violated the following constraint
>> in rcuwait_wake_up():
>>
>>  WAIT    WAKE
>> [S] tsk = current  [S] cond = true
>> MB (A)    MB (B)
>> [L] cond  [L] tsk
>>
> 
> Hmm yeah we don't want rcu_wake_up() to get hoisted over the 
> __this_cpu_dec(read_count). The smp_rmb() does not make sense to me here in 
> the first place. Did you run into this scenario by code inspection or you 
> actually it the issue?
> 
> Thanks,
> Davidlohr

I have checked one issue where it seems that cpu hotplug code
path is not able to get cpu_hotplug_lock in write mode and there
is a reader pending for cpu hotplug path to release
percpu_rw_semaphore->rwsem to acquire cpu_hotplug_lock.
This caused a deadlock.

>From code inspection also it seems to be not adhering to arm64
smp_rmb() constraint of load/load-store ordering guarantee.


Thanks,
Prateek

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

[PATCH] percpu_rwsem: fix missed wakeup due to reordering of load

2018-11-30 Thread Prateek Sood

In a scenario where cpu_hotplug_lock percpu_rw_semaphore is already
acquired for read operation by P1 using percpu_down_read().

Now we have P1 in the path of releaseing the cpu_hotplug_lock and P2
is in the process of acquiring cpu_hotplug_lock.

P1   P2
percpu_up_read() path  percpu_down_write() path

  rcu_sync_enter() //gp_state=GP_PASSED

rcu_sync_is_idle() //returns falsedown_write(rw_sem)

__percpu_up_read()

[L] task = rcu_dereference(w->task) //NULL

smp_rmb()  [S] w->task = current

smp_mb()

   [L] readers_active_check() //fails
 schedule()

[S] __this_cpu_dec(read_count)

Since load of task can result in NULL. This can lead to missed wakeup
in rcuwait_wake_up(). Above sequence violated the following constraint
in rcuwait_wake_up():

 WAITWAKE
[S] tsk = current [S] cond = true
MB (A)  MB (B)
[L] cond  [L] tsk

This can happen as smp_rmb() in rcuwait_wake_up() will provide ordering
of load before barrier with load and store after barrier for arm64
architecture. Here the requirement is to order store before smp_rmb()
with load after the smp_rmb().

For the usage of rcuwait_wake_up() in __percpu_up_read() full barrier
(smp_mb) is required to complete the constraint of rcuwait_wake_up().

Signed-off-by: Prateek Sood 
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index f1d74f0..a10820d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -306,7 +306,7 @@ void rcuwait_wake_up(struct rcuwait *w)
 *MB (A)  MB (B)
 *[L] cond[L] tsk
 */
-   smp_rmb(); /* (B) */
+   smp_mb(); /* (B) */
 
/*
 * Avoid using task_rcu_dereference() magic as long as we are careful,
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

[PATCH] percpu_rwsem: fix missed wakeup due to reordering of load

2018-11-30 Thread Prateek Sood

In a scenario where cpu_hotplug_lock percpu_rw_semaphore is already
acquired for read operation by P1 using percpu_down_read().

Now we have P1 in the path of releaseing the cpu_hotplug_lock and P2
is in the process of acquiring cpu_hotplug_lock.

P1   P2
percpu_up_read() path  percpu_down_write() path

  rcu_sync_enter() //gp_state=GP_PASSED

rcu_sync_is_idle() //returns falsedown_write(rw_sem)

__percpu_up_read()

[L] task = rcu_dereference(w->task) //NULL

smp_rmb()  [S] w->task = current

smp_mb()

   [L] readers_active_check() //fails
 schedule()

[S] __this_cpu_dec(read_count)

Since load of task can result in NULL. This can lead to missed wakeup
in rcuwait_wake_up(). Above sequence violated the following constraint
in rcuwait_wake_up():

 WAITWAKE
[S] tsk = current [S] cond = true
MB (A)  MB (B)
[L] cond  [L] tsk

This can happen as smp_rmb() in rcuwait_wake_up() will provide ordering
of load before barrier with load and store after barrier for arm64
architecture. Here the requirement is to order store before smp_rmb()
with load after the smp_rmb().

For the usage of rcuwait_wake_up() in __percpu_up_read() full barrier
(smp_mb) is required to complete the constraint of rcuwait_wake_up().

Signed-off-by: Prateek Sood 
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index f1d74f0..a10820d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -306,7 +306,7 @@ void rcuwait_wake_up(struct rcuwait *w)
 *MB (A)  MB (B)
 *[L] cond[L] tsk
 */
-   smp_rmb(); /* (B) */
+   smp_mb(); /* (B) */
 
/*
 * Avoid using task_rcu_dereference() magic as long as we are careful,
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: Query related to usage of cpufreq_suspend() & cpufreq_resume

2018-02-02 Thread Prateek Sood

On 02/02/2018 06:49 PM, Rafael J. Wysocki wrote:
> On Fri, Feb 2, 2018 at 1:53 PM, Prateek Sood <prs...@codeaurora.org> wrote:
>> On 02/02/2018 05:18 PM, Rafael J. Wysocki wrote:
>>> On Friday, February 2, 2018 12:41:58 PM CET Prateek Sood wrote:
>>>> Hi Viresh,
>>>>
>>>> One scenario is there where a kernel panic is observed in
>>>> cpufreq during suspend/resume.
>>>>
>>>> pm_suspend()
>>>>   suspend_devices_and_enter()
>>>> dpm_suspend_start()
>>>>   dpm_prepare()
>>>>
>>>> Failure in dpm_prepare() happend with following dmesg:
>>>>
>>>> [ 3746.316062] PM: Device xyz not prepared for power transition: code -16
>>>> [ 3746.316071] PM: Some devices failed to suspend, or early wake event 
>>>> detected
>>>>
>>>>
>>>> pm_suspend()
>>>>   suspend_devices_and_enter()
>>>> dpm_suspend_start()
>>>>   dpm_prepare() //failed
>>>> dpm_resume_end()
>>>>   dpm_resume()
>>>> cpufreq_resume()
>>>>   cpufreq_start_governor()
>>>> sugov_start()
>>>>   cpufreq_add_update_util_hook()
>>>>
>>>> After failure in dpm_prepare(), dpm_resume() called
>>>> cpufreq_resume(). Corresponding cpufreq_suspend() was not
>>>> called due to failure of dpm_prepare().
>>>>
>>>> This resulted in WARN_ON(per_cpu(cpufreq_update_util_data, cpu))
>>>> in cpufreq_add_update_util_hook() and cpufreq_add_update_util_hook->func
>>>> being inconsistent state. It caused crash in scheduler.
>>>>
>>>> Following are some of the ways to mitigate this issue. Could
>>>> you please provide feedback on below two approaches or suugest
>>>> a better way to fix this problem.
>>>>
>>>> ---8<--
>>>>
>>>> Co-developed-by: Gaurav Kohli <gko...@codeaurora.org>
>>>> Signed-off-by: Gaurav Kohli <gko...@codeaurora.org>
>>>> Signed-off-by: Prateek Sood <prs...@codeaurora.org>
>>>>
>>>> diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
>>>> index 02a497e..732e5a2 100644
>>>> --- a/drivers/base/power/main.c
>>>> +++ b/drivers/base/power/main.c
>>>> @@ -1038,6 +1038,7 @@ void dpm_resume(pm_message_t state)
>>>>  {
>>>> struct device *dev;
>>>> ktime_t starttime = ktime_get();
>>>> +   bool valid_resume = false;
>>>>
>>>> trace_suspend_resume(TPS("dpm_resume"), state.event, true);
>>>> might_sleep();
>>>> @@ -1055,6 +1056,7 @@ void dpm_resume(pm_message_t state)
>>>> }
>>>>
>>>> while (!list_empty(_suspended_list)) {
>>>> +   valid_resume = true;
>>>> dev = to_device(dpm_suspended_list.next);
>>>> get_device(dev);
>>>> if (!is_async(dev)) {
>>>> @@ -1080,7 +1082,8 @@ void dpm_resume(pm_message_t state)
>>>> async_synchronize_full();
>>>> dpm_show_time(starttime, state, 0, NULL);
>>>>
>>>> -   cpufreq_resume();
>>>> +   if (valid_resume)
>>>> +   cpufreq_resume();
>>>> trace_suspend_resume(TPS("dpm_resume"), state.event, false);
>>>>  }
>>>>
>>>> 8<--
>>>>
>>>> Signed-off-by: Prateek Sood <prs...@codeaurora.org>
>>>>
>>>> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
>>>> index 421f318..439eab8 100644
>>>> --- a/drivers/cpufreq/cpufreq.c
>>>> +++ b/drivers/cpufreq/cpufreq.c
>>>> @@ -1648,7 +1648,7 @@ void cpufreq_suspend(void)
>>>>  {
>>>> struct cpufreq_policy *policy;
>>>>
>>>> -   if (!cpufreq_driver)
>>>> +   if (!cpufreq_driver || cpufreq_suspended)
>>>> return;
>>>>
>>>> if (!has_target() && !cpufreq_driver->suspend)
>>>> @@ -1683,7 +1683,7 @@ void cpufreq_resume(void)
>>>> struct cpufreq_policy *policy;
>>>> int ret;
>>>>
>>>> -   if (!cpufreq_driver)
>>>> +   if (!cpufreq_driver || !cpufreq_suspended)
>>>> return;
>>>>
>>>> cpufreq_suspended = false;
>>>
>>> Since we have cpufreq_suspended already, the second one is better.
>>>
>>
>> Thanks Rafael for the inputs, I will send a formal patch.
> 
> Bo Yan has posted something really similar already, however:
> 
> https://patchwork.kernel.org/patch/10181101/
> 
> so I would prefer to apply a new version of that one with the latest
> comment taken into account:
> 
> https://patchwork.kernel.org/patch/10183075/
> 
> for the credit to go to the first submitter.
> 

Thanks for the information Rafael. 

I believe safety check in both cpufreq_suspend() and
cpufreq_resume() would be a good thing to have. 


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: Query related to usage of cpufreq_suspend() & cpufreq_resume

2018-02-02 Thread Prateek Sood

On 02/02/2018 06:49 PM, Rafael J. Wysocki wrote:
> On Fri, Feb 2, 2018 at 1:53 PM, Prateek Sood  wrote:
>> On 02/02/2018 05:18 PM, Rafael J. Wysocki wrote:
>>> On Friday, February 2, 2018 12:41:58 PM CET Prateek Sood wrote:
>>>> Hi Viresh,
>>>>
>>>> One scenario is there where a kernel panic is observed in
>>>> cpufreq during suspend/resume.
>>>>
>>>> pm_suspend()
>>>>   suspend_devices_and_enter()
>>>> dpm_suspend_start()
>>>>   dpm_prepare()
>>>>
>>>> Failure in dpm_prepare() happend with following dmesg:
>>>>
>>>> [ 3746.316062] PM: Device xyz not prepared for power transition: code -16
>>>> [ 3746.316071] PM: Some devices failed to suspend, or early wake event 
>>>> detected
>>>>
>>>>
>>>> pm_suspend()
>>>>   suspend_devices_and_enter()
>>>> dpm_suspend_start()
>>>>   dpm_prepare() //failed
>>>> dpm_resume_end()
>>>>   dpm_resume()
>>>> cpufreq_resume()
>>>>   cpufreq_start_governor()
>>>> sugov_start()
>>>>   cpufreq_add_update_util_hook()
>>>>
>>>> After failure in dpm_prepare(), dpm_resume() called
>>>> cpufreq_resume(). Corresponding cpufreq_suspend() was not
>>>> called due to failure of dpm_prepare().
>>>>
>>>> This resulted in WARN_ON(per_cpu(cpufreq_update_util_data, cpu))
>>>> in cpufreq_add_update_util_hook() and cpufreq_add_update_util_hook->func
>>>> being inconsistent state. It caused crash in scheduler.
>>>>
>>>> Following are some of the ways to mitigate this issue. Could
>>>> you please provide feedback on below two approaches or suugest
>>>> a better way to fix this problem.
>>>>
>>>> ---8<--
>>>>
>>>> Co-developed-by: Gaurav Kohli 
>>>> Signed-off-by: Gaurav Kohli 
>>>> Signed-off-by: Prateek Sood 
>>>>
>>>> diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
>>>> index 02a497e..732e5a2 100644
>>>> --- a/drivers/base/power/main.c
>>>> +++ b/drivers/base/power/main.c
>>>> @@ -1038,6 +1038,7 @@ void dpm_resume(pm_message_t state)
>>>>  {
>>>> struct device *dev;
>>>> ktime_t starttime = ktime_get();
>>>> +   bool valid_resume = false;
>>>>
>>>> trace_suspend_resume(TPS("dpm_resume"), state.event, true);
>>>> might_sleep();
>>>> @@ -1055,6 +1056,7 @@ void dpm_resume(pm_message_t state)
>>>> }
>>>>
>>>> while (!list_empty(_suspended_list)) {
>>>> +   valid_resume = true;
>>>> dev = to_device(dpm_suspended_list.next);
>>>> get_device(dev);
>>>> if (!is_async(dev)) {
>>>> @@ -1080,7 +1082,8 @@ void dpm_resume(pm_message_t state)
>>>> async_synchronize_full();
>>>> dpm_show_time(starttime, state, 0, NULL);
>>>>
>>>> -   cpufreq_resume();
>>>> +   if (valid_resume)
>>>> +   cpufreq_resume();
>>>> trace_suspend_resume(TPS("dpm_resume"), state.event, false);
>>>>  }
>>>>
>>>> 8<--
>>>>
>>>> Signed-off-by: Prateek Sood 
>>>>
>>>> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
>>>> index 421f318..439eab8 100644
>>>> --- a/drivers/cpufreq/cpufreq.c
>>>> +++ b/drivers/cpufreq/cpufreq.c
>>>> @@ -1648,7 +1648,7 @@ void cpufreq_suspend(void)
>>>>  {
>>>> struct cpufreq_policy *policy;
>>>>
>>>> -   if (!cpufreq_driver)
>>>> +   if (!cpufreq_driver || cpufreq_suspended)
>>>> return;
>>>>
>>>> if (!has_target() && !cpufreq_driver->suspend)
>>>> @@ -1683,7 +1683,7 @@ void cpufreq_resume(void)
>>>> struct cpufreq_policy *policy;
>>>> int ret;
>>>>
>>>> -   if (!cpufreq_driver)
>>>> +   if (!cpufreq_driver || !cpufreq_suspended)
>>>> return;
>>>>
>>>> cpufreq_suspended = false;
>>>
>>> Since we have cpufreq_suspended already, the second one is better.
>>>
>>
>> Thanks Rafael for the inputs, I will send a formal patch.
> 
> Bo Yan has posted something really similar already, however:
> 
> https://patchwork.kernel.org/patch/10181101/
> 
> so I would prefer to apply a new version of that one with the latest
> comment taken into account:
> 
> https://patchwork.kernel.org/patch/10183075/
> 
> for the credit to go to the first submitter.
> 

Thanks for the information Rafael. 

I believe safety check in both cpufreq_suspend() and
cpufreq_resume() would be a good thing to have. 


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: Query related to usage of cpufreq_suspend() & cpufreq_resume

2018-02-02 Thread Prateek Sood

On 02/02/2018 05:18 PM, Rafael J. Wysocki wrote:
> On Friday, February 2, 2018 12:41:58 PM CET Prateek Sood wrote:
>> Hi Viresh,
>>
>> One scenario is there where a kernel panic is observed in
>> cpufreq during suspend/resume.
>>
>> pm_suspend()
>>   suspend_devices_and_enter()
>> dpm_suspend_start()
>>   dpm_prepare() 
>>
>> Failure in dpm_prepare() happend with following dmesg:
>>
>> [ 3746.316062] PM: Device xyz not prepared for power transition: code -16
>> [ 3746.316071] PM: Some devices failed to suspend, or early wake event 
>> detected
>>
>>
>> pm_suspend()
>>   suspend_devices_and_enter()
>> dpm_suspend_start()
>>   dpm_prepare() //failed
>> dpm_resume_end()
>>   dpm_resume()
>> cpufreq_resume()
>>   cpufreq_start_governor()
>> sugov_start()
>>   cpufreq_add_update_util_hook()
>>
>> After failure in dpm_prepare(), dpm_resume() called
>> cpufreq_resume(). Corresponding cpufreq_suspend() was not
>> called due to failure of dpm_prepare(). 
>>
>> This resulted in WARN_ON(per_cpu(cpufreq_update_util_data, cpu))
>> in cpufreq_add_update_util_hook() and cpufreq_add_update_util_hook->func
>> being inconsistent state. It caused crash in scheduler.
>>
>> Following are some of the ways to mitigate this issue. Could
>> you please provide feedback on below two approaches or suugest
>> a better way to fix this problem.
>>
>> ---8<--
>>
>> Co-developed-by: Gaurav Kohli <gko...@codeaurora.org>
>> Signed-off-by: Gaurav Kohli <gko...@codeaurora.org> 
>> Signed-off-by: Prateek Sood <prs...@codeaurora.org>
>>
>> diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
>> index 02a497e..732e5a2 100644
>> --- a/drivers/base/power/main.c
>> +++ b/drivers/base/power/main.c
>> @@ -1038,6 +1038,7 @@ void dpm_resume(pm_message_t state)
>>  {
>> struct device *dev;
>> ktime_t starttime = ktime_get();
>> +   bool valid_resume = false;
>>
>> trace_suspend_resume(TPS("dpm_resume"), state.event, true);
>> might_sleep();
>> @@ -1055,6 +1056,7 @@ void dpm_resume(pm_message_t state)
>> }
>>
>> while (!list_empty(_suspended_list)) {
>> +   valid_resume = true;
>> dev = to_device(dpm_suspended_list.next);
>> get_device(dev);
>> if (!is_async(dev)) {
>> @@ -1080,7 +1082,8 @@ void dpm_resume(pm_message_t state)
>> async_synchronize_full();
>> dpm_show_time(starttime, state, 0, NULL);
>>
>> -   cpufreq_resume();
>> +   if (valid_resume)
>> +   cpufreq_resume();
>> trace_suspend_resume(TPS("dpm_resume"), state.event, false);
>>  }
>>
>> 8<--
>>
>> Signed-off-by: Prateek Sood <prs...@codeaurora.org>
>>
>> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
>> index 421f318..439eab8 100644
>> --- a/drivers/cpufreq/cpufreq.c
>> +++ b/drivers/cpufreq/cpufreq.c
>> @@ -1648,7 +1648,7 @@ void cpufreq_suspend(void)
>>  {
>> struct cpufreq_policy *policy;
>>
>> -   if (!cpufreq_driver)
>> +   if (!cpufreq_driver || cpufreq_suspended)
>> return;
>>
>> if (!has_target() && !cpufreq_driver->suspend)
>> @@ -1683,7 +1683,7 @@ void cpufreq_resume(void)
>> struct cpufreq_policy *policy;
>> int ret;
>>
>> -   if (!cpufreq_driver)
>> +   if (!cpufreq_driver || !cpufreq_suspended)
>> return;
>>
>> cpufreq_suspended = false;
> 
> Since we have cpufreq_suspended already, the second one is better.
> 
> Thanks,
> Rafael
> 

Thanks Rafael for the inputs, I will send a formal patch.


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: Query related to usage of cpufreq_suspend() & cpufreq_resume

2018-02-02 Thread Prateek Sood

On 02/02/2018 05:18 PM, Rafael J. Wysocki wrote:
> On Friday, February 2, 2018 12:41:58 PM CET Prateek Sood wrote:
>> Hi Viresh,
>>
>> One scenario is there where a kernel panic is observed in
>> cpufreq during suspend/resume.
>>
>> pm_suspend()
>>   suspend_devices_and_enter()
>> dpm_suspend_start()
>>   dpm_prepare() 
>>
>> Failure in dpm_prepare() happend with following dmesg:
>>
>> [ 3746.316062] PM: Device xyz not prepared for power transition: code -16
>> [ 3746.316071] PM: Some devices failed to suspend, or early wake event 
>> detected
>>
>>
>> pm_suspend()
>>   suspend_devices_and_enter()
>> dpm_suspend_start()
>>   dpm_prepare() //failed
>> dpm_resume_end()
>>   dpm_resume()
>> cpufreq_resume()
>>   cpufreq_start_governor()
>> sugov_start()
>>   cpufreq_add_update_util_hook()
>>
>> After failure in dpm_prepare(), dpm_resume() called
>> cpufreq_resume(). Corresponding cpufreq_suspend() was not
>> called due to failure of dpm_prepare(). 
>>
>> This resulted in WARN_ON(per_cpu(cpufreq_update_util_data, cpu))
>> in cpufreq_add_update_util_hook() and cpufreq_add_update_util_hook->func
>> being inconsistent state. It caused crash in scheduler.
>>
>> Following are some of the ways to mitigate this issue. Could
>> you please provide feedback on below two approaches or suugest
>> a better way to fix this problem.
>>
>> ---8<--
>>
>> Co-developed-by: Gaurav Kohli 
>> Signed-off-by: Gaurav Kohli  
>> Signed-off-by: Prateek Sood 
>>
>> diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
>> index 02a497e..732e5a2 100644
>> --- a/drivers/base/power/main.c
>> +++ b/drivers/base/power/main.c
>> @@ -1038,6 +1038,7 @@ void dpm_resume(pm_message_t state)
>>  {
>> struct device *dev;
>> ktime_t starttime = ktime_get();
>> +   bool valid_resume = false;
>>
>> trace_suspend_resume(TPS("dpm_resume"), state.event, true);
>> might_sleep();
>> @@ -1055,6 +1056,7 @@ void dpm_resume(pm_message_t state)
>> }
>>
>> while (!list_empty(_suspended_list)) {
>> +   valid_resume = true;
>> dev = to_device(dpm_suspended_list.next);
>> get_device(dev);
>> if (!is_async(dev)) {
>> @@ -1080,7 +1082,8 @@ void dpm_resume(pm_message_t state)
>> async_synchronize_full();
>> dpm_show_time(starttime, state, 0, NULL);
>>
>> -   cpufreq_resume();
>> +   if (valid_resume)
>> +   cpufreq_resume();
>> trace_suspend_resume(TPS("dpm_resume"), state.event, false);
>>  }
>>
>> 8<--
>>
>> Signed-off-by: Prateek Sood 
>>
>> diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
>> index 421f318..439eab8 100644
>> --- a/drivers/cpufreq/cpufreq.c
>> +++ b/drivers/cpufreq/cpufreq.c
>> @@ -1648,7 +1648,7 @@ void cpufreq_suspend(void)
>>  {
>> struct cpufreq_policy *policy;
>>
>> -   if (!cpufreq_driver)
>> +   if (!cpufreq_driver || cpufreq_suspended)
>> return;
>>
>> if (!has_target() && !cpufreq_driver->suspend)
>> @@ -1683,7 +1683,7 @@ void cpufreq_resume(void)
>> struct cpufreq_policy *policy;
>> int ret;
>>
>> -   if (!cpufreq_driver)
>> +   if (!cpufreq_driver || !cpufreq_suspended)
>> return;
>>
>> cpufreq_suspended = false;
> 
> Since we have cpufreq_suspended already, the second one is better.
> 
> Thanks,
> Rafael
> 

Thanks Rafael for the inputs, I will send a formal patch.


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Query related to usage of cpufreq_suspend() & cpufreq_resume

2018-02-02 Thread Prateek Sood

Hi Viresh,

One scenario is there where a kernel panic is observed in
cpufreq during suspend/resume.

pm_suspend()
  suspend_devices_and_enter()
dpm_suspend_start()
  dpm_prepare() 

Failure in dpm_prepare() happend with following dmesg:

[ 3746.316062] PM: Device xyz not prepared for power transition: code -16
[ 3746.316071] PM: Some devices failed to suspend, or early wake event detected


pm_suspend()
  suspend_devices_and_enter()
dpm_suspend_start()
  dpm_prepare() //failed
dpm_resume_end()
  dpm_resume()
cpufreq_resume()
  cpufreq_start_governor()
sugov_start()
  cpufreq_add_update_util_hook()

After failure in dpm_prepare(), dpm_resume() called
cpufreq_resume(). Corresponding cpufreq_suspend() was not
called due to failure of dpm_prepare(). 

This resulted in WARN_ON(per_cpu(cpufreq_update_util_data, cpu))
in cpufreq_add_update_util_hook() and cpufreq_add_update_util_hook->func
being inconsistent state. It caused crash in scheduler.

Following are some of the ways to mitigate this issue. Could
you please provide feedback on below two approaches or suugest
a better way to fix this problem.

---8<--

Co-developed-by: Gaurav Kohli <gko...@codeaurora.org>
Signed-off-by: Gaurav Kohli <gko...@codeaurora.org> 
Signed-off-by: Prateek Sood <prs...@codeaurora.org>

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 02a497e..732e5a2 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1038,6 +1038,7 @@ void dpm_resume(pm_message_t state)
 {
struct device *dev;
ktime_t starttime = ktime_get();
+   bool valid_resume = false;

trace_suspend_resume(TPS("dpm_resume"), state.event, true);
might_sleep();
@@ -1055,6 +1056,7 @@ void dpm_resume(pm_message_t state)
}

while (!list_empty(_suspended_list)) {
+   valid_resume = true;
dev = to_device(dpm_suspended_list.next);
get_device(dev);
if (!is_async(dev)) {
@@ -1080,7 +1082,8 @@ void dpm_resume(pm_message_t state)
async_synchronize_full();
dpm_show_time(starttime, state, 0, NULL);

-   cpufreq_resume();
+   if (valid_resume)
+   cpufreq_resume();
trace_suspend_resume(TPS("dpm_resume"), state.event, false);
 }

8<------

Signed-off-by: Prateek Sood <prs...@codeaurora.org>

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 421f318..439eab8 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1648,7 +1648,7 @@ void cpufreq_suspend(void)
 {
struct cpufreq_policy *policy;

-   if (!cpufreq_driver)
+   if (!cpufreq_driver || cpufreq_suspended)
return;

if (!has_target() && !cpufreq_driver->suspend)
@@ -1683,7 +1683,7 @@ void cpufreq_resume(void)
struct cpufreq_policy *policy;
int ret;

-   if (!cpufreq_driver)
+   if (!cpufreq_driver || !cpufreq_suspended)
return;

cpufreq_suspended = false;




Thanks

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Query related to usage of cpufreq_suspend() & cpufreq_resume

2018-02-02 Thread Prateek Sood

Hi Viresh,

One scenario is there where a kernel panic is observed in
cpufreq during suspend/resume.

pm_suspend()
  suspend_devices_and_enter()
dpm_suspend_start()
  dpm_prepare() 

Failure in dpm_prepare() happend with following dmesg:

[ 3746.316062] PM: Device xyz not prepared for power transition: code -16
[ 3746.316071] PM: Some devices failed to suspend, or early wake event detected


pm_suspend()
  suspend_devices_and_enter()
dpm_suspend_start()
  dpm_prepare() //failed
dpm_resume_end()
  dpm_resume()
cpufreq_resume()
  cpufreq_start_governor()
sugov_start()
  cpufreq_add_update_util_hook()

After failure in dpm_prepare(), dpm_resume() called
cpufreq_resume(). Corresponding cpufreq_suspend() was not
called due to failure of dpm_prepare(). 

This resulted in WARN_ON(per_cpu(cpufreq_update_util_data, cpu))
in cpufreq_add_update_util_hook() and cpufreq_add_update_util_hook->func
being inconsistent state. It caused crash in scheduler.

Following are some of the ways to mitigate this issue. Could
you please provide feedback on below two approaches or suugest
a better way to fix this problem.

---8<--

Co-developed-by: Gaurav Kohli 
Signed-off-by: Gaurav Kohli  
Signed-off-by: Prateek Sood 

diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 02a497e..732e5a2 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -1038,6 +1038,7 @@ void dpm_resume(pm_message_t state)
 {
struct device *dev;
ktime_t starttime = ktime_get();
+   bool valid_resume = false;

trace_suspend_resume(TPS("dpm_resume"), state.event, true);
might_sleep();
@@ -1055,6 +1056,7 @@ void dpm_resume(pm_message_t state)
}

while (!list_empty(_suspended_list)) {
+   valid_resume = true;
dev = to_device(dpm_suspended_list.next);
get_device(dev);
if (!is_async(dev)) {
@@ -1080,7 +1082,8 @@ void dpm_resume(pm_message_t state)
async_synchronize_full();
dpm_show_time(starttime, state, 0, NULL);

-   cpufreq_resume();
+   if (valid_resume)
+   cpufreq_resume();
trace_suspend_resume(TPS("dpm_resume"), state.event, false);
 }

8<------

Signed-off-by: Prateek Sood 

diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 421f318..439eab8 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -1648,7 +1648,7 @@ void cpufreq_suspend(void)
 {
struct cpufreq_policy *policy;

-   if (!cpufreq_driver)
+   if (!cpufreq_driver || cpufreq_suspended)
return;

if (!has_target() && !cpufreq_driver->suspend)
@@ -1683,7 +1683,7 @@ void cpufreq_resume(void)
struct cpufreq_policy *policy;
int ret;

-   if (!cpufreq_driver)
+   if (!cpufreq_driver || !cpufreq_suspended)
return;

cpufreq_suspended = false;




Thanks

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2018-01-15 Thread Prateek Sood

On 01/02/2018 09:46 PM, Tejun Heo wrote:
> Hello,
> 
> On Fri, Dec 29, 2017 at 02:07:16AM +0530, Prateek Sood wrote:
>> task T is waiting for cpuset_mutex acquired
>> by kworker/2:1
>>
>> sh ==> cpuhp/2 ==> kworker/2:1 ==> sh 
>>
>> kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1
>>
>> It seems that my earlier patch set should fix this scenario:
>> 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
>> 2) Make cpuset hotplug work synchronous.
>>
>> Could you please share your feedback.
> 
> Hmm... this can also be resolved by adding WQ_MEM_RECLAIM to the
> synchronize rcu workqueue, right?  Given the wide-spread usages of
> synchronize_rcu and friends, maybe that's the right solution, or at
> least something we also need to do, for this particular deadlock?
> 
> Again, I don't have anything against making the domain rebuliding part
> of cpuset operations synchronous and these tricky deadlock scenarios
> do indicate that doing so would probably be beneficial.  That said,
> tho, these scenarios seem more of manifestations of other problems
> exposed through kthreadd dependency than anything else.
> 
> Thanks.
> 

Hi TJ,

Thanks for suggesting WQ_MEM_RECLAIM solution.

My understanding of WQ_MEM_RECLAIM was that it needs to be used for
cases where memory pressure could cause deadlocks.

In this case it does not seem to be a memory pressure issue.
Overloading WQ_MEM_RECLAIM usage for solution to another problem
is the correct approach?

This scenario can be resolved by using WQ_MEM_RECLAIM and a separate
workqueue for rcu. But there seems to be a possibility in future if
any cpu hotplug callbacks use other predefined workqueues which do not
have WQ_MEM_RECLAIM option.

Please let me know your feedback on this.

Thanks

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2018-01-15 Thread Prateek Sood

On 01/02/2018 09:46 PM, Tejun Heo wrote:
> Hello,
> 
> On Fri, Dec 29, 2017 at 02:07:16AM +0530, Prateek Sood wrote:
>> task T is waiting for cpuset_mutex acquired
>> by kworker/2:1
>>
>> sh ==> cpuhp/2 ==> kworker/2:1 ==> sh 
>>
>> kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1
>>
>> It seems that my earlier patch set should fix this scenario:
>> 1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
>> 2) Make cpuset hotplug work synchronous.
>>
>> Could you please share your feedback.
> 
> Hmm... this can also be resolved by adding WQ_MEM_RECLAIM to the
> synchronize rcu workqueue, right?  Given the wide-spread usages of
> synchronize_rcu and friends, maybe that's the right solution, or at
> least something we also need to do, for this particular deadlock?
> 
> Again, I don't have anything against making the domain rebuliding part
> of cpuset operations synchronous and these tricky deadlock scenarios
> do indicate that doing so would probably be beneficial.  That said,
> tho, these scenarios seem more of manifestations of other problems
> exposed through kthreadd dependency than anything else.
> 
> Thanks.
> 

Hi TJ,

Thanks for suggesting WQ_MEM_RECLAIM solution.

My understanding of WQ_MEM_RECLAIM was that it needs to be used for
cases where memory pressure could cause deadlocks.

In this case it does not seem to be a memory pressure issue.
Overloading WQ_MEM_RECLAIM usage for solution to another problem
is the correct approach?

This scenario can be resolved by using WQ_MEM_RECLAIM and a separate
workqueue for rcu. But there seems to be a possibility in future if
any cpu hotplug callbacks use other predefined workqueues which do not
have WQ_MEM_RECLAIM option.

Please let me know your feedback on this.

Thanks

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-28 Thread Prateek Sood

On 12/13/2017 09:36 PM, Tejun Heo wrote:
> Hello, Prateek.
> 
> On Wed, Dec 13, 2017 at 01:20:46PM +0530, Prateek Sood wrote:
>> This change makes the usage of cpuset_hotplug_workfn() from cpu
>> hotplug path synchronous. For memory hotplug it still remains
>> asynchronous.
> 
> Ah, right.
> 
>> Memory migration happening from cpuset_hotplug_workfn() is
>> already asynchronous by queuing cpuset_migrate_mm_workfn() in
>> cpuset_migrate_mm_wq.
>>
>> cpuset_hotplug_workfn()
>>cpuset_hotplug_workfn(()
>>   cpuset_migrate_mm()
>>  queue_work(cpuset_migrate_mm_wq)
>>
>> It seems that memory migration latency might not have
>> impact with this change.
>>
>> Please let me know if you meant something else by cpuset
>> migration taking time when memory migration is turned on.
> 
> No, I didn't.  I was just confused about which part became
> synchronous.  So, I don't have anything against making the cpu part
> synchronous, but let's not do that as the fix to the deadlocks cuz,
> while we can avoid them by changing cpuset, I don't think cpuset is
> the root cause for them.  If there are benefits to making cpuset cpu
> migration synchronous, let's do that for those benefits.
> 
> Thanks.
> 

TJ,

One more deadlock scenario

Task: sh 
[] wait_for_completion+0x14
[] cpuhp_kick_ap_work+0x80 //waiting for cpuhp/2
[] _cpu_down+0xe0
[] cpu_down+0x38
[] cpu_subsys_offline+0x10

Task: cpuhp/2 
[] schedule+0x38
[] _synchronize_rcu_expedited+0x2ec
[] synchronize_sched_expedited+0x60
[] synchronize_sched+0xb0
[] sugov_stop+0x58
[] cpufreq_stop_governor+0x48
[] cpufreq_offline+0x84
[] cpuhp_cpufreq_offline+0xc
[] cpuhp_invoke_callback+0xac
[] cpuhp_down_callbacks+0x58
[] cpuhp_thread_fun+0xa8

_synchronize_rcu_expedited is waiting for execution of rcu
expedited grace period work item wait_rcu_exp_gp()

Task: kworker/2:1 
[] schedule+0x38
[] schedule_preempt_disabled+0x20
[] __mutex_lock_slowpath+0x158
[] mutex_lock+0x14
[] get_online_cpus+0x34 //waiting for cpu_hotplug_lock
[] rebuild_sched_domains+0x30
[] cpuset_hotplug_workfn+0xb8
[] process_one_work+0x168
[] worker_thread+0x140
[] kthread+0xe0

cpu_hotplug_lock is acquired by task: sh


Task: kworker/2:3
[] schedule+0x38
[] schedule_timeout+0x1d8
[] wait_for_common+0xb4
[] wait_for_completion_killable+0x14 //waiting for kthreadd 
[] __kthread_create_on_node+0xec
[] kthread_create_on_node+0x64
[] create_worker+0xb4
[] worker_thread+0x2e0
[] kthread+0xe0


Task: kthreadd
[] __switch_to+0x94
[] __schedule+0x2a8
[] schedule+0x38
[] rwsem_down_read_failed+0xe8
[] __percpu_down_read+0xfc
[] copy_process.isra.72.part.73+0xf60
[] _do_fork+0xc4
[] kernel_thread+0x34
[] kthreadd+0x144

kthreadd is waiting for cgroup_threadgroup_rwsem acquired
by task T

Task: T
[] schedule+0x38
[] schedule_preempt_disabled+0x20
[] __mutex_lock_slowpath+0x158
[] mutex_lock+0x14 
[] cpuset_can_attach+0x58
[] cgroup_taskset_migrate+0x8c
[] cgroup_migrate+0xa4
[] cgroup_attach_task+0x100
[] __cgroup_procs_write.isra.35+0x228
[] cgroup_tasks_write+0x10
[] cgroup_file_write+0x44
[] kernfs_fop_write+0xc0

task T is waiting for cpuset_mutex acquired
by kworker/2:1

sh ==> cpuhp/2 ==> kworker/2:1 ==> sh 

kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1

It seems that my earlier patch set should fix this scenario:
1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
2) Make cpuset hotplug work synchronous.


Could you please share your feedback.


Thanks

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-28 Thread Prateek Sood

On 12/13/2017 09:36 PM, Tejun Heo wrote:
> Hello, Prateek.
> 
> On Wed, Dec 13, 2017 at 01:20:46PM +0530, Prateek Sood wrote:
>> This change makes the usage of cpuset_hotplug_workfn() from cpu
>> hotplug path synchronous. For memory hotplug it still remains
>> asynchronous.
> 
> Ah, right.
> 
>> Memory migration happening from cpuset_hotplug_workfn() is
>> already asynchronous by queuing cpuset_migrate_mm_workfn() in
>> cpuset_migrate_mm_wq.
>>
>> cpuset_hotplug_workfn()
>>cpuset_hotplug_workfn(()
>>   cpuset_migrate_mm()
>>  queue_work(cpuset_migrate_mm_wq)
>>
>> It seems that memory migration latency might not have
>> impact with this change.
>>
>> Please let me know if you meant something else by cpuset
>> migration taking time when memory migration is turned on.
> 
> No, I didn't.  I was just confused about which part became
> synchronous.  So, I don't have anything against making the cpu part
> synchronous, but let's not do that as the fix to the deadlocks cuz,
> while we can avoid them by changing cpuset, I don't think cpuset is
> the root cause for them.  If there are benefits to making cpuset cpu
> migration synchronous, let's do that for those benefits.
> 
> Thanks.
> 

TJ,

One more deadlock scenario

Task: sh 
[] wait_for_completion+0x14
[] cpuhp_kick_ap_work+0x80 //waiting for cpuhp/2
[] _cpu_down+0xe0
[] cpu_down+0x38
[] cpu_subsys_offline+0x10

Task: cpuhp/2 
[] schedule+0x38
[] _synchronize_rcu_expedited+0x2ec
[] synchronize_sched_expedited+0x60
[] synchronize_sched+0xb0
[] sugov_stop+0x58
[] cpufreq_stop_governor+0x48
[] cpufreq_offline+0x84
[] cpuhp_cpufreq_offline+0xc
[] cpuhp_invoke_callback+0xac
[] cpuhp_down_callbacks+0x58
[] cpuhp_thread_fun+0xa8

_synchronize_rcu_expedited is waiting for execution of rcu
expedited grace period work item wait_rcu_exp_gp()

Task: kworker/2:1 
[] schedule+0x38
[] schedule_preempt_disabled+0x20
[] __mutex_lock_slowpath+0x158
[] mutex_lock+0x14
[] get_online_cpus+0x34 //waiting for cpu_hotplug_lock
[] rebuild_sched_domains+0x30
[] cpuset_hotplug_workfn+0xb8
[] process_one_work+0x168
[] worker_thread+0x140
[] kthread+0xe0

cpu_hotplug_lock is acquired by task: sh


Task: kworker/2:3
[] schedule+0x38
[] schedule_timeout+0x1d8
[] wait_for_common+0xb4
[] wait_for_completion_killable+0x14 //waiting for kthreadd 
[] __kthread_create_on_node+0xec
[] kthread_create_on_node+0x64
[] create_worker+0xb4
[] worker_thread+0x2e0
[] kthread+0xe0


Task: kthreadd
[] __switch_to+0x94
[] __schedule+0x2a8
[] schedule+0x38
[] rwsem_down_read_failed+0xe8
[] __percpu_down_read+0xfc
[] copy_process.isra.72.part.73+0xf60
[] _do_fork+0xc4
[] kernel_thread+0x34
[] kthreadd+0x144

kthreadd is waiting for cgroup_threadgroup_rwsem acquired
by task T

Task: T
[] schedule+0x38
[] schedule_preempt_disabled+0x20
[] __mutex_lock_slowpath+0x158
[] mutex_lock+0x14 
[] cpuset_can_attach+0x58
[] cgroup_taskset_migrate+0x8c
[] cgroup_migrate+0xa4
[] cgroup_attach_task+0x100
[] __cgroup_procs_write.isra.35+0x228
[] cgroup_tasks_write+0x10
[] cgroup_file_write+0x44
[] kernfs_fop_write+0xc0

task T is waiting for cpuset_mutex acquired
by kworker/2:1

sh ==> cpuhp/2 ==> kworker/2:1 ==> sh 

kworker/2:3 ==> kthreadd ==> Task T ==> kworker/2:1

It seems that my earlier patch set should fix this scenario:
1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
2) Make cpuset hotplug work synchronous.


Could you please share your feedback.


Thanks

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

[PATCH] cgroup: Fix deadlock in cpu hotplug path

2017-12-18 Thread Prateek Sood

Deadlock during cgroup migration from cpu hotplug path when a task T is
being moved from source to destination cgroup.

kworker/0:0
cpuset_hotplug_workfn()
   cpuset_hotplug_update_tasks()
  hotplug_update_tasks_legacy()
remove_tasks_in_empty_cpuset()
  cgroup_transfer_tasks() // stuck in iterator loop
cgroup_migrate()
  cgroup_migrate_add_task()

In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
Task T will not migrate to destination cgroup. css_task_iter_start()
will keep pointing to task T in loop waiting for task T cg_list node
to be removed.

Task T
do_exit()
  exit_signals() // sets PF_EXITING
  exit_task_namespaces()
switch_task_namespaces()
  free_nsproxy()
put_mnt_ns()
  drop_collected_mounts()
namespace_unlock()
  synchronize_rcu()
_synchronize_rcu_expedited()
  schedule_work() // on cpu0 low priority worker pool
  wait_event() // waiting for work item to execute

Task T inserted a work item in the worklist of cpu0 low priority
worker pool. It is waiting for expedited grace period work item
to execute. This work item will only be executed once kworker/0:0
complete execution of cpuset_hotplug_workfn().

kworker/0:0 ==> Task T ==>kworker/0:0

In case of PF_EXITING task being migrated from source to destination
cgroup, migrate next available task in source cgroup.

Change-Id: I8874fb04479c136cae4dabd5c168c7749df4
Signed-off-by: Prateek Sood <prs...@codeaurora.org>
---
 kernel/cgroup/cgroup-v1.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..a2c05d2 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup 
*from)
 */
do {
css_task_iter_start(>self, 0, );
-   task = css_task_iter_next();
+
+   do {
+   task = css_task_iter_next();
+   } while (task && (task->flags & PF_EXITING));
+
if (task)
get_task_struct(task);
css_task_iter_end();
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

[PATCH] cgroup: Fix deadlock in cpu hotplug path

2017-12-18 Thread Prateek Sood

Deadlock during cgroup migration from cpu hotplug path when a task T is
being moved from source to destination cgroup.

kworker/0:0
cpuset_hotplug_workfn()
   cpuset_hotplug_update_tasks()
  hotplug_update_tasks_legacy()
remove_tasks_in_empty_cpuset()
  cgroup_transfer_tasks() // stuck in iterator loop
cgroup_migrate()
  cgroup_migrate_add_task()

In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
Task T will not migrate to destination cgroup. css_task_iter_start()
will keep pointing to task T in loop waiting for task T cg_list node
to be removed.

Task T
do_exit()
  exit_signals() // sets PF_EXITING
  exit_task_namespaces()
switch_task_namespaces()
  free_nsproxy()
put_mnt_ns()
  drop_collected_mounts()
namespace_unlock()
  synchronize_rcu()
_synchronize_rcu_expedited()
  schedule_work() // on cpu0 low priority worker pool
  wait_event() // waiting for work item to execute

Task T inserted a work item in the worklist of cpu0 low priority
worker pool. It is waiting for expedited grace period work item
to execute. This work item will only be executed once kworker/0:0
complete execution of cpuset_hotplug_workfn().

kworker/0:0 ==> Task T ==>kworker/0:0

In case of PF_EXITING task being migrated from source to destination
cgroup, migrate next available task in source cgroup.

Change-Id: I8874fb04479c136cae4dabd5c168c7749df4
Signed-off-by: Prateek Sood 
---
 kernel/cgroup/cgroup-v1.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..a2c05d2 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup 
*from)
 */
do {
css_task_iter_start(>self, 0, );
-   task = css_task_iter_next();
+
+   do {
+   task = css_task_iter_next();
+   } while (task && (task->flags & PF_EXITING));
+
if (task)
get_task_struct(task);
css_task_iter_end();
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-15 Thread Prateek Sood

On 12/15/2017 06:52 PM, Tejun Heo wrote:
> Hello, Prateek.
> 
> On Fri, Dec 15, 2017 at 02:24:55PM +0530, Prateek Sood wrote:
>> Following are two ways to improve cgroup_transfer_tasks(). In
>> both cases task in PF_EXITING state would be left in source
>> cgroup. It would be removed from cgroup_exit() in exit path.
>>
>> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
>> index 024085d..e2bdcdb 100644
>> --- a/kernel/cgroup/cgroup-v1.c
>> +++ b/kernel/cgroup/cgroup-v1.c
>> @@ -123,7 +123,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct 
>> cgroup *from)
>>  */
>> do {
>> css_task_iter_start(>self, 0, );
>> -   task = css_task_iter_next();
>> +   do {
>> +   task = css_task_iter_next();
>> +   } while (task && (task & PF_EXITING))
>> +
> 
> Yeah, this looks good to me.  We can't just make a single pass as in
> the other one because we can race aginst fork.  And PF_EXITING being
> left behind is what was happening previously too anyway.  They can't
> be moved.
> 
> Thanks.
> 

Thanks TJ for reviewing.

I will send a formal patch with the above approved approach.


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-15 Thread Prateek Sood

On 12/15/2017 06:52 PM, Tejun Heo wrote:
> Hello, Prateek.
> 
> On Fri, Dec 15, 2017 at 02:24:55PM +0530, Prateek Sood wrote:
>> Following are two ways to improve cgroup_transfer_tasks(). In
>> both cases task in PF_EXITING state would be left in source
>> cgroup. It would be removed from cgroup_exit() in exit path.
>>
>> diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
>> index 024085d..e2bdcdb 100644
>> --- a/kernel/cgroup/cgroup-v1.c
>> +++ b/kernel/cgroup/cgroup-v1.c
>> @@ -123,7 +123,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct 
>> cgroup *from)
>>  */
>> do {
>> css_task_iter_start(>self, 0, );
>> -   task = css_task_iter_next();
>> +   do {
>> +   task = css_task_iter_next();
>> +   } while (task && (task & PF_EXITING))
>> +
> 
> Yeah, this looks good to me.  We can't just make a single pass as in
> the other one because we can race aginst fork.  And PF_EXITING being
> left behind is what was happening previously too anyway.  They can't
> be moved.
> 
> Thanks.
> 

Thanks TJ for reviewing.

I will send a formal patch with the above approved approach.


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-15 Thread Prateek Sood

On 12/13/2017 09:36 PM, Tejun Heo wrote:
> Hello, Prateek.
> 
> On Wed, Dec 13, 2017 at 01:20:46PM +0530, Prateek Sood wrote:
>> This change makes the usage of cpuset_hotplug_workfn() from cpu
>> hotplug path synchronous. For memory hotplug it still remains
>> asynchronous.
> 
> Ah, right.
> 
>> Memory migration happening from cpuset_hotplug_workfn() is
>> already asynchronous by queuing cpuset_migrate_mm_workfn() in
>> cpuset_migrate_mm_wq.
>>
>> cpuset_hotplug_workfn()
>>cpuset_hotplug_workfn(()
>>   cpuset_migrate_mm()
>>  queue_work(cpuset_migrate_mm_wq)
>>
>> It seems that memory migration latency might not have
>> impact with this change.
>>
>> Please let me know if you meant something else by cpuset
>> migration taking time when memory migration is turned on.
> 
> No, I didn't.  I was just confused about which part became
> synchronous.  So, I don't have anything against making the cpu part
> synchronous, but let's not do that as the fix to the deadlocks cuz,
> while we can avoid them by changing cpuset, I don't think cpuset is
> the root cause for them.  If there are benefits to making cpuset cpu
> migration synchronous, let's do that for those benefits.

Making CPU part synchronous can only be achieved if we retain the
patch for inverting the locking order for cpuset_mutex and
cpu_hotplug_lock.

> 
> Thanks.
> 

Peter,

Do you suggest taking both the patches:
1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
2) Make cpuset hotplug work synchronous.

or  

 
https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d

I would leave it for you and TJ to decide on this.

Thanks



-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-15 Thread Prateek Sood

On 12/13/2017 09:36 PM, Tejun Heo wrote:
> Hello, Prateek.
> 
> On Wed, Dec 13, 2017 at 01:20:46PM +0530, Prateek Sood wrote:
>> This change makes the usage of cpuset_hotplug_workfn() from cpu
>> hotplug path synchronous. For memory hotplug it still remains
>> asynchronous.
> 
> Ah, right.
> 
>> Memory migration happening from cpuset_hotplug_workfn() is
>> already asynchronous by queuing cpuset_migrate_mm_workfn() in
>> cpuset_migrate_mm_wq.
>>
>> cpuset_hotplug_workfn()
>>cpuset_hotplug_workfn(()
>>   cpuset_migrate_mm()
>>  queue_work(cpuset_migrate_mm_wq)
>>
>> It seems that memory migration latency might not have
>> impact with this change.
>>
>> Please let me know if you meant something else by cpuset
>> migration taking time when memory migration is turned on.
> 
> No, I didn't.  I was just confused about which part became
> synchronous.  So, I don't have anything against making the cpu part
> synchronous, but let's not do that as the fix to the deadlocks cuz,
> while we can avoid them by changing cpuset, I don't think cpuset is
> the root cause for them.  If there are benefits to making cpuset cpu
> migration synchronous, let's do that for those benefits.

Making CPU part synchronous can only be achieved if we retain the
patch for inverting the locking order for cpuset_mutex and
cpu_hotplug_lock.

> 
> Thanks.
> 

Peter,

Do you suggest taking both the patches:
1) Inverting locking order of cpuset_mutex and cpu_hotplug_lock.
2) Make cpuset hotplug work synchronous.

or  

 
https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d

I would leave it for you and TJ to decide on this.

Thanks



-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-15 Thread Prateek Sood

On 12/13/2017 09:10 PM, Tejun Heo wrote:
Hi TJ,

> Hello, Prateek.
> 
> On Wed, Dec 13, 2017 at 07:58:24PM +0530, Prateek Sood wrote:
>> Did you mean something like below. If not then could you
>> please share a patch for this problem in
>> cgroup_transfer_tasks().
> 
> Oh we surely can add a new iterator but we can just count in
> cgroup_transfer_tasks() too, right?
I did not get what you meant by this. Could you please share a
patch for this.
> 
> Thanks.
> 

Following are two ways to improve cgroup_transfer_tasks(). In
both cases task in PF_EXITING state would be left in source
cgroup. It would be removed from cgroup_exit() in exit path.

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..e2bdcdb 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup 
*from)
 */
do {
css_task_iter_start(>self, 0, );
-   task = css_task_iter_next();
+   do {
+   task = css_task_iter_next();
+   } while (task && (task & PF_EXITING))
+
if (task)
get_task_struct(task);
css_task_iter_end();

8<-

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..843b8bb 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -121,12 +121,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct 
cgroup *from)
 * Migrate tasks one-by-one until @from is empty.  This fails iff
 * ->can_attach() fails.
 */
+   css_task_iter_start(>self, 0, );
do {
-   css_task_iter_start(>self, 0, );
task = css_task_iter_next();
if (task)
get_task_struct(task);
-   css_task_iter_end();

if (task) {
ret = cgroup_migrate(task, false, );
@@ -135,6 +134,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup 
*from)
put_task_struct(task);
}
} while (task && !ret);
+   css_task_iter_end();
 out_err:
cgroup_migrate_finish();
percpu_up_write(_threadgroup_rwsem);


Thanks


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-15 Thread Prateek Sood

On 12/13/2017 09:10 PM, Tejun Heo wrote:
Hi TJ,

> Hello, Prateek.
> 
> On Wed, Dec 13, 2017 at 07:58:24PM +0530, Prateek Sood wrote:
>> Did you mean something like below. If not then could you
>> please share a patch for this problem in
>> cgroup_transfer_tasks().
> 
> Oh we surely can add a new iterator but we can just count in
> cgroup_transfer_tasks() too, right?
I did not get what you meant by this. Could you please share a
patch for this.
> 
> Thanks.
> 

Following are two ways to improve cgroup_transfer_tasks(). In
both cases task in PF_EXITING state would be left in source
cgroup. It would be removed from cgroup_exit() in exit path.

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..e2bdcdb 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -123,7 +123,10 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup 
*from)
 */
do {
css_task_iter_start(>self, 0, );
-   task = css_task_iter_next();
+   do {
+   task = css_task_iter_next();
+   } while (task && (task & PF_EXITING))
+
if (task)
get_task_struct(task);
css_task_iter_end();

8<-

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..843b8bb 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -121,12 +121,11 @@ int cgroup_transfer_tasks(struct cgroup *to, struct 
cgroup *from)
 * Migrate tasks one-by-one until @from is empty.  This fails iff
 * ->can_attach() fails.
 */
+   css_task_iter_start(>self, 0, );
do {
-   css_task_iter_start(>self, 0, );
task = css_task_iter_next();
if (task)
get_task_struct(task);
-   css_task_iter_end();

if (task) {
ret = cgroup_migrate(task, false, );
@@ -135,6 +134,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup 
*from)
put_task_struct(task);
}
} while (task && !ret);
+   css_task_iter_end();
 out_err:
cgroup_migrate_finish();
percpu_up_write(_threadgroup_rwsem);


Thanks


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-13 Thread Prateek Sood

On 12/11/2017 09:02 PM, Tejun Heo wrote:
> Hello, Prateek.
> 
> On Fri, Dec 08, 2017 at 05:15:55PM +0530, Prateek Sood wrote:
>> There is one deadlock issue during cgroup migration from cpu
>> hotplug path when a task T is being moved from source to
>> destination cgroup.
>>
>> kworker/0:0
>> cpuset_hotplug_workfn()
>>cpuset_hotplug_update_tasks()
>>   hotplug_update_tasks_legacy()
>> remove_tasks_in_empty_cpuset()
>>   cgroup_transfer_tasks() // stuck in iterator loop
>> cgroup_migrate()
>>   cgroup_migrate_add_task()
>>
>> In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
>> Task T will not migrate to destination cgroup. css_task_iter_start()
>> will keep pointing to task T in loop waiting for task T cg_list node
>> to be removed.
> 
> Heh, that's a bug in cgroup_transfer_tasks() which happened because I
> forgot to update when we changed how we handle exiting tasks.  The
> right thing to do here is making cgroup_transfer_tasks() repeat iff
> there were a valid migration target which didn't get transferred.
> 
> Thanks.
> 

Hi TJ,

Did you mean something like below. If not then could you
please share a patch for this problem in
cgroup_transfer_tasks().

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 473e0c0..41de618 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -143,6 +143,8 @@ struct task_struct *cgroup_taskset_next(struct 
cgroup_taskset *tset,

 void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 struct css_task_iter *it);
+void css_task_migrate_iter_start(struct cgroup_subsys_state *css,
+unsigned int flags, struct css_task_iter *it);
 struct task_struct *css_task_iter_next(struct css_task_iter *it);
 void css_task_iter_end(struct css_task_iter *it);

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..12279ae 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -122,7 +122,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup 
*from)
 * ->can_attach() fails.
 */
do {
-   css_task_iter_start(>self, 0, );
+   css_task_migrate_iter_start(>self, 0, );
task = css_task_iter_next();
if (task)
get_task_struct(task);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0b1ffe1..3c1d2d2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4189,6 +4189,42 @@ void css_task_iter_start(struct cgroup_subsys_state 
*css, unsigned int flags,
spin_unlock_irq(_set_lock);
 }

+void css_task_migrate_iter_start(struct cgroup_subsys_state *css,
+unsigned int flags, struct css_task_iter *it)
+{
+   struct task_struct *task = NULL;
+   /* no one should try to iterate before mounting cgroups */
+   WARN_ON_ONCE(!use_task_css_set_links);
+
+   memset(it, 0, sizeof(*it));
+
+   spin_lock_irq(_set_lock);
+
+   it->ss = css->ss;
+   it->flags = flags;
+
+   if (it->ss)
+   it->cset_pos = >cgroup->e_csets[css->ss->id];
+   else
+   it->cset_pos = >cgroup->cset_links;
+
+   it->cset_head = it->cset_pos;
+
+   css_task_iter_advance_css_set(it);
+
+   while (it->task_pos) {
+   task = list_entry(it->task_pos, struct task_struct,
+   cg_list);
+
+   if (likely(!(task->flags & PF_EXITING)))
+   break;
+
+   css_task_iter_advance(it);
+   }
+
+   spin_unlock_irq(_set_lock);
+}
+
 /**
  * css_task_iter_next - return the next task for the iterator
  * @it: the task iterator being iterated




Thanks

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-13 Thread Prateek Sood

On 12/11/2017 09:02 PM, Tejun Heo wrote:
> Hello, Prateek.
> 
> On Fri, Dec 08, 2017 at 05:15:55PM +0530, Prateek Sood wrote:
>> There is one deadlock issue during cgroup migration from cpu
>> hotplug path when a task T is being moved from source to
>> destination cgroup.
>>
>> kworker/0:0
>> cpuset_hotplug_workfn()
>>cpuset_hotplug_update_tasks()
>>   hotplug_update_tasks_legacy()
>> remove_tasks_in_empty_cpuset()
>>   cgroup_transfer_tasks() // stuck in iterator loop
>> cgroup_migrate()
>>   cgroup_migrate_add_task()
>>
>> In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
>> Task T will not migrate to destination cgroup. css_task_iter_start()
>> will keep pointing to task T in loop waiting for task T cg_list node
>> to be removed.
> 
> Heh, that's a bug in cgroup_transfer_tasks() which happened because I
> forgot to update when we changed how we handle exiting tasks.  The
> right thing to do here is making cgroup_transfer_tasks() repeat iff
> there were a valid migration target which didn't get transferred.
> 
> Thanks.
> 

Hi TJ,

Did you mean something like below. If not then could you
please share a patch for this problem in
cgroup_transfer_tasks().

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 473e0c0..41de618 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -143,6 +143,8 @@ struct task_struct *cgroup_taskset_next(struct 
cgroup_taskset *tset,

 void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags,
 struct css_task_iter *it);
+void css_task_migrate_iter_start(struct cgroup_subsys_state *css,
+unsigned int flags, struct css_task_iter *it);
 struct task_struct *css_task_iter_next(struct css_task_iter *it);
 void css_task_iter_end(struct css_task_iter *it);

diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 024085d..12279ae 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -122,7 +122,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup 
*from)
 * ->can_attach() fails.
 */
do {
-   css_task_iter_start(>self, 0, );
+   css_task_migrate_iter_start(>self, 0, );
task = css_task_iter_next();
if (task)
get_task_struct(task);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 0b1ffe1..3c1d2d2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -4189,6 +4189,42 @@ void css_task_iter_start(struct cgroup_subsys_state 
*css, unsigned int flags,
spin_unlock_irq(_set_lock);
 }

+void css_task_migrate_iter_start(struct cgroup_subsys_state *css,
+unsigned int flags, struct css_task_iter *it)
+{
+   struct task_struct *task = NULL;
+   /* no one should try to iterate before mounting cgroups */
+   WARN_ON_ONCE(!use_task_css_set_links);
+
+   memset(it, 0, sizeof(*it));
+
+   spin_lock_irq(_set_lock);
+
+   it->ss = css->ss;
+   it->flags = flags;
+
+   if (it->ss)
+   it->cset_pos = >cgroup->e_csets[css->ss->id];
+   else
+   it->cset_pos = >cgroup->cset_links;
+
+   it->cset_head = it->cset_pos;
+
+   css_task_iter_advance_css_set(it);
+
+   while (it->task_pos) {
+   task = list_entry(it->task_pos, struct task_struct,
+   cg_list);
+
+   if (likely(!(task->flags & PF_EXITING)))
+   break;
+
+   css_task_iter_advance(it);
+   }
+
+   spin_unlock_irq(_set_lock);
+}
+
 /**
  * css_task_iter_next - return the next task for the iterator
  * @it: the task iterator being iterated




Thanks

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-12 Thread Prateek Sood

On 12/11/2017 08:50 PM, Tejun Heo wrote:
> Hello, Peter.
> 
> On Tue, Dec 05, 2017 at 12:01:17AM +0100, Peter Zijlstra wrote:
>>> AFAICS, this should remove the circular dependency you originally
>>> reported.  I'll revert the two cpuset commits for now.
>>
>> So I liked his patches in that we would be able to go back to
>> synchronous sched_domain building.
> 
> Ah, yeah, that's a separate issue but didn't we intentionally make
> that asynchronous?  IIRC, cpuset migration can take a really long time
> when the memory migration is turned on and doing that synchronously
> could mess up the system.
> 
> Thanks.
> 

Hi TJ,

This change makes the usage of cpuset_hotplug_workfn() from cpu
hotplug path synchronous. For memory hotplug it still remains
asynchronous.

Memory migration happening from cpuset_hotplug_workfn() is
already asynchronous by queuing cpuset_migrate_mm_workfn() in
cpuset_migrate_mm_wq.

cpuset_hotplug_workfn()
   cpuset_hotplug_workfn(()
  cpuset_migrate_mm()
 queue_work(cpuset_migrate_mm_wq)

It seems that memory migration latency might not have
impact with this change.

Please let me know if you meant something else by cpuset
migration taking time when memory migration is turned on.

Thanks

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-12 Thread Prateek Sood

On 12/11/2017 08:50 PM, Tejun Heo wrote:
> Hello, Peter.
> 
> On Tue, Dec 05, 2017 at 12:01:17AM +0100, Peter Zijlstra wrote:
>>> AFAICS, this should remove the circular dependency you originally
>>> reported.  I'll revert the two cpuset commits for now.
>>
>> So I liked his patches in that we would be able to go back to
>> synchronous sched_domain building.
> 
> Ah, yeah, that's a separate issue but didn't we intentionally make
> that asynchronous?  IIRC, cpuset migration can take a really long time
> when the memory migration is turned on and doing that synchronously
> could mess up the system.
> 
> Thanks.
> 

Hi TJ,

This change makes the usage of cpuset_hotplug_workfn() from cpu
hotplug path synchronous. For memory hotplug it still remains
asynchronous.

Memory migration happening from cpuset_hotplug_workfn() is
already asynchronous by queuing cpuset_migrate_mm_workfn() in
cpuset_migrate_mm_wq.

cpuset_hotplug_workfn()
   cpuset_hotplug_workfn(()
  cpuset_migrate_mm()
 queue_work(cpuset_migrate_mm_wq)

It seems that memory migration latency might not have
impact with this change.

Please let me know if you meant something else by cpuset
migration taking time when memory migration is turned on.

Thanks

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-08 Thread Prateek Sood

On 12/08/2017 03:10 PM, Prateek Sood wrote:
> On 12/05/2017 04:31 AM, Peter Zijlstra wrote:
>> On Mon, Dec 04, 2017 at 02:58:25PM -0800, Tejun Heo wrote:
>>> Hello, again.
>>>
>>> On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote:
>>>> Hello,
>>>>
>>>> On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote:
>>>>> Any feedback/suggestion for this patch?
>>>>
>>>> Sorry about the delay.  I'm a bit worried because it feels like we're
>>>> chasing a squirrel.  I'll think through the recent changes and this
>>>> one and get back to you.
>>>
>>> Can you please take a look at the following pending commit?
>>>
>>>   
>>> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
>>>
>>> AFAICS, this should remove the circular dependency you originally
>>> reported.  I'll revert the two cpuset commits for now.
>>
>> So I liked his patches in that we would be able to go back to
>> synchronous sched_domain building.
>>
> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
> 
> This will fix the original circular locking dependency issue.
> I will let you both (Peter & TJ) to decide on which one to
> pick.
> 
> 
> 

TJ & Peter,

There is one deadlock issue during cgroup migration from cpu
hotplug path when a task T is being moved from source to
destination cgroup.

kworker/0:0
cpuset_hotplug_workfn()
   cpuset_hotplug_update_tasks()
  hotplug_update_tasks_legacy()
remove_tasks_in_empty_cpuset()
  cgroup_transfer_tasks() // stuck in iterator loop
cgroup_migrate()
  cgroup_migrate_add_task()

In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
Task T will not migrate to destination cgroup. css_task_iter_start()
will keep pointing to task T in loop waiting for task T cg_list node
to be removed.

Task T
do_exit()
  exit_signals() // sets PF_EXITING
  exit_task_namespaces()
switch_task_namespaces()
  free_nsproxy()
put_mnt_ns()
  drop_collected_mounts()
namespace_unlock()
  synchronize_rcu()
_synchronize_rcu_expedited()
  schedule_work() // on cpu0 low priority worker pool
  wait_event() // waiting for work item to execute

Task T inserted a work item in the worklist of cpu0 low priority
worker pool. It is waiting for expedited grace period work item
to execute. This work item will only be executed once kworker/0:0
complete execution of cpuset_hotplug_workfn().

kworker/0:0 ==> Task T ==>kworker/0:0
 
Following suggested patch might not be able to fix the above
mentioned case:
https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d

Combination of following patches fixes above mentioned scenario
as well:
1) Inverting cpuset_mutex and cpu_hotplug_lock locking sequence
2) Making cpuset hotplug work synchronous 



-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-08 Thread Prateek Sood

On 12/08/2017 03:10 PM, Prateek Sood wrote:
> On 12/05/2017 04:31 AM, Peter Zijlstra wrote:
>> On Mon, Dec 04, 2017 at 02:58:25PM -0800, Tejun Heo wrote:
>>> Hello, again.
>>>
>>> On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote:
>>>> Hello,
>>>>
>>>> On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote:
>>>>> Any feedback/suggestion for this patch?
>>>>
>>>> Sorry about the delay.  I'm a bit worried because it feels like we're
>>>> chasing a squirrel.  I'll think through the recent changes and this
>>>> one and get back to you.
>>>
>>> Can you please take a look at the following pending commit?
>>>
>>>   
>>> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
>>>
>>> AFAICS, this should remove the circular dependency you originally
>>> reported.  I'll revert the two cpuset commits for now.
>>
>> So I liked his patches in that we would be able to go back to
>> synchronous sched_domain building.
>>
> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
> 
> This will fix the original circular locking dependency issue.
> I will let you both (Peter & TJ) to decide on which one to
> pick.
> 
> 
> 

TJ & Peter,

There is one deadlock issue during cgroup migration from cpu
hotplug path when a task T is being moved from source to
destination cgroup.

kworker/0:0
cpuset_hotplug_workfn()
   cpuset_hotplug_update_tasks()
  hotplug_update_tasks_legacy()
remove_tasks_in_empty_cpuset()
  cgroup_transfer_tasks() // stuck in iterator loop
cgroup_migrate()
  cgroup_migrate_add_task()

In cgroup_migrate_add_task() it checks for PF_EXITING flag of task T.
Task T will not migrate to destination cgroup. css_task_iter_start()
will keep pointing to task T in loop waiting for task T cg_list node
to be removed.

Task T
do_exit()
  exit_signals() // sets PF_EXITING
  exit_task_namespaces()
switch_task_namespaces()
  free_nsproxy()
put_mnt_ns()
  drop_collected_mounts()
namespace_unlock()
  synchronize_rcu()
_synchronize_rcu_expedited()
  schedule_work() // on cpu0 low priority worker pool
  wait_event() // waiting for work item to execute

Task T inserted a work item in the worklist of cpu0 low priority
worker pool. It is waiting for expedited grace period work item
to execute. This work item will only be executed once kworker/0:0
complete execution of cpuset_hotplug_workfn().

kworker/0:0 ==> Task T ==>kworker/0:0
 
Following suggested patch might not be able to fix the above
mentioned case:
https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d

Combination of following patches fixes above mentioned scenario
as well:
1) Inverting cpuset_mutex and cpu_hotplug_lock locking sequence
2) Making cpuset hotplug work synchronous 



-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-08 Thread Prateek Sood

On 12/05/2017 04:31 AM, Peter Zijlstra wrote:
> On Mon, Dec 04, 2017 at 02:58:25PM -0800, Tejun Heo wrote:
>> Hello, again.
>>
>> On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote:
>>> Hello,
>>>
>>> On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote:
>>>> Any feedback/suggestion for this patch?
>>>
>>> Sorry about the delay.  I'm a bit worried because it feels like we're
>>> chasing a squirrel.  I'll think through the recent changes and this
>>> one and get back to you.
>>
>> Can you please take a look at the following pending commit?
>>
>>   
>> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
>>
>> AFAICS, this should remove the circular dependency you originally
>> reported.  I'll revert the two cpuset commits for now.
> 
> So I liked his patches in that we would be able to go back to
> synchronous sched_domain building.
> 
https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d

This will fix the original circular locking dependency issue.
I will let you both (Peter & TJ) to decide on which one to
pick.



-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-08 Thread Prateek Sood

On 12/05/2017 04:31 AM, Peter Zijlstra wrote:
> On Mon, Dec 04, 2017 at 02:58:25PM -0800, Tejun Heo wrote:
>> Hello, again.
>>
>> On Mon, Dec 04, 2017 at 12:22:19PM -0800, Tejun Heo wrote:
>>> Hello,
>>>
>>> On Mon, Dec 04, 2017 at 10:44:49AM +0530, Prateek Sood wrote:
>>>> Any feedback/suggestion for this patch?
>>>
>>> Sorry about the delay.  I'm a bit worried because it feels like we're
>>> chasing a squirrel.  I'll think through the recent changes and this
>>> one and get back to you.
>>
>> Can you please take a look at the following pending commit?
>>
>>   
>> https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d
>>
>> AFAICS, this should remove the circular dependency you originally
>> reported.  I'll revert the two cpuset commits for now.
> 
> So I liked his patches in that we would be able to go back to
> synchronous sched_domain building.
> 
https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git/commit/?h=for-4.15-fixes=e8b3f8db7aad99fcc5234fc5b89984ff6620de3d

This will fix the original circular locking dependency issue.
I will let you both (Peter & TJ) to decide on which one to
pick.



-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-03 Thread Prateek Sood

On 11/28/2017 05:05 PM, Prateek Sood wrote:
> CPU1
> cpus_read_lock+0x3e/0x80
> static_key_slow_inc+0xe/0xa0
> cpuset_css_online+0x62/0x330
> online_css+0x26/0x80
> cgroup_apply_control_enable+0x266/0x3d0
> cgroup_mkdir+0x37d/0x4f0
> kernfs_iop_mkdir+0x53/0x80
> vfs_mkdir+0x10e/0x1a0
> SyS_mkdirat+0xb3/0xe0
> entry_SYSCALL_64_fastpath+0x23/0x9a
> 
> CPU0
> lock_acquire+0xec/0x1e0
> __mutex_lock+0x89/0x920
> cpuset_write_resmask+0x61/0x1100
> cgroup_file_write+0x7b/0x200
> kernfs_fop_write+0x112/0x1a0
> __vfs_write+0x23/0x150
> vfs_write+0xc8/0x1c0
> SyS_write+0x45/0xa0
> entry_SYSCALL_64_fastpath+0x23/0x9a
> 
> CPU0   CPU1
>    
> lock(cpu_hotplug_lock.rw_sem);
> lock(cpuset_mutex);
> lock(cpu_hotplug_lock.rw_sem);
> lock(cpuset_mutex);
> 
> Change locking order of cpu_hotplug_lock.rw_sem and
> cpuset_mutex in cpuset_css_online(). Use _cpuslocked()
> version for static_branch_inc/static_branch_dec in
> cpuset_inc()/cpuset_dec().
> 
> Signed-off-by: Prateek Sood <prs...@codeaurora.org>
> ---
>  include/linux/cpuset.h |  8 
>  include/linux/jump_label.h | 10 --
>  kernel/cgroup/cpuset.c |  4 ++--
>  kernel/jump_label.c| 13 +
>  4 files changed, 27 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index 2ab910f..5aadc25 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)
>  
>  static inline void cpuset_inc(void)
>  {
> - static_branch_inc(_pre_enable_key);
> - static_branch_inc(_enabled_key);
> + static_branch_inc_cpuslocked(_pre_enable_key);
> + static_branch_inc_cpuslocked(_enabled_key);
>  }
>  
>  static inline void cpuset_dec(void)
>  {
> - static_branch_dec(_enabled_key);
> - static_branch_dec(_pre_enable_key);
> + static_branch_dec_cpuslocked(_enabled_key);
> + static_branch_dec_cpuslocked(_pre_enable_key);
>  }
>  
>  extern int cpuset_init(void);
> diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
> index c7b368c..890e21c 100644
> --- a/include/linux/jump_label.h
> +++ b/include/linux/jump_label.h
> @@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct 
> jump_entry *entry,
>  extern int jump_label_text_reserved(void *start, void *end);
>  extern void static_key_slow_inc(struct static_key *key);
>  extern void static_key_slow_dec(struct static_key *key);
> +extern void static_key_slow_incr_cpuslocked(struct static_key *key);
> +extern void static_key_slow_decr_cpuslocked(struct static_key *key);
>  extern void jump_label_apply_nops(struct module *mod);
>  extern int static_key_count(struct static_key *key);
>  extern void static_key_enable(struct static_key *key);
> @@ -259,6 +261,8 @@ static inline void static_key_disable(struct static_key 
> *key)
>  
>  #define static_key_enable_cpuslocked(k)  static_key_enable((k))
>  #define static_key_disable_cpuslocked(k) static_key_disable((k))
> +#define static_key_slow_incr_cpuslocked(k)   static_key_slow_inc((k))
> +#define static_key_slow_decr_cpuslocked(k)   static_key_slow_dec((k))
>  
>  #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) }
>  #define STATIC_KEY_INIT_FALSE{ .enabled = ATOMIC_INIT(0) }
> @@ -414,8 +418,10 @@ struct static_key_false {
>   * Advanced usage; refcount, branch is enabled when: count != 0
>   */
>  
> -#define static_branch_inc(x) static_key_slow_inc(&(x)->key)
> -#define static_branch_dec(x) static_key_slow_dec(&(x)->key)
> +#define static_branch_inc(x) static_key_slow_inc(&(x)->key)
> +#define static_branch_dec(x) static_key_slow_dec(&(x)->key)
> +#define static_branch_inc_cpuslocked(x)  
> static_key_slow_incr_cpuslocked(&(x)->key)
> +#define static_branch_dec_cpuslocked(x)  
> static_key_slow_decr_cpuslocked(&(x)->key)
>  
>  /*
>   * Normal usage; boolean enable/disable.
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 227bc25..4ad8bae 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -1985,7 +1985,7 @@ static int cpuset_css_online(struct cgroup_subsys_state 
> *css)
>   if (!parent)
>   return 0;
>  
> - mutex_lock(_mutex);
> + cpuset_sched_change_begin();
>  
>   set_bit(CS_ONLINE, >flags);
>   if (is_spread_page(parent))
> @@ -2034,7 +2034,7 @@ static int cpuset_css_online(

Re: [PATCH] cgroup/cpuset: fix circular locking dependency

2017-12-03 Thread Prateek Sood

On 11/28/2017 05:05 PM, Prateek Sood wrote:
> CPU1
> cpus_read_lock+0x3e/0x80
> static_key_slow_inc+0xe/0xa0
> cpuset_css_online+0x62/0x330
> online_css+0x26/0x80
> cgroup_apply_control_enable+0x266/0x3d0
> cgroup_mkdir+0x37d/0x4f0
> kernfs_iop_mkdir+0x53/0x80
> vfs_mkdir+0x10e/0x1a0
> SyS_mkdirat+0xb3/0xe0
> entry_SYSCALL_64_fastpath+0x23/0x9a
> 
> CPU0
> lock_acquire+0xec/0x1e0
> __mutex_lock+0x89/0x920
> cpuset_write_resmask+0x61/0x1100
> cgroup_file_write+0x7b/0x200
> kernfs_fop_write+0x112/0x1a0
> __vfs_write+0x23/0x150
> vfs_write+0xc8/0x1c0
> SyS_write+0x45/0xa0
> entry_SYSCALL_64_fastpath+0x23/0x9a
> 
> CPU0   CPU1
>    
> lock(cpu_hotplug_lock.rw_sem);
> lock(cpuset_mutex);
> lock(cpu_hotplug_lock.rw_sem);
> lock(cpuset_mutex);
> 
> Change locking order of cpu_hotplug_lock.rw_sem and
> cpuset_mutex in cpuset_css_online(). Use _cpuslocked()
> version for static_branch_inc/static_branch_dec in
> cpuset_inc()/cpuset_dec().
> 
> Signed-off-by: Prateek Sood 
> ---
>  include/linux/cpuset.h |  8 
>  include/linux/jump_label.h | 10 --
>  kernel/cgroup/cpuset.c |  4 ++--
>  kernel/jump_label.c| 13 +
>  4 files changed, 27 insertions(+), 8 deletions(-)
> 
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index 2ab910f..5aadc25 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)
>  
>  static inline void cpuset_inc(void)
>  {
> - static_branch_inc(_pre_enable_key);
> - static_branch_inc(_enabled_key);
> + static_branch_inc_cpuslocked(_pre_enable_key);
> + static_branch_inc_cpuslocked(_enabled_key);
>  }
>  
>  static inline void cpuset_dec(void)
>  {
> - static_branch_dec(_enabled_key);
> - static_branch_dec(_pre_enable_key);
> + static_branch_dec_cpuslocked(_enabled_key);
> + static_branch_dec_cpuslocked(_pre_enable_key);
>  }
>  
>  extern int cpuset_init(void);
> diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
> index c7b368c..890e21c 100644
> --- a/include/linux/jump_label.h
> +++ b/include/linux/jump_label.h
> @@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct 
> jump_entry *entry,
>  extern int jump_label_text_reserved(void *start, void *end);
>  extern void static_key_slow_inc(struct static_key *key);
>  extern void static_key_slow_dec(struct static_key *key);
> +extern void static_key_slow_incr_cpuslocked(struct static_key *key);
> +extern void static_key_slow_decr_cpuslocked(struct static_key *key);
>  extern void jump_label_apply_nops(struct module *mod);
>  extern int static_key_count(struct static_key *key);
>  extern void static_key_enable(struct static_key *key);
> @@ -259,6 +261,8 @@ static inline void static_key_disable(struct static_key 
> *key)
>  
>  #define static_key_enable_cpuslocked(k)  static_key_enable((k))
>  #define static_key_disable_cpuslocked(k) static_key_disable((k))
> +#define static_key_slow_incr_cpuslocked(k)   static_key_slow_inc((k))
> +#define static_key_slow_decr_cpuslocked(k)   static_key_slow_dec((k))
>  
>  #define STATIC_KEY_INIT_TRUE { .enabled = ATOMIC_INIT(1) }
>  #define STATIC_KEY_INIT_FALSE{ .enabled = ATOMIC_INIT(0) }
> @@ -414,8 +418,10 @@ struct static_key_false {
>   * Advanced usage; refcount, branch is enabled when: count != 0
>   */
>  
> -#define static_branch_inc(x) static_key_slow_inc(&(x)->key)
> -#define static_branch_dec(x) static_key_slow_dec(&(x)->key)
> +#define static_branch_inc(x) static_key_slow_inc(&(x)->key)
> +#define static_branch_dec(x) static_key_slow_dec(&(x)->key)
> +#define static_branch_inc_cpuslocked(x)  
> static_key_slow_incr_cpuslocked(&(x)->key)
> +#define static_branch_dec_cpuslocked(x)  
> static_key_slow_decr_cpuslocked(&(x)->key)
>  
>  /*
>   * Normal usage; boolean enable/disable.
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 227bc25..4ad8bae 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -1985,7 +1985,7 @@ static int cpuset_css_online(struct cgroup_subsys_state 
> *css)
>   if (!parent)
>   return 0;
>  
> - mutex_lock(_mutex);
> + cpuset_sched_change_begin();
>  
>   set_bit(CS_ONLINE, >flags);
>   if (is_spread_page(parent))
> @@ -2034,7 +2034,7 @@ static int cpuset_css_online(struct cgroup_subsys_state

[PATCH] cgroup/cpuset: fix circular locking dependency

2017-11-28 Thread Prateek Sood

CPU1
cpus_read_lock+0x3e/0x80
static_key_slow_inc+0xe/0xa0
cpuset_css_online+0x62/0x330
online_css+0x26/0x80
cgroup_apply_control_enable+0x266/0x3d0
cgroup_mkdir+0x37d/0x4f0
kernfs_iop_mkdir+0x53/0x80
vfs_mkdir+0x10e/0x1a0
SyS_mkdirat+0xb3/0xe0
entry_SYSCALL_64_fastpath+0x23/0x9a

CPU0
lock_acquire+0xec/0x1e0
__mutex_lock+0x89/0x920
cpuset_write_resmask+0x61/0x1100
cgroup_file_write+0x7b/0x200
kernfs_fop_write+0x112/0x1a0
__vfs_write+0x23/0x150
vfs_write+0xc8/0x1c0
SyS_write+0x45/0xa0
entry_SYSCALL_64_fastpath+0x23/0x9a

CPU0   CPU1
   
lock(cpu_hotplug_lock.rw_sem);
lock(cpuset_mutex);
lock(cpu_hotplug_lock.rw_sem);
lock(cpuset_mutex);

Change locking order of cpu_hotplug_lock.rw_sem and
cpuset_mutex in cpuset_css_online(). Use _cpuslocked()
version for static_branch_inc/static_branch_dec in
cpuset_inc()/cpuset_dec().

Signed-off-by: Prateek Sood <prs...@codeaurora.org>
---
 include/linux/cpuset.h |  8 
 include/linux/jump_label.h | 10 --
 kernel/cgroup/cpuset.c |  4 ++--
 kernel/jump_label.c| 13 +
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 2ab910f..5aadc25 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)
 
 static inline void cpuset_inc(void)
 {
-   static_branch_inc(_pre_enable_key);
-   static_branch_inc(_enabled_key);
+   static_branch_inc_cpuslocked(_pre_enable_key);
+   static_branch_inc_cpuslocked(_enabled_key);
 }
 
 static inline void cpuset_dec(void)
 {
-   static_branch_dec(_enabled_key);
-   static_branch_dec(_pre_enable_key);
+   static_branch_dec_cpuslocked(_enabled_key);
+   static_branch_dec_cpuslocked(_pre_enable_key);
 }
 
 extern int cpuset_init(void);
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index c7b368c..890e21c 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct 
jump_entry *entry,
 extern int jump_label_text_reserved(void *start, void *end);
 extern void static_key_slow_inc(struct static_key *key);
 extern void static_key_slow_dec(struct static_key *key);
+extern void static_key_slow_incr_cpuslocked(struct static_key *key);
+extern void static_key_slow_decr_cpuslocked(struct static_key *key);
 extern void jump_label_apply_nops(struct module *mod);
 extern int static_key_count(struct static_key *key);
 extern void static_key_enable(struct static_key *key);
@@ -259,6 +261,8 @@ static inline void static_key_disable(struct static_key 
*key)
 
 #define static_key_enable_cpuslocked(k)static_key_enable((k))
 #define static_key_disable_cpuslocked(k)   static_key_disable((k))
+#define static_key_slow_incr_cpuslocked(k) static_key_slow_inc((k))
+#define static_key_slow_decr_cpuslocked(k) static_key_slow_dec((k))
 
 #define STATIC_KEY_INIT_TRUE   { .enabled = ATOMIC_INIT(1) }
 #define STATIC_KEY_INIT_FALSE  { .enabled = ATOMIC_INIT(0) }
@@ -414,8 +418,10 @@ struct static_key_false {
  * Advanced usage; refcount, branch is enabled when: count != 0
  */
 
-#define static_branch_inc(x)   static_key_slow_inc(&(x)->key)
-#define static_branch_dec(x)   static_key_slow_dec(&(x)->key)
+#define static_branch_inc(x)   static_key_slow_inc(&(x)->key)
+#define static_branch_dec(x)   static_key_slow_dec(&(x)->key)
+#define static_branch_inc_cpuslocked(x)
static_key_slow_incr_cpuslocked(&(x)->key)
+#define static_branch_dec_cpuslocked(x)
static_key_slow_decr_cpuslocked(&(x)->key)
 
 /*
  * Normal usage; boolean enable/disable.
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 227bc25..4ad8bae 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1985,7 +1985,7 @@ static int cpuset_css_online(struct cgroup_subsys_state 
*css)
if (!parent)
return 0;
 
-   mutex_lock(_mutex);
+   cpuset_sched_change_begin();
 
set_bit(CS_ONLINE, >flags);
if (is_spread_page(parent))
@@ -2034,7 +2034,7 @@ static int cpuset_css_online(struct cgroup_subsys_state 
*css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(_lock);
 out_unlock:
-   mutex_unlock(_mutex);
+   cpuset_sched_change_end();
return 0;
 }
 
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8594d24..dde0eaa 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -126,6 +126,12 @@ void static_key_slow_inc(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
 
+void static_key_slow_incr_cpuslocked(struct static_key *key)
+{
+   static_key_slow_inc_cpuslocked(ke

[PATCH] cgroup/cpuset: fix circular locking dependency

2017-11-28 Thread Prateek Sood

CPU1
cpus_read_lock+0x3e/0x80
static_key_slow_inc+0xe/0xa0
cpuset_css_online+0x62/0x330
online_css+0x26/0x80
cgroup_apply_control_enable+0x266/0x3d0
cgroup_mkdir+0x37d/0x4f0
kernfs_iop_mkdir+0x53/0x80
vfs_mkdir+0x10e/0x1a0
SyS_mkdirat+0xb3/0xe0
entry_SYSCALL_64_fastpath+0x23/0x9a

CPU0
lock_acquire+0xec/0x1e0
__mutex_lock+0x89/0x920
cpuset_write_resmask+0x61/0x1100
cgroup_file_write+0x7b/0x200
kernfs_fop_write+0x112/0x1a0
__vfs_write+0x23/0x150
vfs_write+0xc8/0x1c0
SyS_write+0x45/0xa0
entry_SYSCALL_64_fastpath+0x23/0x9a

CPU0   CPU1
   
lock(cpu_hotplug_lock.rw_sem);
lock(cpuset_mutex);
lock(cpu_hotplug_lock.rw_sem);
lock(cpuset_mutex);

Change locking order of cpu_hotplug_lock.rw_sem and
cpuset_mutex in cpuset_css_online(). Use _cpuslocked()
version for static_branch_inc/static_branch_dec in
cpuset_inc()/cpuset_dec().

Signed-off-by: Prateek Sood 
---
 include/linux/cpuset.h |  8 
 include/linux/jump_label.h | 10 --
 kernel/cgroup/cpuset.c |  4 ++--
 kernel/jump_label.c| 13 +
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 2ab910f..5aadc25 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -40,14 +40,14 @@ static inline bool cpusets_enabled(void)
 
 static inline void cpuset_inc(void)
 {
-   static_branch_inc(_pre_enable_key);
-   static_branch_inc(_enabled_key);
+   static_branch_inc_cpuslocked(_pre_enable_key);
+   static_branch_inc_cpuslocked(_enabled_key);
 }
 
 static inline void cpuset_dec(void)
 {
-   static_branch_dec(_enabled_key);
-   static_branch_dec(_pre_enable_key);
+   static_branch_dec_cpuslocked(_enabled_key);
+   static_branch_dec_cpuslocked(_pre_enable_key);
 }
 
 extern int cpuset_init(void);
diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index c7b368c..890e21c 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -160,6 +160,8 @@ extern void arch_jump_label_transform_static(struct 
jump_entry *entry,
 extern int jump_label_text_reserved(void *start, void *end);
 extern void static_key_slow_inc(struct static_key *key);
 extern void static_key_slow_dec(struct static_key *key);
+extern void static_key_slow_incr_cpuslocked(struct static_key *key);
+extern void static_key_slow_decr_cpuslocked(struct static_key *key);
 extern void jump_label_apply_nops(struct module *mod);
 extern int static_key_count(struct static_key *key);
 extern void static_key_enable(struct static_key *key);
@@ -259,6 +261,8 @@ static inline void static_key_disable(struct static_key 
*key)
 
 #define static_key_enable_cpuslocked(k)static_key_enable((k))
 #define static_key_disable_cpuslocked(k)   static_key_disable((k))
+#define static_key_slow_incr_cpuslocked(k) static_key_slow_inc((k))
+#define static_key_slow_decr_cpuslocked(k) static_key_slow_dec((k))
 
 #define STATIC_KEY_INIT_TRUE   { .enabled = ATOMIC_INIT(1) }
 #define STATIC_KEY_INIT_FALSE  { .enabled = ATOMIC_INIT(0) }
@@ -414,8 +418,10 @@ struct static_key_false {
  * Advanced usage; refcount, branch is enabled when: count != 0
  */
 
-#define static_branch_inc(x)   static_key_slow_inc(&(x)->key)
-#define static_branch_dec(x)   static_key_slow_dec(&(x)->key)
+#define static_branch_inc(x)   static_key_slow_inc(&(x)->key)
+#define static_branch_dec(x)   static_key_slow_dec(&(x)->key)
+#define static_branch_inc_cpuslocked(x)
static_key_slow_incr_cpuslocked(&(x)->key)
+#define static_branch_dec_cpuslocked(x)
static_key_slow_decr_cpuslocked(&(x)->key)
 
 /*
  * Normal usage; boolean enable/disable.
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 227bc25..4ad8bae 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1985,7 +1985,7 @@ static int cpuset_css_online(struct cgroup_subsys_state 
*css)
if (!parent)
return 0;
 
-   mutex_lock(_mutex);
+   cpuset_sched_change_begin();
 
set_bit(CS_ONLINE, >flags);
if (is_spread_page(parent))
@@ -2034,7 +2034,7 @@ static int cpuset_css_online(struct cgroup_subsys_state 
*css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(_lock);
 out_unlock:
-   mutex_unlock(_mutex);
+   cpuset_sched_change_end();
return 0;
 }
 
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 8594d24..dde0eaa 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -126,6 +126,12 @@ void static_key_slow_inc(struct static_key *key)
 }
 EXPORT_SYMBOL_GPL(static_key_slow_inc);
 
+void static_key_slow_incr_cpuslocked(struct static_key *key)
+{
+   static_key_slow_inc_cpuslocked(key);
+}
+EXPORT_S

Re: [PATCH v2] cgroup/cpuset: remove circular dependency deadlock

2017-11-15 Thread Prateek Sood

On 11/15/2017 10:35 PM, Tejun Heo wrote:
> On Wed, Nov 15, 2017 at 11:37:42AM +0100, Peter Zijlstra wrote:
>> On Wed, Nov 15, 2017 at 03:56:26PM +0530, Prateek Sood wrote:
>>> Any improvement/suggestion for this patch?
>>
>> I would have done 2 patches, the first one solving the locking issue,
>> the second removing the then redundant async rebuild stuff.
>>
>> Other than that this looks OK I suppose, but I was expecting this to go
>> through the cgroup tree, TJ?
> 
> Will pick them up after -rc1.
> 
> Thanks.
> 

I have made two patches as suggested by Peter and sent for review. 

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH v2] cgroup/cpuset: remove circular dependency deadlock

2017-11-15 Thread Prateek Sood

On 11/15/2017 10:35 PM, Tejun Heo wrote:
> On Wed, Nov 15, 2017 at 11:37:42AM +0100, Peter Zijlstra wrote:
>> On Wed, Nov 15, 2017 at 03:56:26PM +0530, Prateek Sood wrote:
>>> Any improvement/suggestion for this patch?
>>
>> I would have done 2 patches, the first one solving the locking issue,
>> the second removing the then redundant async rebuild stuff.
>>
>> Other than that this looks OK I suppose, but I was expecting this to go
>> through the cgroup tree, TJ?
> 
> Will pick them up after -rc1.
> 
> Thanks.
> 

I have made two patches as suggested by Peter and sent for review. 

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

[PATCH v3 2/2] cpuset: Make cpuset hotplug synchronous

2017-11-15 Thread Prateek Sood

Convert cpuset_hotplug_workfn() into synchronous call for cpu hotplug
path. For memory hotplug path it still gets queued as a work item.

Since cpuset_hotplug_workfn() can be made synchronous for cpu hotplug
path, it is not required to wait for cpuset hotplug while thawing
processes.

Signed-off-by: Prateek Sood <prs...@codeaurora.org>
---
 include/linux/cpuset.h |  6 --
 kernel/cgroup/cpuset.c | 41 -
 kernel/power/process.c |  2 --
 kernel/sched/core.c|  1 -
 4 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 1b8e415..2ab910f 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -52,9 +52,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_force_rebuild(void);
 extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -167,15 +165,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_force_rebuild(void) { }
-
 static inline void cpuset_update_active_cpus(void)
 {
partition_sched_domains(1, NULL, NULL);
 }
 
-static inline void cpuset_wait_for_hotplug(void) { }
-
 static inline void cpuset_cpus_allowed(struct task_struct *p,
   struct cpumask *mask)
 {
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index cab5fd1..227bc25 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2277,15 +2277,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset 
*cs)
mutex_unlock(_mutex);
 }
 
-static bool force_rebuild;
-
-void cpuset_force_rebuild(void)
-{
-   force_rebuild = true;
-}
-
 /**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset
  *
  * This function is called after either CPU or memory configuration has
  * changed and updates cpuset accordingly.  The top_cpuset is always
@@ -2300,7 +2293,7 @@ void cpuset_force_rebuild(void)
  * Note that CPU offlining during suspend is ignored.  We don't modify
  * cpusets across suspend/resume cycles at all.
  */
-static void cpuset_hotplug_workfn(struct work_struct *work)
+static void cpuset_hotplug(bool use_cpu_hp_lock)
 {
static cpumask_t new_cpus;
static nodemask_t new_mems;
@@ -2358,25 +2351,31 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
}
 
/* rebuild sched domains if cpus_allowed has changed */
-   if (cpus_updated || force_rebuild) {
-   force_rebuild = false;
-   rebuild_sched_domains();
+   if (cpus_updated) {
+   if (use_cpu_hp_lock)
+   rebuild_sched_domains();
+   else {
+   /* Acquiring cpu_hotplug_lock is not required.
+* When cpuset_hotplug() is called in hotplug path,
+* cpu_hotplug_lock is held by the hotplug context
+* which is waiting for cpuhp_thread_fun to indicate
+* completion of callback.
+*/
+   mutex_lock(_mutex);
+   rebuild_sched_domains_cpuslocked();
+   mutex_unlock(_mutex);
+   }
}
 }
 
-void cpuset_update_active_cpus(void)
+static void cpuset_hotplug_workfn(struct work_struct *work)
 {
-   /*
-* We're inside cpu hotplug critical region which usually nests
-* inside cgroup synchronization.  Bounce actual hotplug processing
-* to a work item to avoid reverse locking order.
-*/
-   schedule_work(_hotplug_work);
+   cpuset_hotplug(true);
 }
 
-void cpuset_wait_for_hotplug(void)
+void cpuset_update_active_cpus(void)
 {
-   flush_work(_hotplug_work);
+   cpuset_hotplug(false);
 }
 
 /*
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7381d49..c326d72 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -204,8 +204,6 @@ void thaw_processes(void)
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
 
-   cpuset_wait_for_hotplug();
-
read_lock(_lock);
for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b82a00..efcf753 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5600,7 +5600,6 @@ static void cpuset_cpu_active(void)
 * restore the original sched domains by consi

[PATCH v3 2/2] cpuset: Make cpuset hotplug synchronous

2017-11-15 Thread Prateek Sood

Convert cpuset_hotplug_workfn() into synchronous call for cpu hotplug
path. For memory hotplug path it still gets queued as a work item.

Since cpuset_hotplug_workfn() can be made synchronous for cpu hotplug
path, it is not required to wait for cpuset hotplug while thawing
processes.

Signed-off-by: Prateek Sood 
---
 include/linux/cpuset.h |  6 --
 kernel/cgroup/cpuset.c | 41 -
 kernel/power/process.c |  2 --
 kernel/sched/core.c|  1 -
 4 files changed, 20 insertions(+), 30 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 1b8e415..2ab910f 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -52,9 +52,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_force_rebuild(void);
 extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -167,15 +165,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_force_rebuild(void) { }
-
 static inline void cpuset_update_active_cpus(void)
 {
partition_sched_domains(1, NULL, NULL);
 }
 
-static inline void cpuset_wait_for_hotplug(void) { }
-
 static inline void cpuset_cpus_allowed(struct task_struct *p,
   struct cpumask *mask)
 {
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index cab5fd1..227bc25 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -2277,15 +2277,8 @@ static void cpuset_hotplug_update_tasks(struct cpuset 
*cs)
mutex_unlock(_mutex);
 }
 
-static bool force_rebuild;
-
-void cpuset_force_rebuild(void)
-{
-   force_rebuild = true;
-}
-
 /**
- * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
+ * cpuset_hotplug - handle CPU/memory hotunplug for a cpuset
  *
  * This function is called after either CPU or memory configuration has
  * changed and updates cpuset accordingly.  The top_cpuset is always
@@ -2300,7 +2293,7 @@ void cpuset_force_rebuild(void)
  * Note that CPU offlining during suspend is ignored.  We don't modify
  * cpusets across suspend/resume cycles at all.
  */
-static void cpuset_hotplug_workfn(struct work_struct *work)
+static void cpuset_hotplug(bool use_cpu_hp_lock)
 {
static cpumask_t new_cpus;
static nodemask_t new_mems;
@@ -2358,25 +2351,31 @@ static void cpuset_hotplug_workfn(struct work_struct 
*work)
}
 
/* rebuild sched domains if cpus_allowed has changed */
-   if (cpus_updated || force_rebuild) {
-   force_rebuild = false;
-   rebuild_sched_domains();
+   if (cpus_updated) {
+   if (use_cpu_hp_lock)
+   rebuild_sched_domains();
+   else {
+   /* Acquiring cpu_hotplug_lock is not required.
+* When cpuset_hotplug() is called in hotplug path,
+* cpu_hotplug_lock is held by the hotplug context
+* which is waiting for cpuhp_thread_fun to indicate
+* completion of callback.
+*/
+   mutex_lock(_mutex);
+   rebuild_sched_domains_cpuslocked();
+   mutex_unlock(_mutex);
+   }
}
 }
 
-void cpuset_update_active_cpus(void)
+static void cpuset_hotplug_workfn(struct work_struct *work)
 {
-   /*
-* We're inside cpu hotplug critical region which usually nests
-* inside cgroup synchronization.  Bounce actual hotplug processing
-* to a work item to avoid reverse locking order.
-*/
-   schedule_work(_hotplug_work);
+   cpuset_hotplug(true);
 }
 
-void cpuset_wait_for_hotplug(void)
+void cpuset_update_active_cpus(void)
 {
-   flush_work(_hotplug_work);
+   cpuset_hotplug(false);
 }
 
 /*
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 7381d49..c326d72 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -204,8 +204,6 @@ void thaw_processes(void)
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
 
-   cpuset_wait_for_hotplug();
-
read_lock(_lock);
for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b82a00..efcf753 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5600,7 +5600,6 @@ static void cpuset_cpu_active(void)
 * restore the original sched domains by considering the
 * cpuset configurations

[PATCH v3 1/2] cgroup/cpuset: remove circular dependency deadlock

2017-11-15 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Process A => kthreadd => Process B => Process C => Process A

Process A
cpu_subsys_offline();
  cpu_down();
_cpu_down();
  percpu_down_write(_hotplug_lock); //held
  cpuhp_invoke_callback();
 workqueue_offline_cpu();
queue_work_on(); // unbind_work on system_highpri_wq
   __queue_work();
 insert_work();
wake_up_worker();
flush_work();
   wait_for_completion();

worker_thread();
   manage_workers();
  create_worker();
 kthread_create_on_node();
wake_up_process(kthreadd_task);

kthreadd
kthreadd();
  kernel_thread();
do_fork();
  copy_process();
percpu_down_read(_threadgroup_rwsem);
  __rwsem_down_read_failed_common(); //waiting

Process B
kernfs_fop_write();
  cgroup_file_write();
cgroup_procs_write();
  percpu_down_write(_threadgroup_rwsem); //held
  cgroup_attach_task();
cgroup_migrate();
  cgroup_migrate_execute();
cpuset_can_attach();
  mutex_lock(_mutex); //waiting

Process C
kernfs_fop_write();
  cgroup_file_write();
cpuset_write_resmask();
  mutex_lock(_mutex); //held
  update_cpumask();
update_cpumasks_hier();
  rebuild_sched_domains_locked();
get_online_cpus();
  percpu_down_read(_hotplug_lock); //waiting

Eliminating deadlock by reversing the locking order for cpuset_mutex and
cpu_hotplug_lock.

Signed-off-by: Prateek Sood <prs...@codeaurora.org>
---
 kernel/cgroup/cpuset.c | 53 --
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f7efa7b..cab5fd1 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -812,6 +812,18 @@ static int generate_sched_domains(cpumask_var_t **domains,
return ndoms;
 }
 
+static void cpuset_sched_change_begin(void)
+{
+   cpus_read_lock();
+   mutex_lock(_mutex);
+}
+
+static void cpuset_sched_change_end(void)
+{
+   mutex_unlock(_mutex);
+   cpus_read_unlock();
+}
+
 /*
  * Rebuild scheduler domains.
  *
@@ -821,16 +833,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
 
lockdep_assert_held(_mutex);
-   get_online_cpus();
 
/*
 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -838,27 +848,25 @@ static void rebuild_sched_domains_locked(void)
 * Anyways, hotplug work item will rebuild sched domains.
 */
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-   goto out;
+   return;
 
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(, );
 
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
-   put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
 }
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
 {
-   mutex_lock(_mutex);
-   rebuild_sched_domains_locked();
-   mutex_unlock(_mutex);
+   cpuset_sched_change_begin();
+   rebuild_sched_domains_cpuslocked();
+   cpuset_sched_change_end();
 }
 
 /**
@@ -944,7 +952,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
rcu_read_unlock();
 
if (need_rebuild_sched_domains)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
 }
 
 /**
@@ -1276,7 +1284,7 @@ static int update_relax_domain_level(struct cpuset *cs, 
s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
}
 
return 0;
@@ -1309,7 +1317,6 @@ static void update_tasks_flags(struct cpuset *cs)
  *
  * Call with cpuset_mutex held.
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
   int turning_on)
 {
@@ -1342,7 +1349,7 @@ static int update_flag(cpuset_flagbits_t bit, struct 
cpuset *cs,
spin_unlock_irq(_lock);
 
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-

[PATCH v3 1/2] cgroup/cpuset: remove circular dependency deadlock

2017-11-15 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Process A => kthreadd => Process B => Process C => Process A

Process A
cpu_subsys_offline();
  cpu_down();
_cpu_down();
  percpu_down_write(_hotplug_lock); //held
  cpuhp_invoke_callback();
 workqueue_offline_cpu();
queue_work_on(); // unbind_work on system_highpri_wq
   __queue_work();
 insert_work();
wake_up_worker();
flush_work();
   wait_for_completion();

worker_thread();
   manage_workers();
  create_worker();
 kthread_create_on_node();
wake_up_process(kthreadd_task);

kthreadd
kthreadd();
  kernel_thread();
do_fork();
  copy_process();
percpu_down_read(_threadgroup_rwsem);
  __rwsem_down_read_failed_common(); //waiting

Process B
kernfs_fop_write();
  cgroup_file_write();
cgroup_procs_write();
  percpu_down_write(_threadgroup_rwsem); //held
  cgroup_attach_task();
cgroup_migrate();
  cgroup_migrate_execute();
cpuset_can_attach();
  mutex_lock(_mutex); //waiting

Process C
kernfs_fop_write();
  cgroup_file_write();
cpuset_write_resmask();
  mutex_lock(_mutex); //held
  update_cpumask();
update_cpumasks_hier();
  rebuild_sched_domains_locked();
get_online_cpus();
  percpu_down_read(_hotplug_lock); //waiting

Eliminating deadlock by reversing the locking order for cpuset_mutex and
cpu_hotplug_lock.

Signed-off-by: Prateek Sood 
---
 kernel/cgroup/cpuset.c | 53 --
 1 file changed, 30 insertions(+), 23 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index f7efa7b..cab5fd1 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -812,6 +812,18 @@ static int generate_sched_domains(cpumask_var_t **domains,
return ndoms;
 }
 
+static void cpuset_sched_change_begin(void)
+{
+   cpus_read_lock();
+   mutex_lock(_mutex);
+}
+
+static void cpuset_sched_change_end(void)
+{
+   mutex_unlock(_mutex);
+   cpus_read_unlock();
+}
+
 /*
  * Rebuild scheduler domains.
  *
@@ -821,16 +833,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
 
lockdep_assert_held(_mutex);
-   get_online_cpus();
 
/*
 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -838,27 +848,25 @@ static void rebuild_sched_domains_locked(void)
 * Anyways, hotplug work item will rebuild sched domains.
 */
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-   goto out;
+   return;
 
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(, );
 
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
-   put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
 }
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
 {
-   mutex_lock(_mutex);
-   rebuild_sched_domains_locked();
-   mutex_unlock(_mutex);
+   cpuset_sched_change_begin();
+   rebuild_sched_domains_cpuslocked();
+   cpuset_sched_change_end();
 }
 
 /**
@@ -944,7 +952,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
rcu_read_unlock();
 
if (need_rebuild_sched_domains)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
 }
 
 /**
@@ -1276,7 +1284,7 @@ static int update_relax_domain_level(struct cpuset *cs, 
s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
}
 
return 0;
@@ -1309,7 +1317,6 @@ static void update_tasks_flags(struct cpuset *cs)
  *
  * Call with cpuset_mutex held.
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
   int turning_on)
 {
@@ -1342,7 +1349,7 @@ static int update_flag(cpuset_flagbits_t bit, struct 
cpuset *cs,
spin_unlock_irq(_lock);
 
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-

[PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order.

2017-11-15 Thread Prateek Sood

This patch does following
1- Remove circular dependency deadlock by inverting order of
cpu_hotplug_lock and cpuset_mutex.
2- Make cpuset_hotplug_workfn() synchronous for cpu hotplug path.
For memory hotplug path it still gets queued as a work item.

Prateek Sood (2):
  cgroup/cpuset: remove circular dependency deadlock
  cpuset: Make cpuset hotplug synchronous

 include/linux/cpuset.h |  6 
 kernel/cgroup/cpuset.c | 94 +++---
 kernel/power/process.c |  2 --
 kernel/sched/core.c|  1 -
 4 files changed, 50 insertions(+), 53 deletions(-)

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

[PATCH v3 0/2] Invert cpu_hotplug_lock and cpuset_mutex locking order.

2017-11-15 Thread Prateek Sood

This patch does following
1- Remove circular dependency deadlock by inverting order of
cpu_hotplug_lock and cpuset_mutex.
2- Make cpuset_hotplug_workfn() synchronous for cpu hotplug path.
For memory hotplug path it still gets queued as a work item.

Prateek Sood (2):
  cgroup/cpuset: remove circular dependency deadlock
  cpuset: Make cpuset hotplug synchronous

 include/linux/cpuset.h |  6 
 kernel/cgroup/cpuset.c | 94 +++---
 kernel/power/process.c |  2 --
 kernel/sched/core.c|  1 -
 4 files changed, 50 insertions(+), 53 deletions(-)

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: [PATCH v2] cgroup/cpuset: remove circular dependency deadlock

2017-11-15 Thread Prateek Sood

On 10/30/2017 12:46 PM, Prateek Sood wrote:
> Remove circular dependency deadlock in a scenario where hotplug of CPU is
> being done while there is updation in cgroup and cpuset triggered from
> userspace.
> 
> Process A => kthreadd => Process B => Process C => Process A
> 
> Process A
> cpu_subsys_offline();
>   cpu_down();
> _cpu_down();
>   percpu_down_write(_hotplug_lock); //held
>   cpuhp_invoke_callback();
>workqueue_offline_cpu();
> queue_work_on(); // unbind_work on system_highpri_wq
>__queue_work();
>  insert_work();
> wake_up_worker();
> flush_work();
>wait_for_completion();
> 
> worker_thread();
>manage_workers();
>   create_worker();
>kthread_create_on_node();
>   wake_up_process(kthreadd_task);
> 
> kthreadd
> kthreadd();
>   kernel_thread();
> do_fork();
>   copy_process();
> percpu_down_read(_threadgroup_rwsem);
>   __rwsem_down_read_failed_common(); //waiting
> 
> Process B
> kernfs_fop_write();
>   cgroup_file_write();
> cgroup_procs_write();
>   percpu_down_write(_threadgroup_rwsem); //held
>   cgroup_attach_task();
> cgroup_migrate();
>   cgroup_migrate_execute();
> cpuset_can_attach();
>   mutex_lock(_mutex); //waiting
> 
> Process C
> kernfs_fop_write();
>   cgroup_file_write();
> cpuset_write_resmask();
>   mutex_lock(_mutex); //held
>   update_cpumask();
> update_cpumasks_hier();
>   rebuild_sched_domains_locked();
> get_online_cpus();
>   percpu_down_read(_hotplug_lock); //waiting
> 
> Eliminating deadlock by reversing the locking order for cpuset_mutex and
> cpu_hotplug_lock. After inverting the locking sequence of cpu_hotplug_lock
> and cpuset_mutex, cpuset_hotplug_workfn() related functionality can be
> done synchronously from the context doing cpu hotplug. For memory hotplug
> it still gets queued as a work item.
> 
> Signed-off-by: Prateek Sood <prs...@codeaurora.org>
> ---
>  include/linux/cpuset.h |  6 
>  kernel/cgroup/cpuset.c | 94 
> +++---
>  kernel/power/process.c |  2 --
>  kernel/sched/core.c|  1 -
>  4 files changed, 50 insertions(+), 53 deletions(-)
> 
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index a1e6a33..e74655d 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
>  
>  extern int cpuset_init(void);
>  extern void cpuset_init_smp(void);
> -extern void cpuset_force_rebuild(void);
>  extern void cpuset_update_active_cpus(void);
> -extern void cpuset_wait_for_hotplug(void);
>  extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
>  extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
>  extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
> @@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
>  static inline int cpuset_init(void) { return 0; }
>  static inline void cpuset_init_smp(void) {}
>  
> -static inline void cpuset_force_rebuild(void) { }
> -
>  static inline void cpuset_update_active_cpus(void)
>  {
>   partition_sched_domains(1, NULL, NULL);
>  }
>  
> -static inline void cpuset_wait_for_hotplug(void) { }
> -
>  static inline void cpuset_cpus_allowed(struct task_struct *p,
>  struct cpumask *mask)
>  {
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 4657e29..ec44aaa 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -817,6 +817,18 @@ static int generate_sched_domains(cpumask_var_t 
> **domains,
>   return ndoms;
>  }
>  
> +static void cpuset_sched_change_begin(void)
> +{
> + cpus_read_lock();
> + mutex_lock(_mutex);
> +}
> +
> +static void cpuset_sched_change_end(void)
> +{
> + mutex_unlock(_mutex);
> + cpus_read_unlock();
> +}
> +
>  /*
>   * Rebuild scheduler domains.
>   *
> @@ -826,16 +838,14 @@ static int generate_sched_domains(cpumask_var_t 
> **domains,
>   * 'cpus' is removed, then call this routine to rebuild the
>   * scheduler's dynamic sched domains.
>   *
> - * Call with cpuset_mutex held.  Takes get_online_cpus().
>   */
> -static void rebuild_sched_domains_locked(void)
> +static void rebuild_sched_domains_cpuslocked(void)
>  {
>   struct sched_domain_attr *attr;
>   cpumask_var_t *doms;
>   i

Re: [PATCH v2] cgroup/cpuset: remove circular dependency deadlock

2017-11-15 Thread Prateek Sood

On 10/30/2017 12:46 PM, Prateek Sood wrote:
> Remove circular dependency deadlock in a scenario where hotplug of CPU is
> being done while there is updation in cgroup and cpuset triggered from
> userspace.
> 
> Process A => kthreadd => Process B => Process C => Process A
> 
> Process A
> cpu_subsys_offline();
>   cpu_down();
> _cpu_down();
>   percpu_down_write(_hotplug_lock); //held
>   cpuhp_invoke_callback();
>workqueue_offline_cpu();
> queue_work_on(); // unbind_work on system_highpri_wq
>__queue_work();
>  insert_work();
> wake_up_worker();
> flush_work();
>wait_for_completion();
> 
> worker_thread();
>manage_workers();
>   create_worker();
>kthread_create_on_node();
>   wake_up_process(kthreadd_task);
> 
> kthreadd
> kthreadd();
>   kernel_thread();
> do_fork();
>   copy_process();
> percpu_down_read(_threadgroup_rwsem);
>   __rwsem_down_read_failed_common(); //waiting
> 
> Process B
> kernfs_fop_write();
>   cgroup_file_write();
> cgroup_procs_write();
>   percpu_down_write(_threadgroup_rwsem); //held
>   cgroup_attach_task();
> cgroup_migrate();
>   cgroup_migrate_execute();
> cpuset_can_attach();
>   mutex_lock(_mutex); //waiting
> 
> Process C
> kernfs_fop_write();
>   cgroup_file_write();
> cpuset_write_resmask();
>   mutex_lock(_mutex); //held
>   update_cpumask();
> update_cpumasks_hier();
>   rebuild_sched_domains_locked();
> get_online_cpus();
>   percpu_down_read(_hotplug_lock); //waiting
> 
> Eliminating deadlock by reversing the locking order for cpuset_mutex and
> cpu_hotplug_lock. After inverting the locking sequence of cpu_hotplug_lock
> and cpuset_mutex, cpuset_hotplug_workfn() related functionality can be
> done synchronously from the context doing cpu hotplug. For memory hotplug
> it still gets queued as a work item.
> 
> Signed-off-by: Prateek Sood 
> ---
>  include/linux/cpuset.h |  6 
>  kernel/cgroup/cpuset.c | 94 
> +++---
>  kernel/power/process.c |  2 --
>  kernel/sched/core.c|  1 -
>  4 files changed, 50 insertions(+), 53 deletions(-)
> 
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index a1e6a33..e74655d 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
>  
>  extern int cpuset_init(void);
>  extern void cpuset_init_smp(void);
> -extern void cpuset_force_rebuild(void);
>  extern void cpuset_update_active_cpus(void);
> -extern void cpuset_wait_for_hotplug(void);
>  extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
>  extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
>  extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
> @@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
>  static inline int cpuset_init(void) { return 0; }
>  static inline void cpuset_init_smp(void) {}
>  
> -static inline void cpuset_force_rebuild(void) { }
> -
>  static inline void cpuset_update_active_cpus(void)
>  {
>   partition_sched_domains(1, NULL, NULL);
>  }
>  
> -static inline void cpuset_wait_for_hotplug(void) { }
> -
>  static inline void cpuset_cpus_allowed(struct task_struct *p,
>  struct cpumask *mask)
>  {
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 4657e29..ec44aaa 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -817,6 +817,18 @@ static int generate_sched_domains(cpumask_var_t 
> **domains,
>   return ndoms;
>  }
>  
> +static void cpuset_sched_change_begin(void)
> +{
> + cpus_read_lock();
> + mutex_lock(_mutex);
> +}
> +
> +static void cpuset_sched_change_end(void)
> +{
> + mutex_unlock(_mutex);
> + cpus_read_unlock();
> +}
> +
>  /*
>   * Rebuild scheduler domains.
>   *
> @@ -826,16 +838,14 @@ static int generate_sched_domains(cpumask_var_t 
> **domains,
>   * 'cpus' is removed, then call this routine to rebuild the
>   * scheduler's dynamic sched domains.
>   *
> - * Call with cpuset_mutex held.  Takes get_online_cpus().
>   */
> -static void rebuild_sched_domains_locked(void)
> +static void rebuild_sched_domains_cpuslocked(void)
>  {
>   struct sched_domain_attr *attr;
>   cpumask_var_t *doms;
>   int ndoms;
>  
>   lockdep_assert_h

Re: [PATCH v2] cgroup/cpuset: remove circular dependency deadlock

2017-11-05 Thread Prateek Sood

On 10/30/2017 12:46 PM, Prateek Sood wrote:
> Remove circular dependency deadlock in a scenario where hotplug of CPU is
> being done while there is updation in cgroup and cpuset triggered from
> userspace.
> 
> Process A => kthreadd => Process B => Process C => Process A
> 
> Process A
> cpu_subsys_offline();
>   cpu_down();
> _cpu_down();
>   percpu_down_write(_hotplug_lock); //held
>   cpuhp_invoke_callback();
>workqueue_offline_cpu();
> queue_work_on(); // unbind_work on system_highpri_wq
>__queue_work();
>  insert_work();
> wake_up_worker();
> flush_work();
>wait_for_completion();
> 
> worker_thread();
>manage_workers();
>   create_worker();
>kthread_create_on_node();
>   wake_up_process(kthreadd_task);
> 
> kthreadd
> kthreadd();
>   kernel_thread();
> do_fork();
>   copy_process();
> percpu_down_read(_threadgroup_rwsem);
>   __rwsem_down_read_failed_common(); //waiting
> 
> Process B
> kernfs_fop_write();
>   cgroup_file_write();
> cgroup_procs_write();
>   percpu_down_write(_threadgroup_rwsem); //held
>   cgroup_attach_task();
> cgroup_migrate();
>   cgroup_migrate_execute();
> cpuset_can_attach();
>   mutex_lock(_mutex); //waiting
> 
> Process C
> kernfs_fop_write();
>   cgroup_file_write();
> cpuset_write_resmask();
>   mutex_lock(_mutex); //held
>   update_cpumask();
> update_cpumasks_hier();
>   rebuild_sched_domains_locked();
> get_online_cpus();
>   percpu_down_read(_hotplug_lock); //waiting
> 
> Eliminating deadlock by reversing the locking order for cpuset_mutex and
> cpu_hotplug_lock. After inverting the locking sequence of cpu_hotplug_lock
> and cpuset_mutex, cpuset_hotplug_workfn() related functionality can be
> done synchronously from the context doing cpu hotplug. For memory hotplug
> it still gets queued as a work item.
> 
> Signed-off-by: Prateek Sood <prs...@codeaurora.org>
> ---
>  include/linux/cpuset.h |  6 
>  kernel/cgroup/cpuset.c | 94 
> +++---
>  kernel/power/process.c |  2 --
>  kernel/sched/core.c|  1 -
>  4 files changed, 50 insertions(+), 53 deletions(-)
> 
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index a1e6a33..e74655d 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
>  
>  extern int cpuset_init(void);
>  extern void cpuset_init_smp(void);
> -extern void cpuset_force_rebuild(void);
>  extern void cpuset_update_active_cpus(void);
> -extern void cpuset_wait_for_hotplug(void);
>  extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
>  extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
>  extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
> @@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
>  static inline int cpuset_init(void) { return 0; }
>  static inline void cpuset_init_smp(void) {}
>  
> -static inline void cpuset_force_rebuild(void) { }
> -
>  static inline void cpuset_update_active_cpus(void)
>  {
>   partition_sched_domains(1, NULL, NULL);
>  }
>  
> -static inline void cpuset_wait_for_hotplug(void) { }
> -
>  static inline void cpuset_cpus_allowed(struct task_struct *p,
>  struct cpumask *mask)
>  {
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 4657e29..ec44aaa 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -817,6 +817,18 @@ static int generate_sched_domains(cpumask_var_t 
> **domains,
>   return ndoms;
>  }
>  
> +static void cpuset_sched_change_begin(void)
> +{
> + cpus_read_lock();
> + mutex_lock(_mutex);
> +}
> +
> +static void cpuset_sched_change_end(void)
> +{
> + mutex_unlock(_mutex);
> + cpus_read_unlock();
> +}
> +
>  /*
>   * Rebuild scheduler domains.
>   *
> @@ -826,16 +838,14 @@ static int generate_sched_domains(cpumask_var_t 
> **domains,
>   * 'cpus' is removed, then call this routine to rebuild the
>   * scheduler's dynamic sched domains.
>   *
> - * Call with cpuset_mutex held.  Takes get_online_cpus().
>   */
> -static void rebuild_sched_domains_locked(void)
> +static void rebuild_sched_domains_cpuslocked(void)
>  {
>   struct sched_domain_attr *attr;
>   cpumask_var_t *doms;
>   i

Re: [PATCH v2] cgroup/cpuset: remove circular dependency deadlock

2017-11-05 Thread Prateek Sood

On 10/30/2017 12:46 PM, Prateek Sood wrote:
> Remove circular dependency deadlock in a scenario where hotplug of CPU is
> being done while there is updation in cgroup and cpuset triggered from
> userspace.
> 
> Process A => kthreadd => Process B => Process C => Process A
> 
> Process A
> cpu_subsys_offline();
>   cpu_down();
> _cpu_down();
>   percpu_down_write(_hotplug_lock); //held
>   cpuhp_invoke_callback();
>workqueue_offline_cpu();
> queue_work_on(); // unbind_work on system_highpri_wq
>__queue_work();
>  insert_work();
> wake_up_worker();
> flush_work();
>wait_for_completion();
> 
> worker_thread();
>manage_workers();
>   create_worker();
>kthread_create_on_node();
>   wake_up_process(kthreadd_task);
> 
> kthreadd
> kthreadd();
>   kernel_thread();
> do_fork();
>   copy_process();
> percpu_down_read(_threadgroup_rwsem);
>   __rwsem_down_read_failed_common(); //waiting
> 
> Process B
> kernfs_fop_write();
>   cgroup_file_write();
> cgroup_procs_write();
>   percpu_down_write(_threadgroup_rwsem); //held
>   cgroup_attach_task();
> cgroup_migrate();
>   cgroup_migrate_execute();
> cpuset_can_attach();
>   mutex_lock(_mutex); //waiting
> 
> Process C
> kernfs_fop_write();
>   cgroup_file_write();
> cpuset_write_resmask();
>   mutex_lock(_mutex); //held
>   update_cpumask();
> update_cpumasks_hier();
>   rebuild_sched_domains_locked();
> get_online_cpus();
>   percpu_down_read(_hotplug_lock); //waiting
> 
> Eliminating deadlock by reversing the locking order for cpuset_mutex and
> cpu_hotplug_lock. After inverting the locking sequence of cpu_hotplug_lock
> and cpuset_mutex, cpuset_hotplug_workfn() related functionality can be
> done synchronously from the context doing cpu hotplug. For memory hotplug
> it still gets queued as a work item.
> 
> Signed-off-by: Prateek Sood 
> ---
>  include/linux/cpuset.h |  6 
>  kernel/cgroup/cpuset.c | 94 
> +++---
>  kernel/power/process.c |  2 --
>  kernel/sched/core.c|  1 -
>  4 files changed, 50 insertions(+), 53 deletions(-)
> 
> diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
> index a1e6a33..e74655d 100644
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
>  
>  extern int cpuset_init(void);
>  extern void cpuset_init_smp(void);
> -extern void cpuset_force_rebuild(void);
>  extern void cpuset_update_active_cpus(void);
> -extern void cpuset_wait_for_hotplug(void);
>  extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
>  extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
>  extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
> @@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
>  static inline int cpuset_init(void) { return 0; }
>  static inline void cpuset_init_smp(void) {}
>  
> -static inline void cpuset_force_rebuild(void) { }
> -
>  static inline void cpuset_update_active_cpus(void)
>  {
>   partition_sched_domains(1, NULL, NULL);
>  }
>  
> -static inline void cpuset_wait_for_hotplug(void) { }
> -
>  static inline void cpuset_cpus_allowed(struct task_struct *p,
>  struct cpumask *mask)
>  {
> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
> index 4657e29..ec44aaa 100644
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -817,6 +817,18 @@ static int generate_sched_domains(cpumask_var_t 
> **domains,
>   return ndoms;
>  }
>  
> +static void cpuset_sched_change_begin(void)
> +{
> + cpus_read_lock();
> + mutex_lock(_mutex);
> +}
> +
> +static void cpuset_sched_change_end(void)
> +{
> + mutex_unlock(_mutex);
> + cpus_read_unlock();
> +}
> +
>  /*
>   * Rebuild scheduler domains.
>   *
> @@ -826,16 +838,14 @@ static int generate_sched_domains(cpumask_var_t 
> **domains,
>   * 'cpus' is removed, then call this routine to rebuild the
>   * scheduler's dynamic sched domains.
>   *
> - * Call with cpuset_mutex held.  Takes get_online_cpus().
>   */
> -static void rebuild_sched_domains_locked(void)
> +static void rebuild_sched_domains_cpuslocked(void)
>  {
>   struct sched_domain_attr *attr;
>   cpumask_var_t *doms;
>   int ndoms;
>  
>   lockdep_assert_h

[PATCH v2] cgroup/cpuset: remove circular dependency deadlock

2017-10-30 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Process A => kthreadd => Process B => Process C => Process A

Process A
cpu_subsys_offline();
  cpu_down();
_cpu_down();
  percpu_down_write(_hotplug_lock); //held
  cpuhp_invoke_callback();
 workqueue_offline_cpu();
queue_work_on(); // unbind_work on system_highpri_wq
   __queue_work();
 insert_work();
wake_up_worker();
flush_work();
   wait_for_completion();

worker_thread();
   manage_workers();
  create_worker();
 kthread_create_on_node();
wake_up_process(kthreadd_task);

kthreadd
kthreadd();
  kernel_thread();
do_fork();
  copy_process();
percpu_down_read(_threadgroup_rwsem);
  __rwsem_down_read_failed_common(); //waiting

Process B
kernfs_fop_write();
  cgroup_file_write();
cgroup_procs_write();
  percpu_down_write(_threadgroup_rwsem); //held
  cgroup_attach_task();
cgroup_migrate();
  cgroup_migrate_execute();
cpuset_can_attach();
  mutex_lock(_mutex); //waiting

Process C
kernfs_fop_write();
  cgroup_file_write();
cpuset_write_resmask();
  mutex_lock(_mutex); //held
  update_cpumask();
update_cpumasks_hier();
  rebuild_sched_domains_locked();
get_online_cpus();
  percpu_down_read(_hotplug_lock); //waiting

Eliminating deadlock by reversing the locking order for cpuset_mutex and
cpu_hotplug_lock. After inverting the locking sequence of cpu_hotplug_lock
and cpuset_mutex, cpuset_hotplug_workfn() related functionality can be
done synchronously from the context doing cpu hotplug. For memory hotplug
it still gets queued as a work item.

Signed-off-by: Prateek Sood <prs...@codeaurora.org>
---
 include/linux/cpuset.h |  6 
 kernel/cgroup/cpuset.c | 94 +++---
 kernel/power/process.c |  2 --
 kernel/sched/core.c|  1 -
 4 files changed, 50 insertions(+), 53 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a1e6a33..e74655d 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_force_rebuild(void);
 extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_force_rebuild(void) { }
-
 static inline void cpuset_update_active_cpus(void)
 {
partition_sched_domains(1, NULL, NULL);
 }
 
-static inline void cpuset_wait_for_hotplug(void) { }
-
 static inline void cpuset_cpus_allowed(struct task_struct *p,
   struct cpumask *mask)
 {
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e29..ec44aaa 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -817,6 +817,18 @@ static int generate_sched_domains(cpumask_var_t **domains,
return ndoms;
 }
 
+static void cpuset_sched_change_begin(void)
+{
+   cpus_read_lock();
+   mutex_lock(_mutex);
+}
+
+static void cpuset_sched_change_end(void)
+{
+   mutex_unlock(_mutex);
+   cpus_read_unlock();
+}
+
 /*
  * Rebuild scheduler domains.
  *
@@ -826,16 +838,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
 
lockdep_assert_held(_mutex);
-   get_online_cpus();
 
/*
 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -843,27 +853,25 @@ static void rebuild_sched_domains_locked(void)
 * Anyways, hotplug work item will rebuild sched domains.
 */
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-   goto out;
+   return;
 
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(, );
 
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
-   put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(

[PATCH v2] cgroup/cpuset: remove circular dependency deadlock

2017-10-30 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Process A => kthreadd => Process B => Process C => Process A

Process A
cpu_subsys_offline();
  cpu_down();
_cpu_down();
  percpu_down_write(_hotplug_lock); //held
  cpuhp_invoke_callback();
 workqueue_offline_cpu();
queue_work_on(); // unbind_work on system_highpri_wq
   __queue_work();
 insert_work();
wake_up_worker();
flush_work();
   wait_for_completion();

worker_thread();
   manage_workers();
  create_worker();
 kthread_create_on_node();
wake_up_process(kthreadd_task);

kthreadd
kthreadd();
  kernel_thread();
do_fork();
  copy_process();
percpu_down_read(_threadgroup_rwsem);
  __rwsem_down_read_failed_common(); //waiting

Process B
kernfs_fop_write();
  cgroup_file_write();
cgroup_procs_write();
  percpu_down_write(_threadgroup_rwsem); //held
  cgroup_attach_task();
cgroup_migrate();
  cgroup_migrate_execute();
cpuset_can_attach();
  mutex_lock(_mutex); //waiting

Process C
kernfs_fop_write();
  cgroup_file_write();
cpuset_write_resmask();
  mutex_lock(_mutex); //held
  update_cpumask();
update_cpumasks_hier();
  rebuild_sched_domains_locked();
get_online_cpus();
  percpu_down_read(_hotplug_lock); //waiting

Eliminating deadlock by reversing the locking order for cpuset_mutex and
cpu_hotplug_lock. After inverting the locking sequence of cpu_hotplug_lock
and cpuset_mutex, cpuset_hotplug_workfn() related functionality can be
done synchronously from the context doing cpu hotplug. For memory hotplug
it still gets queued as a work item.

Signed-off-by: Prateek Sood 
---
 include/linux/cpuset.h |  6 
 kernel/cgroup/cpuset.c | 94 +++---
 kernel/power/process.c |  2 --
 kernel/sched/core.c|  1 -
 4 files changed, 50 insertions(+), 53 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a1e6a33..e74655d 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_force_rebuild(void);
 extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_force_rebuild(void) { }
-
 static inline void cpuset_update_active_cpus(void)
 {
partition_sched_domains(1, NULL, NULL);
 }
 
-static inline void cpuset_wait_for_hotplug(void) { }
-
 static inline void cpuset_cpus_allowed(struct task_struct *p,
   struct cpumask *mask)
 {
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e29..ec44aaa 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -817,6 +817,18 @@ static int generate_sched_domains(cpumask_var_t **domains,
return ndoms;
 }
 
+static void cpuset_sched_change_begin(void)
+{
+   cpus_read_lock();
+   mutex_lock(_mutex);
+}
+
+static void cpuset_sched_change_end(void)
+{
+   mutex_unlock(_mutex);
+   cpus_read_unlock();
+}
+
 /*
  * Rebuild scheduler domains.
  *
@@ -826,16 +838,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
 
lockdep_assert_held(_mutex);
-   get_online_cpus();
 
/*
 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -843,27 +853,25 @@ static void rebuild_sched_domains_locked(void)
 * Anyways, hotplug work item will rebuild sched domains.
 */
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-   goto out;
+   return;
 
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(, );
 
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
-   put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-10-27 Thread Prateek Sood

On 10/26/2017 07:35 PM, Waiman Long wrote:
> On 10/26/2017 07:52 AM, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Process A => kthreadd => Process B => Process C => Process A
>>
>> Process A
>> cpu_subsys_offline();
>>   cpu_down();
>> _cpu_down();
>>   percpu_down_write(_hotplug_lock); //held
>>   cpuhp_invoke_callback();
>> workqueue_offline_cpu();
>>   wq_update_unbound_numa();
>> kthread_create_on_node();
>>   wake_up_process();  //wakeup kthreadd
>>   flush_work();
>>   wait_for_completion();
>>
>> kthreadd
>> kthreadd();
>>   kernel_thread();
>> do_fork();
>>   copy_process();
>> percpu_down_read(_threadgroup_rwsem);
>>   __rwsem_down_read_failed_common(); //waiting
>>
>> Process B
>> kernfs_fop_write();
>>   cgroup_file_write();
>> cgroup_procs_write();
>>   percpu_down_write(_threadgroup_rwsem); //held
>>   cgroup_attach_task();
>> cgroup_migrate();
>>   cgroup_migrate_execute();
>> cpuset_can_attach();
>>   mutex_lock(_mutex); //waiting
>>
>> Process C
>> kernfs_fop_write();
>>   cgroup_file_write();
>> cpuset_write_resmask();
>>   mutex_lock(_mutex); //held
>>   update_cpumask();
>> update_cpumasks_hier();
>>   rebuild_sched_domains_locked();
>> get_online_cpus();
>>   percpu_down_read(_hotplug_lock); //waiting
>>
>> Eliminating deadlock by reversing the locking order for cpuset_mutex and
>> cpu_hotplug_lock.
> 
> General comments:
> 
> Please add a version number of your patch. I have seen multiple versions
> of this patch and have lost track how many are there as there is no
> version number information.  In addition, there are changes beyond just
> swapping the lock order and they are not documented in this change log.
> I would like to see you discuss about those additional changes here as well.
Thanks for the comments Longman. I will introduce patch versioning and update
commit text to document extra changes.

Explaintaion for extra changes in this patch:
After inverting the locking sequence of cpu_hotplug_lock and cpuset_mutex,
cpuset_hotplug_workfn() related functionality can be done synchronously from
the context doing cpu hotplug. Extra changes in this patch intend to remove
queuing of cpuset_hotplug_workfn() as a work item for cpu hotplug path. For
memory hotplug it still gets queued as a work item.


This suggestion came in from Peter. 
Peter could you please elaborate if I have missed anything.

> 
>>  void rebuild_sched_domains(void)
>>  {
>> +cpus_read_lock();
>>  mutex_lock(_mutex);
>> -rebuild_sched_domains_locked();
>> +rebuild_sched_domains_cpuslocked();
>>  mutex_unlock(_mutex);
>> +cpus_read_unlock();
>>  }
> 
> I saw a lot of instances where cpus_read_lock() and mutex_lock() come
> together. Maybe some new lock/unlock helper functions may help.
Ok, I will introduce a single wrapper for locking and unlocking
of both locks
 
> 
>> @@ -2356,25 +2354,29 @@ static void cpuset_hotplug_workfn(struct work_struct 
>> *work)
>>  }
>>  
>>  /* rebuild sched domains if cpus_allowed has changed */
>> -if (cpus_updated || force_rebuild) {
>> -force_rebuild = false;
>> -rebuild_sched_domains();
>> +if (cpus_updated) {
>> +if (use_cpu_hp_lock)
>> +rebuild_sched_domains();
>> +else {
>> +/* When called during cpu hotplug cpu_hotplug_lock
>> + * is held by the calling thread, not
>> + * not cpuhp_thread_fun
>> + */
> 
> ??? The comment is not clear.

Following is the scenario that is described by the comment

Process A
_cpu_down()
   cpus_write_lock() //cpu_hotplug_lock held
  cpuhp_kick_ap_work()
 cpuhp_kick_ap()
wake_up_process() // wake up cpuhp_thread_fun
wait_for_ap_thread() //wait for hotplug thread to signal completion


cpuhp_thread_fun()
   cpuhp_invoke_callback()
 sched_cpu_deactivate()
cpuset_cpu_inactive()
   cpuset_update_active_cpus()
 cpuset_hotplug(false) \\ do not use cpu_hotplug_lock from 
_cpu_down() path

I will update the comment in next version of patch to elaborate more.

> 
> Cheers,
> Longman
> 


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-10-27 Thread Prateek Sood

On 10/26/2017 07:35 PM, Waiman Long wrote:
> On 10/26/2017 07:52 AM, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Process A => kthreadd => Process B => Process C => Process A
>>
>> Process A
>> cpu_subsys_offline();
>>   cpu_down();
>> _cpu_down();
>>   percpu_down_write(_hotplug_lock); //held
>>   cpuhp_invoke_callback();
>> workqueue_offline_cpu();
>>   wq_update_unbound_numa();
>> kthread_create_on_node();
>>   wake_up_process();  //wakeup kthreadd
>>   flush_work();
>>   wait_for_completion();
>>
>> kthreadd
>> kthreadd();
>>   kernel_thread();
>> do_fork();
>>   copy_process();
>> percpu_down_read(_threadgroup_rwsem);
>>   __rwsem_down_read_failed_common(); //waiting
>>
>> Process B
>> kernfs_fop_write();
>>   cgroup_file_write();
>> cgroup_procs_write();
>>   percpu_down_write(_threadgroup_rwsem); //held
>>   cgroup_attach_task();
>> cgroup_migrate();
>>   cgroup_migrate_execute();
>> cpuset_can_attach();
>>   mutex_lock(_mutex); //waiting
>>
>> Process C
>> kernfs_fop_write();
>>   cgroup_file_write();
>> cpuset_write_resmask();
>>   mutex_lock(_mutex); //held
>>   update_cpumask();
>> update_cpumasks_hier();
>>   rebuild_sched_domains_locked();
>> get_online_cpus();
>>   percpu_down_read(_hotplug_lock); //waiting
>>
>> Eliminating deadlock by reversing the locking order for cpuset_mutex and
>> cpu_hotplug_lock.
> 
> General comments:
> 
> Please add a version number of your patch. I have seen multiple versions
> of this patch and have lost track how many are there as there is no
> version number information.  In addition, there are changes beyond just
> swapping the lock order and they are not documented in this change log.
> I would like to see you discuss about those additional changes here as well.
Thanks for the comments Longman. I will introduce patch versioning and update
commit text to document extra changes.

Explaintaion for extra changes in this patch:
After inverting the locking sequence of cpu_hotplug_lock and cpuset_mutex,
cpuset_hotplug_workfn() related functionality can be done synchronously from
the context doing cpu hotplug. Extra changes in this patch intend to remove
queuing of cpuset_hotplug_workfn() as a work item for cpu hotplug path. For
memory hotplug it still gets queued as a work item.


This suggestion came in from Peter. 
Peter could you please elaborate if I have missed anything.

> 
>>  void rebuild_sched_domains(void)
>>  {
>> +cpus_read_lock();
>>  mutex_lock(_mutex);
>> -rebuild_sched_domains_locked();
>> +rebuild_sched_domains_cpuslocked();
>>  mutex_unlock(_mutex);
>> +cpus_read_unlock();
>>  }
> 
> I saw a lot of instances where cpus_read_lock() and mutex_lock() come
> together. Maybe some new lock/unlock helper functions may help.
Ok, I will introduce a single wrapper for locking and unlocking
of both locks
 
> 
>> @@ -2356,25 +2354,29 @@ static void cpuset_hotplug_workfn(struct work_struct 
>> *work)
>>  }
>>  
>>  /* rebuild sched domains if cpus_allowed has changed */
>> -if (cpus_updated || force_rebuild) {
>> -force_rebuild = false;
>> -rebuild_sched_domains();
>> +if (cpus_updated) {
>> +if (use_cpu_hp_lock)
>> +rebuild_sched_domains();
>> +else {
>> +/* When called during cpu hotplug cpu_hotplug_lock
>> + * is held by the calling thread, not
>> + * not cpuhp_thread_fun
>> + */
> 
> ??? The comment is not clear.

Following is the scenario that is described by the comment

Process A
_cpu_down()
   cpus_write_lock() //cpu_hotplug_lock held
  cpuhp_kick_ap_work()
 cpuhp_kick_ap()
wake_up_process() // wake up cpuhp_thread_fun
wait_for_ap_thread() //wait for hotplug thread to signal completion


cpuhp_thread_fun()
   cpuhp_invoke_callback()
 sched_cpu_deactivate()
cpuset_cpu_inactive()
   cpuset_update_active_cpus()
 cpuset_hotplug(false) \\ do not use cpu_hotplug_lock from 
_cpu_down() path

I will update the comment in next version of patch to elaborate more.

> 
> Cheers,
> Longman
> 


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

[PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-10-26 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Process A => kthreadd => Process B => Process C => Process A

Process A
cpu_subsys_offline();
  cpu_down();
_cpu_down();
  percpu_down_write(_hotplug_lock); //held
  cpuhp_invoke_callback();
workqueue_offline_cpu();
  wq_update_unbound_numa();
kthread_create_on_node();
  wake_up_process();  //wakeup kthreadd
  flush_work();
  wait_for_completion();

kthreadd
kthreadd();
  kernel_thread();
do_fork();
  copy_process();
percpu_down_read(_threadgroup_rwsem);
  __rwsem_down_read_failed_common(); //waiting

Process B
kernfs_fop_write();
  cgroup_file_write();
cgroup_procs_write();
  percpu_down_write(_threadgroup_rwsem); //held
  cgroup_attach_task();
cgroup_migrate();
  cgroup_migrate_execute();
cpuset_can_attach();
  mutex_lock(_mutex); //waiting

Process C
kernfs_fop_write();
  cgroup_file_write();
cpuset_write_resmask();
  mutex_lock(_mutex); //held
  update_cpumask();
update_cpumasks_hier();
  rebuild_sched_domains_locked();
get_online_cpus();
  percpu_down_read(_hotplug_lock); //waiting

Eliminating deadlock by reversing the locking order for cpuset_mutex and
cpu_hotplug_lock.

Signed-off-by: Prateek Sood <prs...@codeaurora.org>
---
 include/linux/cpuset.h |  6 -
 kernel/cgroup/cpuset.c | 70 ++
 kernel/power/process.c |  2 --
 kernel/sched/core.c|  1 -
 4 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a1e6a33..e74655d 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_force_rebuild(void);
 extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_force_rebuild(void) { }
-
 static inline void cpuset_update_active_cpus(void)
 {
partition_sched_domains(1, NULL, NULL);
 }
 
-static inline void cpuset_wait_for_hotplug(void) { }
-
 static inline void cpuset_cpus_allowed(struct task_struct *p,
   struct cpumask *mask)
 {
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e29..a8213c2 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -826,16 +826,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
 
lockdep_assert_held(_mutex);
-   get_online_cpus();
 
/*
 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -843,27 +841,27 @@ static void rebuild_sched_domains_locked(void)
 * Anyways, hotplug work item will rebuild sched domains.
 */
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-   goto out;
+   return;
 
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(, );
 
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
-   put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
 }
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
 {
+   cpus_read_lock();
mutex_lock(_mutex);
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
mutex_unlock(_mutex);
+   cpus_read_unlock();
 }
 
 /**
@@ -949,7 +947,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
rcu_read_unlock();
 
if (need_rebuild_sched_domains)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
 }
 
 /**
@@ -1281,7 +1279,7 @@ static int update_relax_domain_level(struct cpuset *cs, 
s64 val)
cs->relax_domain_level = val;
if (!cpumask_e

[PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-10-26 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Process A => kthreadd => Process B => Process C => Process A

Process A
cpu_subsys_offline();
  cpu_down();
_cpu_down();
  percpu_down_write(_hotplug_lock); //held
  cpuhp_invoke_callback();
workqueue_offline_cpu();
  wq_update_unbound_numa();
kthread_create_on_node();
  wake_up_process();  //wakeup kthreadd
  flush_work();
  wait_for_completion();

kthreadd
kthreadd();
  kernel_thread();
do_fork();
  copy_process();
percpu_down_read(_threadgroup_rwsem);
  __rwsem_down_read_failed_common(); //waiting

Process B
kernfs_fop_write();
  cgroup_file_write();
cgroup_procs_write();
  percpu_down_write(_threadgroup_rwsem); //held
  cgroup_attach_task();
cgroup_migrate();
  cgroup_migrate_execute();
cpuset_can_attach();
  mutex_lock(_mutex); //waiting

Process C
kernfs_fop_write();
  cgroup_file_write();
cpuset_write_resmask();
  mutex_lock(_mutex); //held
  update_cpumask();
update_cpumasks_hier();
  rebuild_sched_domains_locked();
get_online_cpus();
  percpu_down_read(_hotplug_lock); //waiting

Eliminating deadlock by reversing the locking order for cpuset_mutex and
cpu_hotplug_lock.

Signed-off-by: Prateek Sood 
---
 include/linux/cpuset.h |  6 -
 kernel/cgroup/cpuset.c | 70 ++
 kernel/power/process.c |  2 --
 kernel/sched/core.c|  1 -
 4 files changed, 36 insertions(+), 43 deletions(-)

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index a1e6a33..e74655d 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
 
 extern int cpuset_init(void);
 extern void cpuset_init_smp(void);
-extern void cpuset_force_rebuild(void);
 extern void cpuset_update_active_cpus(void);
-extern void cpuset_wait_for_hotplug(void);
 extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
 extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
 extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -166,15 +164,11 @@ static inline void set_mems_allowed(nodemask_t nodemask)
 static inline int cpuset_init(void) { return 0; }
 static inline void cpuset_init_smp(void) {}
 
-static inline void cpuset_force_rebuild(void) { }
-
 static inline void cpuset_update_active_cpus(void)
 {
partition_sched_domains(1, NULL, NULL);
 }
 
-static inline void cpuset_wait_for_hotplug(void) { }
-
 static inline void cpuset_cpus_allowed(struct task_struct *p,
   struct cpumask *mask)
 {
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 4657e29..a8213c2 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -826,16 +826,14 @@ static int generate_sched_domains(cpumask_var_t **domains,
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
 
lockdep_assert_held(_mutex);
-   get_online_cpus();
 
/*
 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -843,27 +841,27 @@ static void rebuild_sched_domains_locked(void)
 * Anyways, hotplug work item will rebuild sched domains.
 */
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-   goto out;
+   return;
 
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(, );
 
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
-   put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
 }
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
 {
+   cpus_read_lock();
mutex_lock(_mutex);
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
mutex_unlock(_mutex);
+   cpus_read_unlock();
 }
 
 /**
@@ -949,7 +947,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
rcu_read_unlock();
 
if (need_rebuild_sched_domains)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
 }
 
 /**
@@ -1281,7 +1279,7 @@ static int update_relax_domain_level(struct cpuset *cs, 
s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-10-25 Thread Prateek Sood

On 10/11/2017 03:18 PM, Peter Zijlstra wrote:
> On Mon, Oct 09, 2017 at 06:57:46PM +0530, Prateek Sood wrote:
>> On 09/07/2017 11:21 PM, Peter Zijlstra wrote:
> 
>>> But if you invert these locks, the need for cpuset_hotplug_workfn() goes
>>> away, at least for the CPU part, and we can make in synchronous again.
>>> Yay!!
> 
>> The callback making a call to cpuset_hotplug_workfn()in hotplug path are
>> [CPUHP_AP_ACTIVE] = {
>> .name   = "sched:active",
>> .startup.single = sched_cpu_activate,
>> .teardown.single= sched_cpu_deactivate,
>> },
>>
>> if we make cpuset_hotplug_workfn() synchronous, deadlock might happen:
>> _cpu_down()
>>cpus_write_lock()  //held
>>   cpuhp_kick_ap_work()
>> cpuhp_kick_ap()
>>__cpuhp_kick_ap()
>>   wake_up_process() //cpuhp_thread_fun
>> wait_for_ap_thread() //wait for complete from 
>> cpuhp_thread_fun()
>>
>> cpuhp_thread_fun()
>>cpuhp_invoke_callback()
>>  sched_cpu_deactivate()
>>cpuset_cpu_inactive()
>>   cpuset_update_active_cpus()
>>  cpuset_hotplug_work()
>> rebuild_sched_domains()
>>cpus_read_lock() //waiting as acquired in _cpu_down()
> 
> Well, duh, don't use rebuild_sched_domains() 'obviously' :-) use
> rebuild_sched_domains_cpuslocked() instead and it works just fine.
> 
> After applying your patch, the below boots and survives a hotplug.
> 
> ---
>  include/linux/cpuset.h |6 --
>  kernel/cgroup/cpuset.c |   30 +-
>  kernel/power/process.c |2 --
>  kernel/sched/core.c|1 -
>  4 files changed, 9 insertions(+), 30 deletions(-)
> 
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
>  
>  extern int cpuset_init(void);
>  extern void cpuset_init_smp(void);
> -extern void cpuset_force_rebuild(void);
>  extern void cpuset_update_active_cpus(void);
> -extern void cpuset_wait_for_hotplug(void);
>  extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
>  extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
>  extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
> @@ -166,15 +164,11 @@ static inline bool cpusets_enabled(void)
>  static inline int cpuset_init(void) { return 0; }
>  static inline void cpuset_init_smp(void) {}
>  
> -static inline void cpuset_force_rebuild(void) { }
> -
>  static inline void cpuset_update_active_cpus(void)
>  {
>   partition_sched_domains(1, NULL, NULL);
>  }
>  
> -static inline void cpuset_wait_for_hotplug(void) { }
> -
>  static inline void cpuset_cpus_allowed(struct task_struct *p,
>  struct cpumask *mask)
>  {
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -833,7 +833,12 @@ static void rebuild_sched_domains_cpuslo
>   cpumask_var_t *doms;
>   int ndoms;
>  
> + /*
> +  * When called during hotplug, this lock is held by the calling
> +  * thread, not cpuhp_thread_fun :/
> +  *
>   lockdep_assert_cpus_held();
> +  */
>   lockdep_assert_held(_mutex);
>  
>   /*
> @@ -2281,13 +2286,6 @@ static void cpuset_hotplug_update_tasks(
>   mutex_unlock(_mutex);
>  }
>  
> -static bool force_rebuild;
> -
> -void cpuset_force_rebuild(void)
> -{
> - force_rebuild = true;
> -}
> -
>  /**
>   * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
>   *
> @@ -2362,25 +2360,15 @@ static void cpuset_hotplug_workfn(struct
>   }
>  
>   /* rebuild sched domains if cpus_allowed has changed */
> - if (cpus_updated || force_rebuild) {
> - force_rebuild = false;
> + if (cpus_updated)
>   rebuild_sched_domains();
> - }
>  }
>  
>  void cpuset_update_active_cpus(void)
>  {
> - /*
> -  * We're inside cpu hotplug critical region which usually nests
> -  * inside cgroup synchronization.  Bounce actual hotplug processing
> -  * to a work item to avoid reverse locking order.
> -  */
> - schedule_work(_hotplug_work);
> -}
> -
> -void cpuset_wait_for_hotplug(void)
> -{
> - flush_work(_hotplug_work);
> + mutex_lock(_mutex);
> + rebuild_sched_domains_cpuslocked();
> + mutex_unlock(_mutex);
>  }
>  
>  /*
> --- a/kernel/power/process.c
> +++ b/kernel/power/process.c
> @@ -203,8 +203,6 @@ void thaw

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-10-25 Thread Prateek Sood

On 10/11/2017 03:18 PM, Peter Zijlstra wrote:
> On Mon, Oct 09, 2017 at 06:57:46PM +0530, Prateek Sood wrote:
>> On 09/07/2017 11:21 PM, Peter Zijlstra wrote:
> 
>>> But if you invert these locks, the need for cpuset_hotplug_workfn() goes
>>> away, at least for the CPU part, and we can make in synchronous again.
>>> Yay!!
> 
>> The callback making a call to cpuset_hotplug_workfn()in hotplug path are
>> [CPUHP_AP_ACTIVE] = {
>> .name   = "sched:active",
>> .startup.single = sched_cpu_activate,
>> .teardown.single= sched_cpu_deactivate,
>> },
>>
>> if we make cpuset_hotplug_workfn() synchronous, deadlock might happen:
>> _cpu_down()
>>cpus_write_lock()  //held
>>   cpuhp_kick_ap_work()
>> cpuhp_kick_ap()
>>__cpuhp_kick_ap()
>>   wake_up_process() //cpuhp_thread_fun
>> wait_for_ap_thread() //wait for complete from 
>> cpuhp_thread_fun()
>>
>> cpuhp_thread_fun()
>>cpuhp_invoke_callback()
>>  sched_cpu_deactivate()
>>cpuset_cpu_inactive()
>>   cpuset_update_active_cpus()
>>  cpuset_hotplug_work()
>> rebuild_sched_domains()
>>cpus_read_lock() //waiting as acquired in _cpu_down()
> 
> Well, duh, don't use rebuild_sched_domains() 'obviously' :-) use
> rebuild_sched_domains_cpuslocked() instead and it works just fine.
> 
> After applying your patch, the below boots and survives a hotplug.
> 
> ---
>  include/linux/cpuset.h |6 --
>  kernel/cgroup/cpuset.c |   30 +-
>  kernel/power/process.c |2 --
>  kernel/sched/core.c|1 -
>  4 files changed, 9 insertions(+), 30 deletions(-)
> 
> --- a/include/linux/cpuset.h
> +++ b/include/linux/cpuset.h
> @@ -51,9 +51,7 @@ static inline void cpuset_dec(void)
>  
>  extern int cpuset_init(void);
>  extern void cpuset_init_smp(void);
> -extern void cpuset_force_rebuild(void);
>  extern void cpuset_update_active_cpus(void);
> -extern void cpuset_wait_for_hotplug(void);
>  extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
>  extern void cpuset_cpus_allowed_fallback(struct task_struct *p);
>  extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
> @@ -166,15 +164,11 @@ static inline bool cpusets_enabled(void)
>  static inline int cpuset_init(void) { return 0; }
>  static inline void cpuset_init_smp(void) {}
>  
> -static inline void cpuset_force_rebuild(void) { }
> -
>  static inline void cpuset_update_active_cpus(void)
>  {
>   partition_sched_domains(1, NULL, NULL);
>  }
>  
> -static inline void cpuset_wait_for_hotplug(void) { }
> -
>  static inline void cpuset_cpus_allowed(struct task_struct *p,
>  struct cpumask *mask)
>  {
> --- a/kernel/cgroup/cpuset.c
> +++ b/kernel/cgroup/cpuset.c
> @@ -833,7 +833,12 @@ static void rebuild_sched_domains_cpuslo
>   cpumask_var_t *doms;
>   int ndoms;
>  
> + /*
> +  * When called during hotplug, this lock is held by the calling
> +  * thread, not cpuhp_thread_fun :/
> +  *
>   lockdep_assert_cpus_held();
> +  */
>   lockdep_assert_held(_mutex);
>  
>   /*
> @@ -2281,13 +2286,6 @@ static void cpuset_hotplug_update_tasks(
>   mutex_unlock(_mutex);
>  }
>  
> -static bool force_rebuild;
> -
> -void cpuset_force_rebuild(void)
> -{
> - force_rebuild = true;
> -}
> -
>  /**
>   * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
>   *
> @@ -2362,25 +2360,15 @@ static void cpuset_hotplug_workfn(struct
>   }
>  
>   /* rebuild sched domains if cpus_allowed has changed */
> - if (cpus_updated || force_rebuild) {
> - force_rebuild = false;
> + if (cpus_updated)
>   rebuild_sched_domains();
> - }
>  }
>  
>  void cpuset_update_active_cpus(void)
>  {
> - /*
> -  * We're inside cpu hotplug critical region which usually nests
> -  * inside cgroup synchronization.  Bounce actual hotplug processing
> -  * to a work item to avoid reverse locking order.
> -  */
> - schedule_work(_hotplug_work);
> -}
> -
> -void cpuset_wait_for_hotplug(void)
> -{
> - flush_work(_hotplug_work);
> + mutex_lock(_mutex);
> + rebuild_sched_domains_cpuslocked();
> + mutex_unlock(_mutex);
>  }
>  
>  /*
> --- a/kernel/power/process.c
> +++ b/kernel/power/process.c
> @@ -203,8 +203,6 @@ void thaw

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-10-09 Thread Prateek Sood

On 09/07/2017 11:21 PM, Peter Zijlstra wrote:
> On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
> 
> You've forgotten to mention your solution to the deadlock, namely
> inverting cpuset_mutex and cpu_hotplug_lock.
> 
>> Signed-off-by: Prateek Sood <prs...@codeaurora.org>
>> ---
>>  kernel/cgroup/cpuset.c | 32 +++-
>>  1 file changed, 19 insertions(+), 13 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index 2f4039b..60dc0ac 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -816,16 +816,15 @@ static int generate_sched_domains(cpumask_var_t 
>> **domains,
>>   * 'cpus' is removed, then call this routine to rebuild the
>>   * scheduler's dynamic sched domains.
>>   *
>> - * Call with cpuset_mutex held.  Takes get_online_cpus().
>>   */
>> -static void rebuild_sched_domains_locked(void)
>> +static void rebuild_sched_domains_cpuslocked(void)
>>  {
>>  struct sched_domain_attr *attr;
>>  cpumask_var_t *doms;
>>  int ndoms;
>>  
>> +lockdep_assert_cpus_held();
>>  lockdep_assert_held(_mutex);
>> -get_online_cpus();
>>  
>>  /*
>>   * We have raced with CPU hotplug. Don't do anything to avoid
>> @@ -833,27 +832,27 @@ static void rebuild_sched_domains_locked(void)
>>   * Anyways, hotplug work item will rebuild sched domains.
>>   */
>>  if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
>> -goto out;
>> +return;
>>  
>>  /* Generate domain masks and attrs */
>>  ndoms = generate_sched_domains(, );
>>  
>>  /* Have scheduler rebuild the domains */
>>  partition_sched_domains(ndoms, doms, attr);
>> -out:
>> -put_online_cpus();
>>  }
>>  #else /* !CONFIG_SMP */
>> -static void rebuild_sched_domains_locked(void)
>> +static void rebuild_sched_domains_cpuslocked(void)
>>  {
>>  }
>>  #endif /* CONFIG_SMP */
>>  
>>  void rebuild_sched_domains(void)
>>  {
>> +get_online_cpus();
>>  mutex_lock(_mutex);
>> -rebuild_sched_domains_locked();
>> +rebuild_sched_domains_cpuslocked();
>>  mutex_unlock(_mutex);
>> +put_online_cpus();
>>  }
> 
> But if you invert these locks, the need for cpuset_hotplug_workfn() goes
> away, at least for the CPU part, and we can make in synchronous again.
> Yay!!
> 
> Also, I think new code should use cpus_read_lock() instead of
> get_online_cpus().
> 

Thanks for the review comments Peter.
For patch related to circular deadlock, I will send an updated version.

The callback making a call to cpuset_hotplug_workfn()in hotplug path are
[CPUHP_AP_ACTIVE] = {
.name   = "sched:active",
.startup.single = sched_cpu_activate,
.teardown.single= sched_cpu_deactivate,
},

if we make cpuset_hotplug_workfn() synchronous, deadlock might happen:
_cpu_down()
   cpus_write_lock()  //held
  cpuhp_kick_ap_work()
cpuhp_kick_ap()
   __cpuhp_kick_ap()
  wake_up_process() //cpuhp_thread_fun
wait_for_ap_thread() //wait for complete from cpuhp_thread_fun()

cpuhp_thread_fun()
   cpuhp_invoke_callback()
 sched_cpu_deactivate()
   cpuset_cpu_inactive()
  cpuset_update_active_cpus()
 cpuset_hotplug_work()
rebuild_sched_domains()
   cpus_read_lock() //waiting as acquired in _cpu_down()
  

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-10-09 Thread Prateek Sood

On 09/07/2017 11:21 PM, Peter Zijlstra wrote:
> On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
> 
> You've forgotten to mention your solution to the deadlock, namely
> inverting cpuset_mutex and cpu_hotplug_lock.
> 
>> Signed-off-by: Prateek Sood 
>> ---
>>  kernel/cgroup/cpuset.c | 32 +++-
>>  1 file changed, 19 insertions(+), 13 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index 2f4039b..60dc0ac 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -816,16 +816,15 @@ static int generate_sched_domains(cpumask_var_t 
>> **domains,
>>   * 'cpus' is removed, then call this routine to rebuild the
>>   * scheduler's dynamic sched domains.
>>   *
>> - * Call with cpuset_mutex held.  Takes get_online_cpus().
>>   */
>> -static void rebuild_sched_domains_locked(void)
>> +static void rebuild_sched_domains_cpuslocked(void)
>>  {
>>  struct sched_domain_attr *attr;
>>  cpumask_var_t *doms;
>>  int ndoms;
>>  
>> +lockdep_assert_cpus_held();
>>  lockdep_assert_held(_mutex);
>> -get_online_cpus();
>>  
>>  /*
>>   * We have raced with CPU hotplug. Don't do anything to avoid
>> @@ -833,27 +832,27 @@ static void rebuild_sched_domains_locked(void)
>>   * Anyways, hotplug work item will rebuild sched domains.
>>   */
>>  if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
>> -goto out;
>> +return;
>>  
>>  /* Generate domain masks and attrs */
>>  ndoms = generate_sched_domains(, );
>>  
>>  /* Have scheduler rebuild the domains */
>>  partition_sched_domains(ndoms, doms, attr);
>> -out:
>> -put_online_cpus();
>>  }
>>  #else /* !CONFIG_SMP */
>> -static void rebuild_sched_domains_locked(void)
>> +static void rebuild_sched_domains_cpuslocked(void)
>>  {
>>  }
>>  #endif /* CONFIG_SMP */
>>  
>>  void rebuild_sched_domains(void)
>>  {
>> +get_online_cpus();
>>  mutex_lock(_mutex);
>> -rebuild_sched_domains_locked();
>> +rebuild_sched_domains_cpuslocked();
>>  mutex_unlock(_mutex);
>> +put_online_cpus();
>>  }
> 
> But if you invert these locks, the need for cpuset_hotplug_workfn() goes
> away, at least for the CPU part, and we can make in synchronous again.
> Yay!!
> 
> Also, I think new code should use cpus_read_lock() instead of
> get_online_cpus().
> 

Thanks for the review comments Peter.
For patch related to circular deadlock, I will send an updated version.

The callback making a call to cpuset_hotplug_workfn()in hotplug path are
[CPUHP_AP_ACTIVE] = {
.name   = "sched:active",
.startup.single = sched_cpu_activate,
.teardown.single= sched_cpu_deactivate,
},

if we make cpuset_hotplug_workfn() synchronous, deadlock might happen:
_cpu_down()
   cpus_write_lock()  //held
  cpuhp_kick_ap_work()
cpuhp_kick_ap()
   __cpuhp_kick_ap()
  wake_up_process() //cpuhp_thread_fun
wait_for_ap_thread() //wait for complete from cpuhp_thread_fun()

cpuhp_thread_fun()
   cpuhp_invoke_callback()
 sched_cpu_deactivate()
   cpuset_cpu_inactive()
  cpuset_update_active_cpus()
 cpuset_hotplug_work()
rebuild_sched_domains()
   cpus_read_lock() //waiting as acquired in _cpu_down()
  

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

[tip:locking/urgent] locking/rwsem-xadd: Fix missed wakeup due to reordering of load

2017-09-29 Thread tip-bot for Prateek Sood

Commit-ID:  9c29c31830a4eca724e137a9339137204bbb31be
Gitweb: https://git.kernel.org/tip/9c29c31830a4eca724e137a9339137204bbb31be
Author: Prateek Sood <prs...@codeaurora.org>
AuthorDate: Thu, 7 Sep 2017 20:00:58 +0530
Committer:  Ingo Molnar <mi...@kernel.org>
CommitDate: Fri, 29 Sep 2017 10:10:20 +0200

locking/rwsem-xadd: Fix missed wakeup due to reordering of load

If a spinner is present, there is a chance that the load of
rwsem_has_spinner() in rwsem_wake() can be reordered with
respect to decrement of rwsem count in __up_write() leading
to wakeup being missed:

 spinning writer  up_write caller
 ---  ---
 [S] osq_unlock() [L] osq
  spin_lock(wait_lock)
  sem->count=0x0001
+0x
  count=sem->count
  MB
   sem->count=0xFFFE0001
 -0x0001
   spin_trylock(wait_lock)
   return
 rwsem_try_write_lock(count)
 spin_unlock(wait_lock)
 schedule()

Reordering of atomic_long_sub_return_release() in __up_write()
and rwsem_has_spinner() in rwsem_wake() can cause missing of
wakeup in up_write() context. In spinning writer, sem->count
and local variable count is 0XFFFE0001. It would result
in rwsem_try_write_lock() failing to acquire rwsem and spinning
writer going to sleep in rwsem_down_write_failed().

The smp_rmb() will make sure that the spinner state is
consulted after sem->count is updated in up_write context.

Signed-off-by: Prateek Sood <prs...@codeaurora.org>
Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Thomas Gleixner <t...@linutronix.de>
Cc: d...@stgolabs.net
Cc: long...@redhat.com
Cc: parri.and...@gmail.com
Cc: sram...@codeaurora.org
Link: 
http://lkml.kernel.org/r/1504794658-15397-1-git-send-email-prs...@codeaurora.org
Signed-off-by: Ingo Molnar <mi...@kernel.org>
---
 kernel/locking/rwsem-xadd.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 02f6606..1fefe6d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
DEFINE_WAKE_Q(wake_q);
 
/*
+   * __rwsem_down_write_failed_common(sem)
+   *   rwsem_optimistic_spin(sem)
+   * osq_unlock(sem->osq)
+   *   ...
+   *   atomic_long_add_return(>count)
+   *
+   *  - VS -
+   *
+   *  __up_write()
+   *if (atomic_long_sub_return_release(>count) < 0)
+   *  rwsem_wake(sem)
+   *osq_is_locked(>osq)
+   *
+   * And __up_write() must observe !osq_is_locked() when it observes the
+   * atomic_long_add_return() in order to not miss a wakeup.
+   *
+   * This boils down to:
+   *
+   * [S.rel] X = 1[RmW] r0 = (Y += 0)
+   * MB RMB
+   * [RmW]   Y += 1   [L]   r1 = X
+   *
+   * exists (r0=1 /\ r1=0)
+   */
+   smp_rmb();
+
+   /*
 * If a spinner is present, it is not necessary to do the wakeup.
 * Try to do wakeup only if the trylock succeeds to minimize
 * spinlock contention which may introduce too much delay in the

[tip:locking/urgent] locking/rwsem-xadd: Fix missed wakeup due to reordering of load

2017-09-29 Thread tip-bot for Prateek Sood

Commit-ID:  9c29c31830a4eca724e137a9339137204bbb31be
Gitweb: https://git.kernel.org/tip/9c29c31830a4eca724e137a9339137204bbb31be
Author: Prateek Sood 
AuthorDate: Thu, 7 Sep 2017 20:00:58 +0530
Committer:  Ingo Molnar 
CommitDate: Fri, 29 Sep 2017 10:10:20 +0200

locking/rwsem-xadd: Fix missed wakeup due to reordering of load

If a spinner is present, there is a chance that the load of
rwsem_has_spinner() in rwsem_wake() can be reordered with
respect to decrement of rwsem count in __up_write() leading
to wakeup being missed:

 spinning writer  up_write caller
 ---  ---
 [S] osq_unlock() [L] osq
  spin_lock(wait_lock)
  sem->count=0x0001
+0x
  count=sem->count
  MB
   sem->count=0xFFFE0001
 -0x0001
   spin_trylock(wait_lock)
   return
 rwsem_try_write_lock(count)
 spin_unlock(wait_lock)
 schedule()

Reordering of atomic_long_sub_return_release() in __up_write()
and rwsem_has_spinner() in rwsem_wake() can cause missing of
wakeup in up_write() context. In spinning writer, sem->count
and local variable count is 0XFFFE0001. It would result
in rwsem_try_write_lock() failing to acquire rwsem and spinning
writer going to sleep in rwsem_down_write_failed().

The smp_rmb() will make sure that the spinner state is
consulted after sem->count is updated in up_write context.

Signed-off-by: Prateek Sood 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: d...@stgolabs.net
Cc: long...@redhat.com
Cc: parri.and...@gmail.com
Cc: sram...@codeaurora.org
Link: 
http://lkml.kernel.org/r/1504794658-15397-1-git-send-email-prs...@codeaurora.org
Signed-off-by: Ingo Molnar 
---
 kernel/locking/rwsem-xadd.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 02f6606..1fefe6d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
DEFINE_WAKE_Q(wake_q);
 
/*
+   * __rwsem_down_write_failed_common(sem)
+   *   rwsem_optimistic_spin(sem)
+   * osq_unlock(sem->osq)
+   *   ...
+   *   atomic_long_add_return(>count)
+   *
+   *  - VS -
+   *
+   *  __up_write()
+   *if (atomic_long_sub_return_release(>count) < 0)
+   *  rwsem_wake(sem)
+   *osq_is_locked(>osq)
+   *
+   * And __up_write() must observe !osq_is_locked() when it observes the
+   * atomic_long_add_return() in order to not miss a wakeup.
+   *
+   * This boils down to:
+   *
+   * [S.rel] X = 1[RmW] r0 = (Y += 0)
+   * MB RMB
+   * [RmW]   Y += 1   [L]   r1 = X
+   *
+   * exists (r0=1 /\ r1=0)
+   */
+   smp_rmb();
+
+   /*
 * If a spinner is present, it is not necessary to do the wakeup.
 * Try to do wakeup only if the trylock succeeds to minimize
 * spinlock contention which may introduce too much delay in the

Re: [PATCH] rwsem: fix missed wakeup due to reordering of load

2017-09-26 Thread Prateek Sood

On 09/07/2017 08:00 PM, Prateek Sood wrote:
> If a spinner is present, there is a chance that the load of
> rwsem_has_spinner() in rwsem_wake() can be reordered with
> respect to decrement of rwsem count in __up_write() leading
> to wakeup being missed.
> 
>  spinning writer  up_write caller
>  ---  ---
>  [S] osq_unlock() [L] osq
>   spin_lock(wait_lock)
>   sem->count=0x0001
> +0x
>   count=sem->count
>   MB
>sem->count=0xFFFE0001
>  -0x0001
>spin_trylock(wait_lock)
>return
>  rwsem_try_write_lock(count)
>  spin_unlock(wait_lock)
>  schedule()
> 
> Reordering of atomic_long_sub_return_release() in __up_write()
> and rwsem_has_spinner() in rwsem_wake() can cause missing of
> wakeup in up_write() context. In spinning writer, sem->count
> and local variable count is 0XFFFE0001. It would result
> in rwsem_try_write_lock() failing to acquire rwsem and spinning
> writer going to sleep in rwsem_down_write_failed().
> 
> The smp_rmb() will make sure that the spinner state is
> consulted after sem->count is updated in up_write context.
> 
> Signed-off-by: Prateek Sood <prs...@codeaurora.org>
> ---
>  kernel/locking/rwsem-xadd.c | 27 +++
>  1 file changed, 27 insertions(+)
> 
> diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
> index 02f6606..1fefe6d 100644
> --- a/kernel/locking/rwsem-xadd.c
> +++ b/kernel/locking/rwsem-xadd.c
> @@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
>   DEFINE_WAKE_Q(wake_q);
>  
>   /*
> + * __rwsem_down_write_failed_common(sem)
> + *   rwsem_optimistic_spin(sem)
> + * osq_unlock(sem->osq)
> + *   ...
> + *   atomic_long_add_return(>count)
> + *
> + *  - VS -
> + *
> + *  __up_write()
> + *if (atomic_long_sub_return_release(>count) < 0)
> + *  rwsem_wake(sem)
> + *osq_is_locked(>osq)
> + *
> + * And __up_write() must observe !osq_is_locked() when it observes the
> + * atomic_long_add_return() in order to not miss a wakeup.
> + *
> + * This boils down to:
> + *
> + * [S.rel] X = 1[RmW] r0 = (Y += 0)
> + * MB RMB
> + * [RmW]   Y += 1   [L]   r1 = X
> + *
> + * exists (r0=1 /\ r1=0)
> + */
> + smp_rmb();
> +
> + /*
>* If a spinner is present, it is not necessary to do the wakeup.
>* Try to do wakeup only if the trylock succeeds to minimize
>* spinlock contention which may introduce too much delay in the
> 

Hi Folks,

Do you have any more suggestion/feedback on this patch.


Regards
Prateek

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] rwsem: fix missed wakeup due to reordering of load

2017-09-26 Thread Prateek Sood

On 09/07/2017 08:00 PM, Prateek Sood wrote:
> If a spinner is present, there is a chance that the load of
> rwsem_has_spinner() in rwsem_wake() can be reordered with
> respect to decrement of rwsem count in __up_write() leading
> to wakeup being missed.
> 
>  spinning writer  up_write caller
>  ---  ---
>  [S] osq_unlock() [L] osq
>   spin_lock(wait_lock)
>   sem->count=0x0001
> +0x
>   count=sem->count
>   MB
>sem->count=0xFFFE0001
>  -0x0001
>spin_trylock(wait_lock)
>return
>  rwsem_try_write_lock(count)
>  spin_unlock(wait_lock)
>  schedule()
> 
> Reordering of atomic_long_sub_return_release() in __up_write()
> and rwsem_has_spinner() in rwsem_wake() can cause missing of
> wakeup in up_write() context. In spinning writer, sem->count
> and local variable count is 0XFFFE0001. It would result
> in rwsem_try_write_lock() failing to acquire rwsem and spinning
> writer going to sleep in rwsem_down_write_failed().
> 
> The smp_rmb() will make sure that the spinner state is
> consulted after sem->count is updated in up_write context.
> 
> Signed-off-by: Prateek Sood 
> ---
>  kernel/locking/rwsem-xadd.c | 27 +++
>  1 file changed, 27 insertions(+)
> 
> diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
> index 02f6606..1fefe6d 100644
> --- a/kernel/locking/rwsem-xadd.c
> +++ b/kernel/locking/rwsem-xadd.c
> @@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
>   DEFINE_WAKE_Q(wake_q);
>  
>   /*
> + * __rwsem_down_write_failed_common(sem)
> + *   rwsem_optimistic_spin(sem)
> + * osq_unlock(sem->osq)
> + *   ...
> + *   atomic_long_add_return(>count)
> + *
> + *  - VS -
> + *
> + *  __up_write()
> + *if (atomic_long_sub_return_release(>count) < 0)
> + *  rwsem_wake(sem)
> + *osq_is_locked(>osq)
> + *
> + * And __up_write() must observe !osq_is_locked() when it observes the
> + * atomic_long_add_return() in order to not miss a wakeup.
> + *
> + * This boils down to:
> + *
> + * [S.rel] X = 1[RmW] r0 = (Y += 0)
> + * MB RMB
> + * [RmW]   Y += 1   [L]   r1 = X
> + *
> + * exists (r0=1 /\ r1=0)
> + */
> + smp_rmb();
> +
> + /*
>* If a spinner is present, it is not necessary to do the wakeup.
>* Try to do wakeup only if the trylock succeeds to minimize
>* spinlock contention which may introduce too much delay in the
> 

Hi Folks,

Do you have any more suggestion/feedback on this patch.


Regards
Prateek

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-07 Thread Prateek Sood

On 09/07/2017 11:15 PM, Peter Zijlstra wrote:
> On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Process A => kthreadd => Process B => Process C => Process A
> 
>> Process A
>> cpu_subsys_offline();
>>   cpu_down();
>> _cpu_down();
>>   percpu_down_write(_hotplug_lock); //held
>>   cpuhp_invoke_callback();
>> workqueue_offline_cpu();
>>   wq_update_unbound_numa();
>> kthread_create_on_node();
>>   wake_up_process();  //wakeup kthreadd
> 
> TJ, I'm puzzled, why would we need to spawn new threads to update NUMA
> affinity when taking a CPU out? That doesn't make sense to me, we can
> either shrink the affinity of an existing thread or completely kill of a
> thread if the mask becomes empty. But why spawn a new thread?
> 
>>   flush_work();
>>   wait_for_completion();
>>

> 
> Yes, inverting cpuset and hotplug would break that chain, but I'm still
> wondering why workqueue needs to spawn threads on CPU down.
> 

Thanks for the comments Peter

You rightly mentioned that a new thread will not be spawn
while updating NUMA affinity when taking a CPU out.

While a CPU is made offline, attempt is made to unbind per-cpu
worker for CPU going down. This is done by queuing unbind work
to system_highpri_wq. It results in an attempt to create one
bounded worker thread as there is none. 

wait_for_completion() in flush_work() waits for unbinding to
finish for CPU going down.

Process A
cpu_subsys_offline();
   cpu_down();
 _cpu_down();
   percpu_down_write(_hotplug_lock); //held
   cpuhp_invoke_callback();
 workqueue_offline_cpu();
   queue_work_on(system_highpri_wq);
 __queue_work();
   insert_work();
 wake_up_worker(); //pool->nr_running = 0
   flush_work();
   wait_for_completion();
   
   
worker_thread();
  need_more_worker(); // returns true
  manage_workers();
maybe_create_worker();
  create_worker();
kthread_create_on_node();
  wake_up_process(kthreadd_task);



-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-07 Thread Prateek Sood

On 09/07/2017 11:15 PM, Peter Zijlstra wrote:
> On Thu, Sep 07, 2017 at 07:26:23PM +0530, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Process A => kthreadd => Process B => Process C => Process A
> 
>> Process A
>> cpu_subsys_offline();
>>   cpu_down();
>> _cpu_down();
>>   percpu_down_write(_hotplug_lock); //held
>>   cpuhp_invoke_callback();
>> workqueue_offline_cpu();
>>   wq_update_unbound_numa();
>> kthread_create_on_node();
>>   wake_up_process();  //wakeup kthreadd
> 
> TJ, I'm puzzled, why would we need to spawn new threads to update NUMA
> affinity when taking a CPU out? That doesn't make sense to me, we can
> either shrink the affinity of an existing thread or completely kill of a
> thread if the mask becomes empty. But why spawn a new thread?
> 
>>   flush_work();
>>   wait_for_completion();
>>

> 
> Yes, inverting cpuset and hotplug would break that chain, but I'm still
> wondering why workqueue needs to spawn threads on CPU down.
> 

Thanks for the comments Peter

You rightly mentioned that a new thread will not be spawn
while updating NUMA affinity when taking a CPU out.

While a CPU is made offline, attempt is made to unbind per-cpu
worker for CPU going down. This is done by queuing unbind work
to system_highpri_wq. It results in an attempt to create one
bounded worker thread as there is none. 

wait_for_completion() in flush_work() waits for unbinding to
finish for CPU going down.

Process A
cpu_subsys_offline();
   cpu_down();
 _cpu_down();
   percpu_down_write(_hotplug_lock); //held
   cpuhp_invoke_callback();
 workqueue_offline_cpu();
   queue_work_on(system_highpri_wq);
 __queue_work();
   insert_work();
 wake_up_worker(); //pool->nr_running = 0
   flush_work();
   wait_for_completion();
   
   
worker_thread();
  need_more_worker(); // returns true
  manage_workers();
maybe_create_worker();
  create_worker();
kthread_create_on_node();
  wake_up_process(kthreadd_task);



-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

[PATCH] rwsem: fix missed wakeup due to reordering of load

2017-09-07 Thread Prateek Sood

If a spinner is present, there is a chance that the load of
rwsem_has_spinner() in rwsem_wake() can be reordered with
respect to decrement of rwsem count in __up_write() leading
to wakeup being missed.

 spinning writer  up_write caller
 ---  ---
 [S] osq_unlock() [L] osq
  spin_lock(wait_lock)
  sem->count=0x0001
+0x
  count=sem->count
  MB
   sem->count=0xFFFE0001
 -0x0001
   spin_trylock(wait_lock)
   return
 rwsem_try_write_lock(count)
 spin_unlock(wait_lock)
 schedule()

Reordering of atomic_long_sub_return_release() in __up_write()
and rwsem_has_spinner() in rwsem_wake() can cause missing of
wakeup in up_write() context. In spinning writer, sem->count
and local variable count is 0XFFFE0001. It would result
in rwsem_try_write_lock() failing to acquire rwsem and spinning
writer going to sleep in rwsem_down_write_failed().

The smp_rmb() will make sure that the spinner state is
consulted after sem->count is updated in up_write context.

Signed-off-by: Prateek Sood <prs...@codeaurora.org>
---
 kernel/locking/rwsem-xadd.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 02f6606..1fefe6d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
DEFINE_WAKE_Q(wake_q);
 
/*
+   * __rwsem_down_write_failed_common(sem)
+   *   rwsem_optimistic_spin(sem)
+   * osq_unlock(sem->osq)
+   *   ...
+   *   atomic_long_add_return(>count)
+   *
+   *  - VS -
+   *
+   *  __up_write()
+   *if (atomic_long_sub_return_release(>count) < 0)
+   *  rwsem_wake(sem)
+   *osq_is_locked(>osq)
+   *
+   * And __up_write() must observe !osq_is_locked() when it observes the
+   * atomic_long_add_return() in order to not miss a wakeup.
+   *
+   * This boils down to:
+   *
+   * [S.rel] X = 1[RmW] r0 = (Y += 0)
+   * MB RMB
+   * [RmW]   Y += 1   [L]   r1 = X
+   *
+   * exists (r0=1 /\ r1=0)
+   */
+   smp_rmb();
+
+   /*
 * If a spinner is present, it is not necessary to do the wakeup.
 * Try to do wakeup only if the trylock succeeds to minimize
 * spinlock contention which may introduce too much delay in the
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

[PATCH] rwsem: fix missed wakeup due to reordering of load

2017-09-07 Thread Prateek Sood

If a spinner is present, there is a chance that the load of
rwsem_has_spinner() in rwsem_wake() can be reordered with
respect to decrement of rwsem count in __up_write() leading
to wakeup being missed.

 spinning writer  up_write caller
 ---  ---
 [S] osq_unlock() [L] osq
  spin_lock(wait_lock)
  sem->count=0x0001
+0x
  count=sem->count
  MB
   sem->count=0xFFFE0001
 -0x0001
   spin_trylock(wait_lock)
   return
 rwsem_try_write_lock(count)
 spin_unlock(wait_lock)
 schedule()

Reordering of atomic_long_sub_return_release() in __up_write()
and rwsem_has_spinner() in rwsem_wake() can cause missing of
wakeup in up_write() context. In spinning writer, sem->count
and local variable count is 0XFFFE0001. It would result
in rwsem_try_write_lock() failing to acquire rwsem and spinning
writer going to sleep in rwsem_down_write_failed().

The smp_rmb() will make sure that the spinner state is
consulted after sem->count is updated in up_write context.

Signed-off-by: Prateek Sood 
---
 kernel/locking/rwsem-xadd.c | 27 +++
 1 file changed, 27 insertions(+)

diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index 02f6606..1fefe6d 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
DEFINE_WAKE_Q(wake_q);
 
/*
+   * __rwsem_down_write_failed_common(sem)
+   *   rwsem_optimistic_spin(sem)
+   * osq_unlock(sem->osq)
+   *   ...
+   *   atomic_long_add_return(>count)
+   *
+   *  - VS -
+   *
+   *  __up_write()
+   *if (atomic_long_sub_return_release(>count) < 0)
+   *  rwsem_wake(sem)
+   *osq_is_locked(>osq)
+   *
+   * And __up_write() must observe !osq_is_locked() when it observes the
+   * atomic_long_add_return() in order to not miss a wakeup.
+   *
+   * This boils down to:
+   *
+   * [S.rel] X = 1[RmW] r0 = (Y += 0)
+   * MB RMB
+   * [RmW]   Y += 1   [L]   r1 = X
+   *
+   * exists (r0=1 /\ r1=0)
+   */
+   smp_rmb();
+
+   /*
 * If a spinner is present, it is not necessary to do the wakeup.
 * Try to do wakeup only if the trylock succeeds to minimize
 * spinlock contention which may introduce too much delay in the
-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation Center, Inc., 
is a member of Code Aurora Forum, a Linux Foundation Collaborative Project.

Re: [PATCH] rwsem: fix missed wakeup due to reordering of load

2017-09-07 Thread Prateek Sood

On 08/24/2017 06:22 PM, Peter Zijlstra wrote:
> On Thu, Aug 24, 2017 at 02:33:04PM +0200, Peter Zijlstra wrote:
>> On Thu, Aug 24, 2017 at 01:29:27PM +0200, Peter Zijlstra wrote:
>>>
>>> WTH did you not Cc the people that commented on your patch last time?
>>>
>>> On Wed, Aug 23, 2017 at 04:58:55PM +0530, Prateek Sood wrote:
>>>> If a spinner is present, there is a chance that the load of
>>>> rwsem_has_spinner() in rwsem_wake() can be reordered with
>>>> respect to decrement of rwsem count in __up_write() leading
>>>> to wakeup being missed.
>>>
>>>>  spinning writer  up_write caller
>>>>  ---  ---
>>>>  [S] osq_unlock() [L] osq
>>>>   spin_lock(wait_lock)
>>>>   sem->count=0x0001
>>>> +0x
>>>>   count=sem->count
>>>>   MB
>>>>sem->count=0xFFFE0001
>>>>  -0x0001
>>>>RMB
>>>
>>> This doesn't make sense, it appears to order a STORE against something
>>> else.
>>>
>>>>spin_trylock(wait_lock)
>>>>return
>>>>  rwsem_try_write_lock(count)
>>>>  spin_unlock(wait_lock)
>>>>  schedule()
>>
>> Is this what you wanted to write?
> 
> And ideally there should be a comment near the atomic_long_add_return()
> in __rwsem_down_write_failed_common() to indicate we rely on the implied
> smp_mb() before it -- just in case someone goes and makes it
> atomic_long_add_return_relaxed().
> 
> And I suppose someone should look at the waiting branch of that thing
> too.. because I'm not sure what happens if waiting is true but count
> isn't big enough.
> 
> I bloody hate the rwsem code, that BIAS stuff forever confuses me. I
> have a start at rewriting the thing to put the owner in the lock word
> just like we now do for mutex, but never seem to get around to finishing
> it.
> 
>> ---
>>  kernel/locking/rwsem-xadd.c | 27 +++
>>  1 file changed, 27 insertions(+)
>>
>> diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
>> index 02f660666ab8..813b5d3654ce 100644
>> --- a/kernel/locking/rwsem-xadd.c
>> +++ b/kernel/locking/rwsem-xadd.c
>> @@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore 
>> *sem)
>>  DEFINE_WAKE_Q(wake_q);
>>  
>>  /*
>> + * __rwsem_down_write_failed_common(sem)
>> + *   rwsem_optimistic_spin(sem)
>> + * osq_unlock(sem->osq)
>> + *   ...
>> + *   atomic_long_add_return(>count)
>> + *
>> + *  - VS -
>> + *
>> + *  __up_write()
>> + *if 
>> (atomic_long_sub_return_release(>count) < 0)
>> + *  rwsem_wake(sem)
>> + *osq_is_locked(>osq)
>> + *
>> + * And __up_write() must observe !osq_is_locked() when it observes the
>> + * atomic_long_add_return() in order to not miss a wakeup.
>> + *
>> + * This boils down to:
>> + *
>> + * [S.rel] X = 1[RmW] r0 = (Y += 0)
>> + * MB RMB
>> + * [RmW]   Y += 1   [L]   r1 = X
>> + *
>> + * exists (r0=1 /\ r1=0)
>> + */
>> +smp_rmb();
>> +
>> +/*
>>   * If a spinner is present, it is not necessary to do the wakeup.
>>   * Try to do wakeup only if the trylock succeeds to minimize
>>   * spinlock contention which may introduce too much delay in the

Thanks Peter for your suggestion on comments.
I will resend the patch with updated comments

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] rwsem: fix missed wakeup due to reordering of load

2017-09-07 Thread Prateek Sood

On 08/24/2017 06:22 PM, Peter Zijlstra wrote:
> On Thu, Aug 24, 2017 at 02:33:04PM +0200, Peter Zijlstra wrote:
>> On Thu, Aug 24, 2017 at 01:29:27PM +0200, Peter Zijlstra wrote:
>>>
>>> WTH did you not Cc the people that commented on your patch last time?
>>>
>>> On Wed, Aug 23, 2017 at 04:58:55PM +0530, Prateek Sood wrote:
>>>> If a spinner is present, there is a chance that the load of
>>>> rwsem_has_spinner() in rwsem_wake() can be reordered with
>>>> respect to decrement of rwsem count in __up_write() leading
>>>> to wakeup being missed.
>>>
>>>>  spinning writer  up_write caller
>>>>  ---  ---
>>>>  [S] osq_unlock() [L] osq
>>>>   spin_lock(wait_lock)
>>>>   sem->count=0x0001
>>>> +0x
>>>>   count=sem->count
>>>>   MB
>>>>sem->count=0xFFFE0001
>>>>  -0x0001
>>>>RMB
>>>
>>> This doesn't make sense, it appears to order a STORE against something
>>> else.
>>>
>>>>spin_trylock(wait_lock)
>>>>return
>>>>  rwsem_try_write_lock(count)
>>>>  spin_unlock(wait_lock)
>>>>  schedule()
>>
>> Is this what you wanted to write?
> 
> And ideally there should be a comment near the atomic_long_add_return()
> in __rwsem_down_write_failed_common() to indicate we rely on the implied
> smp_mb() before it -- just in case someone goes and makes it
> atomic_long_add_return_relaxed().
> 
> And I suppose someone should look at the waiting branch of that thing
> too.. because I'm not sure what happens if waiting is true but count
> isn't big enough.
> 
> I bloody hate the rwsem code, that BIAS stuff forever confuses me. I
> have a start at rewriting the thing to put the owner in the lock word
> just like we now do for mutex, but never seem to get around to finishing
> it.
> 
>> ---
>>  kernel/locking/rwsem-xadd.c | 27 +++
>>  1 file changed, 27 insertions(+)
>>
>> diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
>> index 02f660666ab8..813b5d3654ce 100644
>> --- a/kernel/locking/rwsem-xadd.c
>> +++ b/kernel/locking/rwsem-xadd.c
>> @@ -613,6 +613,33 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore 
>> *sem)
>>  DEFINE_WAKE_Q(wake_q);
>>  
>>  /*
>> + * __rwsem_down_write_failed_common(sem)
>> + *   rwsem_optimistic_spin(sem)
>> + * osq_unlock(sem->osq)
>> + *   ...
>> + *   atomic_long_add_return(>count)
>> + *
>> + *  - VS -
>> + *
>> + *  __up_write()
>> + *if 
>> (atomic_long_sub_return_release(>count) < 0)
>> + *  rwsem_wake(sem)
>> + *osq_is_locked(>osq)
>> + *
>> + * And __up_write() must observe !osq_is_locked() when it observes the
>> + * atomic_long_add_return() in order to not miss a wakeup.
>> + *
>> + * This boils down to:
>> + *
>> + * [S.rel] X = 1[RmW] r0 = (Y += 0)
>> + * MB RMB
>> + * [RmW]   Y += 1   [L]   r1 = X
>> + *
>> + * exists (r0=1 /\ r1=0)
>> + */
>> +smp_rmb();
>> +
>> +/*
>>   * If a spinner is present, it is not necessary to do the wakeup.
>>   * Try to do wakeup only if the trylock succeeds to minimize
>>   * spinlock contention which may introduce too much delay in the

Thanks Peter for your suggestion on comments.
I will resend the patch with updated comments

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

[PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-07 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Process A => kthreadd => Process B => Process C => Process A

Process A
cpu_subsys_offline();
  cpu_down();
_cpu_down();
  percpu_down_write(_hotplug_lock); //held
  cpuhp_invoke_callback();
workqueue_offline_cpu();
  wq_update_unbound_numa();
kthread_create_on_node();
  wake_up_process();  //wakeup kthreadd
  flush_work();
  wait_for_completion();

kthreadd
kthreadd();
  kernel_thread();
do_fork();
  copy_process();
percpu_down_read(_threadgroup_rwsem);
  __rwsem_down_read_failed_common(); //waiting

Process B
kernfs_fop_write();
  cgroup_file_write();
cgroup_procs_write();
  percpu_down_write(_threadgroup_rwsem); //held
  cgroup_attach_task();
cgroup_migrate();
  cgroup_migrate_execute();
cpuset_can_attach();
  mutex_lock(_mutex); //waiting

Process C
kernfs_fop_write();
  cgroup_file_write();
cpuset_write_resmask();
  mutex_lock(_mutex); //held
  update_cpumask();
update_cpumasks_hier();
  rebuild_sched_domains_locked();
get_online_cpus();
  percpu_down_read(_hotplug_lock); //waiting

Signed-off-by: Prateek Sood <prs...@codeaurora.org>
---
 kernel/cgroup/cpuset.c | 32 +++-
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039b..60dc0ac 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -816,16 +816,15 @@ static int generate_sched_domains(cpumask_var_t **domains,
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
 
+   lockdep_assert_cpus_held();
lockdep_assert_held(_mutex);
-   get_online_cpus();
 
/*
 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -833,27 +832,27 @@ static void rebuild_sched_domains_locked(void)
 * Anyways, hotplug work item will rebuild sched domains.
 */
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-   goto out;
+   return;
 
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(, );
 
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
-   put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
 }
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
 {
+   get_online_cpus();
mutex_lock(_mutex);
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
mutex_unlock(_mutex);
+   put_online_cpus();
 }
 
 /**
@@ -940,7 +939,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
rcu_read_unlock();
 
if (need_rebuild_sched_domains)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
 }
 
 /**
@@ -1273,7 +1272,7 @@ static int update_relax_domain_level(struct cpuset *cs, 
s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
}
 
return 0;
@@ -1306,7 +1305,6 @@ static void update_tasks_flags(struct cpuset *cs)
  *
  * Call with cpuset_mutex held.
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
   int turning_on)
 {
@@ -1339,7 +1337,7 @@ static int update_flag(cpuset_flagbits_t bit, struct 
cpuset *cs,
spin_unlock_irq(_lock);
 
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
 
if (spread_flag_changed)
update_tasks_flags(cs);
@@ -1607,6 +1605,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
 
+   get_online_cpus();
mutex_lock(_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -1644,6 +1643,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
}
 out_unlock:
mutex_unlock(_mutex);
+   put_online_cpus();

[PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-07 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Process A => kthreadd => Process B => Process C => Process A

Process A
cpu_subsys_offline();
  cpu_down();
_cpu_down();
  percpu_down_write(_hotplug_lock); //held
  cpuhp_invoke_callback();
workqueue_offline_cpu();
  wq_update_unbound_numa();
kthread_create_on_node();
  wake_up_process();  //wakeup kthreadd
  flush_work();
  wait_for_completion();

kthreadd
kthreadd();
  kernel_thread();
do_fork();
  copy_process();
percpu_down_read(_threadgroup_rwsem);
  __rwsem_down_read_failed_common(); //waiting

Process B
kernfs_fop_write();
  cgroup_file_write();
cgroup_procs_write();
  percpu_down_write(_threadgroup_rwsem); //held
  cgroup_attach_task();
cgroup_migrate();
  cgroup_migrate_execute();
cpuset_can_attach();
  mutex_lock(_mutex); //waiting

Process C
kernfs_fop_write();
  cgroup_file_write();
cpuset_write_resmask();
  mutex_lock(_mutex); //held
  update_cpumask();
update_cpumasks_hier();
  rebuild_sched_domains_locked();
get_online_cpus();
  percpu_down_read(_hotplug_lock); //waiting

Signed-off-by: Prateek Sood 
---
 kernel/cgroup/cpuset.c | 32 +++-
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039b..60dc0ac 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -816,16 +816,15 @@ static int generate_sched_domains(cpumask_var_t **domains,
  * 'cpus' is removed, then call this routine to rebuild the
  * scheduler's dynamic sched domains.
  *
- * Call with cpuset_mutex held.  Takes get_online_cpus().
  */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
struct sched_domain_attr *attr;
cpumask_var_t *doms;
int ndoms;
 
+   lockdep_assert_cpus_held();
lockdep_assert_held(_mutex);
-   get_online_cpus();
 
/*
 * We have raced with CPU hotplug. Don't do anything to avoid
@@ -833,27 +832,27 @@ static void rebuild_sched_domains_locked(void)
 * Anyways, hotplug work item will rebuild sched domains.
 */
if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
-   goto out;
+   return;
 
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(, );
 
/* Have scheduler rebuild the domains */
partition_sched_domains(ndoms, doms, attr);
-out:
-   put_online_cpus();
 }
 #else /* !CONFIG_SMP */
-static void rebuild_sched_domains_locked(void)
+static void rebuild_sched_domains_cpuslocked(void)
 {
 }
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
 {
+   get_online_cpus();
mutex_lock(_mutex);
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
mutex_unlock(_mutex);
+   put_online_cpus();
 }
 
 /**
@@ -940,7 +939,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
rcu_read_unlock();
 
if (need_rebuild_sched_domains)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
 }
 
 /**
@@ -1273,7 +1272,7 @@ static int update_relax_domain_level(struct cpuset *cs, 
s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
}
 
return 0;
@@ -1306,7 +1305,6 @@ static void update_tasks_flags(struct cpuset *cs)
  *
  * Call with cpuset_mutex held.
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
   int turning_on)
 {
@@ -1339,7 +1337,7 @@ static int update_flag(cpuset_flagbits_t bit, struct 
cpuset *cs,
spin_unlock_irq(_lock);
 
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_cpuslocked();
 
if (spread_flag_changed)
update_tasks_flags(cs);
@@ -1607,6 +1605,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
 
+   get_online_cpus();
mutex_lock(_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -1644,6 +1643,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
}
 out_unlock:
mutex_unlock(_mutex);
+   put_online_cpus();
return retva

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-07 Thread Prateek Sood

On 09/07/2017 02:26 PM, Boqun Feng wrote:
> On Thu, Sep 07, 2017 at 09:28:48AM +0200, Peter Zijlstra wrote:
>> On Thu, Sep 07, 2017 at 11:34:12AM +0530, Prateek Sood wrote:
>>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>>> being done while there is updation in cgroup and cpuset triggered from
>>> userspace.
>>>
>>> Example scenario:
>>> kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
>>>
>>> kworker/0:0 - percpu_down_write(_hotplug_lock)  [held]
>>>   flush(work)   [no high prio workqueue available on CPU]
>>>   wait_for_completion()
> 
> Hi Prateek,
> 
> so this is:
> 
>   _cpu_down():
> cpus_write_lock(); // percpu_down_write(_hotlug_lock)
> cpuhp_invoke_callbacks():
>   workqueue_offine_cpu():
> wq_update_unbound_numa():
>   alloc_unbound_pool():
> get_unbound_pool():
>   create_worker():
> kthread_create_on_node():
>   wake_up_process(kthreadd_task);
>   wait_for_completion(); // create->done
> 
> , right?
> 
> Wonder running in a kworker is necessary to trigger this, I mean running
> a cpu_down() in a normal process context could also trigger this, no?
> Just ask out of curiosity.
> 
> Regards,
> Boqun

Hi Boqun,

cpu_down() in normal process can also trigger this.



Regards
Prateek
> 
>>>
>>> kthreadd- percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
>>>
>>> init:729- percpu_down_write(cgroup_threadgroup_rwsem)   [held]
>>>   lock(cpuset_mutex)   [waiting]
>>>
>>> init:1  - lock(cpuset_mutex)   [held]
>>>   percpu_down_read(_hotplug_lock)   [waiting]
>>
>> That's both unreadable and useless :/ You want to tell what code paths
>> that were, not which random tasks happened to run them.
>>
>>
> [...]
> 


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-07 Thread Prateek Sood

On 09/07/2017 02:26 PM, Boqun Feng wrote:
> On Thu, Sep 07, 2017 at 09:28:48AM +0200, Peter Zijlstra wrote:
>> On Thu, Sep 07, 2017 at 11:34:12AM +0530, Prateek Sood wrote:
>>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>>> being done while there is updation in cgroup and cpuset triggered from
>>> userspace.
>>>
>>> Example scenario:
>>> kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
>>>
>>> kworker/0:0 - percpu_down_write(_hotplug_lock)  [held]
>>>   flush(work)   [no high prio workqueue available on CPU]
>>>   wait_for_completion()
> 
> Hi Prateek,
> 
> so this is:
> 
>   _cpu_down():
> cpus_write_lock(); // percpu_down_write(_hotlug_lock)
> cpuhp_invoke_callbacks():
>   workqueue_offine_cpu():
> wq_update_unbound_numa():
>   alloc_unbound_pool():
> get_unbound_pool():
>   create_worker():
> kthread_create_on_node():
>   wake_up_process(kthreadd_task);
>   wait_for_completion(); // create->done
> 
> , right?
> 
> Wonder running in a kworker is necessary to trigger this, I mean running
> a cpu_down() in a normal process context could also trigger this, no?
> Just ask out of curiosity.
> 
> Regards,
> Boqun

Hi Boqun,

cpu_down() in normal process can also trigger this.



Regards
Prateek
> 
>>>
>>> kthreadd- percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
>>>
>>> init:729- percpu_down_write(cgroup_threadgroup_rwsem)   [held]
>>>   lock(cpuset_mutex)   [waiting]
>>>
>>> init:1  - lock(cpuset_mutex)   [held]
>>>   percpu_down_read(_hotplug_lock)   [waiting]
>>
>> That's both unreadable and useless :/ You want to tell what code paths
>> that were, not which random tasks happened to run them.
>>
>>
> [...]
> 


-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-07 Thread Prateek Sood

On 09/07/2017 12:58 PM, Peter Zijlstra wrote:
> On Thu, Sep 07, 2017 at 11:34:12AM +0530, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Example scenario:
>> kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
>>
>> kworker/0:0 - percpu_down_write(_hotplug_lock)  [held]
>>   flush(work)   [no high prio workqueue available on CPU]
>>   wait_for_completion()
>>
>> kthreadd- percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
>>
>> init:729- percpu_down_write(cgroup_threadgroup_rwsem)   [held]
>>   lock(cpuset_mutex)   [waiting]
>>
>> init:1  - lock(cpuset_mutex)   [held]
>>   percpu_down_read(_hotplug_lock)   [waiting]
> 
> That's both unreadable and useless :/ You want to tell what code paths
> that were, not which random tasks happened to run them.
> 
> 
> 
>> Eliminate this dependecy by reordering locking of cpuset_mutex
>> and cpu_hotplug_lock in following order
>> 1. Acquire cpu_hotplug_lock (read)
>> 2. Acquire cpuset_mutex
>>
>> Signed-off-by: Prateek Sood <prs...@codeaurora.org>
>> ---
>>  kernel/cgroup/cpuset.c | 70 
>> +++---
>>  1 file changed, 61 insertions(+), 9 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index 2f4039b..687be57 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -843,10 +843,41 @@ static void rebuild_sched_domains_locked(void)
>>  out:
>>  put_online_cpus();
>>  }
>> +
>> +/*
>> + * Rebuild scheduler domains.
>> + * Call with following lock held in the order
>> + * 1. cpu_hotplug_lock (read)
>> + * 2. cpuset_mutex
> 
> Do not put that in comments, nobody ever reads comments.
> 
>> + */
>> +static void rebuild_sched_domains_unlocked(void)
> 
> The common postfix for a function called with the cpuhotplug lock held
> is: _cpuslocked()
> 
>> +{
>> +struct sched_domain_attr *attr;
>> +cpumask_var_t *doms;
>> +int ndoms;
> 
>   lockdep_assert_cpus_held();
>   lockdep_assert_held(_mutex);
> 
>> +
>> +/*
>> + * We have raced with CPU hotplug. Don't do anything to avoid
>> + * passing doms with offlined cpu to partition_sched_domains().
>> + * Anyways, hotplug work item will rebuild sched domains.
>> + */
>> +if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
>> +return;
>> +
>> +/* Generate domain masks and attrs */
>> +ndoms = generate_sched_domains(, );
>> +
>> +/* Have scheduler rebuild the domains */
>> +partition_sched_domains(ndoms, doms, attr);
>> +}
> 
> And you couldn't come up with a way to share _anything_ with the
> existing rebuild_sched_domains_locked() function?
> 
> *sigh*.. please try again.
> 

Thanks for the suggestion Peter, I will resend the patch

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-07 Thread Prateek Sood

On 09/07/2017 12:58 PM, Peter Zijlstra wrote:
> On Thu, Sep 07, 2017 at 11:34:12AM +0530, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Example scenario:
>> kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
>>
>> kworker/0:0 - percpu_down_write(_hotplug_lock)  [held]
>>   flush(work)   [no high prio workqueue available on CPU]
>>   wait_for_completion()
>>
>> kthreadd- percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
>>
>> init:729- percpu_down_write(cgroup_threadgroup_rwsem)   [held]
>>   lock(cpuset_mutex)   [waiting]
>>
>> init:1  - lock(cpuset_mutex)   [held]
>>   percpu_down_read(_hotplug_lock)   [waiting]
> 
> That's both unreadable and useless :/ You want to tell what code paths
> that were, not which random tasks happened to run them.
> 
> 
> 
>> Eliminate this dependecy by reordering locking of cpuset_mutex
>> and cpu_hotplug_lock in following order
>> 1. Acquire cpu_hotplug_lock (read)
>> 2. Acquire cpuset_mutex
>>
>> Signed-off-by: Prateek Sood 
>> ---
>>  kernel/cgroup/cpuset.c | 70 
>> +++---
>>  1 file changed, 61 insertions(+), 9 deletions(-)
>>
>> diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
>> index 2f4039b..687be57 100644
>> --- a/kernel/cgroup/cpuset.c
>> +++ b/kernel/cgroup/cpuset.c
>> @@ -843,10 +843,41 @@ static void rebuild_sched_domains_locked(void)
>>  out:
>>  put_online_cpus();
>>  }
>> +
>> +/*
>> + * Rebuild scheduler domains.
>> + * Call with following lock held in the order
>> + * 1. cpu_hotplug_lock (read)
>> + * 2. cpuset_mutex
> 
> Do not put that in comments, nobody ever reads comments.
> 
>> + */
>> +static void rebuild_sched_domains_unlocked(void)
> 
> The common postfix for a function called with the cpuhotplug lock held
> is: _cpuslocked()
> 
>> +{
>> +struct sched_domain_attr *attr;
>> +cpumask_var_t *doms;
>> +int ndoms;
> 
>   lockdep_assert_cpus_held();
>   lockdep_assert_held(_mutex);
> 
>> +
>> +/*
>> + * We have raced with CPU hotplug. Don't do anything to avoid
>> + * passing doms with offlined cpu to partition_sched_domains().
>> + * Anyways, hotplug work item will rebuild sched domains.
>> + */
>> +if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
>> +return;
>> +
>> +/* Generate domain masks and attrs */
>> +ndoms = generate_sched_domains(, );
>> +
>> +/* Have scheduler rebuild the domains */
>> +partition_sched_domains(ndoms, doms, attr);
>> +}
> 
> And you couldn't come up with a way to share _anything_ with the
> existing rebuild_sched_domains_locked() function?
> 
> *sigh*.. please try again.
> 

Thanks for the suggestion Peter, I will resend the patch

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

[PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-07 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Example scenario:
kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0

kworker/0:0 - percpu_down_write(_hotplug_lock)  [held]
  flush(work)   [no high prio workqueue available on CPU]
  wait_for_completion()

kthreadd- percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]

init:729- percpu_down_write(cgroup_threadgroup_rwsem)   [held]
  lock(cpuset_mutex)   [waiting]

init:1  - lock(cpuset_mutex)   [held]
  percpu_down_read(_hotplug_lock)   [waiting]

Eliminate this dependecy by reordering locking of cpuset_mutex
and cpu_hotplug_lock in following order
1. Acquire cpu_hotplug_lock (read)
2. Acquire cpuset_mutex

Signed-off-by: Prateek Sood <prs...@codeaurora.org>
---
 kernel/cgroup/cpuset.c | 70 +++---
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039b..687be57 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -843,10 +843,41 @@ static void rebuild_sched_domains_locked(void)
 out:
put_online_cpus();
 }
+
+/*
+ * Rebuild scheduler domains.
+ * Call with following lock held in the order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
+ */
+static void rebuild_sched_domains_unlocked(void)
+{
+struct sched_domain_attr *attr;
+cpumask_var_t *doms;
+int ndoms;
+
+/*
+ * We have raced with CPU hotplug. Don't do anything to avoid
+ * passing doms with offlined cpu to partition_sched_domains().
+ * Anyways, hotplug work item will rebuild sched domains.
+ */
+if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+return;
+
+/* Generate domain masks and attrs */
+ndoms = generate_sched_domains(, );
+
+/* Have scheduler rebuild the domains */
+partition_sched_domains(ndoms, doms, attr);
+}
 #else /* !CONFIG_SMP */
 static void rebuild_sched_domains_locked(void)
 {
 }
+
+static void rebuild_sched_domains_unlocked(void)
+{
+}
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
@@ -885,7 +916,9 @@ static void update_tasks_cpumask(struct cpuset *cs)
  *
  * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
  *
- * Called with cpuset_mutex held
+ * Called with following lock held in order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
@@ -940,7 +973,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
rcu_read_unlock();
 
if (need_rebuild_sched_domains)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
 }
 
 /**
@@ -1262,6 +1295,11 @@ int current_cpuset_is_being_rebound(void)
return ret;
 }
 
+/*
+ * Call with following lock held in order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
+ */
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
 #ifdef CONFIG_SMP
@@ -1273,7 +1311,7 @@ static int update_relax_domain_level(struct cpuset *cs, 
s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
}
 
return 0;
@@ -1304,9 +1342,10 @@ static void update_tasks_flags(struct cpuset *cs)
  * cs: the cpuset to update
  * turning_on: whether the flag is being set or cleared
  *
- * Call with cpuset_mutex held.
+ * Call with following lock held in order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
   int turning_on)
 {
@@ -1339,7 +1378,7 @@ static int update_flag(cpuset_flagbits_t bit, struct 
cpuset *cs,
spin_unlock_irq(_lock);
 
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
 
if (spread_flag_changed)
update_tasks_flags(cs);
@@ -1607,6 +1646,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
 
+   get_online_cpus();
mutex_lock(_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -1644,6 +1684,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
}
 out_unlock:
mutex_unlock(_mutex);
+   put_online_cpus();
return retval;
 }
 
@@ -1654,6 +1695,7 @@ static i

[PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-07 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Example scenario:
kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0

kworker/0:0 - percpu_down_write(_hotplug_lock)  [held]
  flush(work)   [no high prio workqueue available on CPU]
  wait_for_completion()

kthreadd- percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]

init:729- percpu_down_write(cgroup_threadgroup_rwsem)   [held]
  lock(cpuset_mutex)   [waiting]

init:1  - lock(cpuset_mutex)   [held]
  percpu_down_read(_hotplug_lock)   [waiting]

Eliminate this dependecy by reordering locking of cpuset_mutex
and cpu_hotplug_lock in following order
1. Acquire cpu_hotplug_lock (read)
2. Acquire cpuset_mutex

Signed-off-by: Prateek Sood 
---
 kernel/cgroup/cpuset.c | 70 +++---
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039b..687be57 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -843,10 +843,41 @@ static void rebuild_sched_domains_locked(void)
 out:
put_online_cpus();
 }
+
+/*
+ * Rebuild scheduler domains.
+ * Call with following lock held in the order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
+ */
+static void rebuild_sched_domains_unlocked(void)
+{
+struct sched_domain_attr *attr;
+cpumask_var_t *doms;
+int ndoms;
+
+/*
+ * We have raced with CPU hotplug. Don't do anything to avoid
+ * passing doms with offlined cpu to partition_sched_domains().
+ * Anyways, hotplug work item will rebuild sched domains.
+ */
+if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+return;
+
+/* Generate domain masks and attrs */
+ndoms = generate_sched_domains(, );
+
+/* Have scheduler rebuild the domains */
+partition_sched_domains(ndoms, doms, attr);
+}
 #else /* !CONFIG_SMP */
 static void rebuild_sched_domains_locked(void)
 {
 }
+
+static void rebuild_sched_domains_unlocked(void)
+{
+}
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
@@ -885,7 +916,9 @@ static void update_tasks_cpumask(struct cpuset *cs)
  *
  * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
  *
- * Called with cpuset_mutex held
+ * Called with following lock held in order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
@@ -940,7 +973,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
rcu_read_unlock();
 
if (need_rebuild_sched_domains)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
 }
 
 /**
@@ -1262,6 +1295,11 @@ int current_cpuset_is_being_rebound(void)
return ret;
 }
 
+/*
+ * Call with following lock held in order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
+ */
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
 #ifdef CONFIG_SMP
@@ -1273,7 +1311,7 @@ static int update_relax_domain_level(struct cpuset *cs, 
s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
}
 
return 0;
@@ -1304,9 +1342,10 @@ static void update_tasks_flags(struct cpuset *cs)
  * cs: the cpuset to update
  * turning_on: whether the flag is being set or cleared
  *
- * Call with cpuset_mutex held.
+ * Call with following lock held in order
+ * 1. cpu_hotplug_lock (read)
+ * 2. cpuset_mutex
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
   int turning_on)
 {
@@ -1339,7 +1378,7 @@ static int update_flag(cpuset_flagbits_t bit, struct 
cpuset *cs,
spin_unlock_irq(_lock);
 
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
 
if (spread_flag_changed)
update_tasks_flags(cs);
@@ -1607,6 +1646,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
 
+   get_online_cpus();
mutex_lock(_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -1644,6 +1684,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
}
 out_unlock:
mutex_unlock(_mutex);
+   put_online_cpus();
return retval;
 }
 
@@ -1654,6 +1695,7 @@ static int cpuset_write_s64(struct

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-06 Thread Prateek Sood

On 09/06/2017 06:26 PM, Waiman Long wrote:
> On 09/06/2017 07:48 AM, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Example scenario:
>> kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
>>
>> kworker/0:0 - lock(cpuhotplug.mutex)  [held]
>>   flush(work)   [no high prio workqueue available on CPU]
>>   wait_for_completion()
>>
>> kthreadd- percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
>>
>> init:729- percpu_down_write(cgroup_threadgroup_rwsem)   [held]
>>   lock(cpuset_mutex)   [waiting]
>>
>> init:1  - lock(cpuset_mutex)   [held]
>>   lock(cpuhotplug.mutex)   [waiting]
>>
>> Eliminate this dependecy by reordering locking of cpuset_mutex
>> and cpuhotplug.mutex in following order
>> 1. Acquire cpuhotplug.mutex
>> 2. Acquire cpuset_mutex
>>
>> Signed-off-by: Prateek Sood <prs...@codeaurora.org>
> 
> Is this patch for the latest upstream kernel or 4.4? There is no
> cpuhotplug.mutex anymore in upstream kernel. It is a per-cpu rwsem
> cpu_hotplug_lock.
> 
> Cheers,
> Longman
> 

Thanks for inputs, I will check latest kernel for details

Regards
Prateek

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-06 Thread Prateek Sood

On 09/06/2017 06:26 PM, Waiman Long wrote:
> On 09/06/2017 07:48 AM, Prateek Sood wrote:
>> Remove circular dependency deadlock in a scenario where hotplug of CPU is
>> being done while there is updation in cgroup and cpuset triggered from
>> userspace.
>>
>> Example scenario:
>> kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0
>>
>> kworker/0:0 - lock(cpuhotplug.mutex)  [held]
>>   flush(work)   [no high prio workqueue available on CPU]
>>   wait_for_completion()
>>
>> kthreadd- percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]
>>
>> init:729- percpu_down_write(cgroup_threadgroup_rwsem)   [held]
>>   lock(cpuset_mutex)   [waiting]
>>
>> init:1  - lock(cpuset_mutex)   [held]
>>   lock(cpuhotplug.mutex)   [waiting]
>>
>> Eliminate this dependecy by reordering locking of cpuset_mutex
>> and cpuhotplug.mutex in following order
>> 1. Acquire cpuhotplug.mutex
>> 2. Acquire cpuset_mutex
>>
>> Signed-off-by: Prateek Sood 
> 
> Is this patch for the latest upstream kernel or 4.4? There is no
> cpuhotplug.mutex anymore in upstream kernel. It is a per-cpu rwsem
> cpu_hotplug_lock.
> 
> Cheers,
> Longman
> 

Thanks for inputs, I will check latest kernel for details

Regards
Prateek

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

[PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-06 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Example scenario:
kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0

kworker/0:0 - lock(cpuhotplug.mutex)  [held]
  flush(work)   [no high prio workqueue available on CPU]
  wait_for_completion()

kthreadd- percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]

init:729- percpu_down_write(cgroup_threadgroup_rwsem)   [held]
  lock(cpuset_mutex)   [waiting]

init:1  - lock(cpuset_mutex)   [held]
  lock(cpuhotplug.mutex)   [waiting]

Eliminate this dependecy by reordering locking of cpuset_mutex
and cpuhotplug.mutex in following order
1. Acquire cpuhotplug.mutex
2. Acquire cpuset_mutex

Signed-off-by: Prateek Sood <prs...@codeaurora.org>
---
 kernel/cgroup/cpuset.c | 70 +++---
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039b..c7a3901 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -843,10 +843,41 @@ static void rebuild_sched_domains_locked(void)
 out:
put_online_cpus();
 }
+
+/*
+ * Rebuild scheduler domains.
+ * Call with following mutex held in the order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
+ */
+static void rebuild_sched_domains_unlocked(void)
+{
+struct sched_domain_attr *attr;
+cpumask_var_t *doms;
+int ndoms;
+
+/*
+ * We have raced with CPU hotplug. Don't do anything to avoid
+ * passing doms with offlined cpu to partition_sched_domains().
+ * Anyways, hotplug work item will rebuild sched domains.
+ */
+if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+return;
+
+/* Generate domain masks and attrs */
+ndoms = generate_sched_domains(, );
+
+/* Have scheduler rebuild the domains */
+partition_sched_domains(ndoms, doms, attr);
+}
 #else /* !CONFIG_SMP */
 static void rebuild_sched_domains_locked(void)
 {
 }
+
+static void rebuild_sched_domains_unlocked(void)
+{
+}
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
@@ -885,7 +916,9 @@ static void update_tasks_cpumask(struct cpuset *cs)
  *
  * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
  *
- * Called with cpuset_mutex held
+ * Called with following mutex held in order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
@@ -940,7 +973,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
rcu_read_unlock();
 
if (need_rebuild_sched_domains)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
 }
 
 /**
@@ -1262,6 +1295,11 @@ int current_cpuset_is_being_rebound(void)
return ret;
 }
 
+/*
+ * Call with following mutex held in order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
+ */
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
 #ifdef CONFIG_SMP
@@ -1273,7 +1311,7 @@ static int update_relax_domain_level(struct cpuset *cs, 
s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
}
 
return 0;
@@ -1304,9 +1342,10 @@ static void update_tasks_flags(struct cpuset *cs)
  * cs: the cpuset to update
  * turning_on: whether the flag is being set or cleared
  *
- * Call with cpuset_mutex held.
+ * Call with following mutex held in order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
   int turning_on)
 {
@@ -1339,7 +1378,7 @@ static int update_flag(cpuset_flagbits_t bit, struct 
cpuset *cs,
spin_unlock_irq(_lock);
 
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
 
if (spread_flag_changed)
update_tasks_flags(cs);
@@ -1607,6 +1646,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
 
+   get_online_cpus();
mutex_lock(_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -1644,6 +1684,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
}
 out_unlock:
mutex_unlock(_mutex);
+   put_online_cpus();
return retval;
 }
 
@@ -1654,6 +1695,7 @@ static int cpuset_write_s64(struct

[PATCH] cgroup/cpuset: remove circular dependency deadlock

2017-09-06 Thread Prateek Sood

Remove circular dependency deadlock in a scenario where hotplug of CPU is
being done while there is updation in cgroup and cpuset triggered from
userspace.

Example scenario:
kworker/0:0 => kthreadd => init:729 => init:1 => kworker/0:0

kworker/0:0 - lock(cpuhotplug.mutex)  [held]
  flush(work)   [no high prio workqueue available on CPU]
  wait_for_completion()

kthreadd- percpu_down_read(cgroup_threadgroup_rwsem)  [waiting]

init:729- percpu_down_write(cgroup_threadgroup_rwsem)   [held]
  lock(cpuset_mutex)   [waiting]

init:1  - lock(cpuset_mutex)   [held]
  lock(cpuhotplug.mutex)   [waiting]

Eliminate this dependecy by reordering locking of cpuset_mutex
and cpuhotplug.mutex in following order
1. Acquire cpuhotplug.mutex
2. Acquire cpuset_mutex

Signed-off-by: Prateek Sood 
---
 kernel/cgroup/cpuset.c | 70 +++---
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 2f4039b..c7a3901 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -843,10 +843,41 @@ static void rebuild_sched_domains_locked(void)
 out:
put_online_cpus();
 }
+
+/*
+ * Rebuild scheduler domains.
+ * Call with following mutex held in the order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
+ */
+static void rebuild_sched_domains_unlocked(void)
+{
+struct sched_domain_attr *attr;
+cpumask_var_t *doms;
+int ndoms;
+
+/*
+ * We have raced with CPU hotplug. Don't do anything to avoid
+ * passing doms with offlined cpu to partition_sched_domains().
+ * Anyways, hotplug work item will rebuild sched domains.
+ */
+if (!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
+return;
+
+/* Generate domain masks and attrs */
+ndoms = generate_sched_domains(, );
+
+/* Have scheduler rebuild the domains */
+partition_sched_domains(ndoms, doms, attr);
+}
 #else /* !CONFIG_SMP */
 static void rebuild_sched_domains_locked(void)
 {
 }
+
+static void rebuild_sched_domains_unlocked(void)
+{
+}
 #endif /* CONFIG_SMP */
 
 void rebuild_sched_domains(void)
@@ -885,7 +916,9 @@ static void update_tasks_cpumask(struct cpuset *cs)
  *
  * On legacy hierachy, effective_cpus will be the same with cpu_allowed.
  *
- * Called with cpuset_mutex held
+ * Called with following mutex held in order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
  */
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
@@ -940,7 +973,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct 
cpumask *new_cpus)
rcu_read_unlock();
 
if (need_rebuild_sched_domains)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
 }
 
 /**
@@ -1262,6 +1295,11 @@ int current_cpuset_is_being_rebound(void)
return ret;
 }
 
+/*
+ * Call with following mutex held in order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
+ */
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
 #ifdef CONFIG_SMP
@@ -1273,7 +1311,7 @@ static int update_relax_domain_level(struct cpuset *cs, 
s64 val)
cs->relax_domain_level = val;
if (!cpumask_empty(cs->cpus_allowed) &&
is_sched_load_balance(cs))
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
}
 
return 0;
@@ -1304,9 +1342,10 @@ static void update_tasks_flags(struct cpuset *cs)
  * cs: the cpuset to update
  * turning_on: whether the flag is being set or cleared
  *
- * Call with cpuset_mutex held.
+ * Call with following mutex held in order
+ * 1. cpuhotplug.mutex
+ * 2. cpuset_mutex
  */
-
 static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
   int turning_on)
 {
@@ -1339,7 +1378,7 @@ static int update_flag(cpuset_flagbits_t bit, struct 
cpuset *cs,
spin_unlock_irq(_lock);
 
if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed)
-   rebuild_sched_domains_locked();
+   rebuild_sched_domains_unlocked();
 
if (spread_flag_changed)
update_tasks_flags(cs);
@@ -1607,6 +1646,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
cpuset_filetype_t type = cft->private;
int retval = 0;
 
+   get_online_cpus();
mutex_lock(_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
@@ -1644,6 +1684,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state 
*css, struct cftype *cft,
}
 out_unlock:
mutex_unlock(_mutex);
+   put_online_cpus();
return retval;
 }
 
@@ -1654,6 +1695,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state 
*css, struct cftype *cft,

Re: [PATCH] Workqueue lockup: Circular dependency in threads

2017-09-06 Thread Prateek Sood

On 09/05/2017 06:52 PM, Tejun Heo wrote:
> Hello,
> 
> On Thu, Aug 31, 2017 at 06:43:56PM +0530, Prateek Sood wrote:
>>> 6) cpuset_mutex is acquired by task init:1 and is waiting for cpuhotplug 
>>> lock.
> 
> Yeah, this is the problematic one.
> 
>>> We can reorder the sequence of locks as in the below diff to avoid this
>>> deadlock. But I am looking for inputs/better solution to fix this deadlock.
>>>
>>> ---
>>> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
>>>  /**
>>>   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
>>>   * @cs: the cpuset in which each task's cpus_allowed mask needs to be 
>>> changed
>>> @@ -930,7 +946,7 @@ static void update_cpumasks_hier(struct cpuset *cs, 
>>> struct cpumask *new_cpus)
>>> rcu_read_unlock();
>>>  
>>> if (need_rebuild_sched_domains)
>>> -   rebuild_sched_domains_locked();
>>> +   rebuild_sched_domains_unlocked()(without taking cpuhotplug.lock)
>>>  }
>>>  
>>>  /**
>>> @@ -1719,6 +1735,7 @@ static ssize_t cpuset_write_resmask(struct 
>>> kernfs_open_file *of,
>>> +   get_online_cpus();
>>> mutex_lock(_mutex);
>>> if (!is_cpuset_online(cs))
>>> goto out_unlock;
>>> @@ -1744,6 +1761,7 @@ static ssize_t cpuset_write_resmask(struct 
>>> kernfs_open_file *of,
>>> mutex_unlock(_mutex);
>>> +   put_online_cpus();
>>> kernfs_unbreak_active_protection(of->kn);
>>> css_put(>css);
>>> flush_workqueue(cpuset_migrate_mm_wq);
>>>
> 
> And the patch looks good to me.  Can you please format the patch with
> proper description and sob?
> 
> Thanks.
> 

Thanks for review Tejun

I will send updated patch.

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

Re: [PATCH] Workqueue lockup: Circular dependency in threads

2017-09-06 Thread Prateek Sood

On 09/05/2017 06:52 PM, Tejun Heo wrote:
> Hello,
> 
> On Thu, Aug 31, 2017 at 06:43:56PM +0530, Prateek Sood wrote:
>>> 6) cpuset_mutex is acquired by task init:1 and is waiting for cpuhotplug 
>>> lock.
> 
> Yeah, this is the problematic one.
> 
>>> We can reorder the sequence of locks as in the below diff to avoid this
>>> deadlock. But I am looking for inputs/better solution to fix this deadlock.
>>>
>>> ---
>>> diff --git a/kernel/cpuset.c b/kernel/cpuset.c
>>>  /**
>>>   * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
>>>   * @cs: the cpuset in which each task's cpus_allowed mask needs to be 
>>> changed
>>> @@ -930,7 +946,7 @@ static void update_cpumasks_hier(struct cpuset *cs, 
>>> struct cpumask *new_cpus)
>>> rcu_read_unlock();
>>>  
>>> if (need_rebuild_sched_domains)
>>> -   rebuild_sched_domains_locked();
>>> +   rebuild_sched_domains_unlocked()(without taking cpuhotplug.lock)
>>>  }
>>>  
>>>  /**
>>> @@ -1719,6 +1735,7 @@ static ssize_t cpuset_write_resmask(struct 
>>> kernfs_open_file *of,
>>> +   get_online_cpus();
>>> mutex_lock(_mutex);
>>> if (!is_cpuset_online(cs))
>>> goto out_unlock;
>>> @@ -1744,6 +1761,7 @@ static ssize_t cpuset_write_resmask(struct 
>>> kernfs_open_file *of,
>>> mutex_unlock(_mutex);
>>> +   put_online_cpus();
>>> kernfs_unbreak_active_protection(of->kn);
>>> css_put(>css);
>>> flush_workqueue(cpuset_migrate_mm_wq);
>>>
> 
> And the patch looks good to me.  Can you please format the patch with
> proper description and sob?
> 
> Thanks.
> 

Thanks for review Tejun

I will send updated patch.

-- 
Qualcomm India Private Limited, on behalf of Qualcomm Innovation
Center, Inc., is a member of Code Aurora Forum, a Linux Foundation
Collaborative Project

1 2 >

1 - 100 of 118 matches

Mail list logo