[PATCH 2/8] fs/aio: Use RCU accessors for kioctx_table->table[]

2018-03-14 Thread Tejun Heo
While converting ioctx index from a list to a table, db446a08c23d
("aio: convert the ioctx list to table lookup v3") missed tagging
kioctx_table->table[] as an array of RCU pointers and using the
appropriate RCU accessors.  This introduces a small window in the
lookup path where init and access may race.

Mark kioctx_table->table[] with __rcu and use the approriate RCU
accessors when using the field.

Signed-off-by: Tejun Heo 
Reported-by: Jann Horn 
Fixes: db446a08c23d ("aio: convert the ioctx list to table lookup v3")
Cc: Benjamin LaHaise 
Cc: Linus Torvalds 
Cc: sta...@vger.kernel.org
---
 fs/aio.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index eb2e0cf..6bcd3fb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -68,9 +68,9 @@ struct aio_ring {
 #define AIO_RING_PAGES 8
 
 struct kioctx_table {
-   struct rcu_head rcu;
-   unsignednr;
-   struct kioctx   *table[];
+   struct rcu_head rcu;
+   unsignednr;
+   struct kioctx __rcu *table[];
 };
 
 struct kioctx_cpu {
@@ -330,7 +330,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma)
for (i = 0; i < table->nr; i++) {
struct kioctx *ctx;
 
-   ctx = table->table[i];
+   ctx = rcu_dereference(table->table[i]);
if (ctx && ctx->aio_ring_file == file) {
if (!atomic_read(>dead)) {
ctx->user_id = ctx->mmap_base = vma->vm_start;
@@ -666,9 +666,9 @@ static int ioctx_add_table(struct kioctx *ctx, struct 
mm_struct *mm)
while (1) {
if (table)
for (i = 0; i < table->nr; i++)
-   if (!table->table[i]) {
+   if (!rcu_access_pointer(table->table[i])) {
ctx->id = i;
-   table->table[i] = ctx;
+   rcu_assign_pointer(table->table[i], 
ctx);
spin_unlock(>ioctx_lock);
 
/* While kioctx setup is in progress,
@@ -849,8 +849,8 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx 
*ctx,
}
 
table = rcu_dereference_raw(mm->ioctx_table);
-   WARN_ON(ctx != table->table[ctx->id]);
-   table->table[ctx->id] = NULL;
+   WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
+   RCU_INIT_POINTER(table->table[ctx->id], NULL);
spin_unlock(>ioctx_lock);
 
/* free_ioctx_reqs() will do the necessary RCU synchronization */
@@ -895,7 +895,8 @@ void exit_aio(struct mm_struct *mm)
 
skipped = 0;
for (i = 0; i < table->nr; ++i) {
-   struct kioctx *ctx = table->table[i];
+   struct kioctx *ctx =
+   rcu_dereference_protected(table->table[i], true);
 
if (!ctx) {
skipped++;
@@ -1084,7 +1085,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
if (!table || id >= table->nr)
goto out;
 
-   ctx = table->table[id];
+   ctx = rcu_dereference(table->table[id]);
if (ctx && ctx->user_id == ctx_id) {
percpu_ref_get(>users);
ret = ctx;
-- 
2.9.5



[PATCH 1/8] fs/aio: Add explicit RCU grace period when freeing kioctx

2018-03-14 Thread Tejun Heo
While fixing refcounting, e34ecee2ae79 ("aio: Fix a trinity splat")
incorrectly removed explicit RCU grace period before freeing kioctx.
The intention seems to be depending on the internal RCU grace periods
of percpu_ref; however, percpu_ref uses a different flavor of RCU,
sched-RCU.  This can lead to kioctx being freed while RCU read
protected dereferences are still in progress.

Fix it by updating free_ioctx() to go through call_rcu() explicitly.

v2: Comment added to explain double bouncing.

Signed-off-by: Tejun Heo <t...@kernel.org>
Reported-by: Jann Horn <ja...@google.com>
Fixes: e34ecee2ae79 ("aio: Fix a trinity splat")
Cc: Kent Overstreet <kent.overstr...@gmail.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: sta...@vger.kernel.org
---
 fs/aio.c | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index a062d75..eb2e0cf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -115,7 +115,8 @@ struct kioctx {
struct page **ring_pages;
longnr_pages;
 
-   struct work_struct  free_work;
+   struct rcu_head free_rcu;
+   struct work_struct  free_work;  /* see free_ioctx() */
 
/*
 * signals when all in-flight requests are done
@@ -588,6 +589,12 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
return cancel(>common);
 }
 
+/*
+ * free_ioctx() should be RCU delayed to synchronize against the RCU
+ * protected lookup_ioctx() and also needs process context to call
+ * aio_free_ring(), so the double bouncing through kioctx->free_rcu and
+ * ->free_work.
+ */
 static void free_ioctx(struct work_struct *work)
 {
struct kioctx *ctx = container_of(work, struct kioctx, free_work);
@@ -601,6 +608,14 @@ static void free_ioctx(struct work_struct *work)
kmem_cache_free(kioctx_cachep, ctx);
 }
 
+static void free_ioctx_rcufn(struct rcu_head *head)
+{
+   struct kioctx *ctx = container_of(head, struct kioctx, free_rcu);
+
+   INIT_WORK(>free_work, free_ioctx);
+   schedule_work(>free_work);
+}
+
 static void free_ioctx_reqs(struct percpu_ref *ref)
 {
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
@@ -609,8 +624,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
if (ctx->rq_wait && atomic_dec_and_test(>rq_wait->count))
complete(>rq_wait->comp);
 
-   INIT_WORK(>free_work, free_ioctx);
-   schedule_work(>free_work);
+   /* Synchronize against RCU protected table->table[] dereferences */
+   call_rcu(>free_rcu, free_ioctx_rcufn);
 }
 
 /*
@@ -838,7 +853,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx 
*ctx,
table->table[ctx->id] = NULL;
spin_unlock(>ioctx_lock);
 
-   /* percpu_ref_kill() will do the necessary call_rcu() */
+   /* free_ioctx_reqs() will do the necessary RCU synchronization */
wake_up_all(>wait);
 
/*
-- 
2.9.5



[PATCH 1/8] fs/aio: Add explicit RCU grace period when freeing kioctx

2018-03-14 Thread Tejun Heo
While fixing refcounting, e34ecee2ae79 ("aio: Fix a trinity splat")
incorrectly removed explicit RCU grace period before freeing kioctx.
The intention seems to be depending on the internal RCU grace periods
of percpu_ref; however, percpu_ref uses a different flavor of RCU,
sched-RCU.  This can lead to kioctx being freed while RCU read
protected dereferences are still in progress.

Fix it by updating free_ioctx() to go through call_rcu() explicitly.

v2: Comment added to explain double bouncing.

Signed-off-by: Tejun Heo 
Reported-by: Jann Horn 
Fixes: e34ecee2ae79 ("aio: Fix a trinity splat")
Cc: Kent Overstreet 
Cc: Linus Torvalds 
Cc: sta...@vger.kernel.org
---
 fs/aio.c | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index a062d75..eb2e0cf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -115,7 +115,8 @@ struct kioctx {
struct page **ring_pages;
longnr_pages;
 
-   struct work_struct  free_work;
+   struct rcu_head free_rcu;
+   struct work_struct  free_work;  /* see free_ioctx() */
 
/*
 * signals when all in-flight requests are done
@@ -588,6 +589,12 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
return cancel(>common);
 }
 
+/*
+ * free_ioctx() should be RCU delayed to synchronize against the RCU
+ * protected lookup_ioctx() and also needs process context to call
+ * aio_free_ring(), so the double bouncing through kioctx->free_rcu and
+ * ->free_work.
+ */
 static void free_ioctx(struct work_struct *work)
 {
struct kioctx *ctx = container_of(work, struct kioctx, free_work);
@@ -601,6 +608,14 @@ static void free_ioctx(struct work_struct *work)
kmem_cache_free(kioctx_cachep, ctx);
 }
 
+static void free_ioctx_rcufn(struct rcu_head *head)
+{
+   struct kioctx *ctx = container_of(head, struct kioctx, free_rcu);
+
+   INIT_WORK(>free_work, free_ioctx);
+   schedule_work(>free_work);
+}
+
 static void free_ioctx_reqs(struct percpu_ref *ref)
 {
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
@@ -609,8 +624,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
if (ctx->rq_wait && atomic_dec_and_test(>rq_wait->count))
complete(>rq_wait->comp);
 
-   INIT_WORK(>free_work, free_ioctx);
-   schedule_work(>free_work);
+   /* Synchronize against RCU protected table->table[] dereferences */
+   call_rcu(>free_rcu, free_ioctx_rcufn);
 }
 
 /*
@@ -838,7 +853,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx 
*ctx,
table->table[ctx->id] = NULL;
spin_unlock(>ioctx_lock);
 
-   /* percpu_ref_kill() will do the necessary call_rcu() */
+   /* free_ioctx_reqs() will do the necessary RCU synchronization */
wake_up_all(>wait);
 
/*
-- 
2.9.5



[PATCHSET v2] percpu_ref, RCU: Audit RCU usages in percpu_ref users

2018-03-14 Thread Tejun Heo
Hello,

Linus, if you want to pick up the first three fix patches, please pull
from the following branch.

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git 
percpu_ref-rcu-audit-fixes

Changes from v1[L] are

* blk_queue_enter() patch dropped.  Needs further investigation.

* queue_rcu_work_on() dropped and documentation improved.

* rcu_work conversion patches separated out.

Original patchset description follows.

Jann Horn found that aio was depending on the internal RCU grace
periods of percpu-ref and that it's broken because aio uses regular
RCU while percpu_ref uses sched-RCU.

Depending on percpu_ref's internal grace periods isn't a good idea
because

 * The RCU type might not match.

 * percpu_ref's grace periods are used to switch to atomic mode.  They
   aren't between the last put and the invocation of the last release.
   This is easy to get confused about and can lead to subtle bugs.

 * percpu_ref might not have grace periods at all depending on its
   current operation mode.

This patchset audits all percpu_ref users for their RCU usages,
clarifies percpu_ref documentation that the internal grace periods
must not be depended upon, and introduces rcu_work to simplify
bouncing to a workqueue after an RCU grace period.

This patchset contains the following eight patches.

 0001-fs-aio-Add-explicit-RCU-grace-period-when-freeing-ki.patch
 0002-fs-aio-Use-RCU-accessors-for-kioctx_table-table.patch
 0003-RDMAVT-Fix-synchronization-around-percpu_ref.patch
 0004-HMM-Remove-superflous-RCU-protection-around-radix-tr.patch
 0005-percpu_ref-Update-doc-to-dissuade-users-from-dependi.patch
 0006-RCU-workqueue-Implement-rcu_work.patch
 0007-cgroup-Use-rcu_work-instead-of-explicit-rcu-and-work.patch
 0008-fs-aio-Use-rcu_work-instead-of-explicit-rcu-and-work.patch

0001-0003 are fixes and tagged -stable.

0004 removes (seemingly) superflous RCU read lock usages.

0005 updates the doc and 0006-0008 introduce rcu_work and use it
instead of open-coded rcu+work.

This patchset is also available in the following git tree.

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git 
percpu_ref-rcu-audit-v2

diffstat follows.  Thanks.

 drivers/infiniband/sw/rdmavt/mr.c |   10 ---
 fs/aio.c  |   39 ---
 include/linux/cgroup-defs.h   |2 -
 include/linux/percpu-refcount.h   |   18 
 include/linux/workqueue.h |   23 
 kernel/cgroup/cgroup.c|   21 --
 kernel/workqueue.c|   54 ++
 lib/percpu-refcount.c |2 +
 mm/hmm.c  |   12 +---
 9 files changed, 130 insertions(+), 51 deletions(-)

--
tejun

[L] http://lkml.kernel.org/r/20180306172657.3060270-1...@kernel.org
http://lkml.kernel.org/r/20180306173316.3088458-1...@kernel.org


[PATCHSET v2] percpu_ref, RCU: Audit RCU usages in percpu_ref users

2018-03-14 Thread Tejun Heo
Hello,

Linus, if you want to pick up the first three fix patches, please pull
from the following branch.

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git 
percpu_ref-rcu-audit-fixes

Changes from v1[L] are

* blk_queue_enter() patch dropped.  Needs further investigation.

* queue_rcu_work_on() dropped and documentation improved.

* rcu_work conversion patches separated out.

Original patchset description follows.

Jann Horn found that aio was depending on the internal RCU grace
periods of percpu-ref and that it's broken because aio uses regular
RCU while percpu_ref uses sched-RCU.

Depending on percpu_ref's internal grace periods isn't a good idea
because

 * The RCU type might not match.

 * percpu_ref's grace periods are used to switch to atomic mode.  They
   aren't between the last put and the invocation of the last release.
   This is easy to get confused about and can lead to subtle bugs.

 * percpu_ref might not have grace periods at all depending on its
   current operation mode.

This patchset audits all percpu_ref users for their RCU usages,
clarifies percpu_ref documentation that the internal grace periods
must not be depended upon, and introduces rcu_work to simplify
bouncing to a workqueue after an RCU grace period.

This patchset contains the following eight patches.

 0001-fs-aio-Add-explicit-RCU-grace-period-when-freeing-ki.patch
 0002-fs-aio-Use-RCU-accessors-for-kioctx_table-table.patch
 0003-RDMAVT-Fix-synchronization-around-percpu_ref.patch
 0004-HMM-Remove-superflous-RCU-protection-around-radix-tr.patch
 0005-percpu_ref-Update-doc-to-dissuade-users-from-dependi.patch
 0006-RCU-workqueue-Implement-rcu_work.patch
 0007-cgroup-Use-rcu_work-instead-of-explicit-rcu-and-work.patch
 0008-fs-aio-Use-rcu_work-instead-of-explicit-rcu-and-work.patch

0001-0003 are fixes and tagged -stable.

0004 removes (seemingly) superflous RCU read lock usages.

0005 updates the doc and 0006-0008 introduce rcu_work and use it
instead of open-coded rcu+work.

This patchset is also available in the following git tree.

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git 
percpu_ref-rcu-audit-v2

diffstat follows.  Thanks.

 drivers/infiniband/sw/rdmavt/mr.c |   10 ---
 fs/aio.c  |   39 ---
 include/linux/cgroup-defs.h   |2 -
 include/linux/percpu-refcount.h   |   18 
 include/linux/workqueue.h |   23 
 kernel/cgroup/cgroup.c|   21 --
 kernel/workqueue.c|   54 ++
 lib/percpu-refcount.c |2 +
 mm/hmm.c  |   12 +---
 9 files changed, 130 insertions(+), 51 deletions(-)

--
tejun

[L] http://lkml.kernel.org/r/20180306172657.3060270-1...@kernel.org
http://lkml.kernel.org/r/20180306173316.3088458-1...@kernel.org


Re: [PATCH 26/47] ata: remove bf54x driver

2018-03-14 Thread Tejun Heo
On Wed, Mar 14, 2018 at 04:35:39PM +0100, Arnd Bergmann wrote:
> The blackfin architecture is getting removed, so this driver
> is obsolete as well.
> 
> Signed-off-by: Arnd Bergmann <a...@arndb.de>

Acked-by: Tejun Heo <t...@kernel.org>

Please feel free to route with the rest of the series.

Thanks.

-- 
tejun


Re: [PATCH 26/47] ata: remove bf54x driver

2018-03-14 Thread Tejun Heo
On Wed, Mar 14, 2018 at 04:35:39PM +0100, Arnd Bergmann wrote:
> The blackfin architecture is getting removed, so this driver
> is obsolete as well.
> 
> Signed-off-by: Arnd Bergmann 

Acked-by: Tejun Heo 

Please feel free to route with the rest of the series.

Thanks.

-- 
tejun


Re: blk_mq_freeze_queue hang and possible race in percpu-refcount

2018-03-14 Thread Tejun Heo
Hello, David.

On Tue, Mar 13, 2018 at 03:50:47PM -0700, David Chen wrote:
> 
> CPU A   CPU B
> -   -
> percpu_ref_kill()   percpu_ref_tryget_live()
> {
> if (__ref_is_percpu())
>   set __PERCPU_REF_DEAD;
>   __percpu_ref_switch_mode();
>^ sum up current percpu_count
> this_cpu_inc(*percpu_count); <- this
> increment got leaked.
> 
> 
> 
> So if later CPU B later does percpu_ref_put, it will cause ref->count
> to drop to -1.
> And thus causing the above hung task issue.
> 
> Do you think this theory is correct, or am I missing something?
> Please tell me what do you think.

The switching to atomic mode does something like the following.

1. Mark the refcnt so that __ref_is_percpu() is false.

2. Wait for RCU grace period so that everyone including
   percpu_ref_tryget_live() which has seen true __ref_is_percpu() is
   done with its operation.

3. Now that it knows nobody is operating on the assumption that the
   counter is in percpu mode, it adds up all the percpu counters.

So, provided there aren't some silly bugs, what you described
shouldn't happen.  Can you force the refcnt into atomic mode w/
PERCPU_REF_INIT_ATOMIC and see whether the problem persists?

Thanks.

-- 
tejun


Re: blk_mq_freeze_queue hang and possible race in percpu-refcount

2018-03-14 Thread Tejun Heo
Hello, David.

On Tue, Mar 13, 2018 at 03:50:47PM -0700, David Chen wrote:
> 
> CPU A   CPU B
> -   -
> percpu_ref_kill()   percpu_ref_tryget_live()
> {
> if (__ref_is_percpu())
>   set __PERCPU_REF_DEAD;
>   __percpu_ref_switch_mode();
>^ sum up current percpu_count
> this_cpu_inc(*percpu_count); <- this
> increment got leaked.
> 
> 
> 
> So if later CPU B later does percpu_ref_put, it will cause ref->count
> to drop to -1.
> And thus causing the above hung task issue.
> 
> Do you think this theory is correct, or am I missing something?
> Please tell me what do you think.

The switching to atomic mode does something like the following.

1. Mark the refcnt so that __ref_is_percpu() is false.

2. Wait for RCU grace period so that everyone including
   percpu_ref_tryget_live() which has seen true __ref_is_percpu() is
   done with its operation.

3. Now that it knows nobody is operating on the assumption that the
   counter is in percpu mode, it adds up all the percpu counters.

So, provided there aren't some silly bugs, what you described
shouldn't happen.  Can you force the refcnt into atomic mode w/
PERCPU_REF_INIT_ATOMIC and see whether the problem persists?

Thanks.

-- 
tejun


Re: [PATCH] percpu: Allow to kill tasks doing pcpu_alloc() and waiting for pcpu_balance_workfn()

2018-03-14 Thread Tejun Heo
On Wed, Mar 14, 2018 at 02:51:48PM +0300, Kirill Tkhai wrote:
> In case of memory deficit and low percpu memory pages,
> pcpu_balance_workfn() takes pcpu_alloc_mutex for a long
> time (as it makes memory allocations itself and waits
> for memory reclaim). If tasks doing pcpu_alloc() are
> choosen by OOM killer, they can't exit, because they
> are waiting for the mutex.
> 
> The patch makes pcpu_alloc() to care about killing signal
> and use mutex_lock_killable(), when it's allowed by GFP
> flags. This guarantees, a task does not miss SIGKILL
> from OOM killer.
> 
> Signed-off-by: Kirill Tkhai 

Applied to percpu/for-4.16-fixes.

Thanks, Kirill.

-- 
tejun


Re: [PATCH] percpu: Allow to kill tasks doing pcpu_alloc() and waiting for pcpu_balance_workfn()

2018-03-14 Thread Tejun Heo
On Wed, Mar 14, 2018 at 02:51:48PM +0300, Kirill Tkhai wrote:
> In case of memory deficit and low percpu memory pages,
> pcpu_balance_workfn() takes pcpu_alloc_mutex for a long
> time (as it makes memory allocations itself and waits
> for memory reclaim). If tasks doing pcpu_alloc() are
> choosen by OOM killer, they can't exit, because they
> are waiting for the mutex.
> 
> The patch makes pcpu_alloc() to care about killing signal
> and use mutex_lock_killable(), when it's allowed by GFP
> flags. This guarantees, a task does not miss SIGKILL
> from OOM killer.
> 
> Signed-off-by: Kirill Tkhai 

Applied to percpu/for-4.16-fixes.

Thanks, Kirill.

-- 
tejun


Re: [PATCH] libata: add refcounting to ata_host

2018-03-13 Thread Tejun Heo
On Fri, Mar 09, 2018 at 08:34:41AM +, Taras Kondratiuk wrote:
> After commit 9a6d6a2ddabb ("ata: make ata port as parent device of scsi
> host") manual driver unbind/remove causes use-after-free.
> 
> Unbind unconditionally invokes devres_release_all() which calls
> ata_host_release() and frees ata_host/ata_port memory while it is still
> being referenced as a parent of SCSI host. When SCSI host is finally
> released scsi_host_dev_release() calls put_device(parent) and accesses
> freed ata_port memory.
> 
> Add reference counting to make sure that ata_host lives long enough.
> 
> Bug report: https://lkml.org/lkml/2017/11/1/945
> Fixes: 9a6d6a2ddabb ("ata: make ata port as parent device of scsi host")
> Cc: Tejun Heo <t...@kernel.org>
> Cc: Lin Ming <min...@gmail.com>
> Cc: linux-...@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Taras Kondratiuk <takon...@cisco.com>

Applied to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH] libata: add refcounting to ata_host

2018-03-13 Thread Tejun Heo
On Fri, Mar 09, 2018 at 08:34:41AM +, Taras Kondratiuk wrote:
> After commit 9a6d6a2ddabb ("ata: make ata port as parent device of scsi
> host") manual driver unbind/remove causes use-after-free.
> 
> Unbind unconditionally invokes devres_release_all() which calls
> ata_host_release() and frees ata_host/ata_port memory while it is still
> being referenced as a parent of SCSI host. When SCSI host is finally
> released scsi_host_dev_release() calls put_device(parent) and accesses
> freed ata_port memory.
> 
> Add reference counting to make sure that ata_host lives long enough.
> 
> Bug report: https://lkml.org/lkml/2017/11/1/945
> Fixes: 9a6d6a2ddabb ("ata: make ata port as parent device of scsi host")
> Cc: Tejun Heo 
> Cc: Lin Ming 
> Cc: linux-...@vger.kernel.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Taras Kondratiuk 

Applied to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH v2] workqueue: use put_device() instead of kfree()

2018-03-13 Thread Tejun Heo
On Tue, Mar 06, 2018 at 03:35:43PM +0530, Arvind Yadav wrote:
> Never directly free @dev after calling device_register(), even
> if it returned an error! Always use put_device() to give up the
> reference initialized in this function instead.
> 
> Signed-off-by: Arvind Yadav 

Applied to wq/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH v2] workqueue: use put_device() instead of kfree()

2018-03-13 Thread Tejun Heo
On Tue, Mar 06, 2018 at 03:35:43PM +0530, Arvind Yadav wrote:
> Never directly free @dev after calling device_register(), even
> if it returned an error! Always use put_device() to give up the
> reference initialized in this function instead.
> 
> Signed-off-by: Arvind Yadav 

Applied to wq/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH 7/7] RCU, workqueue: Implement rcu_work

2018-03-09 Thread Tejun Heo
Hello,

On Thu, Mar 08, 2018 at 08:29:53AM +0800, Lai Jiangshan wrote:
> I mentioned a subtle use case that user would think it is supported
> since the comment doesn't disallow it.
> 
> It is clear that the user expects
>the work must be called at least once after the API returns
>the work must be called after an RCU grace period
> 
> But in the case when the user expects the work must be called
> at least once again after "queue_rcu_work() + an RCU grace period",
> the API is not competent to it if the work is queued.
> Although the user can detect it by the return value of
> queue_rcu_work(), the user hardly makes his expectation
> happen by adding appropriate code.

We should definitely document it better but it isn't any different
from delayed_work, and I don't see a reason to deviate.

Thanks.

-- 
tejun


Re: [PATCH 7/7] RCU, workqueue: Implement rcu_work

2018-03-09 Thread Tejun Heo
Hello,

On Thu, Mar 08, 2018 at 08:29:53AM +0800, Lai Jiangshan wrote:
> I mentioned a subtle use case that user would think it is supported
> since the comment doesn't disallow it.
> 
> It is clear that the user expects
>the work must be called at least once after the API returns
>the work must be called after an RCU grace period
> 
> But in the case when the user expects the work must be called
> at least once again after "queue_rcu_work() + an RCU grace period",
> the API is not competent to it if the work is queued.
> Although the user can detect it by the return value of
> queue_rcu_work(), the user hardly makes his expectation
> happen by adding appropriate code.

We should definitely document it better but it isn't any different
from delayed_work, and I don't see a reason to deviate.

Thanks.

-- 
tejun


Re: [PATCH 7/7] RCU, workqueue: Implement rcu_work

2018-03-09 Thread Tejun Heo
Hello, Linus.

On Tue, Mar 06, 2018 at 10:30:29AM -0800, Linus Torvalds wrote:
>  - can we split this patch up, so that if somebody bisects a problem
> to it, we'll see if it's cgroup or aio that triggers it?

Will do.

> So I'd like to either just make the thing always just use
> WORK_CPU_UNBOUND, or hear some kind of (handwaving ok) explanation for
> why something else would ever make sense. If the action is
> fundamentally delayed by RCU, why would it make a difference which CPU
> it runs on?

It was mostly for consistency with other interfaces.  Let's drop
queue_work_on() for now.  If ever necessary, we can easily add it back
later.

Thanks.

-- 
tejun


Re: [PATCH 7/7] RCU, workqueue: Implement rcu_work

2018-03-09 Thread Tejun Heo
Hello, Linus.

On Tue, Mar 06, 2018 at 10:30:29AM -0800, Linus Torvalds wrote:
>  - can we split this patch up, so that if somebody bisects a problem
> to it, we'll see if it's cgroup or aio that triggers it?

Will do.

> So I'd like to either just make the thing always just use
> WORK_CPU_UNBOUND, or hear some kind of (handwaving ok) explanation for
> why something else would ever make sense. If the action is
> fundamentally delayed by RCU, why would it make a difference which CPU
> it runs on?

It was mostly for consistency with other interfaces.  Let's drop
queue_work_on() for now.  If ever necessary, we can easily add it back
later.

Thanks.

-- 
tejun


[PATCH 7/7] RCU, workqueue: Implement rcu_work

2018-03-06 Thread Tejun Heo
There are cases where RCU callback needs to be bounced to a sleepable
context.  This is currently done by the RCU callback queueing a work
item, which can be cumbersome to write and confusing to read.

This patch introduces rcu_work, a workqueue work variant which gets
executed after a RCU grace period, and converts the open coded
bouncing in fs/aio and kernel/cgroup.

v2: Use rcu_barrier() instead of synchronize_rcu() to wait for
completion of previously queued rcu callback as per Paul.

Signed-off-by: Tejun Heo <t...@kernel.org>
Cc: "Paul E. McKenney" <paul...@linux.vnet.ibm.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
---
 fs/aio.c| 21 +-
 include/linux/cgroup-defs.h |  2 +-
 include/linux/workqueue.h   | 38 +++
 kernel/cgroup/cgroup.c  | 21 ++
 kernel/workqueue.c  | 54 +
 5 files changed, 106 insertions(+), 30 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 6bcd3fb..88d7927 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -115,8 +115,7 @@ struct kioctx {
struct page **ring_pages;
longnr_pages;
 
-   struct rcu_head free_rcu;
-   struct work_struct  free_work;  /* see free_ioctx() */
+   struct rcu_work free_rwork; /* see free_ioctx() */
 
/*
 * signals when all in-flight requests are done
@@ -592,13 +591,12 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
 /*
  * free_ioctx() should be RCU delayed to synchronize against the RCU
  * protected lookup_ioctx() and also needs process context to call
- * aio_free_ring(), so the double bouncing through kioctx->free_rcu and
- * ->free_work.
+ * aio_free_ring().  Use rcu_work.
  */
 static void free_ioctx(struct work_struct *work)
 {
-   struct kioctx *ctx = container_of(work, struct kioctx, free_work);
-
+   struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
+ free_rwork);
pr_debug("freeing %p\n", ctx);
 
aio_free_ring(ctx);
@@ -608,14 +606,6 @@ static void free_ioctx(struct work_struct *work)
kmem_cache_free(kioctx_cachep, ctx);
 }
 
-static void free_ioctx_rcufn(struct rcu_head *head)
-{
-   struct kioctx *ctx = container_of(head, struct kioctx, free_rcu);
-
-   INIT_WORK(>free_work, free_ioctx);
-   schedule_work(>free_work);
-}
-
 static void free_ioctx_reqs(struct percpu_ref *ref)
 {
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
@@ -625,7 +615,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
complete(>rq_wait->comp);
 
/* Synchronize against RCU protected table->table[] dereferences */
-   call_rcu(>free_rcu, free_ioctx_rcufn);
+   INIT_RCU_WORK(>free_rwork, free_ioctx);
+   queue_rcu_work(system_wq, >free_rwork);
 }
 
 /*
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 9f242b8..92d7640 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -151,8 +151,8 @@ struct cgroup_subsys_state {
atomic_t online_cnt;
 
/* percpu_ref killing and RCU release */
-   struct rcu_head rcu_head;
struct work_struct destroy_work;
+   struct rcu_work destroy_rwork;
 
/*
 * PI: the parent css.  Placed here for cache proximity to following
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bc0cda1..b39f3a4 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct workqueue_struct;
 
@@ -120,6 +121,15 @@ struct delayed_work {
int cpu;
 };
 
+struct rcu_work {
+   struct work_struct work;
+   struct rcu_head rcu;
+
+   /* target workqueue and CPU ->rcu uses to queue ->work */
+   struct workqueue_struct *wq;
+   int cpu;
+};
+
 /**
  * struct workqueue_attrs - A struct for workqueue attributes.
  *
@@ -151,6 +161,11 @@ static inline struct delayed_work *to_delayed_work(struct 
work_struct *work)
return container_of(work, struct delayed_work, work);
 }
 
+static inline struct rcu_work *to_rcu_work(struct work_struct *work)
+{
+   return container_of(work, struct rcu_work, work);
+}
+
 struct execute_work {
struct work_struct work;
 };
@@ -266,6 +281,12 @@ static inline unsigned int work_static(struct work_struct 
*work) { return 0; }
 #define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func) \
__INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE)
 
+#define INIT_RCU_WORK(_work, _func)\
+   INIT_WORK(&(_work)->work, (_func))
+
+#define INIT_RCU_WORK_ONSTACK(_work, _func)\
+   INIT_WORK_ONSTACK(&(_work)->work, (_func))
+
 /**
  * w

[PATCH 7/7] RCU, workqueue: Implement rcu_work

2018-03-06 Thread Tejun Heo
There are cases where RCU callback needs to be bounced to a sleepable
context.  This is currently done by the RCU callback queueing a work
item, which can be cumbersome to write and confusing to read.

This patch introduces rcu_work, a workqueue work variant which gets
executed after a RCU grace period, and converts the open coded
bouncing in fs/aio and kernel/cgroup.

v2: Use rcu_barrier() instead of synchronize_rcu() to wait for
completion of previously queued rcu callback as per Paul.

Signed-off-by: Tejun Heo 
Cc: "Paul E. McKenney" 
Cc: Linus Torvalds 
---
 fs/aio.c| 21 +-
 include/linux/cgroup-defs.h |  2 +-
 include/linux/workqueue.h   | 38 +++
 kernel/cgroup/cgroup.c  | 21 ++
 kernel/workqueue.c  | 54 +
 5 files changed, 106 insertions(+), 30 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 6bcd3fb..88d7927 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -115,8 +115,7 @@ struct kioctx {
struct page **ring_pages;
longnr_pages;
 
-   struct rcu_head free_rcu;
-   struct work_struct  free_work;  /* see free_ioctx() */
+   struct rcu_work free_rwork; /* see free_ioctx() */
 
/*
 * signals when all in-flight requests are done
@@ -592,13 +591,12 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
 /*
  * free_ioctx() should be RCU delayed to synchronize against the RCU
  * protected lookup_ioctx() and also needs process context to call
- * aio_free_ring(), so the double bouncing through kioctx->free_rcu and
- * ->free_work.
+ * aio_free_ring().  Use rcu_work.
  */
 static void free_ioctx(struct work_struct *work)
 {
-   struct kioctx *ctx = container_of(work, struct kioctx, free_work);
-
+   struct kioctx *ctx = container_of(to_rcu_work(work), struct kioctx,
+ free_rwork);
pr_debug("freeing %p\n", ctx);
 
aio_free_ring(ctx);
@@ -608,14 +606,6 @@ static void free_ioctx(struct work_struct *work)
kmem_cache_free(kioctx_cachep, ctx);
 }
 
-static void free_ioctx_rcufn(struct rcu_head *head)
-{
-   struct kioctx *ctx = container_of(head, struct kioctx, free_rcu);
-
-   INIT_WORK(>free_work, free_ioctx);
-   schedule_work(>free_work);
-}
-
 static void free_ioctx_reqs(struct percpu_ref *ref)
 {
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
@@ -625,7 +615,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
complete(>rq_wait->comp);
 
/* Synchronize against RCU protected table->table[] dereferences */
-   call_rcu(>free_rcu, free_ioctx_rcufn);
+   INIT_RCU_WORK(>free_rwork, free_ioctx);
+   queue_rcu_work(system_wq, >free_rwork);
 }
 
 /*
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index 9f242b8..92d7640 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -151,8 +151,8 @@ struct cgroup_subsys_state {
atomic_t online_cnt;
 
/* percpu_ref killing and RCU release */
-   struct rcu_head rcu_head;
struct work_struct destroy_work;
+   struct rcu_work destroy_rwork;
 
/*
 * PI: the parent css.  Placed here for cache proximity to following
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index bc0cda1..b39f3a4 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct workqueue_struct;
 
@@ -120,6 +121,15 @@ struct delayed_work {
int cpu;
 };
 
+struct rcu_work {
+   struct work_struct work;
+   struct rcu_head rcu;
+
+   /* target workqueue and CPU ->rcu uses to queue ->work */
+   struct workqueue_struct *wq;
+   int cpu;
+};
+
 /**
  * struct workqueue_attrs - A struct for workqueue attributes.
  *
@@ -151,6 +161,11 @@ static inline struct delayed_work *to_delayed_work(struct 
work_struct *work)
return container_of(work, struct delayed_work, work);
 }
 
+static inline struct rcu_work *to_rcu_work(struct work_struct *work)
+{
+   return container_of(work, struct rcu_work, work);
+}
+
 struct execute_work {
struct work_struct work;
 };
@@ -266,6 +281,12 @@ static inline unsigned int work_static(struct work_struct 
*work) { return 0; }
 #define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func) \
__INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE)
 
+#define INIT_RCU_WORK(_work, _func)\
+   INIT_WORK(&(_work)->work, (_func))
+
+#define INIT_RCU_WORK_ONSTACK(_work, _func)\
+   INIT_WORK_ONSTACK(&(_work)->work, (_func))
+
 /**
  * work_pending - Find out whether a work item is currently pending
  * @work: The work item in 

[PATCH 6/7] percpu_ref: Update doc to dissuade users from depending on internal RCU grace periods

2018-03-06 Thread Tejun Heo
percpu_ref internally uses sched-RCU to implement the percpu -> atomic
mode switching and the documentation suggested that this could be
depended upon.  This doesn't seem like a good idea.

* percpu_ref uses sched-RCU which has different grace periods regular
  RCU.  Users may combine percpu_ref with regular RCU usage and
  incorrectly believe that regular RCU grace periods are performed by
  percpu_ref.  This can lead to, for example, use-after-free due to
  premature freeing.

* percpu_ref has a grace period when switching from percpu to atomic
  mode.  It doesn't have one between the last put and release.  This
  distinction is subtle and can lead to surprising bugs.

* percpu_ref allows starting in and switching to atomic mode manually
  for debugging and other purposes.  This means that there may not be
  any grace periods from kill to release.

This patch makes it clear that the grace periods are percpu_ref's
internal implementation detail and can't be depended upon by the
users.

Signed-off-by: Tejun Heo <t...@kernel.org>
Cc: Kent Overstreet <kent.overstr...@gmail.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
---
 include/linux/percpu-refcount.h | 18 --
 lib/percpu-refcount.c   |  2 ++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 864d167..009cdf3 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -30,10 +30,14 @@
  * calls io_destroy() or the process exits.
  *
  * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
- * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to 
remove
- * the kioctx from the proccess's list of kioctxs - after that, there can't be
- * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop
- * the initial ref with percpu_ref_put().
+ * removes the kioctx from the proccess's table of kioctxs and kills 
percpu_ref.
+ * After that, there can't be any new users of the kioctx (from lookup_ioctx())
+ * and it's then safe to drop the initial ref with percpu_ref_put().
+ *
+ * Note that the free path, free_ioctx(), needs to go through explicit 
call_rcu()
+ * to synchronize with RCU protected lookup_ioctx().  percpu_ref operations 
don't
+ * imply RCU grace periods of any kind and if a user wants to combine 
percpu_ref
+ * with RCU protection, it must be done explicitly.
  *
  * Code that does a two stage shutdown like this often needs some kind of
  * explicit synchronization to ensure the initial refcount can only be dropped
@@ -113,8 +117,10 @@ void percpu_ref_reinit(struct percpu_ref *ref);
  * Must be used to drop the initial ref on a percpu refcount; must be called
  * precisely once before shutdown.
  *
- * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the
- * percpu counters and dropping the initial ref.
+ * Switches @ref into atomic mode before gathering up the percpu counters
+ * and dropping the initial ref.
+ *
+ * There are no implied RCU grace periods between kill and release.
  */
 static inline void percpu_ref_kill(struct percpu_ref *ref)
 {
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 30e7dd8..9f96fa7 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -322,6 +322,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
  * This function normally doesn't block and can be called from any context
  * but it may block if @confirm_kill is specified and @ref is in the
  * process of switching to atomic mode by percpu_ref_switch_to_atomic().
+ *
+ * There are no implied RCU grace periods between kill and release.
  */
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 percpu_ref_func_t *confirm_kill)
-- 
2.9.5



[PATCH 6/7] percpu_ref: Update doc to dissuade users from depending on internal RCU grace periods

2018-03-06 Thread Tejun Heo
percpu_ref internally uses sched-RCU to implement the percpu -> atomic
mode switching and the documentation suggested that this could be
depended upon.  This doesn't seem like a good idea.

* percpu_ref uses sched-RCU which has different grace periods regular
  RCU.  Users may combine percpu_ref with regular RCU usage and
  incorrectly believe that regular RCU grace periods are performed by
  percpu_ref.  This can lead to, for example, use-after-free due to
  premature freeing.

* percpu_ref has a grace period when switching from percpu to atomic
  mode.  It doesn't have one between the last put and release.  This
  distinction is subtle and can lead to surprising bugs.

* percpu_ref allows starting in and switching to atomic mode manually
  for debugging and other purposes.  This means that there may not be
  any grace periods from kill to release.

This patch makes it clear that the grace periods are percpu_ref's
internal implementation detail and can't be depended upon by the
users.

Signed-off-by: Tejun Heo 
Cc: Kent Overstreet 
Cc: Linus Torvalds 
---
 include/linux/percpu-refcount.h | 18 --
 lib/percpu-refcount.c   |  2 ++
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 864d167..009cdf3 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -30,10 +30,14 @@
  * calls io_destroy() or the process exits.
  *
  * In the aio code, kill_ioctx() is called when we wish to destroy a kioctx; it
- * calls percpu_ref_kill(), then hlist_del_rcu() and synchronize_rcu() to 
remove
- * the kioctx from the proccess's list of kioctxs - after that, there can't be
- * any new users of the kioctx (from lookup_ioctx()) and it's then safe to drop
- * the initial ref with percpu_ref_put().
+ * removes the kioctx from the proccess's table of kioctxs and kills 
percpu_ref.
+ * After that, there can't be any new users of the kioctx (from lookup_ioctx())
+ * and it's then safe to drop the initial ref with percpu_ref_put().
+ *
+ * Note that the free path, free_ioctx(), needs to go through explicit 
call_rcu()
+ * to synchronize with RCU protected lookup_ioctx().  percpu_ref operations 
don't
+ * imply RCU grace periods of any kind and if a user wants to combine 
percpu_ref
+ * with RCU protection, it must be done explicitly.
  *
  * Code that does a two stage shutdown like this often needs some kind of
  * explicit synchronization to ensure the initial refcount can only be dropped
@@ -113,8 +117,10 @@ void percpu_ref_reinit(struct percpu_ref *ref);
  * Must be used to drop the initial ref on a percpu refcount; must be called
  * precisely once before shutdown.
  *
- * Puts @ref in non percpu mode, then does a call_rcu() before gathering up the
- * percpu counters and dropping the initial ref.
+ * Switches @ref into atomic mode before gathering up the percpu counters
+ * and dropping the initial ref.
+ *
+ * There are no implied RCU grace periods between kill and release.
  */
 static inline void percpu_ref_kill(struct percpu_ref *ref)
 {
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index 30e7dd8..9f96fa7 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -322,6 +322,8 @@ EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu);
  * This function normally doesn't block and can be called from any context
  * but it may block if @confirm_kill is specified and @ref is in the
  * process of switching to atomic mode by percpu_ref_switch_to_atomic().
+ *
+ * There are no implied RCU grace periods between kill and release.
  */
 void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
 percpu_ref_func_t *confirm_kill)
-- 
2.9.5



[PATCH 5/7] block: Remove superflous rcu_read_[un]lock_sched() in blk_queue_enter()

2018-03-06 Thread Tejun Heo
3a0a529971ec ("block, scsi: Make SCSI quiesce and resume work
reliably") added rcu_read_[un]lock_sched() to blk_queue_enter() along
with other changes but it doesn't seem to be doing anything.

blk_queue_enter() is called with @q - the pointer to the target
request_queue, so the caller obviously has to guarantee that @q can be
dereferenced, and inside the RCU-sched protected area, there's nothing
which needs further protection.

Let's remove the superflous rcu_read_[un]lock_sched().

Signed-off-by: Tejun Heo <t...@kernel.org>
Cc: Bart Van Assche <bart.vanass...@wdc.com>
Cc: Jens Axboe <ax...@kernel.dk>
Cc: Linus Torvalds <torva...@linux-foundation.org>
---
Hello, Bart.

This came up while auditing percpu_ref users for problematic RCU
usages.  I couldn't understand what the RCU protection was doing.
It'd be great if you can take a look and tell me whether I missed
something.

Thanks.

 block/blk-core.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 6d82c4f..e3b4d1849 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -827,7 +827,6 @@ int blk_queue_enter(struct request_queue *q, 
blk_mq_req_flags_t flags)
bool success = false;
int ret;
 
-   rcu_read_lock_sched();
if (percpu_ref_tryget_live(>q_usage_counter)) {
/*
 * The code that sets the PREEMPT_ONLY flag is
@@ -840,7 +839,6 @@ int blk_queue_enter(struct request_queue *q, 
blk_mq_req_flags_t flags)
percpu_ref_put(>q_usage_counter);
}
}
-   rcu_read_unlock_sched();
 
if (success)
return 0;
-- 
2.9.5



[PATCH 5/7] block: Remove superflous rcu_read_[un]lock_sched() in blk_queue_enter()

2018-03-06 Thread Tejun Heo
3a0a529971ec ("block, scsi: Make SCSI quiesce and resume work
reliably") added rcu_read_[un]lock_sched() to blk_queue_enter() along
with other changes but it doesn't seem to be doing anything.

blk_queue_enter() is called with @q - the pointer to the target
request_queue, so the caller obviously has to guarantee that @q can be
dereferenced, and inside the RCU-sched protected area, there's nothing
which needs further protection.

Let's remove the superflous rcu_read_[un]lock_sched().

Signed-off-by: Tejun Heo 
Cc: Bart Van Assche 
Cc: Jens Axboe 
Cc: Linus Torvalds 
---
Hello, Bart.

This came up while auditing percpu_ref users for problematic RCU
usages.  I couldn't understand what the RCU protection was doing.
It'd be great if you can take a look and tell me whether I missed
something.

Thanks.

 block/blk-core.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 6d82c4f..e3b4d1849 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -827,7 +827,6 @@ int blk_queue_enter(struct request_queue *q, 
blk_mq_req_flags_t flags)
bool success = false;
int ret;
 
-   rcu_read_lock_sched();
if (percpu_ref_tryget_live(>q_usage_counter)) {
/*
 * The code that sets the PREEMPT_ONLY flag is
@@ -840,7 +839,6 @@ int blk_queue_enter(struct request_queue *q, 
blk_mq_req_flags_t flags)
percpu_ref_put(>q_usage_counter);
}
}
-   rcu_read_unlock_sched();
 
if (success)
return 0;
-- 
2.9.5



[PATCH 4/7] HMM: Remove superflous RCU protection around radix tree lookup

2018-03-06 Thread Tejun Heo
hmm_devmem_find() requires rcu_read_lock_held() but there's nothing
which actually uses the RCU protection.  The only caller is
hmm_devmem_pages_create() which already grabs the mutex and does
superflous rcu_read_lock/unlock() around the function.

This doesn't add anything and just adds to confusion.  Remove the RCU
protection and open-code the radix tree lookup.  If this needs to
become more sophisticated in the future, let's add them back when
necessary.

Signed-off-by: Tejun Heo <t...@kernel.org>
Cc: Jérôme Glisse <jgli...@redhat.com>
Cc: linux...@kvack.org
Cc: Linus Torvalds <torva...@linux-foundation.org>
---
Hello, Jérôme.

This came up while auditing percpu_ref users for missing explicit RCU
grace periods.  HMM doesn't seem to depend on RCU protection at all,
so I thought it'd be better to remove it for now.  It's only compile
tested.

Thanks.

 mm/hmm.c | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 320545b98..d4627c5 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -845,13 +845,6 @@ static void hmm_devmem_release(struct device *dev, void 
*data)
hmm_devmem_radix_release(resource);
 }
 
-static struct hmm_devmem *hmm_devmem_find(resource_size_t phys)
-{
-   WARN_ON_ONCE(!rcu_read_lock_held());
-
-   return radix_tree_lookup(_devmem_radix, phys >> PA_SECTION_SHIFT);
-}
-
 static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
 {
resource_size_t key, align_start, align_size, align_end;
@@ -892,9 +885,8 @@ static int hmm_devmem_pages_create(struct hmm_devmem 
*devmem)
for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
struct hmm_devmem *dup;
 
-   rcu_read_lock();
-   dup = hmm_devmem_find(key);
-   rcu_read_unlock();
+   dup = radix_tree_lookup(_devmem_radix,
+   key >> PA_SECTION_SHIFT);
if (dup) {
dev_err(device, "%s: collides with mapping for %s\n",
__func__, dev_name(dup->device));
-- 
2.9.5



[PATCH 4/7] HMM: Remove superflous RCU protection around radix tree lookup

2018-03-06 Thread Tejun Heo
hmm_devmem_find() requires rcu_read_lock_held() but there's nothing
which actually uses the RCU protection.  The only caller is
hmm_devmem_pages_create() which already grabs the mutex and does
superflous rcu_read_lock/unlock() around the function.

This doesn't add anything and just adds to confusion.  Remove the RCU
protection and open-code the radix tree lookup.  If this needs to
become more sophisticated in the future, let's add them back when
necessary.

Signed-off-by: Tejun Heo 
Cc: Jérôme Glisse 
Cc: linux...@kvack.org
Cc: Linus Torvalds 
---
Hello, Jérôme.

This came up while auditing percpu_ref users for missing explicit RCU
grace periods.  HMM doesn't seem to depend on RCU protection at all,
so I thought it'd be better to remove it for now.  It's only compile
tested.

Thanks.

 mm/hmm.c | 12 ++--
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/mm/hmm.c b/mm/hmm.c
index 320545b98..d4627c5 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -845,13 +845,6 @@ static void hmm_devmem_release(struct device *dev, void 
*data)
hmm_devmem_radix_release(resource);
 }
 
-static struct hmm_devmem *hmm_devmem_find(resource_size_t phys)
-{
-   WARN_ON_ONCE(!rcu_read_lock_held());
-
-   return radix_tree_lookup(_devmem_radix, phys >> PA_SECTION_SHIFT);
-}
-
 static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
 {
resource_size_t key, align_start, align_size, align_end;
@@ -892,9 +885,8 @@ static int hmm_devmem_pages_create(struct hmm_devmem 
*devmem)
for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
struct hmm_devmem *dup;
 
-   rcu_read_lock();
-   dup = hmm_devmem_find(key);
-   rcu_read_unlock();
+   dup = radix_tree_lookup(_devmem_radix,
+   key >> PA_SECTION_SHIFT);
if (dup) {
dev_err(device, "%s: collides with mapping for %s\n",
__func__, dev_name(dup->device));
-- 
2.9.5



[PATCH 3/7] RDMAVT: Fix synchronization around percpu_ref

2018-03-06 Thread Tejun Heo
rvt_mregion uses percpu_ref for reference counting and RCU to protect
accesses from lkey_table.  When a rvt_mregion needs to be freed, it
first gets unregistered from lkey_table and then rvt_check_refs() is
called to wait for in-flight usages before the rvt_mregion is freed.

rvt_check_refs() seems to have a couple issues.

* It has a fast exit path which tests percpu_ref_is_zero().  However,
  a percpu_ref reading zero doesn't mean that the object can be
  released.  In fact, the ->release() callback might not even have
  started executing yet.  Proceeding with freeing can lead to
  use-after-free.

* lkey_table is RCU protected but there is no RCU grace period in the
  free path.  percpu_ref uses RCU internally but it's sched-RCU whose
  grace periods are different from regular RCU.  Also, it generally
  isn't a good idea to depend on internal behaviors like this.

To address the above issues, this patch removes the the fast exit and
adds an explicit synchronize_rcu().

Signed-off-by: Tejun Heo <t...@kernel.org>
Cc: Dennis Dalessandro <dennis.dalessan...@intel.com>
Cc: Mike Marciniszyn <mike.marcinis...@intel.com>
Cc: linux-r...@vger.kernel.org
Cc: Linus Torvalds <torva...@linux-foundation.org>
---
Hello, Dennis, Mike.

I don't know RDMA at all and this patch is only compile tested.  Can
you please take a careful look?

Thanks.

 drivers/infiniband/sw/rdmavt/mr.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/sw/rdmavt/mr.c 
b/drivers/infiniband/sw/rdmavt/mr.c
index 1b2e536..cc429b5 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -489,11 +489,13 @@ static int rvt_check_refs(struct rvt_mregion *mr, const 
char *t)
unsigned long timeout;
struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
 
-   if (percpu_ref_is_zero(>refcount))
-   return 0;
-   /* avoid dma mr */
-   if (mr->lkey)
+   if (mr->lkey) {
+   /* avoid dma mr */
rvt_dereg_clean_qps(mr);
+   /* @mr was indexed on rcu protected @lkey_table */
+   synchronize_rcu();
+   }
+
timeout = wait_for_completion_timeout(>comp, 5 * HZ);
if (!timeout) {
rvt_pr_err(rdi,
-- 
2.9.5



[PATCH 3/7] RDMAVT: Fix synchronization around percpu_ref

2018-03-06 Thread Tejun Heo
rvt_mregion uses percpu_ref for reference counting and RCU to protect
accesses from lkey_table.  When a rvt_mregion needs to be freed, it
first gets unregistered from lkey_table and then rvt_check_refs() is
called to wait for in-flight usages before the rvt_mregion is freed.

rvt_check_refs() seems to have a couple issues.

* It has a fast exit path which tests percpu_ref_is_zero().  However,
  a percpu_ref reading zero doesn't mean that the object can be
  released.  In fact, the ->release() callback might not even have
  started executing yet.  Proceeding with freeing can lead to
  use-after-free.

* lkey_table is RCU protected but there is no RCU grace period in the
  free path.  percpu_ref uses RCU internally but it's sched-RCU whose
  grace periods are different from regular RCU.  Also, it generally
  isn't a good idea to depend on internal behaviors like this.

To address the above issues, this patch removes the the fast exit and
adds an explicit synchronize_rcu().

Signed-off-by: Tejun Heo 
Cc: Dennis Dalessandro 
Cc: Mike Marciniszyn 
Cc: linux-r...@vger.kernel.org
Cc: Linus Torvalds 
---
Hello, Dennis, Mike.

I don't know RDMA at all and this patch is only compile tested.  Can
you please take a careful look?

Thanks.

 drivers/infiniband/sw/rdmavt/mr.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/drivers/infiniband/sw/rdmavt/mr.c 
b/drivers/infiniband/sw/rdmavt/mr.c
index 1b2e536..cc429b5 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -489,11 +489,13 @@ static int rvt_check_refs(struct rvt_mregion *mr, const 
char *t)
unsigned long timeout;
struct rvt_dev_info *rdi = ib_to_rvt(mr->pd->device);
 
-   if (percpu_ref_is_zero(>refcount))
-   return 0;
-   /* avoid dma mr */
-   if (mr->lkey)
+   if (mr->lkey) {
+   /* avoid dma mr */
rvt_dereg_clean_qps(mr);
+   /* @mr was indexed on rcu protected @lkey_table */
+   synchronize_rcu();
+   }
+
timeout = wait_for_completion_timeout(>comp, 5 * HZ);
if (!timeout) {
rvt_pr_err(rdi,
-- 
2.9.5



[PATCH 2/7] fs/aio: Use RCU accessors for kioctx_table->table[]

2018-03-06 Thread Tejun Heo
While converting ioctx index from a list to a table, db446a08c23d
("aio: convert the ioctx list to table lookup v3") missed tagging
kioctx_table->table[] as an array of RCU pointers and using the
appropriate RCU accessors.  This introduces a small window in the
lookup path where init and access may race.

Mark kioctx_table->table[] with __rcu and use the approriate RCU
accessors when using the field.

Signed-off-by: Tejun Heo <t...@kernel.org>
Reported-by: Jann Horn <ja...@google.com>
Fixes: db446a08c23d ("aio: convert the ioctx list to table lookup v3")
Cc: Benjamin LaHaise <b...@kvack.org>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: sta...@vger.kernel.org
---
 fs/aio.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index eb2e0cf..6bcd3fb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -68,9 +68,9 @@ struct aio_ring {
 #define AIO_RING_PAGES 8
 
 struct kioctx_table {
-   struct rcu_head rcu;
-   unsignednr;
-   struct kioctx   *table[];
+   struct rcu_head rcu;
+   unsignednr;
+   struct kioctx __rcu *table[];
 };
 
 struct kioctx_cpu {
@@ -330,7 +330,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma)
for (i = 0; i < table->nr; i++) {
struct kioctx *ctx;
 
-   ctx = table->table[i];
+   ctx = rcu_dereference(table->table[i]);
if (ctx && ctx->aio_ring_file == file) {
if (!atomic_read(>dead)) {
ctx->user_id = ctx->mmap_base = vma->vm_start;
@@ -666,9 +666,9 @@ static int ioctx_add_table(struct kioctx *ctx, struct 
mm_struct *mm)
while (1) {
if (table)
for (i = 0; i < table->nr; i++)
-   if (!table->table[i]) {
+   if (!rcu_access_pointer(table->table[i])) {
ctx->id = i;
-   table->table[i] = ctx;
+   rcu_assign_pointer(table->table[i], 
ctx);
spin_unlock(>ioctx_lock);
 
/* While kioctx setup is in progress,
@@ -849,8 +849,8 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx 
*ctx,
}
 
table = rcu_dereference_raw(mm->ioctx_table);
-   WARN_ON(ctx != table->table[ctx->id]);
-   table->table[ctx->id] = NULL;
+   WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
+   RCU_INIT_POINTER(table->table[ctx->id], NULL);
spin_unlock(>ioctx_lock);
 
/* free_ioctx_reqs() will do the necessary RCU synchronization */
@@ -895,7 +895,8 @@ void exit_aio(struct mm_struct *mm)
 
skipped = 0;
for (i = 0; i < table->nr; ++i) {
-   struct kioctx *ctx = table->table[i];
+   struct kioctx *ctx =
+   rcu_dereference_protected(table->table[i], true);
 
if (!ctx) {
skipped++;
@@ -1084,7 +1085,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
if (!table || id >= table->nr)
goto out;
 
-   ctx = table->table[id];
+   ctx = rcu_dereference(table->table[id]);
if (ctx && ctx->user_id == ctx_id) {
percpu_ref_get(>users);
ret = ctx;
-- 
2.9.5



[PATCH 2/7] fs/aio: Use RCU accessors for kioctx_table->table[]

2018-03-06 Thread Tejun Heo
While converting ioctx index from a list to a table, db446a08c23d
("aio: convert the ioctx list to table lookup v3") missed tagging
kioctx_table->table[] as an array of RCU pointers and using the
appropriate RCU accessors.  This introduces a small window in the
lookup path where init and access may race.

Mark kioctx_table->table[] with __rcu and use the approriate RCU
accessors when using the field.

Signed-off-by: Tejun Heo 
Reported-by: Jann Horn 
Fixes: db446a08c23d ("aio: convert the ioctx list to table lookup v3")
Cc: Benjamin LaHaise 
Cc: Linus Torvalds 
Cc: sta...@vger.kernel.org
---
 fs/aio.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index eb2e0cf..6bcd3fb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -68,9 +68,9 @@ struct aio_ring {
 #define AIO_RING_PAGES 8
 
 struct kioctx_table {
-   struct rcu_head rcu;
-   unsignednr;
-   struct kioctx   *table[];
+   struct rcu_head rcu;
+   unsignednr;
+   struct kioctx __rcu *table[];
 };
 
 struct kioctx_cpu {
@@ -330,7 +330,7 @@ static int aio_ring_mremap(struct vm_area_struct *vma)
for (i = 0; i < table->nr; i++) {
struct kioctx *ctx;
 
-   ctx = table->table[i];
+   ctx = rcu_dereference(table->table[i]);
if (ctx && ctx->aio_ring_file == file) {
if (!atomic_read(>dead)) {
ctx->user_id = ctx->mmap_base = vma->vm_start;
@@ -666,9 +666,9 @@ static int ioctx_add_table(struct kioctx *ctx, struct 
mm_struct *mm)
while (1) {
if (table)
for (i = 0; i < table->nr; i++)
-   if (!table->table[i]) {
+   if (!rcu_access_pointer(table->table[i])) {
ctx->id = i;
-   table->table[i] = ctx;
+   rcu_assign_pointer(table->table[i], 
ctx);
spin_unlock(>ioctx_lock);
 
/* While kioctx setup is in progress,
@@ -849,8 +849,8 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx 
*ctx,
}
 
table = rcu_dereference_raw(mm->ioctx_table);
-   WARN_ON(ctx != table->table[ctx->id]);
-   table->table[ctx->id] = NULL;
+   WARN_ON(ctx != rcu_access_pointer(table->table[ctx->id]));
+   RCU_INIT_POINTER(table->table[ctx->id], NULL);
spin_unlock(>ioctx_lock);
 
/* free_ioctx_reqs() will do the necessary RCU synchronization */
@@ -895,7 +895,8 @@ void exit_aio(struct mm_struct *mm)
 
skipped = 0;
for (i = 0; i < table->nr; ++i) {
-   struct kioctx *ctx = table->table[i];
+   struct kioctx *ctx =
+   rcu_dereference_protected(table->table[i], true);
 
if (!ctx) {
skipped++;
@@ -1084,7 +1085,7 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id)
if (!table || id >= table->nr)
goto out;
 
-   ctx = table->table[id];
+   ctx = rcu_dereference(table->table[id]);
if (ctx && ctx->user_id == ctx_id) {
percpu_ref_get(>users);
ret = ctx;
-- 
2.9.5



[PATCH 1/7] fs/aio: Add explicit RCU grace period when freeing kioctx

2018-03-06 Thread Tejun Heo
While fixing refcounting, e34ecee2ae79 ("aio: Fix a trinity splat")
incorrectly removed explicit RCU grace period before freeing kioctx.
The intention seems to be depending on the internal RCU grace periods
of percpu_ref; however, percpu_ref uses a different flavor of RCU,
sched-RCU.  This can lead to kioctx being freed while RCU read
protected dereferences are still in progress.

Fix it by updating free_ioctx() to go through call_rcu() explicitly.

v2: Comment added to explain double bouncing.

Signed-off-by: Tejun Heo <t...@kernel.org>
Reported-by: Jann Horn <ja...@google.com>
Fixes: e34ecee2ae79 ("aio: Fix a trinity splat")
Cc: Kent Overstreet <kent.overstr...@gmail.com>
Cc: Linus Torvalds <torva...@linux-foundation.org>
Cc: sta...@vger.kernel.org
---
 fs/aio.c | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index a062d75..eb2e0cf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -115,7 +115,8 @@ struct kioctx {
struct page **ring_pages;
longnr_pages;
 
-   struct work_struct  free_work;
+   struct rcu_head free_rcu;
+   struct work_struct  free_work;  /* see free_ioctx() */
 
/*
 * signals when all in-flight requests are done
@@ -588,6 +589,12 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
return cancel(>common);
 }
 
+/*
+ * free_ioctx() should be RCU delayed to synchronize against the RCU
+ * protected lookup_ioctx() and also needs process context to call
+ * aio_free_ring(), so the double bouncing through kioctx->free_rcu and
+ * ->free_work.
+ */
 static void free_ioctx(struct work_struct *work)
 {
struct kioctx *ctx = container_of(work, struct kioctx, free_work);
@@ -601,6 +608,14 @@ static void free_ioctx(struct work_struct *work)
kmem_cache_free(kioctx_cachep, ctx);
 }
 
+static void free_ioctx_rcufn(struct rcu_head *head)
+{
+   struct kioctx *ctx = container_of(head, struct kioctx, free_rcu);
+
+   INIT_WORK(>free_work, free_ioctx);
+   schedule_work(>free_work);
+}
+
 static void free_ioctx_reqs(struct percpu_ref *ref)
 {
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
@@ -609,8 +624,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
if (ctx->rq_wait && atomic_dec_and_test(>rq_wait->count))
complete(>rq_wait->comp);
 
-   INIT_WORK(>free_work, free_ioctx);
-   schedule_work(>free_work);
+   /* Synchronize against RCU protected table->table[] dereferences */
+   call_rcu(>free_rcu, free_ioctx_rcufn);
 }
 
 /*
@@ -838,7 +853,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx 
*ctx,
table->table[ctx->id] = NULL;
spin_unlock(>ioctx_lock);
 
-   /* percpu_ref_kill() will do the necessary call_rcu() */
+   /* free_ioctx_reqs() will do the necessary RCU synchronization */
wake_up_all(>wait);
 
/*
-- 
2.9.5



[PATCH 1/7] fs/aio: Add explicit RCU grace period when freeing kioctx

2018-03-06 Thread Tejun Heo
While fixing refcounting, e34ecee2ae79 ("aio: Fix a trinity splat")
incorrectly removed explicit RCU grace period before freeing kioctx.
The intention seems to be depending on the internal RCU grace periods
of percpu_ref; however, percpu_ref uses a different flavor of RCU,
sched-RCU.  This can lead to kioctx being freed while RCU read
protected dereferences are still in progress.

Fix it by updating free_ioctx() to go through call_rcu() explicitly.

v2: Comment added to explain double bouncing.

Signed-off-by: Tejun Heo 
Reported-by: Jann Horn 
Fixes: e34ecee2ae79 ("aio: Fix a trinity splat")
Cc: Kent Overstreet 
Cc: Linus Torvalds 
Cc: sta...@vger.kernel.org
---
 fs/aio.c | 23 +++
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index a062d75..eb2e0cf 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -115,7 +115,8 @@ struct kioctx {
struct page **ring_pages;
longnr_pages;
 
-   struct work_struct  free_work;
+   struct rcu_head free_rcu;
+   struct work_struct  free_work;  /* see free_ioctx() */
 
/*
 * signals when all in-flight requests are done
@@ -588,6 +589,12 @@ static int kiocb_cancel(struct aio_kiocb *kiocb)
return cancel(>common);
 }
 
+/*
+ * free_ioctx() should be RCU delayed to synchronize against the RCU
+ * protected lookup_ioctx() and also needs process context to call
+ * aio_free_ring(), so the double bouncing through kioctx->free_rcu and
+ * ->free_work.
+ */
 static void free_ioctx(struct work_struct *work)
 {
struct kioctx *ctx = container_of(work, struct kioctx, free_work);
@@ -601,6 +608,14 @@ static void free_ioctx(struct work_struct *work)
kmem_cache_free(kioctx_cachep, ctx);
 }
 
+static void free_ioctx_rcufn(struct rcu_head *head)
+{
+   struct kioctx *ctx = container_of(head, struct kioctx, free_rcu);
+
+   INIT_WORK(>free_work, free_ioctx);
+   schedule_work(>free_work);
+}
+
 static void free_ioctx_reqs(struct percpu_ref *ref)
 {
struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
@@ -609,8 +624,8 @@ static void free_ioctx_reqs(struct percpu_ref *ref)
if (ctx->rq_wait && atomic_dec_and_test(>rq_wait->count))
complete(>rq_wait->comp);
 
-   INIT_WORK(>free_work, free_ioctx);
-   schedule_work(>free_work);
+   /* Synchronize against RCU protected table->table[] dereferences */
+   call_rcu(>free_rcu, free_ioctx_rcufn);
 }
 
 /*
@@ -838,7 +853,7 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx 
*ctx,
table->table[ctx->id] = NULL;
spin_unlock(>ioctx_lock);
 
-   /* percpu_ref_kill() will do the necessary call_rcu() */
+   /* free_ioctx_reqs() will do the necessary RCU synchronization */
wake_up_all(>wait);
 
/*
-- 
2.9.5



[PATCHSET] percpu_ref, RCU: Audit RCU usages in percpu_ref users

2018-03-06 Thread Tejun Heo
Hello,

Jann Horn found that aio was depending on the internal RCU grace
periods of percpu-ref and that it's broken because aio uses regular
RCU while percpu_ref uses sched-RCU.

Depending on percpu_ref's internal grace periods isn't a good idea
because

 * The RCU type might not match.

 * percpu_ref's grace periods are used to switch to atomic mode.  They
   aren't between the last put and the invocation of the last release.
   This is easy to get confused about and can lead to subtle bugs.

 * percpu_ref might not have grace periods at all depending on its
   current operation mode.

This patchset audits all percpu_ref users for their RCU usages,
clarifies percpu_ref documentation that the internal grace periods
must not be depended upon, and introduces rcu_work to simplify
bouncing to a workqueue after an RCU grace period.

This patchset contains the following seven patches.

 0001-fs-aio-Add-explicit-RCU-grace-period-when-freeing-ki.patch
 0002-fs-aio-Use-RCU-accessors-for-kioctx_table-table.patch
 0003-RDMAVT-Fix-synchronization-around-percpu_ref.patch
 0004-HMM-Remove-superflous-RCU-protection-around-radix-tr.patch
 0005-block-Remove-superflous-rcu_read_-un-lock_sched-in-b.patch
 0006-percpu_ref-Update-doc-to-dissuade-users-from-dependi.patch
 0007-RCU-workqueue-Implement-rcu_work.patch

0001-0003 are fixes and tagged -stable.

0004-0005 remove (seemingly) superflous RCU read lock usages.

0006 updates the doc and 0007 introduces rcu_work.

This patchset is also available in the following git tree.

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git percpu_ref-rcu-audit

diffstat follows.  Thanks.

 block/blk-core.c  |2 -
 drivers/infiniband/sw/rdmavt/mr.c |   10 ---
 fs/aio.c  |   39 ---
 include/linux/cgroup-defs.h   |2 -
 include/linux/percpu-refcount.h   |   18 
 include/linux/workqueue.h |   38 ++
 kernel/cgroup/cgroup.c|   21 --
 kernel/workqueue.c|   54 ++
 lib/percpu-refcount.c |2 +
 mm/hmm.c  |   12 +---
 10 files changed, 145 insertions(+), 53 deletions(-)

--
tejun


[PATCHSET] percpu_ref, RCU: Audit RCU usages in percpu_ref users

2018-03-06 Thread Tejun Heo
Hello,

Jann Horn found that aio was depending on the internal RCU grace
periods of percpu-ref and that it's broken because aio uses regular
RCU while percpu_ref uses sched-RCU.

Depending on percpu_ref's internal grace periods isn't a good idea
because

 * The RCU type might not match.

 * percpu_ref's grace periods are used to switch to atomic mode.  They
   aren't between the last put and the invocation of the last release.
   This is easy to get confused about and can lead to subtle bugs.

 * percpu_ref might not have grace periods at all depending on its
   current operation mode.

This patchset audits all percpu_ref users for their RCU usages,
clarifies percpu_ref documentation that the internal grace periods
must not be depended upon, and introduces rcu_work to simplify
bouncing to a workqueue after an RCU grace period.

This patchset contains the following seven patches.

 0001-fs-aio-Add-explicit-RCU-grace-period-when-freeing-ki.patch
 0002-fs-aio-Use-RCU-accessors-for-kioctx_table-table.patch
 0003-RDMAVT-Fix-synchronization-around-percpu_ref.patch
 0004-HMM-Remove-superflous-RCU-protection-around-radix-tr.patch
 0005-block-Remove-superflous-rcu_read_-un-lock_sched-in-b.patch
 0006-percpu_ref-Update-doc-to-dissuade-users-from-dependi.patch
 0007-RCU-workqueue-Implement-rcu_work.patch

0001-0003 are fixes and tagged -stable.

0004-0005 remove (seemingly) superflous RCU read lock usages.

0006 updates the doc and 0007 introduces rcu_work.

This patchset is also available in the following git tree.

 git://git.kernel.org/pub/scm/linux/kernel/git/tj/misc.git percpu_ref-rcu-audit

diffstat follows.  Thanks.

 block/blk-core.c  |2 -
 drivers/infiniband/sw/rdmavt/mr.c |   10 ---
 fs/aio.c  |   39 ---
 include/linux/cgroup-defs.h   |2 -
 include/linux/percpu-refcount.h   |   18 
 include/linux/workqueue.h |   38 ++
 kernel/cgroup/cgroup.c|   21 --
 kernel/workqueue.c|   54 ++
 lib/percpu-refcount.c |2 +
 mm/hmm.c  |   12 +---
 10 files changed, 145 insertions(+), 53 deletions(-)

--
tejun


Re: [PATCH 0/2] ahci/pci-quirks: Add PCI-id for the Highpoint Rocketraid 644L card

2018-03-04 Thread Tejun Heo
On Fri, Mar 02, 2018 at 01:29:07PM -0600, Bjorn Helgaas wrote:
> On Fri, Mar 02, 2018 at 11:36:31AM +0100, Hans de Goede wrote:
> > Hi All,
> > 
> > Here are 2 patches adding support for Highpoint Rocketraid 644L cards,
> > they can be merged independently from each-other into their resp. subsys,
> > but since they belong together I'm sending them out as a series.
> 
> Seems like it'd be nice to keep these together, so Tejun, feel free to
> take both patches with my
> 
> Acked-by: Bjorn Helgaas 
> 
> on the pci/quirks.c patch.  If you'd prefer me to take them, just let
> me know.

Applied both to libata/for-4.16-fixes with ack added.

Thanks.

-- 
tejun


Re: [PATCH 0/2] ahci/pci-quirks: Add PCI-id for the Highpoint Rocketraid 644L card

2018-03-04 Thread Tejun Heo
On Fri, Mar 02, 2018 at 01:29:07PM -0600, Bjorn Helgaas wrote:
> On Fri, Mar 02, 2018 at 11:36:31AM +0100, Hans de Goede wrote:
> > Hi All,
> > 
> > Here are 2 patches adding support for Highpoint Rocketraid 644L cards,
> > they can be merged independently from each-other into their resp. subsys,
> > but since they belong together I'm sending them out as a series.
> 
> Seems like it'd be nice to keep these together, so Tejun, feel free to
> take both patches with my
> 
> Acked-by: Bjorn Helgaas 
> 
> on the pci/quirks.c patch.  If you'd prefer me to take them, just let
> me know.

Applied both to libata/for-4.16-fixes with ack added.

Thanks.

-- 
tejun


Re: [PATCH] pata_bk3710: clarify license version and use SPDX header

2018-03-01 Thread Tejun Heo
On Thu, Mar 01, 2018 at 12:16:00PM +0100, Bartlomiej Zolnierkiewicz wrote:
> - clarify license version (it should be GPL 2.0)
> - use SPDX header
> 
> Cc: Sekhar Nori 
> Signed-off-by: Bartlomiej Zolnierkiewicz 

Applied to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH] pata_bk3710: clarify license version and use SPDX header

2018-03-01 Thread Tejun Heo
On Thu, Mar 01, 2018 at 12:16:00PM +0100, Bartlomiej Zolnierkiewicz wrote:
> - clarify license version (it should be GPL 2.0)
> - use SPDX header
> 
> Cc: Sekhar Nori 
> Signed-off-by: Bartlomiej Zolnierkiewicz 

Applied to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH] pata_falcon: clarify license version and use SPDX header

2018-03-01 Thread Tejun Heo
On Thu, Mar 01, 2018 at 12:13:35PM +0100, Bartlomiej Zolnierkiewicz wrote:
> - clarify license version (it should be GPL 2.0)
> - use SPDX header
> 
> Cc: Michael Schmitz 
> Cc: Geert Uytterhoeven 
> Signed-off-by: Bartlomiej Zolnierkiewicz 

Applied to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH] pata_falcon: clarify license version and use SPDX header

2018-03-01 Thread Tejun Heo
On Thu, Mar 01, 2018 at 12:13:35PM +0100, Bartlomiej Zolnierkiewicz wrote:
> - clarify license version (it should be GPL 2.0)
> - use SPDX header
> 
> Cc: Michael Schmitz 
> Cc: Geert Uytterhoeven 
> Signed-off-by: Bartlomiej Zolnierkiewicz 

Applied to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH] percpu: add a schedule point in pcpu_balance_workfn()

2018-02-23 Thread Tejun Heo
On Fri, Feb 23, 2018 at 08:12:42AM -0800, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> When a large BPF percpu map is destroyed, I have seen
> pcpu_balance_workfn() holding cpu for hundreds of milliseconds.
> 
> On KASAN config and 112 hyperthreads, average time to destroy a chunk
> is ~4 ms.
> 
> [ 2489.841376] destroy chunk 1 in 4148689 ns
> ...
> [ 2490.093428] destroy chunk 32 in 4072718 ns
> 
> Signed-off-by: Eric Dumazet 

Applied to percpu/for-4.16-fixes.

Thank, Eric.

-- 
tejun


Re: [PATCH] percpu: add a schedule point in pcpu_balance_workfn()

2018-02-23 Thread Tejun Heo
On Fri, Feb 23, 2018 at 08:12:42AM -0800, Eric Dumazet wrote:
> From: Eric Dumazet 
> 
> When a large BPF percpu map is destroyed, I have seen
> pcpu_balance_workfn() holding cpu for hundreds of milliseconds.
> 
> On KASAN config and 112 hyperthreads, average time to destroy a chunk
> is ~4 ms.
> 
> [ 2489.841376] destroy chunk 1 in 4148689 ns
> ...
> [ 2490.093428] destroy chunk 32 in 4072718 ns
> 
> Signed-off-by: Eric Dumazet 

Applied to percpu/for-4.16-fixes.

Thank, Eric.

-- 
tejun


[PATCH cgroup/for-4.16-fixes] cgroup: fix rule checking for threaded mode switching

2018-02-21 Thread Tejun Heo
>From d1897c9538edafd4ae6bbd03cc075962ddde2c21 Mon Sep 17 00:00:00 2001
From: Tejun Heo <t...@kernel.org>
Date: Wed, 21 Feb 2018 11:39:22 -0800

A domain cgroup isn't allowed to be turned threaded if its subtree is
populated or domain controllers are enabled.  cgroup_enable_threaded()
depended on cgroup_can_be_thread_root() test to enforce this rule.  A
parent which has populated domain descendants or have domain
controllers enabled can't become a thread root, so the above rules are
enforced automatically.

However, for the root cgroup which can host mixed domain and threaded
children, cgroup_can_be_thread_root() doesn't check any of those
conditions and thus first level cgroups ends up escaping those rules.

This patch fixes the bug by adding explicit checks for those rules in
cgroup_enable_threaded().

Reported-by: Michael Kerrisk (man-pages) <mtk.manpa...@gmail.com>
Signed-off-by: Tejun Heo <t...@kernel.org>
Fixes: 8cfd8147df67 ("cgroup: implement cgroup v2 thread support")
Cc: sta...@vger.kernel.org # v4.14+
---
 kernel/cgroup/cgroup.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8cda3bc..4bfb290 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3183,6 +3183,16 @@ static int cgroup_enable_threaded(struct cgroup *cgrp)
if (cgroup_is_threaded(cgrp))
return 0;
 
+   /*
+* If @cgroup is populated or has domain controllers enabled, it
+* can't be switched.  While the below cgroup_can_be_thread_root()
+* test can catch the same conditions, that's only when @parent is
+* not mixable, so let's check it explicitly.
+*/
+   if (cgroup_is_populated(cgrp) ||
+   cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+   return -EOPNOTSUPP;
+
/* we're joining the parent's domain, ensure its validity */
if (!cgroup_is_valid_domain(dom_cgrp) ||
!cgroup_can_be_thread_root(dom_cgrp))
-- 
2.9.5



[PATCH cgroup/for-4.16-fixes] cgroup: fix rule checking for threaded mode switching

2018-02-21 Thread Tejun Heo
>From d1897c9538edafd4ae6bbd03cc075962ddde2c21 Mon Sep 17 00:00:00 2001
From: Tejun Heo 
Date: Wed, 21 Feb 2018 11:39:22 -0800

A domain cgroup isn't allowed to be turned threaded if its subtree is
populated or domain controllers are enabled.  cgroup_enable_threaded()
depended on cgroup_can_be_thread_root() test to enforce this rule.  A
parent which has populated domain descendants or have domain
controllers enabled can't become a thread root, so the above rules are
enforced automatically.

However, for the root cgroup which can host mixed domain and threaded
children, cgroup_can_be_thread_root() doesn't check any of those
conditions and thus first level cgroups ends up escaping those rules.

This patch fixes the bug by adding explicit checks for those rules in
cgroup_enable_threaded().

Reported-by: Michael Kerrisk (man-pages) 
Signed-off-by: Tejun Heo 
Fixes: 8cfd8147df67 ("cgroup: implement cgroup v2 thread support")
Cc: sta...@vger.kernel.org # v4.14+
---
 kernel/cgroup/cgroup.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 8cda3bc..4bfb290 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3183,6 +3183,16 @@ static int cgroup_enable_threaded(struct cgroup *cgrp)
if (cgroup_is_threaded(cgrp))
return 0;
 
+   /*
+* If @cgroup is populated or has domain controllers enabled, it
+* can't be switched.  While the below cgroup_can_be_thread_root()
+* test can catch the same conditions, that's only when @parent is
+* not mixable, so let's check it explicitly.
+*/
+   if (cgroup_is_populated(cgrp) ||
+   cgrp->subtree_control & ~cgrp_dfl_threaded_ss_mask)
+   return -EOPNOTSUPP;
+
/* we're joining the parent's domain, ensure its validity */
if (!cgroup_is_valid_domain(dom_cgrp) ||
!cgroup_can_be_thread_root(dom_cgrp))
-- 
2.9.5



Re: A "domain invalid" cgroup *can* sometimes have member tasks

2018-02-20 Thread Tejun Heo
On Tue, Feb 20, 2018 at 11:35:47AM -0800, Tejun Heo wrote:
> Hmm... nr_populated_domain_children check should have caught that
> condition and rejected it.  Will look into what's going on.

Ah, okay, I was special-casing the first level children case too
early.  If you nest the test case by one more level, it fails as
intended.  I'll fix the checks.

Thanks.

-- 
tejun


Re: A "domain invalid" cgroup *can* sometimes have member tasks

2018-02-20 Thread Tejun Heo
On Tue, Feb 20, 2018 at 11:35:47AM -0800, Tejun Heo wrote:
> Hmm... nr_populated_domain_children check should have caught that
> condition and rejected it.  Will look into what's going on.

Ah, okay, I was special-casing the first level children case too
early.  If you nest the test case by one more level, it fails as
intended.  I'll fix the checks.

Thanks.

-- 
tejun


Re: [PATCH v2] libata: disable LPM for Crucial BX100 SSD 500GB drive

2018-02-20 Thread Tejun Heo
On Sun, Feb 18, 2018 at 10:17:09PM +0800, Kai-Heng Feng wrote:
> After Laptop Mode Tools starts to use min_power for LPM, a user found
> out Crucial BX100 SSD can't get mounted.
> 
> Crucial BX100 SSD 500GB drive don't work well with min_power. This also
> happens to med_power_with_dipm.
> 
> So let's disable LPM for Crucial BX100 SSD 500GB drive.
> 
> BugLink: https://bugs.launchpad.net/bugs/1726930
> Signed-off-by: Kai-Heng Feng 

Applied to libata/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH v2] libata: disable LPM for Crucial BX100 SSD 500GB drive

2018-02-20 Thread Tejun Heo
On Sun, Feb 18, 2018 at 10:17:09PM +0800, Kai-Heng Feng wrote:
> After Laptop Mode Tools starts to use min_power for LPM, a user found
> out Crucial BX100 SSD can't get mounted.
> 
> Crucial BX100 SSD 500GB drive don't work well with min_power. This also
> happens to med_power_with_dipm.
> 
> So let's disable LPM for Crucial BX100 SSD 500GB drive.
> 
> BugLink: https://bugs.launchpad.net/bugs/1726930
> Signed-off-by: Kai-Heng Feng 

Applied to libata/for-4.16-fixes.

Thanks.

-- 
tejun


Re: A "domain invalid" cgroup *can* sometimes have member tasks

2018-02-20 Thread Tejun Heo
On Tue, Feb 20, 2018 at 08:26:59PM +0100, Michael Kerrisk (man-pages) wrote:
> Hello Tejun
> 
> According to Documentation/cgroup-v2.txt, a "domain invalid" cgroup
> can't have member tasks. And indeed this is generally not permitted.
> 
> However, someone recently showed me a scenario where a "domain
> invalid" cgroup can have member processes. See the following example:
> 
> # mkdir -p /sys/fs/cgroup/unified/grp0/grp1
> # sleep 1000 &
> [1] 10549
> # echo 10549 > /sys/fs/cgroup/unified/grp0/grp1/cgroup.procs
> # echo threaded > /sys/fs/cgroup/unified/grp0/cgroup.type
> # cat /sys/fs/cgroup/unified/grp0/cgroup.type
> threaded
> # cat /sys/fs/cgroup/unified/grp0/grp1/cgroup.type
> domain invalid
> # cat /sys/fs/cgroup/unified/grp0/grp1/cgroup.threads
> 10549
> 
> From the above, we see that the cgroup grp0/grp1 is of type "domain
> invalid" and has a member thread. This seems to be a violation of the
> documented rules, and is I assume a bug, since in the above scenario,
> we are denied from adding further tasks to the grp0/grp1 cgroup:
> 
> # sleep 2000 &
> [2] 10553
> # echo 10553 > /sys/fs/cgroup/unified/grp0/grp1/cgroup.procs
> sh: echo: write error: Operation not supported
> 
> Could you comment please?

Hmm... nr_populated_domain_children check should have caught that
condition and rejected it.  Will look into what's going on.

Thanks.

-- 
tejun


Re: A "domain invalid" cgroup *can* sometimes have member tasks

2018-02-20 Thread Tejun Heo
On Tue, Feb 20, 2018 at 08:26:59PM +0100, Michael Kerrisk (man-pages) wrote:
> Hello Tejun
> 
> According to Documentation/cgroup-v2.txt, a "domain invalid" cgroup
> can't have member tasks. And indeed this is generally not permitted.
> 
> However, someone recently showed me a scenario where a "domain
> invalid" cgroup can have member processes. See the following example:
> 
> # mkdir -p /sys/fs/cgroup/unified/grp0/grp1
> # sleep 1000 &
> [1] 10549
> # echo 10549 > /sys/fs/cgroup/unified/grp0/grp1/cgroup.procs
> # echo threaded > /sys/fs/cgroup/unified/grp0/cgroup.type
> # cat /sys/fs/cgroup/unified/grp0/cgroup.type
> threaded
> # cat /sys/fs/cgroup/unified/grp0/grp1/cgroup.type
> domain invalid
> # cat /sys/fs/cgroup/unified/grp0/grp1/cgroup.threads
> 10549
> 
> From the above, we see that the cgroup grp0/grp1 is of type "domain
> invalid" and has a member thread. This seems to be a violation of the
> documented rules, and is I assume a bug, since in the above scenario,
> we are denied from adding further tasks to the grp0/grp1 cgroup:
> 
> # sleep 2000 &
> [2] 10553
> # echo 10553 > /sys/fs/cgroup/unified/grp0/grp1/cgroup.procs
> sh: echo: write error: Operation not supported
> 
> Could you comment please?

Hmm... nr_populated_domain_children check should have caught that
condition and rejected it.  Will look into what's going on.

Thanks.

-- 
tejun


Re: [PATCH v2 3/3] percpu: allow select gfp to be passed to underlying allocators

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 12:09:58PM -0600, Dennis Zhou wrote:
> The prior patch added support for passing gfp flags through to the
> underlying allocators. This patch allows users to pass along gfp flags
> (currently only __GFP_NORETRY and __GFP_NOWARN) to the underlying
> allocators. This should allow users to decide if they are ok with
> failing allocations recovering in a more graceful way.
> 
> Additionally, gfp passing was done as additional flags in the previous
> patch. Instead, change this to caller passed semantics. GFP_KERNEL is
> also removed as the default flag. It continues to be used for internally
> caused underlying percpu allocations.
> 
> V2:
> Removed gfp_percpu_mask in favor of doing it inline.
> Removed GFP_KERNEL as a default flag for __alloc_percpu_gfp.
> 
> Signed-off-by: Dennis Zhou 
> Suggested-by: Daniel Borkmann 

Applied 1-3 to percpu/for-4.16-fixes.

Thanks, Dennis.

-- 
tejun


Re: [PATCH v2 3/3] percpu: allow select gfp to be passed to underlying allocators

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 12:09:58PM -0600, Dennis Zhou wrote:
> The prior patch added support for passing gfp flags through to the
> underlying allocators. This patch allows users to pass along gfp flags
> (currently only __GFP_NORETRY and __GFP_NOWARN) to the underlying
> allocators. This should allow users to decide if they are ok with
> failing allocations recovering in a more graceful way.
> 
> Additionally, gfp passing was done as additional flags in the previous
> patch. Instead, change this to caller passed semantics. GFP_KERNEL is
> also removed as the default flag. It continues to be used for internally
> caused underlying percpu allocations.
> 
> V2:
> Removed gfp_percpu_mask in favor of doing it inline.
> Removed GFP_KERNEL as a default flag for __alloc_percpu_gfp.
> 
> Signed-off-by: Dennis Zhou 
> Suggested-by: Daniel Borkmann 

Applied 1-3 to percpu/for-4.16-fixes.

Thanks, Dennis.

-- 
tejun


Re: [PATCH] libata: Apply NOLPM quirk to Crucial MX100 512GB SSDs

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 10:48:20AM +0100, Hans de Goede wrote:
> Various people have reported the Crucial MX100 512GB model not working
> with LPM set to min_power. I've now received a report that it also does
> not work with the new med_power_with_dipm level.
> 
> It does work with medium_power, but that has no measurable power-savings
> and given the amount of people being bitten by the other levels not
> working, this commit just disables LPM altogether.
> 
> Note all reporters of this have either the 512GB model (max capacity), or
> are not specifying their SSD's size. So for now this quirk assumes this is
> a problem with the 512GB model only.
> 
> Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=89261
> Buglink: https://github.com/linrunner/TLP/issues/84
> Cc: sta...@vger.kernel.org
> Signed-off-by: Hans de Goede 

Applied to libata/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH] libata: Apply NOLPM quirk to Crucial MX100 512GB SSDs

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 10:48:20AM +0100, Hans de Goede wrote:
> Various people have reported the Crucial MX100 512GB model not working
> with LPM set to min_power. I've now received a report that it also does
> not work with the new med_power_with_dipm level.
> 
> It does work with medium_power, but that has no measurable power-savings
> and given the amount of people being bitten by the other levels not
> working, this commit just disables LPM altogether.
> 
> Note all reporters of this have either the 512GB model (max capacity), or
> are not specifying their SSD's size. So for now this quirk assumes this is
> a problem with the 512GB model only.
> 
> Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=89261
> Buglink: https://github.com/linrunner/TLP/issues/84
> Cc: sta...@vger.kernel.org
> Signed-off-by: Hans de Goede 

Applied to libata/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH 1/2] pata_it821x: Delete an error message for a failed memory allocation in it821x_firmware_command()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 03:22:18PM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 14:04:49 +0100
> 
> Omit an extra message for a memory allocation failure in this function.
> 
> This issue was detected by using the Coccinelle software.
> 
> Signed-off-by: Markus Elfring 

Applying this one to libata/for-4.17 but skipping the next one.
There's no point in making these trivial changes.  It's just churn
without actual benefits but only risks.

Thanks.

-- 
tejun


Re: [PATCH 1/2] pata_it821x: Delete an error message for a failed memory allocation in it821x_firmware_command()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 03:22:18PM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 14:04:49 +0100
> 
> Omit an extra message for a memory allocation failure in this function.
> 
> This issue was detected by using the Coccinelle software.
> 
> Signed-off-by: Markus Elfring 

Applying this one to libata/for-4.17 but skipping the next one.
There's no point in making these trivial changes.  It's just churn
without actual benefits but only risks.

Thanks.

-- 
tejun


Re: [PATCH 1/3] pata_macio: Delete an error message for a failed memory allocation in two functions

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 01:38:20PM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 13:01:45 +0100
> 
> Omit an extra message for a memory allocation failure in these functions.
> 
> This issue was detected by using the Coccinelle software.
> 
> Signed-off-by: Markus Elfring 

Applying this one but not the other two.

Thanks.

-- 
tejun


Re: [PATCH 1/3] pata_macio: Delete an error message for a failed memory allocation in two functions

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 01:38:20PM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 13:01:45 +0100
> 
> Omit an extra message for a memory allocation failure in these functions.
> 
> This issue was detected by using the Coccinelle software.
> 
> Signed-off-by: Markus Elfring 

Applying this one but not the other two.

Thanks.

-- 
tejun


Re: [PATCH] pata_mpc52xx: Delete an error message for a failed memory allocation in mpc52xx_ata_probe()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 11:44:13AM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 11:34:53 +0100
> 
> Omit an extra message for a memory allocation failure in this function.
> 
> This issue was detected by using the Coccinelle software.
> 
> Signed-off-by: Markus Elfring 

Applied to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH] pata_mpc52xx: Delete an error message for a failed memory allocation in mpc52xx_ata_probe()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 11:44:13AM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 11:34:53 +0100
> 
> Omit an extra message for a memory allocation failure in this function.
> 
> This issue was detected by using the Coccinelle software.
> 
> Signed-off-by: Markus Elfring 

Applied to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH] sata_dwc_460ex: Delete an error message for a failed memory allocation in sata_dwc_port_start()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 11:39:13AM +0200, Andy Shevchenko wrote:
> On Thu, 2018-02-15 at 22:22 +0100, SF Markus Elfring wrote:
> > From: Markus Elfring 
> > Date: Thu, 15 Feb 2018 22:15:25 +0100
> > 
> > Omit an extra message for a memory allocation failure in this
> > function.
> > 
> > This issue was detected by using the Coccinelle software.
> 
> Tejun, it's up to you. In case you are fine with the change,

Yeah, these are marginal at best and one may be able to argue that
emitting a message with device identifier is still useful but these
allocations are order-zero and can't fail to begin with, so...

Thanks.

-- 
tejun


Re: [PATCH] sata_dwc_460ex: Delete an error message for a failed memory allocation in sata_dwc_port_start()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 11:39:13AM +0200, Andy Shevchenko wrote:
> On Thu, 2018-02-15 at 22:22 +0100, SF Markus Elfring wrote:
> > From: Markus Elfring 
> > Date: Thu, 15 Feb 2018 22:15:25 +0100
> > 
> > Omit an extra message for a memory allocation failure in this
> > function.
> > 
> > This issue was detected by using the Coccinelle software.
> 
> Tejun, it's up to you. In case you are fine with the change,

Yeah, these are marginal at best and one may be able to argue that
emitting a message with device identifier is still useful but these
allocations are order-zero and can't fail to begin with, so...

Thanks.

-- 
tejun


Re: [PATCH] pata_samsung_cf: Delete an error message for a failed memory allocation in pata_s3c_probe()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 11:08:18AM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 10:55:51 +0100
> 
> Omit an extra message for a memory allocation failure in this function.
> 
> This issue was detected by using the Coccinelle software.
> 
> Signed-off-by: Markus Elfring 

Applied to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH] pata_samsung_cf: Delete an error message for a failed memory allocation in pata_s3c_probe()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 11:08:18AM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 10:55:51 +0100
> 
> Omit an extra message for a memory allocation failure in this function.
> 
> This issue was detected by using the Coccinelle software.
> 
> Signed-off-by: Markus Elfring 

Applied to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH 3/3] pata_arasan_cf: Move two variable assignments in arasan_cf_probe()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 05:00:11PM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 16:42:26 +0100
> 
> Move assignments for the local variables "irq_handler" and "pdata"
> so that their setting will only be performed after a call
> of the function "devm_kzalloc" succeeded by this function.
> Thus adjust a corresponding if statement.

I'm not gonna apply this.  If there's something preventing the
compiler from optimizing it, the right thing to do is fixing that.

Thanks.

-- 
tejun


Re: [PATCH 3/3] pata_arasan_cf: Move two variable assignments in arasan_cf_probe()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 05:00:11PM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 16:42:26 +0100
> 
> Move assignments for the local variables "irq_handler" and "pdata"
> so that their setting will only be performed after a call
> of the function "devm_kzalloc" succeeded by this function.
> Thus adjust a corresponding if statement.

I'm not gonna apply this.  If there's something preventing the
compiler from optimizing it, the right thing to do is fixing that.

Thanks.

-- 
tejun


Re: [PATCH 2/3] pata_arasan_cf: Delete an unnecessary variable initialisation in arasan_cf_probe()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 04:59:01PM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 16:28:23 +0100
> 
> The local variable "ret" will eventually be set to an appropriate value
> a bit later. Thus omit the explicit initialisation at the beginning.
> 
> Signed-off-by: Markus Elfring 

Applied 1-2 to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH 2/3] pata_arasan_cf: Delete an unnecessary variable initialisation in arasan_cf_probe()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 04:59:01PM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 16:28:23 +0100
> 
> The local variable "ret" will eventually be set to an appropriate value
> a bit later. Thus omit the explicit initialisation at the beginning.
> 
> Signed-off-by: Markus Elfring 

Applied 1-2 to libata/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH] libata-scsi: Delete an unnecessary variable in ata_scsi_slave_config()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 06:26:30PM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 18:19:13 +0100
> 
> * Return a result code without storing it in an intermediate variable.
> 
> * Reduce the needed source code.
> 
> Signed-off-by: Markus Elfring 
> ---
>  drivers/ata/libata-scsi.c | 7 +--
>  1 file changed, 1 insertion(+), 6 deletions(-)
> 
> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> index 66be961c93a4..93456cff051b 100644
> --- a/drivers/ata/libata-scsi.c
> +++ b/drivers/ata/libata-scsi.c
> @@ -1345,14 +1345,9 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
>  {
>   struct ata_port *ap = ata_shost_to_port(sdev->host);
>   struct ata_device *dev = __ata_scsi_find_dev(ap, sdev);
> - int rc = 0;
>  
>   ata_scsi_sdev_config(sdev);
> -
> - if (dev)
> - rc = ata_scsi_dev_config(sdev, dev);
> -
> - return rc;
> + return dev ? ata_scsi_dev_config(sdev, dev) : 0;

Let's do the following instead.

if (dev)
return ata_scsi..;
else
return 0;

Thanks.

-- 
tejun


Re: [PATCH] libata-scsi: Delete an unnecessary variable in ata_scsi_slave_config()

2018-02-18 Thread Tejun Heo
On Fri, Feb 16, 2018 at 06:26:30PM +0100, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Fri, 16 Feb 2018 18:19:13 +0100
> 
> * Return a result code without storing it in an intermediate variable.
> 
> * Reduce the needed source code.
> 
> Signed-off-by: Markus Elfring 
> ---
>  drivers/ata/libata-scsi.c | 7 +--
>  1 file changed, 1 insertion(+), 6 deletions(-)
> 
> diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
> index 66be961c93a4..93456cff051b 100644
> --- a/drivers/ata/libata-scsi.c
> +++ b/drivers/ata/libata-scsi.c
> @@ -1345,14 +1345,9 @@ int ata_scsi_slave_config(struct scsi_device *sdev)
>  {
>   struct ata_port *ap = ata_shost_to_port(sdev->host);
>   struct ata_device *dev = __ata_scsi_find_dev(ap, sdev);
> - int rc = 0;
>  
>   ata_scsi_sdev_config(sdev);
> -
> - if (dev)
> - rc = ata_scsi_dev_config(sdev, dev);
> -
> - return rc;
> + return dev ? ata_scsi_dev_config(sdev, dev) : 0;

Let's do the following instead.

if (dev)
return ata_scsi..;
else
return 0;

Thanks.

-- 
tejun


Re: [PATCH 3/3] percpu: allow select gfp to be passed to underlying allocators

2018-02-15 Thread Tejun Heo
Hello,

On Thu, Feb 15, 2018 at 10:08:16AM -0600, Dennis Zhou wrote:
> +/* the whitelisted flags that can be passed to the backing allocators */
> +#define gfp_percpu_mask(gfp) (((gfp) & (__GFP_NORETRY | __GFP_NOWARN)) | \
> +   GFP_KERNEL)

Isn't there just one place where gfp comes in from outside?  If so,
this looks like a bit of overkill.  Can't we just filter there?

Thanks.

-- 
tejun


Re: [PATCH 3/3] percpu: allow select gfp to be passed to underlying allocators

2018-02-15 Thread Tejun Heo
Hello,

On Thu, Feb 15, 2018 at 10:08:16AM -0600, Dennis Zhou wrote:
> +/* the whitelisted flags that can be passed to the backing allocators */
> +#define gfp_percpu_mask(gfp) (((gfp) & (__GFP_NORETRY | __GFP_NOWARN)) | \
> +   GFP_KERNEL)

Isn't there just one place where gfp comes in from outside?  If so,
this looks like a bit of overkill.  Can't we just filter there?

Thanks.

-- 
tejun


Re: [PATCH 2/3] percpu: add __GFP_NORETRY semantics to the percpu balancing path

2018-02-15 Thread Tejun Heo
Hello,

On Thu, Feb 15, 2018 at 10:08:15AM -0600, Dennis Zhou wrote:
> -static struct pcpu_chunk *pcpu_create_chunk(void)
> +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
>  {
>   const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
>   struct pcpu_chunk *chunk;
>   struct page *pages;
>   int i;
>  
> - chunk = pcpu_alloc_chunk();
> + chunk = pcpu_alloc_chunk(gfp);
>   if (!chunk)
>   return NULL;
>  
> - pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
> + pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages));

Is there a reason to set GFP_KERNEL in this function?  I'd prefer
pushing this to the callers.

> diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
> index 9158e5a..ea9906a 100644
> --- a/mm/percpu-vm.c
> +++ b/mm/percpu-vm.c
> @@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
>   lockdep_assert_held(_alloc_mutex);
>  
>   if (!pages)
> - pages = pcpu_mem_zalloc(pages_size);
> + pages = pcpu_mem_zalloc(pages_size, 0);
  
  because this is confusing
>  static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
> - struct page **pages, int page_start, int page_end)
> + struct page **pages, int page_start, int page_end,
> + gfp_t gfp)
>  {
> - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
>   unsigned int cpu, tcpu;
>   int i;
>  
> + gfp |=  GFP_KERNEL | __GFP_HIGHMEM;
  ^^
  double space

So, setting __GFP_HIGHMEM unconditionally here makes sense because
it's indicating the types of pages we can use (we also accept high
pages); however, I'm not sure GFP_KERNEL makes sense.  That's about
"how to allocate" and looks like it should be left to the caller.

Thanks.

-- 
tejun


Re: [PATCH 2/3] percpu: add __GFP_NORETRY semantics to the percpu balancing path

2018-02-15 Thread Tejun Heo
Hello,

On Thu, Feb 15, 2018 at 10:08:15AM -0600, Dennis Zhou wrote:
> -static struct pcpu_chunk *pcpu_create_chunk(void)
> +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
>  {
>   const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
>   struct pcpu_chunk *chunk;
>   struct page *pages;
>   int i;
>  
> - chunk = pcpu_alloc_chunk();
> + chunk = pcpu_alloc_chunk(gfp);
>   if (!chunk)
>   return NULL;
>  
> - pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
> + pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages));

Is there a reason to set GFP_KERNEL in this function?  I'd prefer
pushing this to the callers.

> diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
> index 9158e5a..ea9906a 100644
> --- a/mm/percpu-vm.c
> +++ b/mm/percpu-vm.c
> @@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void)
>   lockdep_assert_held(_alloc_mutex);
>  
>   if (!pages)
> - pages = pcpu_mem_zalloc(pages_size);
> + pages = pcpu_mem_zalloc(pages_size, 0);
  
  because this is confusing
>  static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
> - struct page **pages, int page_start, int page_end)
> + struct page **pages, int page_start, int page_end,
> + gfp_t gfp)
>  {
> - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
>   unsigned int cpu, tcpu;
>   int i;
>  
> + gfp |=  GFP_KERNEL | __GFP_HIGHMEM;
  ^^
  double space

So, setting __GFP_HIGHMEM unconditionally here makes sense because
it's indicating the types of pages we can use (we also accept high
pages); however, I'm not sure GFP_KERNEL makes sense.  That's about
"how to allocate" and looks like it should be left to the caller.

Thanks.

-- 
tejun


Re: [PATCH 3/3] percpu: Remove inert tracepoint in __init code

2018-02-15 Thread Tejun Heo
On Wed, Feb 14, 2018 at 10:40:43AM -0600, Josh Poimboeuf wrote:
> The jump_label code doesn't patch init code, so this tracepoint can
> never be enabled.  Remove it.
> 
> Cc: Tejun Heo <t...@kernel.org>
> Signed-off-by: Josh Poimboeuf <jpoim...@redhat.com>

Applied to percpu/for-4.17.

Thanks.

-- 
tejun


Re: [PATCH 3/3] percpu: Remove inert tracepoint in __init code

2018-02-15 Thread Tejun Heo
On Wed, Feb 14, 2018 at 10:40:43AM -0600, Josh Poimboeuf wrote:
> The jump_label code doesn't patch init code, so this tracepoint can
> never be enabled.  Remove it.
> 
> Cc: Tejun Heo 
> Signed-off-by: Josh Poimboeuf 

Applied to percpu/for-4.17.

Thanks.

-- 
tejun


Re: x86/stack protector: X86_32_LAZY_GS=n hangs kernel on old processors

2018-02-13 Thread Tejun Heo
Hello,

(cc'ing lkml as requested)

On Tue, Feb 13, 2018 at 11:40:17AM -0500, tedheadster wrote:
>   in your patch "x86: make lazy %gs optional on x86_32" were you able
> to test it on really old processors? In 4.16.0-rc1, X86_32_LAZY_GS got
> toggled from 'y' to 'n' in my default config because of changes to the
> stack protector code. It hangs my ancient i486 test machine right
> after 'Booting the kernel'.

I didn't have access to an i486 at the time or ever since, so it
wasn't tested there.  If this is specific to i486, flagging it so in
the config probably isn't the end of the world at this point.

Thanks.

-- 
tejun


Re: x86/stack protector: X86_32_LAZY_GS=n hangs kernel on old processors

2018-02-13 Thread Tejun Heo
Hello,

(cc'ing lkml as requested)

On Tue, Feb 13, 2018 at 11:40:17AM -0500, tedheadster wrote:
>   in your patch "x86: make lazy %gs optional on x86_32" were you able
> to test it on really old processors? In 4.16.0-rc1, X86_32_LAZY_GS got
> toggled from 'y' to 'n' in my default config because of changes to the
> stack protector code. It hangs my ancient i486 test machine right
> after 'Booting the kernel'.

I didn't have access to an i486 at the time or ever since, so it
wasn't tested there.  If this is specific to i486, flagging it so in
the config probably isn't the end of the world at this point.

Thanks.

-- 
tejun


[PATCH] tty: make n_tty_read() always abort if hangup is in progress

2018-02-13 Thread Tejun Heo
lled.
   */
  sigaction(SIGHUP, , NULL);
  printf("Child reading tty\n");
  while (1) {
  char buf[1024];

  if (read(fd, buf, sizeof(buf)) < 0) {
      perror("read");
  return 1;
  }
  }

  return 0;
  }


Signed-off-by: Tejun Heo <t...@kernel.org>
Cc: Alan Cox <alan@llwyncelyn.cymru>
Cc: sta...@vger.kernel.org
---
Hello,

This patch was initially posted several months ago as a sort of RFC.

 http://lkml.kernel.org/r/20171101020651.gk3252...@devbig577.frc2.facebook.com

IIUC Alan's explanation correctly, this workaround actually seems like
the right thing to do, so I refreshed the patch, added some comments
and updated the patch description accordingly.

We've been running this in the fleet for the past couple months to
avoid the process hang issues and it hasn't caused any issues.  Greg,
can you please pick it up?

Thanks.

 drivers/tty/n_tty.c  |6 ++
 drivers/tty/tty_io.c |9 +
 include/linux/tty.h  |1 +
 3 files changed, 16 insertions(+)

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 5c0e59e..cbe98bc 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -2180,6 +2180,12 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct 
file *file,
}
if (tty_hung_up_p(file))
break;
+   /*
+* Abort readers for ttys which never actually
+* get hung up.  See __tty_hangup().
+*/
+   if (test_bit(TTY_HUPPING, >flags))
+   break;
if (!timeout)
break;
if (file->f_flags & O_NONBLOCK) {
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index eb9133b..63114ea 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -586,6 +586,14 @@ static void __tty_hangup(struct tty_struct *tty, int 
exit_session)
return;
}
 
+   /*
+* Some console devices aren't actually hung up for technical and
+* historical reasons, which can lead to indefinite interruptible
+* sleep in n_tty_read().  The following explicitly tells
+* n_tty_read() to abort readers.
+*/
+   set_bit(TTY_HUPPING, >flags);
+
/* inuse_filps is protected by the single tty lock,
   this really needs to change if we want to flush the
   workqueue with the lock held */
@@ -640,6 +648,7 @@ static void __tty_hangup(struct tty_struct *tty, int 
exit_session)
 * from the ldisc side, which is now guaranteed.
 */
set_bit(TTY_HUPPED, >flags);
+   clear_bit(TTY_HUPPING, >flags);
tty_unlock(tty);
 
if (f)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 0a6c71e..47f8af2 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -364,6 +364,7 @@ struct tty_file_private {
 #define TTY_PTY_LOCK   16  /* pty private */
 #define TTY_NO_WRITE_SPLIT 17  /* Preserve write boundaries to driver 
*/
 #define TTY_HUPPED 18  /* Post driver->hangup() */
+#define TTY_HUPPING19  /* Hangup in progress */
 #define TTY_LDISC_HALTED   22  /* Line discipline is halted */
 
 /* Values for tty->flow_change */


[PATCH] tty: make n_tty_read() always abort if hangup is in progress

2018-02-13 Thread Tejun Heo
lled.
   */
  sigaction(SIGHUP, , NULL);
  printf("Child reading tty\n");
  while (1) {
  char buf[1024];

  if (read(fd, buf, sizeof(buf)) < 0) {
      perror("read");
  return 1;
  }
  }

  return 0;
  }


Signed-off-by: Tejun Heo 
Cc: Alan Cox 
Cc: sta...@vger.kernel.org
---
Hello,

This patch was initially posted several months ago as a sort of RFC.

 http://lkml.kernel.org/r/20171101020651.gk3252...@devbig577.frc2.facebook.com

IIUC Alan's explanation correctly, this workaround actually seems like
the right thing to do, so I refreshed the patch, added some comments
and updated the patch description accordingly.

We've been running this in the fleet for the past couple months to
avoid the process hang issues and it hasn't caused any issues.  Greg,
can you please pick it up?

Thanks.

 drivers/tty/n_tty.c  |6 ++
 drivers/tty/tty_io.c |9 +
 include/linux/tty.h  |1 +
 3 files changed, 16 insertions(+)

diff --git a/drivers/tty/n_tty.c b/drivers/tty/n_tty.c
index 5c0e59e..cbe98bc 100644
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -2180,6 +2180,12 @@ static ssize_t n_tty_read(struct tty_struct *tty, struct 
file *file,
}
if (tty_hung_up_p(file))
break;
+   /*
+* Abort readers for ttys which never actually
+* get hung up.  See __tty_hangup().
+*/
+   if (test_bit(TTY_HUPPING, >flags))
+   break;
if (!timeout)
break;
if (file->f_flags & O_NONBLOCK) {
diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c
index eb9133b..63114ea 100644
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -586,6 +586,14 @@ static void __tty_hangup(struct tty_struct *tty, int 
exit_session)
return;
}
 
+   /*
+* Some console devices aren't actually hung up for technical and
+* historical reasons, which can lead to indefinite interruptible
+* sleep in n_tty_read().  The following explicitly tells
+* n_tty_read() to abort readers.
+*/
+   set_bit(TTY_HUPPING, >flags);
+
/* inuse_filps is protected by the single tty lock,
   this really needs to change if we want to flush the
   workqueue with the lock held */
@@ -640,6 +648,7 @@ static void __tty_hangup(struct tty_struct *tty, int 
exit_session)
 * from the ldisc side, which is now guaranteed.
 */
set_bit(TTY_HUPPED, >flags);
+   clear_bit(TTY_HUPPING, >flags);
tty_unlock(tty);
 
if (f)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 0a6c71e..47f8af2 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -364,6 +364,7 @@ struct tty_file_private {
 #define TTY_PTY_LOCK   16  /* pty private */
 #define TTY_NO_WRITE_SPLIT 17  /* Preserve write boundaries to driver 
*/
 #define TTY_HUPPED 18  /* Post driver->hangup() */
+#define TTY_HUPPING19  /* Hangup in progress */
 #define TTY_LDISC_HALTED   22  /* Line discipline is halted */
 
 /* Values for tty->flow_change */


Re: linux-next: build warning after merge of the libata tree

2018-02-13 Thread Tejun Heo
On Tue, Feb 13, 2018 at 02:04:49PM +1100, Stephen Rothwell wrote:
> Hi Tejun,
> 
> After merging the libata tree, today's linux-next build (arm
> multi_v7_defconfig) produced this warning:
> 
> drivers/ata/sata_rcar.c: In function 'sata_rcar_init_controller':
> drivers/ata/sata_rcar.c:821:16: warning: unused variable 'base' 
> [-Wunused-variable]
>   void __iomem *base = priv->base;
> ^~~~
> 
> Introduced by commit
> 
>   da77d76b95a0 ("sata_rcar: Reset SATA PHY when Salvator-X board resumes")

Just applied Geert's patch to remove the warning.

Thanks.

-- 
tejun


Re: linux-next: build warning after merge of the libata tree

2018-02-13 Thread Tejun Heo
On Tue, Feb 13, 2018 at 02:04:49PM +1100, Stephen Rothwell wrote:
> Hi Tejun,
> 
> After merging the libata tree, today's linux-next build (arm
> multi_v7_defconfig) produced this warning:
> 
> drivers/ata/sata_rcar.c: In function 'sata_rcar_init_controller':
> drivers/ata/sata_rcar.c:821:16: warning: unused variable 'base' 
> [-Wunused-variable]
>   void __iomem *base = priv->base;
> ^~~~
> 
> Introduced by commit
> 
>   da77d76b95a0 ("sata_rcar: Reset SATA PHY when Salvator-X board resumes")

Just applied Geert's patch to remove the warning.

Thanks.

-- 
tejun


Re: [PATCH] libata: update documentation for sysfs interfaces

2018-02-13 Thread Tejun Heo
On Tue, Feb 13, 2018 at 01:48:16PM +0530, Aishwarya Pant wrote:
> Dcoumentation has been added by parsing through git commit history and
> reading code. This might be useful for scripting and tracking changes in
> the ABI.
> 
> I do not have complete descriptions for the following 3 attributes; they
> have been annotated with the comment [to be documented] -
> 
>   /sys/class/scsi_host/hostX/ahci_port_cmd
>   /sys/class/scsi_host/hostX/ahci_host_caps
>   /sys/class/scsi_host/hostX/ahci_host_cap2
> 
> Signed-off-by: Aishwarya Pant 

Applied to libata/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH] libata: update documentation for sysfs interfaces

2018-02-13 Thread Tejun Heo
On Tue, Feb 13, 2018 at 01:48:16PM +0530, Aishwarya Pant wrote:
> Dcoumentation has been added by parsing through git commit history and
> reading code. This might be useful for scripting and tracking changes in
> the ABI.
> 
> I do not have complete descriptions for the following 3 attributes; they
> have been annotated with the comment [to be documented] -
> 
>   /sys/class/scsi_host/hostX/ahci_port_cmd
>   /sys/class/scsi_host/hostX/ahci_host_caps
>   /sys/class/scsi_host/hostX/ahci_host_cap2
> 
> Signed-off-by: Aishwarya Pant 

Applied to libata/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH] libata: transport: cleanup documentation of sysfs interface

2018-02-13 Thread Tejun Heo
On Tue, Feb 13, 2018 at 04:51:59PM +0530, Aishwarya Pant wrote:
> Clean-up the documentation of sysfs interfaces to be in the same format
> as described in Documentation/ABI/README. This will be useful for
> tracking changes in the ABI. Attributes are grouped by function (device,
> link or port) and then by date added.
> 
> This patch also adds documentation for one attribute -
> /sys/class/ata_port/ataX/port_no
> 
> Signed-off-by: Aishwarya Pant 

Applied to libata/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH] libata: transport: cleanup documentation of sysfs interface

2018-02-13 Thread Tejun Heo
On Tue, Feb 13, 2018 at 04:51:59PM +0530, Aishwarya Pant wrote:
> Clean-up the documentation of sysfs interfaces to be in the same format
> as described in Documentation/ABI/README. This will be useful for
> tracking changes in the ABI. Attributes are grouped by function (device,
> link or port) and then by date added.
> 
> This patch also adds documentation for one attribute -
> /sys/class/ata_port/ataX/port_no
> 
> Signed-off-by: Aishwarya Pant 

Applied to libata/for-4.16-fixes.

Thanks.

-- 
tejun


[PATCH for-4.17] percpu: add Dennis Zhou as a percpu co-maintainer

2018-02-12 Thread Tejun Heo
Dennis rewrote the percpu area allocator some months ago, understands
most of the code base and has been responsive with the bug reports and
questions.  Let's add him as a co-maintainer.

Signed-off-by: Tejun Heo <t...@kernel.org>
---
Imma apply this to percpu/for-4.17.  If there's any objection, please
let me know.

Thanks.

 MAINTAINERS |1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 3bdc260..f384e22 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10832,6 +10832,7 @@ F:  drivers/platform/x86/peaq-wmi.c
 PER-CPU MEMORY ALLOCATOR
 M: Tejun Heo <t...@kernel.org>
 M: Christoph Lameter <c...@linux.com>
+M: Dennis Zhou <dennissz...@gmail.com>
 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git
 S: Maintained
 F: include/linux/percpu*.h


[PATCH for-4.17] percpu: add Dennis Zhou as a percpu co-maintainer

2018-02-12 Thread Tejun Heo
Dennis rewrote the percpu area allocator some months ago, understands
most of the code base and has been responsive with the bug reports and
questions.  Let's add him as a co-maintainer.

Signed-off-by: Tejun Heo 
---
Imma apply this to percpu/for-4.17.  If there's any objection, please
let me know.

Thanks.

 MAINTAINERS |1 +
 1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 3bdc260..f384e22 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10832,6 +10832,7 @@ F:  drivers/platform/x86/peaq-wmi.c
 PER-CPU MEMORY ALLOCATOR
 M: Tejun Heo 
 M: Christoph Lameter 
+M: Dennis Zhou 
 T: git git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu.git
 S: Maintained
 F: include/linux/percpu*.h


Re: lost connection to test machine (4)

2018-02-12 Thread Tejun Heo
On Mon, Feb 12, 2018 at 09:03:25AM -0800, Tejun Heo wrote:
> Hello, Daniel.
> 
> On Mon, Feb 12, 2018 at 06:00:13PM +0100, Daniel Borkmann wrote:
> > [ +Dennis, +Tejun ]
> > 
> > Looks like we're stuck in percpu allocator with key/value size of 4 bytes
> > each and large number of entries (max_entries) in the reproducer in above
> > link.
> > 
> > Could we have some __GFP_NORETRY semantics and let allocations fail instead
> > of triggering OOM killer?
> 
> For some part, maybe, but not generally.  The virt area allocation
> goes down to page table allocation which is hard coded to use
> GFP_KERNEL in arch mm code.

So, the following should convert majority of allocations to use
__GFP_NORETRY.  It doesn't catch everything but should significantly
lower the probability of hitting this and put this on the same footing
as vmalloc.  Can you see whether this is enough?

Note that this patch isn't upstreamable.  We definitely want to
restrict this to the rebalance path, but it should be good enough for
testing.

Thanks.

diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9158e5a..0b4739f 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
struct page **pages, int page_start, int page_end)
 {
-   const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
+   const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_NORETRY;
unsigned int cpu, tcpu;
int i;
 


Re: lost connection to test machine (4)

2018-02-12 Thread Tejun Heo
On Mon, Feb 12, 2018 at 09:03:25AM -0800, Tejun Heo wrote:
> Hello, Daniel.
> 
> On Mon, Feb 12, 2018 at 06:00:13PM +0100, Daniel Borkmann wrote:
> > [ +Dennis, +Tejun ]
> > 
> > Looks like we're stuck in percpu allocator with key/value size of 4 bytes
> > each and large number of entries (max_entries) in the reproducer in above
> > link.
> > 
> > Could we have some __GFP_NORETRY semantics and let allocations fail instead
> > of triggering OOM killer?
> 
> For some part, maybe, but not generally.  The virt area allocation
> goes down to page table allocation which is hard coded to use
> GFP_KERNEL in arch mm code.

So, the following should convert majority of allocations to use
__GFP_NORETRY.  It doesn't catch everything but should significantly
lower the probability of hitting this and put this on the same footing
as vmalloc.  Can you see whether this is enough?

Note that this patch isn't upstreamable.  We definitely want to
restrict this to the rebalance path, but it should be good enough for
testing.

Thanks.

diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9158e5a..0b4739f 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
 static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
struct page **pages, int page_start, int page_end)
 {
-   const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM;
+   const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_NORETRY;
unsigned int cpu, tcpu;
int i;
 


Re: [PATCH v4.15-rc9] sched, cgroup: Don't reject lower cpu.max on ancestors

2018-02-12 Thread Tejun Heo
On Mon, Jan 22, 2018 at 11:26:18AM -0800, Tejun Heo wrote:
> While adding cgroup2 interface for the cpu controller, 0d5936344f30
> ("sched: Implement interface for cgroup unified hierarchy") forgot to
> update input validation and left it to reject cpu.max config if any
> descendant has set a higher value.
> 
> cgroup2 officially supports delegation and a descendant must not be
> able to restrict what its ancestors can configure.  For absolute
> limits such as cpu.max and memory.max, this means that the config at
> each level should only act as the upper limit at that level and
> shouldn't interfere with what other cgroups can configure.
> 
> This patch updates config validation on cgroup2 so that the cpu
> controller follows the same convention.
> 
> Signed-off-by: Tejun Heo <t...@kernel.org>
> Fixes: 0d5936344f30 ("sched: Implement interface for cgroup unified 
> hierarchy")

Applied to cgroup/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH v4.15-rc9] sched, cgroup: Don't reject lower cpu.max on ancestors

2018-02-12 Thread Tejun Heo
On Mon, Jan 22, 2018 at 11:26:18AM -0800, Tejun Heo wrote:
> While adding cgroup2 interface for the cpu controller, 0d5936344f30
> ("sched: Implement interface for cgroup unified hierarchy") forgot to
> update input validation and left it to reject cpu.max config if any
> descendant has set a higher value.
> 
> cgroup2 officially supports delegation and a descendant must not be
> able to restrict what its ancestors can configure.  For absolute
> limits such as cpu.max and memory.max, this means that the config at
> each level should only act as the upper limit at that level and
> shouldn't interfere with what other cgroups can configure.
> 
> This patch updates config validation on cgroup2 so that the cpu
> controller follows the same convention.
> 
> Signed-off-by: Tejun Heo 
> Fixes: 0d5936344f30 ("sched: Implement interface for cgroup unified 
> hierarchy")

Applied to cgroup/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH] libata: fix length validation of ATAPI-relayed SCSI commands

2018-02-12 Thread Tejun Heo
On Sat, Feb 03, 2018 at 08:30:56PM -0800, Eric Biggers wrote:
> From: Eric Biggers 
> 
> syzkaller reported a crash in ata_bmdma_fill_sg() when writing to
> /dev/sg1.  The immediate cause was that the ATA command's scatterlist
> was not DMA-mapped, which causes 'pi - 1' to underflow, resulting in a
> write to 'qc->ap->bmdma_prd[0x]'.
> 
> Strangely though, the flag ATA_QCFLAG_DMAMAP was set in qc->flags.  The
> root cause is that when __ata_scsi_queuecmd() is preparing to relay a
> SCSI command to an ATAPI device, it doesn't correctly validate the CDB
> length before copying it into the 16-byte buffer 'cdb' in 'struct
> ata_queued_cmd'.  Namely, it validates the fixed CDB length expected
> based on the SCSI opcode but not the actual CDB length, which can be
> larger due to the use of the SG_NEXT_CMD_LEN ioctl.  Since 'flags' is
> the next member in ata_queued_cmd, a buffer overflow corrupts it.
> 
> Fix it by requiring that the actual CDB length be <= 16 (ATAPI_CDB_LEN).

Applied to libata/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH] libata: fix length validation of ATAPI-relayed SCSI commands

2018-02-12 Thread Tejun Heo
On Sat, Feb 03, 2018 at 08:30:56PM -0800, Eric Biggers wrote:
> From: Eric Biggers 
> 
> syzkaller reported a crash in ata_bmdma_fill_sg() when writing to
> /dev/sg1.  The immediate cause was that the ATA command's scatterlist
> was not DMA-mapped, which causes 'pi - 1' to underflow, resulting in a
> write to 'qc->ap->bmdma_prd[0x]'.
> 
> Strangely though, the flag ATA_QCFLAG_DMAMAP was set in qc->flags.  The
> root cause is that when __ata_scsi_queuecmd() is preparing to relay a
> SCSI command to an ATAPI device, it doesn't correctly validate the CDB
> length before copying it into the 16-byte buffer 'cdb' in 'struct
> ata_queued_cmd'.  Namely, it validates the fixed CDB length expected
> based on the SCSI opcode but not the actual CDB length, which can be
> larger due to the use of the SG_NEXT_CMD_LEN ioctl.  Since 'flags' is
> the next member in ata_queued_cmd, a buffer overflow corrupts it.
> 
> Fix it by requiring that the actual CDB length be <= 16 (ATAPI_CDB_LEN).

Applied to libata/for-4.16-fixes.

Thanks.

-- 
tejun


Re: [PATCH 1/5] workqueue: Allow retrieval of current task's work struct

2018-02-12 Thread Tejun Heo
Hello,

On Sun, Feb 11, 2018 at 10:38:28AM +0100, Lukas Wunner wrote:
> Introduce a helper to retrieve the current task's work struct if it is
> a workqueue worker.
> 
> This allows us to fix a long-standing deadlock in several DRM drivers
> wherein the ->runtime_suspend callback waits for a specific worker to
> finish and that worker in turn calls a function which waits for runtime
> suspend to finish.  That function is invoked from multiple call sites
> and waiting for runtime suspend to finish is the correct thing to do
> except if it's executing in the context of the worker.
> 
> Cc: Tejun Heo <t...@kernel.org>
> Cc: Lai Jiangshan <jiangshan...@gmail.com>
> Cc: Dave Airlie <airl...@redhat.com>
> Cc: Ben Skeggs <bske...@redhat.com>
> Cc: Alex Deucher <alexander.deuc...@amd.com>
> Signed-off-by: Lukas Wunner <lu...@wunner.de>

I wonder whether it's too generic a name but there are other functions
named in a similar fashion and AFAICS current_work isn't used by
anyone in the tree, so it seems okay.

Acked-by: Tejun Heo <t...@kernel.org>

Please feel free to route as you see fit.

Thanks.

-- 
tejun


Re: [PATCH 1/5] workqueue: Allow retrieval of current task's work struct

2018-02-12 Thread Tejun Heo
Hello,

On Sun, Feb 11, 2018 at 10:38:28AM +0100, Lukas Wunner wrote:
> Introduce a helper to retrieve the current task's work struct if it is
> a workqueue worker.
> 
> This allows us to fix a long-standing deadlock in several DRM drivers
> wherein the ->runtime_suspend callback waits for a specific worker to
> finish and that worker in turn calls a function which waits for runtime
> suspend to finish.  That function is invoked from multiple call sites
> and waiting for runtime suspend to finish is the correct thing to do
> except if it's executing in the context of the worker.
> 
> Cc: Tejun Heo 
> Cc: Lai Jiangshan 
> Cc: Dave Airlie 
> Cc: Ben Skeggs 
> Cc: Alex Deucher 
> Signed-off-by: Lukas Wunner 

I wonder whether it's too generic a name but there are other functions
named in a similar fashion and AFAICS current_work isn't used by
anyone in the tree, so it seems okay.

Acked-by: Tejun Heo 

Please feel free to route as you see fit.

Thanks.

-- 
tejun


Re: lost connection to test machine (4)

2018-02-12 Thread Tejun Heo
Hello, Daniel.

On Mon, Feb 12, 2018 at 06:00:13PM +0100, Daniel Borkmann wrote:
> [ +Dennis, +Tejun ]
> 
> Looks like we're stuck in percpu allocator with key/value size of 4 bytes
> each and large number of entries (max_entries) in the reproducer in above
> link.
> 
> Could we have some __GFP_NORETRY semantics and let allocations fail instead
> of triggering OOM killer?

For some part, maybe, but not generally.  The virt area allocation
goes down to page table allocation which is hard coded to use
GFP_KERNEL in arch mm code.

Thanks.

-- 
tejun


Re: lost connection to test machine (4)

2018-02-12 Thread Tejun Heo
Hello, Daniel.

On Mon, Feb 12, 2018 at 06:00:13PM +0100, Daniel Borkmann wrote:
> [ +Dennis, +Tejun ]
> 
> Looks like we're stuck in percpu allocator with key/value size of 4 bytes
> each and large number of entries (max_entries) in the reproducer in above
> link.
> 
> Could we have some __GFP_NORETRY semantics and let allocations fail instead
> of triggering OOM killer?

For some part, maybe, but not generally.  The virt area allocation
goes down to page table allocation which is hard coded to use
GFP_KERNEL in arch mm code.

Thanks.

-- 
tejun


Re: [PATCH] Fix compile warning with ATA_DEBUG enabled

2018-02-12 Thread Tejun Heo
On Fri, Jan 26, 2018 at 11:21:49AM +0800, dongbo (E) wrote:
> From: Dong Bo 
> 
> This fixs the following comile warnings with ATA_DEBUG enabled,
> which detected by Linaro GCC 5.2-2015.11:
> 
> In file included from ./include/linux/printk.h:7:0,
>  from ./include/linux/kernel.h:14,
>  from ./include/asm-generic/bug.h:16,
>  from ./arch/arm64/include/asm/bug.h:37,
>  from ./include/linux/bug.h:5,
>  from ./include/linux/mmdebug.h:5,
>  from ./include/linux/gfp.h:5,
>  from ./include/linux/slab.h:15,
>  from drivers/ata/libata-scsi.c:36:
> drivers/ata/libata-scsi.c: In function 'ata_scsi_dump_cdb':
> ./include/linux/kern_levels.h:5:18: warning: format '%d' expects
> argument of type 'int', but argument 6 has type 'u64 {aka long
>  long unsigned int}' [-Wformat=]
>  #define KERN_SOH "\001"  /* ASCII Start Of Header */
>   ^
> ./include/linux/kern_levels.h:11:18: note: in expansion of macro
> 'KERN_SOH'
>  #define KERN_ERR KERN_SOH "3" /* error conditions */
>   ^
> ./include/linux/libata.h:65:38: note: in expansion of macro 'KERN_ERR'
>  #define DPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt,
>   __func__, ## args)
>   ^
> drivers/ata/libata-scsi.c:4284:2: note: in expansion of macro 'DPRINTK'
>   DPRINTK("CDB (%u:%d,%d,%d) %9ph\n",
>   ^
> 
> Signed-off-by: Dong Bo 

Path didn't apply, probalby because of the tailing part.  Hand-applied
to libata/for-4.16-fixes and trimmed the description a bit.

Thanks.

-- 
tejun


<    6   7   8   9   10   11   12   13   14   15   >