from:"Junxiao Bi"

Re: [PATCH] block: fix io hung by block throttle

2021-04-19 Thread Junxiao Bi




On 4/18/21 11:09 PM, Junxiao Bi wrote:

- finish_wait(>wait, );
+    mutex_lock(>throttle_mutex);
+    wait_event(rqw->wait, acquire_inflight_cb(rqw, private_data));
+    mutex_unlock(>throttle_mutex);


This will break the throttle? There is a inflight io limitation. With 
this change, there can be only one io inflight whatever the limit is.

Sorry, ignore this. I should go sleep that time.


Thanks,

Junxiao.

Re: [PATCH] block: fix io hung by block throttle

2021-04-19 Thread Junxiao Bi




On 4/18/21 5:33 AM, Hillf Danton wrote:

On Sat, 17 Apr 2021 14:37:57  Junxiao Bi  wrote:

On 4/17/21 3:10 AM, Hillf Danton wrote:

+   if (acquire_inflight_cb(rqw, private_data))

This function is to increase atomic variable rq_wait->inflight.

You are right.


What's the mutex for?

It cuts the race between we peek at the sleepers on rqw->wait while they are
coming and going, and we cant update rqw->inflight without making sure there
are no sleepers.


Why? I think checking the sleeper in original code is for a fast path.

For wbt, acquire_inflight_cb is wbt_inflight_cb where atomic_inc_below 
is used to update rqw->inflight. I don't see why a mutex is needed for 
this atomic operation.




With the mutex in place, in addition to the certainty of !sleepers, we can
avoid the race between us and waker in terms of updating inflight by removing
the invokation of acquire_inflight_cb in the wakeup callback, and the bonus is
we no longer need the wakeup cb and the rq_qos_wait_data because the more
traditional wait_event() can do the job.

Finally we can dump the cleanup_cb_t.

+++ b/block/blk-rq-qos.c
@@ -200,96 +200,24 @@ bool rq_depth_scale_down(struct rq_depth
return true;
  }
  
-struct rq_qos_wait_data {

-   struct wait_queue_entry wq;
-   struct task_struct *task;
-   struct rq_wait *rqw;
-   acquire_inflight_cb_t *cb;
-   void *private_data;
-   bool got_token;
-};
-
-static int rq_qos_wake_function(struct wait_queue_entry *curr,
-   unsigned int mode, int wake_flags, void *key)
-{
-   struct rq_qos_wait_data *data = container_of(curr,
-struct rq_qos_wait_data,
-wq);
-
-   /*
-* If we fail to get a budget, return -1 to interrupt the wake up loop
-* in __wake_up_common.
-*/
-   if (!data->cb(data->rqw, data->private_data))
-   return -1;
-
-   data->got_token = true;
-   smp_wmb();
-   list_del_init(>entry);
-   wake_up_process(data->task);
-   return 1;
-}
-
  /**
   * rq_qos_wait - throttle on a rqw if we need to
   * @rqw: rqw to throttle on
   * @private_data: caller provided specific data
   * @acquire_inflight_cb: inc the rqw->inflight counter if we can
- * @cleanup_cb: the callback to cleanup in case we race with a waker
   *
   * This provides a uniform place for the rq_qos users to do their throttling.
   * Since you can end up with a lot of things sleeping at once, this manages 
the
   * waking up based on the resources available.  The acquire_inflight_cb should
   * inc the rqw->inflight if we have the ability to do so, or return false if 
not
   * and then we will sleep until the room becomes available.
- *
- * cleanup_cb is in case that we race with a waker and need to cleanup the
- * inflight count accordingly.
   */
  void rq_qos_wait(struct rq_wait *rqw, void *private_data,
-acquire_inflight_cb_t *acquire_inflight_cb,
-cleanup_cb_t *cleanup_cb)
+acquire_inflight_cb_t *acquire_inflight_cb)
  {
-   struct rq_qos_wait_data data = {
-   .wq = {
-   .func   = rq_qos_wake_function,
-   .entry  = LIST_HEAD_INIT(data.wq.entry),
-   },
-   .task = current,
-   .rqw = rqw,
-   .cb = acquire_inflight_cb,
-   .private_data = private_data,
-   };
-   bool has_sleeper;
-
-   has_sleeper = wq_has_sleeper(>wait);
-   if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
-   return;
-
-   prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE);
-   has_sleeper = !wq_has_single_sleeper(>wait);
-   do {
-   /* The memory barrier in set_task_state saves us here. */
-   if (data.got_token)
-   break;
-   if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
-   finish_wait(>wait, );
-
-   /*
-* We raced with wbt_wake_function() getting a token,
-* which means we now have two. Put our local token
-* and wake anyone else potentially waiting for one.
-*/
-   smp_rmb();
-   if (data.got_token)
-   cleanup_cb(rqw, private_data);
-   break;
-   }
-   io_schedule();
-   has_sleeper = true;
-   set_current_state(TASK_UNINTERRUPTIBLE);
-   } while (1);
-   finish_wait(>wait, );
+   mutex_lock(>throttle_mutex);
+   wait_event(rqw->wait, acquire_inflight_cb(rqw, private_data));
+   mutex_unlock(>throttle_mutex);


This will break the throttle? Th

Re: [PATCH] block: fix io hung by block throttle

2021-04-17 Thread Junxiao Bi


On 4/17/21 3:10 AM, Hillf Danton wrote:


--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -260,19 +260,17 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
.cb = acquire_inflight_cb,
.private_data = private_data,
};
-   bool has_sleeper;
  
-	has_sleeper = wq_has_sleeper(>wait);

-   if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
+   if (!wq_has_sleeper(>wait)
+   && acquire_inflight_cb(rqw, private_data))
return;
  
  	prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE);

-   has_sleeper = !wq_has_single_sleeper(>wait);
do {
/* The memory barrier in set_task_state saves us here. */
if (data.got_token)
break;
-   if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
+   if (acquire_inflight_cb(rqw, private_data)) {
finish_wait(>wait, );

Simply removing !has_sleeper is not enough if it is mandatory before
acquire_inflight_cb() without adding something like a mutex to sieve the
concurrent sleepers out, see below.


--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -260,19 +260,18 @@ void rq_qos_wait(struct rq_wait *rqw, vo
.cb = acquire_inflight_cb,
.private_data = private_data,
};
-   bool has_sleeper;
  
-	has_sleeper = wq_has_sleeper(>wait);

-   if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
-   return;
+   mutex_lock(>mutex);
+
+   if (acquire_inflight_cb(rqw, private_data))


This function is to increase atomic variable rq_wait->inflight. What's 
the mutex for?


Thanks,

Junxiao.


+   goto out;
  
  	prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE);

-   has_sleeper = !wq_has_single_sleeper(>wait);
do {
/* The memory barrier in set_task_state saves us here. */
if (data.got_token)
break;
-   if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
+   if (acquire_inflight_cb(rqw, private_data)) {
finish_wait(>wait, );
  
  			/*

@@ -286,10 +285,11 @@ void rq_qos_wait(struct rq_wait *rqw, vo
break;
}
io_schedule();
-   has_sleeper = true;
set_current_state(TASK_UNINTERRUPTIBLE);
} while (1);
finish_wait(>wait, );
+out:
+   mutex_unlock(>mutex);
  }

Re: [PATCH] block: fix io hung by block throttle

2021-04-14 Thread Junxiao Bi


On 4/14/21 9:11 PM, Hillf Danton wrote:


On Wed, 14 Apr 2021 14:18:30 Junxiao Bi wrote:

There is a race bug which can cause io hung when multiple processes
run parallel in rq_qos_wait().
Let assume there were 4 processes P1/P2/P3/P4, P1/P2 were at the entry
of rq_qos_wait, and P3/P4 were waiting for io done, 2 io were inflight,
the inflight io limit was 2. See race below.

void rq_qos_wait()
{
...
 bool has_sleeper;

>>>> P3/P4 were in sleeper list, has_sleeper was true for both P1 and 
P2.
 has_sleeper = wq_has_sleeper(>wait);
 if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
 return;

>>>> 2 inflight io done, P3/P4 were waken up to issue 2 new io.
>>>> 2 new io done, no inflight io.

>>>> P1/P2 were added to the sleeper list, 2 entry in the list
 prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE);

>>>> P1/P2 were in the sleeper list, has_sleeper was true for P1/P2.
 has_sleeper = !wq_has_single_sleeper(>wait);
 do {
 /* The memory barrier in set_task_state saves us here. */
 if (data.got_token)
 break;
 if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
 finish_wait(>wait, );

 /*
  * We raced with wbt_wake_function() getting a token,
  * which means we now have two. Put our local token
  * and wake anyone else potentially waiting for one.
  */
 smp_rmb();
 if (data.got_token)
 cleanup_cb(rqw, private_data);
 break;
 }

>>>> P1/P2 hung here forever. New io requests will also hung here.
 io_schedule();
 has_sleeper = true;
 set_current_state(TASK_UNINTERRUPTIBLE);
 } while (1);
 finish_wait(>wait, );
}

Cc: sta...@vger.kernel.org
Signed-off-by: Junxiao Bi 
---
  block/blk-rq-qos.c | 9 +++--
  1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 656460636ad3..04d888c99bc0 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -260,19 +260,17 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
.cb = acquire_inflight_cb,
.private_data = private_data,
};
-   bool has_sleeper;
  
-	has_sleeper = wq_has_sleeper(>wait);

-   if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
+   if (!wq_has_sleeper(>wait)
+   && acquire_inflight_cb(rqw, private_data))
return;
  
  	prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE);

-   has_sleeper = !wq_has_single_sleeper(>wait);
do {
/* The memory barrier in set_task_state saves us here. */
if (data.got_token)
break;
-   if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
+   if (acquire_inflight_cb(rqw, private_data)) {
finish_wait(>wait, );
  
  			/*

@@ -286,7 +284,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
break;
}
io_schedule();
-   has_sleeper = true;
set_current_state(TASK_UNINTERRUPTIBLE);
} while (1);
finish_wait(>wait, );
--
2.24.3 (Apple Git-128)


No wakeup may cause the hang.

--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -287,7 +287,8 @@ void rq_qos_wait(struct rq_wait *rqw, vo
}
io_schedule();
has_sleeper = true;
-   set_current_state(TASK_UNINTERRUPTIBLE);
+   prepare_to_wait_exclusive(>wait, ,
+   TASK_UNINTERRUPTIBLE);


From rq_qos_wake_function(), the process can be waken up and removed 
from the sleeper list only when it get the budget. Looks not necessary 
to re-add it to sleeper list again.


Thanks,

Junxiao.


} while (1);
finish_wait(>wait, );
  }

[PATCH] block: fix io hung by block throttle

2021-04-14 Thread Junxiao Bi

There is a race bug which can cause io hung when multiple processes
run parallel in rq_qos_wait().
Let assume there were 4 processes P1/P2/P3/P4, P1/P2 were at the entry
of rq_qos_wait, and P3/P4 were waiting for io done, 2 io were inflight,
the inflight io limit was 2. See race below.

void rq_qos_wait()
{
...
bool has_sleeper;

>>>> P3/P4 were in sleeper list, has_sleeper was true for both P1 and 
P2.
has_sleeper = wq_has_sleeper(>wait);
if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
return;

>>>> 2 inflight io done, P3/P4 were waken up to issue 2 new io.
>>>> 2 new io done, no inflight io.

>>>> P1/P2 were added to the sleeper list, 2 entry in the list
prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE);

>>>> P1/P2 were in the sleeper list, has_sleeper was true for P1/P2.
has_sleeper = !wq_has_single_sleeper(>wait);
do {
/* The memory barrier in set_task_state saves us here. */
if (data.got_token)
break;
if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
finish_wait(>wait, );

/*
 * We raced with wbt_wake_function() getting a token,
 * which means we now have two. Put our local token
 * and wake anyone else potentially waiting for one.
 */
smp_rmb();
if (data.got_token)
cleanup_cb(rqw, private_data);
break;
}

>>>> P1/P2 hung here forever. New io requests will also hung here.
io_schedule();
has_sleeper = true;
set_current_state(TASK_UNINTERRUPTIBLE);
} while (1);
finish_wait(>wait, );
}

Cc: sta...@vger.kernel.org
Signed-off-by: Junxiao Bi 
---
 block/blk-rq-qos.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 656460636ad3..04d888c99bc0 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -260,19 +260,17 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
.cb = acquire_inflight_cb,
.private_data = private_data,
};
-   bool has_sleeper;
 
-   has_sleeper = wq_has_sleeper(>wait);
-   if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
+   if (!wq_has_sleeper(>wait)
+   && acquire_inflight_cb(rqw, private_data))
return;
 
prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE);
-   has_sleeper = !wq_has_single_sleeper(>wait);
do {
/* The memory barrier in set_task_state saves us here. */
if (data.got_token)
break;
-   if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
+   if (acquire_inflight_cb(rqw, private_data)) {
finish_wait(>wait, );
 
/*
@@ -286,7 +284,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
break;
}
io_schedule();
-   has_sleeper = true;
set_current_state(TASK_UNINTERRUPTIBLE);
} while (1);
finish_wait(>wait, );
-- 
2.24.3 (Apple Git-128)

Re: Race condition in Kernel

2021-03-24 Thread Junxiao Bi


On 3/24/21 5:37 PM, Ming Lei wrote:


On Wed, Mar 24, 2021 at 12:37:03PM +, Gulam Mohamed wrote:

Hi All,

We are facing a stale link (of the device) issue during the iscsi-logout 
process if we use parted command just before the iscsi logout. Here are the 
details:

As part of iscsi logout, the partitions and the disk will be removed. The 
parted command, used to list the partitions, will open the disk in RW mode 
which results in systemd-udevd re-reading the partitions. This will trigger the 
rescan partitions which will also delete and re-add the partitions. So, both 
iscsi logout processing and the parted (through systemd-udevd) will be involved 
in add/delete of partitions. In our case, the following sequence of operations 
happened (the iscsi device is /dev/sdb with partition sdb1):

1. sdb1 was removed by PARTED
2. kworker, as part of iscsi logout, couldn't remove sdb1 as it was 
already removed by PARTED
3. sdb1 was added by parted
4. sdb was NOW removed as part of iscsi logout (the last part of the 
device removal after remoing the partitions)

Since the symlink /sys/class/block/sdb1 points to 
/sys/class/devices/platform/hostx/sessionx/targetx:x:x:x/x:x:x:x/block/sdb/sdb1 
and since sdb is already removed, the symlink /sys/class/block/sdb1 will be 
orphan and stale. So, this stale link is a result of the race condition in 
kernel between the systemd-udevd and iscsi-logout processing as described 
above. We are able to reproduce this even with latest upstream kernel.

We have come across a patch from Ming Lei which was created for "avoid to drop & 
re-add partitions if partitions aren't changed":
https://lore.kernel.org/linux-block/20210216084430.ga23...@lst.de/T/

BTW,  there is a newer version of this patchset:

https://lore.kernel.org/linux-block/20210224081825.ga1...@lst.de/#r



This patch could resolve our problem of stale link but it just seems to be a 
work-around and not the actual fix for the race. We were looking for help to 
fix this race in kernel. Do you have any idea how to fix this race condition?


IMO, that isn't a work-around, kernel shouldn't drop partitions if
partition table isn't changed. But Christoph thought the current approach
is taken since beginning of kernel, and he suggested to fix systemd-udev.


This is a real kernel bug. Whatever BLK_RRPART do, it should not cause 
this sysfs stale link issue. After this issue happen, there is no way to 
remove that stale link except reboot. The situation is even worse when 
login back a new disk, since it will reuse the disk number of the old 
one, it will fail when it creates the symbol link because the stale link 
is still there.


Thanks,

Junxiao.





Thanks,
Ming

Re: [PATCH RFC 0/8] dcache: increase poison resistance

2020-12-16 Thread Junxiao Bi

Hi Konstantin,

How would you like to proceed with this patch set?

This patchset as it is already fixed the customer issue we faced, it 
will stop memory fragmentation causing by negative dentry and no 
performance regression through our test. In production workload, it is 
common that some app kept creating and removing tmp files, this will 
leave a lot of negative dentry over time, some time later, it will cause 
memory fragmentation and system run into memory compaction and not 
responsible. It will be good to push it to upstream merge. If you are 
busy, we can try push it again.

Thanks,

Junxiao.

On 12/14/20 3:10 PM, Junxiao Bi wrote:

On 12/13/20 11:43 PM, Konstantin Khlebnikov wrote:

On Sun, Dec 13, 2020 at 9:52 PM Junxiao Bi <mailto:junxiao...@oracle.com>> wrote:

    On 12/11/20 11:32 PM, Konstantin Khlebnikov wrote:

    > On Thu, Dec 10, 2020 at 2:01 AM Junxiao Bi
    mailto:junxiao...@oracle.com>
    > <mailto:junxiao...@oracle.com <mailto:junxiao...@oracle.com>>>
    wrote:
    >
    >     Hi Konstantin,
    >
    >     We tested this patch set recently and found it limiting 
negative

    >     dentry
    >     to a small part of total memory. And also we don't see any
    >     performance
    >     regression on it. Do you have any plan to integrate it into
    >     mainline? It
    >     will help a lot on memory fragmentation issue causing by
    dentry slab,
    >     there were a lot of customer cases where sys% was very high
    since
    >     most
    >     cpu were doing memory compaction, dentry slab was taking too
    much
    >     memory
    >     and nearly all dentry there were negative.
    >
    >
    > Right now I don't have any plans for this. I suspect such
    problems will
    > appear much more often since machines are getting bigger.
    > So, somebody will take care of it.
    We already had a lot of customer cases. It made no sense to leave so
    many negative dentry in the system, it caused memory fragmentation
    and
    not much benefit.

Dcache could grow so big only if the system lacks of memory pressure.

Simplest solution is a cronjob which provinces such pressure by
creating sparse file on disk-based fs and then reading it.
This should wash away all inactive caches with no IO and zero chance 
of oom.

Sound good, will try.

    >
    > First part which collects negative dentries at the end list of
    > siblings could be
    > done in a more obvious way by splitting the list in two.
    > But this touches much more code.
    That would add new field to dentry?

Yep. Decision is up to maintainers.

    >
    > Last patch isn't very rigid but does non-trivial changes.
    > Probably it's better to call some garbage collector thingy
    periodically.
    > Lru list needs pressure to age and reorder entries properly.

    Swap the negative dentry to the head of hash list when it get
    accessed?
    Extra ones can be easily trimmed when swapping, using GC is to 
reduce

    perf impact?

Reclaimer/shrinker scans denties in LRU lists, it's an another list.

Ah, you mean GC to reclaim from LRU list. I am not sure it could catch 
up the speed of negative dentry generating.

Thanks,

Junxiao.

My patch used order in hash lists is a very unusual way. Don't be 
confused.

There are four lists
parent - siblings
hashtable - hashchain
LRU
inode - alias

    Thanks,

    Junxioao.

    >
    > Gc could be off by default or thresholds set very high (50% of
    ram for
    > example).
    > Final setup could be left up to owners of large systems, which
    needs
    > fine tuning.

Re: [PATCH RFC 0/8] dcache: increase poison resistance

2020-12-14 Thread Junxiao Bi

On 12/13/20 11:43 PM, Konstantin Khlebnikov wrote:

On Sun, Dec 13, 2020 at 9:52 PM Junxiao Bi <mailto:junxiao...@oracle.com>> wrote:

On 12/11/20 11:32 PM, Konstantin Khlebnikov wrote:

> On Thu, Dec 10, 2020 at 2:01 AM Junxiao Bi
mailto:junxiao...@oracle.com>
> <mailto:junxiao...@oracle.com <mailto:junxiao...@oracle.com>>>
wrote:
>
>     Hi Konstantin,
>
>     We tested this patch set recently and found it limiting negative
>     dentry
>     to a small part of total memory. And also we don't see any
>     performance
>     regression on it. Do you have any plan to integrate it into
>     mainline? It
>     will help a lot on memory fragmentation issue causing by
dentry slab,
>     there were a lot of customer cases where sys% was very high
since
>     most
>     cpu were doing memory compaction, dentry slab was taking too
much
>     memory
>     and nearly all dentry there were negative.
>
>
> Right now I don't have any plans for this. I suspect such
problems will
> appear much more often since machines are getting bigger.
> So, somebody will take care of it.
We already had a lot of customer cases. It made no sense to leave so
many negative dentry in the system, it caused memory fragmentation
and
not much benefit.

Dcache could grow so big only if the system lacks of memory pressure.

Simplest solution is a cronjob which provinces such pressure by
creating sparse file on disk-based fs and then reading it.
This should wash away all inactive caches with no IO and zero chance 
of oom.

Sound good, will try.

>
> First part which collects negative dentries at the end list of
> siblings could be
> done in a more obvious way by splitting the list in two.
> But this touches much more code.
That would add new field to dentry?

Yep. Decision is up to maintainers.

>
> Last patch isn't very rigid but does non-trivial changes.
> Probably it's better to call some garbage collector thingy
periodically.
> Lru list needs pressure to age and reorder entries properly.

Swap the negative dentry to the head of hash list when it get
accessed?
Extra ones can be easily trimmed when swapping, using GC is to reduce
perf impact?

Reclaimer/shrinker scans denties in LRU lists, it's an another list.

Ah, you mean GC to reclaim from LRU list. I am not sure it could catch 
up the speed of negative dentry generating.

Thanks,

Junxiao.

My patch used order in hash lists is a very unusual way. Don't be 
confused.

There are four lists
parent - siblings
hashtable - hashchain
LRU
inode - alias

Thanks,

Junxioao.

>
> Gc could be off by default or thresholds set very high (50% of
ram for
> example).
> Final setup could be left up to owners of large systems, which
needs
> fine tuning.

Re: [PATCH RFC 0/8] dcache: increase poison resistance

2020-12-13 Thread Junxiao Bi


On 12/11/20 11:32 PM, Konstantin Khlebnikov wrote:

On Thu, Dec 10, 2020 at 2:01 AM Junxiao Bi <mailto:junxiao...@oracle.com>> wrote:


Hi Konstantin,

We tested this patch set recently and found it limiting negative
dentry
to a small part of total memory. And also we don't see any
performance
regression on it. Do you have any plan to integrate it into
mainline? It
will help a lot on memory fragmentation issue causing by dentry slab,
there were a lot of customer cases where sys% was very high since
most
cpu were doing memory compaction, dentry slab was taking too much
memory
and nearly all dentry there were negative.


Right now I don't have any plans for this. I suspect such problems will
appear much more often since machines are getting bigger.
So, somebody will take care of it.
We already had a lot of customer cases. It made no sense to leave so 
many negative dentry in the system, it caused memory fragmentation and 
not much benefit.


First part which collects negative dentries at the end list of 
siblings could be

done in a more obvious way by splitting the list in two.
But this touches much more code.

That would add new field to dentry?


Last patch isn't very rigid but does non-trivial changes.
Probably it's better to call some garbage collector thingy periodically.
Lru list needs pressure to age and reorder entries properly.


Swap the negative dentry to the head of hash list when it get accessed? 
Extra ones can be easily trimmed when swapping, using GC is to reduce 
perf impact?


Thanks,

Junxioao.



Gc could be off by default or thresholds set very high (50% of ram for 
example).
Final setup could be left up to owners of large systems, which needs 
fine tuning.

Re: [PATCH RFC 0/8] dcache: increase poison resistance

2020-12-09 Thread Junxiao Bi


Hi Konstantin,

We tested this patch set recently and found it limiting negative dentry 
to a small part of total memory. And also we don't see any performance 
regression on it. Do you have any plan to integrate it into mainline? It 
will help a lot on memory fragmentation issue causing by dentry slab, 
there were a lot of customer cases where sys% was very high since most 
cpu were doing memory compaction, dentry slab was taking too much memory 
and nearly all dentry there were negative.


The following is test result we run on two types of servers, one is 256G 
memory with 24 CPUS and another is 3T memory with 384 CPUS. The test 
case is using a lot of processes to generate negative dentry in 
parallel, the following is the test result after 72 hours, the negative 
dentry number is stable around that number even running longer time. If 
without the patch set, in less than half an hour 197G was took by 
negative dentry on 256G system, in 1 day 2.4T was took on 3T system.


    neg-dentry-number    neg-dentry-mem-usage

256G 55259084 10.6G

3T 202306756 38.8G

For perf test, we run the following, and no regression found.

- create 1M negative dentry and then touch them to convert them to 
positive dentry


- create 10K/100K/1M files

- remove 10K/100K/1M files

- kernel compile

To verify the fsnotify fix, we used inotifywait to watch file 
create/open in some directory where there is a lot of negative dentry, 
without the patch set, the system will run into soft lockup, with it, no 
soft lockup.


We also try to defeat the limitation by making different processes 
generating negative dentry with the same naming way, that will make one 
negative dentry being accessed couple times around same time, 
DCACHE_REFERENCED will be set on it and then it can't be trimmed easily. 
We do see negative dentry will take all the memory slowly from one of 
our system with 120G memory, for above two system, we see the memory 
usage were increased, but still a small part of total memory. This looks 
ok, since the common negative dentry user case will be create some temp 
files and then remove it, it will be rare to access same negative dentry 
around same time.


Thanks,

Junxiao.


On 5/8/20 5:23 AM, Konstantin Khlebnikov wrote:

For most filesystems result of every negative lookup is cached, content of
directories is usually cached too. Production of negative dentries isn't
limited with disk speed. It's really easy to generate millions of them if
system has enough memory.

Getting this memory back ins't that easy because slab frees pages only when
all related objects are gone. While dcache shrinker works in LRU order.

Typical scenario is an idle system where some process periodically creates
temporary files and removes them. After some time, memory will be filled
with negative dentries for these random file names.

Simple lookup of random names also generates negative dentries very fast.
Constant flow of such negative denries drains all other inactive caches.

Negative dentries are linked into siblings list along with normal positive
dentries. Some operations walks dcache tree but looks only for positive
dentries: most important is fsnotify/inotify. Hordes of negative dentries
slow down these operations significantly.

Time of dentry lookup is usually unaffected because hash table grows along
with size of memory. Unless somebody especially crafts hash collisions.

This patch set solves all of these problems:

Move negative denries to the end of sliblings list, thus walkers could
skip them at first sight (patches 3-6).

Keep in dcache at most three unreferenced negative denties in row in each
hash bucket (patches 7-8).

---

Konstantin Khlebnikov (8):
   dcache: show count of hash buckets in sysctl fs.dentry-state
   selftests: add stress testing tool for dcache
   dcache: sweep cached negative dentries to the end of list of siblings
   fsnotify: stop walking child dentries if remaining tail is negative
   dcache: add action D_WALK_SKIP_SIBLINGS to d_walk()
   dcache: stop walking siblings if remaining dentries all negative
   dcache: push releasing dentry lock into sweep_negative
   dcache: prevent flooding with negative dentries


  fs/dcache.c   | 144 +++-
  fs/libfs.c|  10 +-
  fs/notify/fsnotify.c  |   6 +-
  include/linux/dcache.h|   6 +
  tools/testing/selftests/filesystems/Makefile  |   1 +
  .../selftests/filesystems/dcache_stress.c | 210 ++
  6 files changed, 370 insertions(+), 7 deletions(-)
  create mode 100644 tools/testing/selftests/filesystems/dcache_stress.c

--
Signature

Re: [md] e1a86dbbbd: mdadm-selftests.enchmarks/mdadm-selftests/tests/07layouts.fail

2020-08-04 Thread Junxiao Bi

This issue had been fixed. I send the following patch in another thread. 
Please take a look. Thank you.


[PATCH] md: get sysfs entry after redundancy attr group create

Thanks,

Junxiao.

On 8/3/20 9:00 AM, Junxiao Bi wrote:

Hi Song,

I am working on setup an env to reproduce, will update soon.

Thanks,

Junxiao.

On 8/2/20 10:52 PM, Song Liu wrote:


On Jul 29, 2020, at 2:04 AM, kernel test robot 
 wrote:


Greeting,

FYI, we noticed the following commit (built with gcc-9):

commit: e1a86dbbbd6a77f73c3d099030495fa31f181e2f ("md: fix deadlock 
causing by sysfs_notify")

https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master


in testcase: mdadm-selftests
with following parameters:

disk: 1HDD
test_prefix: 07layout
ucode: 0x21



on test machine: 4 threads Intel(R) Core(TM) i3-3220 CPU @ 3.30GHz 
with 4G memory


caused below changes (please refer to attached dmesg/kmsg for entire 
log/backtrace):




If you fix the issue, kindly add following tag
Reported-by: kernel test robot 



2020-07-29 01:06:34 mkdir -p /var/tmp
2020-07-29 01:06:34 mke2fs -t ext3 -b 4096 -J size=4 -q /dev/sda3
2020-07-29 01:07:36 mount -t ext3 /dev/sda3 /var/tmp
sed -e 's/{DEFAULT_METADATA}/1.2/g' \
-e 's,{MAP_PATH},/run/mdadm/map,g'  mdadm.8.in > mdadm.8
/usr/bin/install -D -m 644 mdadm.8 /usr/share/man/man8/mdadm.8
/usr/bin/install -D -m 644 mdmon.8 /usr/share/man/man8/mdmon.8
/usr/bin/install -D -m 644 md.4 /usr/share/man/man4/md.4
/usr/bin/install -D -m 644 mdadm.conf.5 
/usr/share/man/man5/mdadm.conf.5
/usr/bin/install -D -m 644 udev-md-raid-creating.rules 
/lib/udev/rules.d/01-md-raid-creating.rules
/usr/bin/install -D -m 644 udev-md-raid-arrays.rules 
/lib/udev/rules.d/63-md-raid-arrays.rules
/usr/bin/install -D -m 644 udev-md-raid-assembly.rules 
/lib/udev/rules.d/64-md-raid-assembly.rules
/usr/bin/install -D -m 644 udev-md-clustered-confirm-device.rules 
/lib/udev/rules.d/69-md-clustered-confirm-device.rules

/usr/bin/install -D  -m 755 mdadm /sbin/mdadm
/usr/bin/install -D  -m 755 mdmon /sbin/mdmon
Testing on linux-5.8.0-rc4-00129-ge1a86dbbbd6a7 kernel
/lkp/benchmarks/mdadm-selftests/tests/07layouts... FAILED - see 
/var/tmp/07layouts.log and /var/tmp/fail07layouts.log for details

07layouts TIMEOUT



To reproduce:

    git clone https://github.com/intel/lkp-tests.git
    cd lkp-tests
    bin/lkp install job.yaml  # job file is attached in this email
    bin/lkp run job.yaml



Thanks,
Rong Chen

<07layouts.log> 


Hi Junxiao,

Could you please look into this issue?

Thanks,
Song

[PATCH] md: get sysfs entry after redundancy attr group create

2020-08-04 Thread Junxiao Bi

"sync_completed" and "degraded" belongs to redundancy attr group,
it was not exist yet when md device was created.

Reported-by: kernel test robot 
Fixes: e1a86dbbbd6a ("md: fix deadlock causing by sysfs_notify")
Signed-off-by: Junxiao Bi 
---
 drivers/md/md.c | 17 ++---
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index fee8943ead7b..60d2142c4693 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -846,7 +846,13 @@ void mddev_unlock(struct mddev *mddev)
sysfs_remove_group(>kobj, 
_redundancy_group);
if (mddev->sysfs_action)
sysfs_put(mddev->sysfs_action);
+   if (mddev->sysfs_completed)
+   sysfs_put(mddev->sysfs_completed);
+   if (mddev->sysfs_degraded)
+   sysfs_put(mddev->sysfs_degraded);
mddev->sysfs_action = NULL;
+   mddev->sysfs_completed = NULL;
+   mddev->sysfs_degraded = NULL;
}
}
mddev->sysfs_active = 0;
@@ -4036,6 +4042,8 @@ level_store(struct mddev *mddev, const char *buf, size_t 
len)
pr_warn("md: cannot register extra attributes for %s\n",
mdname(mddev));
mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, 
"sync_action");
+   mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, 
"sync_completed");
+   mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, 
"degraded");
}
if (oldpers->sync_request != NULL &&
pers->sync_request == NULL) {
@@ -5542,14 +5550,9 @@ static void md_free(struct kobject *ko)
 
if (mddev->sysfs_state)
sysfs_put(mddev->sysfs_state);
-   if (mddev->sysfs_completed)
-   sysfs_put(mddev->sysfs_completed);
-   if (mddev->sysfs_degraded)
-   sysfs_put(mddev->sysfs_degraded);
if (mddev->sysfs_level)
sysfs_put(mddev->sysfs_level);
 
-
if (mddev->gendisk)
del_gendisk(mddev->gendisk);
if (mddev->queue)
@@ -5710,8 +5713,6 @@ static int md_alloc(dev_t dev, char *name)
if (!error && mddev->kobj.sd) {
kobject_uevent(>kobj, KOBJ_ADD);
mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, 
"array_state");
-   mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, 
"sync_completed");
-   mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, 
"degraded");
mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, 
"level");
}
mddev_put(mddev);
@@ -5991,6 +5992,8 @@ int md_run(struct mddev *mddev)
pr_warn("md: cannot register extra attributes for %s\n",
mdname(mddev));
mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, 
"sync_action");
+   mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, 
"sync_completed");
+   mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, 
"degraded");
} else if (mddev->ro == 2) /* auto-readonly not meaningful */
mddev->ro = 0;
 
-- 
2.20.1 (Apple Git-117)

Re: [md] e1a86dbbbd: mdadm-selftests.enchmarks/mdadm-selftests/tests/07layouts.fail

2020-08-03 Thread Junxiao Bi


Hi Song,

I am working on setup an env to reproduce, will update soon.

Thanks,

Junxiao.

On 8/2/20 10:52 PM, Song Liu wrote:



On Jul 29, 2020, at 2:04 AM, kernel test robot  wrote:

Greeting,

FYI, we noticed the following commit (built with gcc-9):

commit: e1a86dbbbd6a77f73c3d099030495fa31f181e2f ("md: fix deadlock causing by 
sysfs_notify")
https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master


in testcase: mdadm-selftests
with following parameters:

disk: 1HDD
test_prefix: 07layout
ucode: 0x21



on test machine: 4 threads Intel(R) Core(TM) i3-3220 CPU @ 3.30GHz with 4G 
memory

caused below changes (please refer to attached dmesg/kmsg for entire 
log/backtrace):



If you fix the issue, kindly add following tag
Reported-by: kernel test robot 



2020-07-29 01:06:34 mkdir -p /var/tmp
2020-07-29 01:06:34 mke2fs -t ext3 -b 4096 -J size=4 -q /dev/sda3
2020-07-29 01:07:36 mount -t ext3 /dev/sda3 /var/tmp
sed -e 's/{DEFAULT_METADATA}/1.2/g' \
-e 's,{MAP_PATH},/run/mdadm/map,g'  mdadm.8.in > mdadm.8
/usr/bin/install -D -m 644 mdadm.8 /usr/share/man/man8/mdadm.8
/usr/bin/install -D -m 644 mdmon.8 /usr/share/man/man8/mdmon.8
/usr/bin/install -D -m 644 md.4 /usr/share/man/man4/md.4
/usr/bin/install -D -m 644 mdadm.conf.5 /usr/share/man/man5/mdadm.conf.5
/usr/bin/install -D -m 644 udev-md-raid-creating.rules 
/lib/udev/rules.d/01-md-raid-creating.rules
/usr/bin/install -D -m 644 udev-md-raid-arrays.rules 
/lib/udev/rules.d/63-md-raid-arrays.rules
/usr/bin/install -D -m 644 udev-md-raid-assembly.rules 
/lib/udev/rules.d/64-md-raid-assembly.rules
/usr/bin/install -D -m 644 udev-md-clustered-confirm-device.rules 
/lib/udev/rules.d/69-md-clustered-confirm-device.rules
/usr/bin/install -D  -m 755 mdadm /sbin/mdadm
/usr/bin/install -D  -m 755 mdmon /sbin/mdmon
Testing on linux-5.8.0-rc4-00129-ge1a86dbbbd6a7 kernel
/lkp/benchmarks/mdadm-selftests/tests/07layouts... FAILED - see 
/var/tmp/07layouts.log and /var/tmp/fail07layouts.log for details
07layouts TIMEOUT



To reproduce:

git clone https://github.com/intel/lkp-tests.git
cd lkp-tests
bin/lkp install job.yaml  # job file is attached in this email
bin/lkp run job.yaml



Thanks,
Rong Chen

<07layouts.log>

Hi Junxiao,

Could you please look into this issue?

Thanks,
Song

[PATCH V2] md: fix deadlock causing by sysfs_notify

2020-07-14 Thread Junxiao Bi

The following deadlock was captured. The first process is holding 'kernfs_mutex'
and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device,
this pending bio list would be flushed by second process 'md127_raid1', but
it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace
sysfs_notify() can fix it. There were other sysfs_notify() invoked from io
path, removed all of them.

 PID: 40430  TASK: 8ee9c8c65c40  CPU: 29  COMMAND: "probe_file"
  #0 [b87c4df37260] __schedule at 9a8678ec
  #1 [b87c4df372f8] schedule at 9a867f06
  #2 [b87c4df37310] io_schedule at 9a0c73e6
  #3 [b87c4df37328] __dta___xfs_iunpin_wait_3443 at c03a4057 [xfs]
  #4 [b87c4df373a0] xfs_iunpin_wait at c03a6c79 [xfs]
  #5 [b87c4df373b0] __dta_xfs_reclaim_inode_3357 at c039a46c [xfs]
  #6 [b87c4df37400] xfs_reclaim_inodes_ag at c039a8b6 [xfs]
  #7 [b87c4df37590] xfs_reclaim_inodes_nr at c039bb33 [xfs]
  #8 [b87c4df375b0] xfs_fs_free_cached_objects at c03af0e9 [xfs]
  #9 [b87c4df375c0] super_cache_scan at 9a287ec7
 #10 [b87c4df37618] shrink_slab at 9a1efd93
 #11 [b87c4df37700] shrink_node at 9a1f5968
 #12 [b87c4df37788] do_try_to_free_pages at 9a1f5ea2
 #13 [b87c4df377f0] try_to_free_mem_cgroup_pages at 9a1f6445
 #14 [b87c4df37880] try_charge at 9a26cc5f
 #15 [b87c4df37920] memcg_kmem_charge_memcg at 9a270f6a
 #16 [b87c4df37958] new_slab at 9a251430
 #17 [b87c4df379c0] ___slab_alloc at 9a251c85
 #18 [b87c4df37a80] __slab_alloc at 9a25635d
 #19 [b87c4df37ac0] kmem_cache_alloc at 9a251f89
 #20 [b87c4df37b00] alloc_inode at 9a2a2b10
 #21 [b87c4df37b20] iget_locked at 9a2a4854
 #22 [b87c4df37b60] kernfs_get_inode at 9a311377
 #23 [b87c4df37b80] kernfs_iop_lookup at 9a311e2b
 #24 [b87c4df37ba8] lookup_slow at 9a290118
 #25 [b87c4df37c10] walk_component at 9a291e83
 #26 [b87c4df37c78] path_lookupat at 9a293619
 #27 [b87c4df37cd8] filename_lookup at 9a2953af
 #28 [b87c4df37de8] user_path_at_empty at 9a295566
 #29 [b87c4df37e10] vfs_statx at 9a289787
 #30 [b87c4df37e70] SYSC_newlstat at 9a289d5d
 #31 [b87c4df37f18] sys_newlstat at 9a28a60e
 #32 [b87c4df37f28] do_syscall_64 at 9a003949
 #33 [b87c4df37f50] entry_SYSCALL_64_after_hwframe at 9aa001ad
 RIP: 7f617a5f2905  RSP: 7f607334f838  RFLAGS: 0246
 RAX: ffda  RBX: 7f6064044b20  RCX: 7f617a5f2905
 RDX: 7f6064044b20  RSI: 7f6064044b20  RDI: 7f6064005890
 RBP: 7f6064044aa0   R8: 0030   R9: 011c
 R10: 0013  R11: 0246  R12: 7f606417e6d0
 R13: 7f6064044aa0  R14: 7f6064044b10  R15: 
 ORIG_RAX: 0006  CS: 0033  SS: 002b

 PID: 927TASK: 8f15ac5dbd80  CPU: 42  COMMAND: "md127_raid1"
  #0 [b87c4df07b28] __schedule at 9a8678ec
  #1 [b87c4df07bc0] schedule at 9a867f06
  #2 [b87c4df07bd8] schedule_preempt_disabled at 9a86825e
  #3 [b87c4df07be8] __mutex_lock at 9a869bcc
  #4 [b87c4df07ca0] __mutex_lock_slowpath at 9a86a013
  #5 [b87c4df07cb0] mutex_lock at 9a86a04f
  #6 [b87c4df07cc8] kernfs_find_and_get_ns at 9a311d83
  #7 [b87c4df07cf0] sysfs_notify at 9a314b3a
  #8 [b87c4df07d18] md_update_sb at 9a688696
  #9 [b87c4df07d98] md_update_sb at 9a6886d5
 #10 [b87c4df07da8] md_check_recovery at 9a68ad9c
 #11 [b87c4df07dd0] raid1d at c01f0375 [raid1]
 #12 [b87c4df07ea0] md_thread at 9a680348
 #13 [b87c4df07f08] kthread at 9a0b8005
 #14 [b87c4df07f50] ret_from_fork at 9aa00344

Signed-off-by: Junxiao Bi 
---
v2 <- v1
- fix sysfs_notify for sysfs file 'level' to align styles with others.

---
 drivers/md/md-bitmap.c |  2 +-
 drivers/md/md.c| 44 --
 drivers/md/md.h|  8 +++-
 drivers/md/raid10.c|  2 +-
 drivers/md/raid5.c |  6 +++---
 5 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 95a5f3757fa3..d61b524ae440 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1631,7 +1631,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, 
sector_t sector, bool force)
s += blocks;
}
bitmap->last_end_sync = jiffies;
-   sysfs_notify(>mddev->kobj, NULL, "sync_completed");
+   sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
 }
 EXPORT_SYMBOL(md_bitmap_cond_end_sync);
 
diff --git a/drivers/md/md.c b/

Re: [PATCH] md: fix deadlock causing by sysfs_notify

2020-07-14 Thread Junxiao Bi


On 7/14/20 9:18 AM, Song Liu wrote:


On Mon, Jul 13, 2020 at 11:41 PM Junxiao Bi  wrote:

On 7/13/20 11:17 PM, Song Liu wrote:


On Thu, Jul 9, 2020 at 4:36 PM Junxiao Bi  wrote:

The following deadlock was captured. The first process is holding 'kernfs_mutex'
and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device,
this pending bio list would be flushed by second process 'md127_raid1', but
it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace
sysfs_notify() can fix it. There were other sysfs_notify() invoked from io
path, removed all of them.


[...]

Cc: sta...@vger.kernel.org
Signed-off-by: Junxiao Bi 

Thanks for the patch. It looks good in general. One question though, do we
need the same change the following line in md.c:level_store()?

  sysfs_notify(>kobj, NULL, "level");

Thanks for the review. This one is not in io path, looks it's safe. I
can change it if you want to align it with others.

This one is the only leftover. Let's also change it.


Sure, i will send a v2.

Thanks,

Junxiao.



Thanks,
Song

Re: [PATCH] md: fix deadlock causing by sysfs_notify

2020-07-14 Thread Junxiao Bi


On 7/13/20 11:17 PM, Song Liu wrote:


On Thu, Jul 9, 2020 at 4:36 PM Junxiao Bi  wrote:

The following deadlock was captured. The first process is holding 'kernfs_mutex'
and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device,
this pending bio list would be flushed by second process 'md127_raid1', but
it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace
sysfs_notify() can fix it. There were other sysfs_notify() invoked from io
path, removed all of them.


[...]

Cc: sta...@vger.kernel.org
Signed-off-by: Junxiao Bi 

Thanks for the patch. It looks good in general. One question though, do we
need the same change the following line in md.c:level_store()?

 sysfs_notify(>kobj, NULL, "level");


Thanks for the review. This one is not in io path, looks it's safe. I 
can change it if you want to align it with others.


Thanks,

Junxiao.



Thanks,
Song

[...]

Re: [PATCH] md: fix deadlock causing by sysfs_notify

2020-07-13 Thread Junxiao Bi


Anybody help take a look at this deadlock?

Issue happened when raid_check was running, at that time, system memory 
was not enough, one process which was doing path lookup from sysfs 
triggered the direct memory reclaim, it was holding filesystem mutex 
'kernelfs_mutex' and hung by io. The io would be flushed from 
raid1d()->flush_pending_writes() by process 'md127_raid1', but it was 
hung by 'kernelfs_mutex' in md_check_recovery()->md_update_sb() before 
flush_pending_writes().


Thanks,

Junxiao.

On 7/9/20 4:35 PM, Junxiao Bi wrote:

The following deadlock was captured. The first process is holding 'kernfs_mutex'
and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device,
this pending bio list would be flushed by second process 'md127_raid1', but
it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace
sysfs_notify() can fix it. There were other sysfs_notify() invoked from io
path, removed all of them.

  PID: 40430  TASK: 8ee9c8c65c40  CPU: 29  COMMAND: "probe_file"
   #0 [b87c4df37260] __schedule at 9a8678ec
   #1 [b87c4df372f8] schedule at 9a867f06
   #2 [b87c4df37310] io_schedule at 9a0c73e6
   #3 [b87c4df37328] __dta___xfs_iunpin_wait_3443 at c03a4057 [xfs]
   #4 [b87c4df373a0] xfs_iunpin_wait at c03a6c79 [xfs]
   #5 [b87c4df373b0] __dta_xfs_reclaim_inode_3357 at c039a46c [xfs]
   #6 [b87c4df37400] xfs_reclaim_inodes_ag at c039a8b6 [xfs]
   #7 [b87c4df37590] xfs_reclaim_inodes_nr at c039bb33 [xfs]
   #8 [b87c4df375b0] xfs_fs_free_cached_objects at c03af0e9 [xfs]
   #9 [b87c4df375c0] super_cache_scan at 9a287ec7
  #10 [b87c4df37618] shrink_slab at 9a1efd93
  #11 [b87c4df37700] shrink_node at 9a1f5968
  #12 [b87c4df37788] do_try_to_free_pages at 9a1f5ea2
  #13 [b87c4df377f0] try_to_free_mem_cgroup_pages at 9a1f6445
  #14 [b87c4df37880] try_charge at 9a26cc5f
  #15 [b87c4df37920] memcg_kmem_charge_memcg at 9a270f6a
  #16 [b87c4df37958] new_slab at 9a251430
  #17 [b87c4df379c0] ___slab_alloc at 9a251c85
  #18 [b87c4df37a80] __slab_alloc at 9a25635d
  #19 [b87c4df37ac0] kmem_cache_alloc at 9a251f89
  #20 [b87c4df37b00] alloc_inode at 9a2a2b10
  #21 [b87c4df37b20] iget_locked at 9a2a4854
  #22 [b87c4df37b60] kernfs_get_inode at 9a311377
  #23 [b87c4df37b80] kernfs_iop_lookup at 9a311e2b
  #24 [b87c4df37ba8] lookup_slow at 9a290118
  #25 [b87c4df37c10] walk_component at 9a291e83
  #26 [b87c4df37c78] path_lookupat at 9a293619
  #27 [b87c4df37cd8] filename_lookup at 9a2953af
  #28 [b87c4df37de8] user_path_at_empty at 9a295566
  #29 [b87c4df37e10] vfs_statx at 9a289787
  #30 [b87c4df37e70] SYSC_newlstat at 9a289d5d
  #31 [b87c4df37f18] sys_newlstat at 9a28a60e
  #32 [b87c4df37f28] do_syscall_64 at 9a003949
  #33 [b87c4df37f50] entry_SYSCALL_64_after_hwframe at 9aa001ad
  RIP: 7f617a5f2905  RSP: 7f607334f838  RFLAGS: 0246
  RAX: ffda  RBX: 7f6064044b20  RCX: 7f617a5f2905
  RDX: 7f6064044b20  RSI: 7f6064044b20  RDI: 7f6064005890
  RBP: 7f6064044aa0   R8: 0030   R9: 011c
  R10: 0013  R11: 0246  R12: 7f606417e6d0
  R13: 7f6064044aa0  R14: 7f6064044b10  R15: 
  ORIG_RAX: 0006  CS: 0033  SS: 002b

  PID: 927TASK: 8f15ac5dbd80  CPU: 42  COMMAND: "md127_raid1"
   #0 [b87c4df07b28] __schedule at 9a8678ec
   #1 [b87c4df07bc0] schedule at 9a867f06
   #2 [b87c4df07bd8] schedule_preempt_disabled at 9a86825e
   #3 [b87c4df07be8] __mutex_lock at 9a869bcc
   #4 [b87c4df07ca0] __mutex_lock_slowpath at 9a86a013
   #5 [b87c4df07cb0] mutex_lock at 9a86a04f
   #6 [b87c4df07cc8] kernfs_find_and_get_ns at 9a311d83
   #7 [b87c4df07cf0] sysfs_notify at 9a314b3a
   #8 [b87c4df07d18] md_update_sb at 9a688696
   #9 [b87c4df07d98] md_update_sb at 9a6886d5
  #10 [b87c4df07da8] md_check_recovery at 9a68ad9c
  #11 [b87c4df07dd0] raid1d at c01f0375 [raid1]
  #12 [b87c4df07ea0] md_thread at 9a680348
  #13 [b87c4df07f08] kthread at 9a0b8005
  #14 [b87c4df07f50] ret_from_fork at 9aa00344

Cc: sta...@vger.kernel.org
Signed-off-by: Junxiao Bi 
---
  drivers/md/md-bitmap.c |  2 +-
  drivers/md/md.c| 39 ++-
  drivers/md/md.h|  7 ++-
  drivers/md/raid10.c|  2 +-
  drivers/md/raid5.c |  6 +++---
  5 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/drivers/m

[PATCH] md: fix deadlock causing by sysfs_notify

2020-07-09 Thread Junxiao Bi

The following deadlock was captured. The first process is holding 'kernfs_mutex'
and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device,
this pending bio list would be flushed by second process 'md127_raid1', but
it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace
sysfs_notify() can fix it. There were other sysfs_notify() invoked from io
path, removed all of them.

 PID: 40430  TASK: 8ee9c8c65c40  CPU: 29  COMMAND: "probe_file"
  #0 [b87c4df37260] __schedule at 9a8678ec
  #1 [b87c4df372f8] schedule at 9a867f06
  #2 [b87c4df37310] io_schedule at 9a0c73e6
  #3 [b87c4df37328] __dta___xfs_iunpin_wait_3443 at c03a4057 [xfs]
  #4 [b87c4df373a0] xfs_iunpin_wait at c03a6c79 [xfs]
  #5 [b87c4df373b0] __dta_xfs_reclaim_inode_3357 at c039a46c [xfs]
  #6 [b87c4df37400] xfs_reclaim_inodes_ag at c039a8b6 [xfs]
  #7 [b87c4df37590] xfs_reclaim_inodes_nr at c039bb33 [xfs]
  #8 [b87c4df375b0] xfs_fs_free_cached_objects at c03af0e9 [xfs]
  #9 [b87c4df375c0] super_cache_scan at 9a287ec7
 #10 [b87c4df37618] shrink_slab at 9a1efd93
 #11 [b87c4df37700] shrink_node at 9a1f5968
 #12 [b87c4df37788] do_try_to_free_pages at 9a1f5ea2
 #13 [b87c4df377f0] try_to_free_mem_cgroup_pages at 9a1f6445
 #14 [b87c4df37880] try_charge at 9a26cc5f
 #15 [b87c4df37920] memcg_kmem_charge_memcg at 9a270f6a
 #16 [b87c4df37958] new_slab at 9a251430
 #17 [b87c4df379c0] ___slab_alloc at 9a251c85
 #18 [b87c4df37a80] __slab_alloc at 9a25635d
 #19 [b87c4df37ac0] kmem_cache_alloc at 9a251f89
 #20 [b87c4df37b00] alloc_inode at 9a2a2b10
 #21 [b87c4df37b20] iget_locked at 9a2a4854
 #22 [b87c4df37b60] kernfs_get_inode at 9a311377
 #23 [b87c4df37b80] kernfs_iop_lookup at 9a311e2b
 #24 [b87c4df37ba8] lookup_slow at 9a290118
 #25 [b87c4df37c10] walk_component at 9a291e83
 #26 [b87c4df37c78] path_lookupat at 9a293619
 #27 [b87c4df37cd8] filename_lookup at 9a2953af
 #28 [b87c4df37de8] user_path_at_empty at 9a295566
 #29 [b87c4df37e10] vfs_statx at 9a289787
 #30 [b87c4df37e70] SYSC_newlstat at 9a289d5d
 #31 [b87c4df37f18] sys_newlstat at 9a28a60e
 #32 [b87c4df37f28] do_syscall_64 at 9a003949
 #33 [b87c4df37f50] entry_SYSCALL_64_after_hwframe at 9aa001ad
 RIP: 7f617a5f2905  RSP: 7f607334f838  RFLAGS: 0246
 RAX: ffda  RBX: 7f6064044b20  RCX: 7f617a5f2905
 RDX: 7f6064044b20  RSI: 7f6064044b20  RDI: 7f6064005890
 RBP: 7f6064044aa0   R8: 0030   R9: 011c
 R10: 0013  R11: 0246  R12: 7f606417e6d0
 R13: 7f6064044aa0  R14: 7f6064044b10  R15: 
 ORIG_RAX: 0006  CS: 0033  SS: 002b

 PID: 927TASK: 8f15ac5dbd80  CPU: 42  COMMAND: "md127_raid1"
  #0 [b87c4df07b28] __schedule at 9a8678ec
  #1 [b87c4df07bc0] schedule at 9a867f06
  #2 [b87c4df07bd8] schedule_preempt_disabled at 9a86825e
  #3 [b87c4df07be8] __mutex_lock at 9a869bcc
  #4 [b87c4df07ca0] __mutex_lock_slowpath at 9a86a013
  #5 [b87c4df07cb0] mutex_lock at 9a86a04f
  #6 [b87c4df07cc8] kernfs_find_and_get_ns at 9a311d83
  #7 [b87c4df07cf0] sysfs_notify at 9a314b3a
  #8 [b87c4df07d18] md_update_sb at 9a688696
  #9 [b87c4df07d98] md_update_sb at 9a6886d5
 #10 [b87c4df07da8] md_check_recovery at 9a68ad9c
 #11 [b87c4df07dd0] raid1d at c01f0375 [raid1]
 #12 [b87c4df07ea0] md_thread at 9a680348
 #13 [b87c4df07f08] kthread at 9a0b8005
 #14 [b87c4df07f50] ret_from_fork at 9aa00344

Cc: sta...@vger.kernel.org
Signed-off-by: Junxiao Bi 
---
 drivers/md/md-bitmap.c |  2 +-
 drivers/md/md.c| 39 ++-
 drivers/md/md.h|  7 ++-
 drivers/md/raid10.c|  2 +-
 drivers/md/raid5.c |  6 +++---
 5 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 95a5f3757fa3..d61b524ae440 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -1631,7 +1631,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, 
sector_t sector, bool force)
s += blocks;
}
bitmap->last_end_sync = jiffies;
-   sysfs_notify(>mddev->kobj, NULL, "sync_completed");
+   sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed);
 }
 EXPORT_SYMBOL(md_bitmap_cond_end_sync);
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f567f536b529..42a0b5ceaaec 100644
--- a/dri

Re: [PATCH 4.19 114/131] ocfs2: avoid inode removal while nfsd is accessing it

2020-07-04 Thread Junxiao Bi


On 7/2/20 3:24 PM, Linus Torvalds wrote:


On Thu, Jul 2, 2020 at 2:17 PM Pavel Machek  wrote:



commit 4cd9973f9ff69e37dd0ba2bd6e6423f8179c329a upstream.

Patch series "ocfs2: fix nfsd over ocfs2 issues", v2.

This causes locking imbalance:

This sems to be true upstream too.


When ocfs2_nfs_sync_lock() returns error, caller can not know if the
lock was taken or not.

Right you are.

And your patch looks sane:


diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index c141b06811a6..8149fb6f1f0d 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2867,9 +2867,15 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex)

 status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE,
 0, 0);
-   if (status < 0)
+   if (status < 0) {
 mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status);

+   if (ex)
+   up_write(>nfs_sync_rwlock);
+   else
+   up_read(>nfs_sync_rwlock);
+   }
+
 return status;
  }

although the whole thing looks messy.

If the issue is a lifetime thing (like that commit says), the proper
model isn't a lock, but a refcount.

Oh well. Junxiao?


There is a block number embedded in nfs file handle, to verify it's an 
inode, need acquire this nfs_sync_lock global lock to avoid any inode 
removed from local node and other nodes in the cluster, before this 
verify done, seemed no way to use a refcount.


Thanks,

Junxiao.



Linus

Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

2020-06-25 Thread Junxiao Bi


On 6/22/20 5:47 PM, Matthew Wilcox wrote:


On Sun, Jun 21, 2020 at 10:15:39PM -0700, Junxiao Bi wrote:

On 6/20/20 9:27 AM, Matthew Wilcox wrote:

On Fri, Jun 19, 2020 at 05:42:45PM -0500, Eric W. Biederman wrote:

Junxiao Bi  writes:

Still high lock contention. Collect the following hot path.

A different location this time.

I know of at least exit_signal and exit_notify that take thread wide
locks, and it looks like exit_mm is another.  Those don't use the same
locks as flushing proc.


So I think you are simply seeing a result of the thundering herd of
threads shutting down at once.  Given that thread shutdown is fundamentally
a slow path there is only so much that can be done.

If you are up for a project to working through this thundering herd I
expect I can help some.  It will be a long process of cleaning up
the entire thread exit process with an eye to performance.

Wengang had some tests which produced wall-clock values for this problem,
which I agree is more informative.

I'm not entirely sure what the customer workload is that requires a
highly threaded workload to also shut down quickly.  To my mind, an
overall workload is normally composed of highly-threaded tasks that run
for a long time and only shut down rarely (thus performance of shutdown
is not important) and single-threaded tasks that run for a short time.

The real workload is a Java application working in server-agent mode, issue
happened in agent side, all it do is waiting works dispatching from server
and execute. To execute one work, agent will start lots of short live
threads, there could be a lot of threads exit same time if there were a lots
of work to execute, the contention on the exit path caused a high %sys time
which impacted other workload.

How about this for a micro?  Executes in about ten seconds on my laptop.
You might need to tweak it a bit to get better timing on a server.

// gcc -pthread -O2 -g -W -Wall
#include 
#include 

void *worker(void *arg)
{
int i = 0;
int *p = arg;

for (;;) {
while (i < 1000 * 1000) {
i += *p;
}
sleep(1);
}
}

int main(int argc, char **argv)
{
pthread_t threads[20][100];


Tuning 100 to 1000 here and the following 2 loops.

Test it on 2-socket server with 104 cpu. Perf is similar on v5.7 and 
v5.7 with Eric's fix. The spin lock was shifted to spin lock in futex, 
so the fix didn't help.



    46.41% 0.11%  perf_test    [kernel.kallsyms] [k] 
entry_SYSCALL_64_after_hwframe

    |
 --46.30%--entry_SYSCALL_64_after_hwframe
   |
    --46.12%--do_syscall_64
  |
  |--30.47%--__x64_sys_futex
  |  |
  |   --30.45%--do_futex
  | |
  | |--18.04%--futex_wait
  | | |
  | | 
|--16.94%--futex_wait_setup

  | | |  |
  | | |   
--16.61%--_raw_spin_lock
  | | 
| |
  | | 
|  --16.30%--native_queued_spin_lock_slowpath
  | | 
|    |
  | | 
| --0.81%--call_function_interrupt
  | | 
|   |
  | | | 
--0.79%--smp_call_function_interrupt
  | | 
|  |
  | | | 
--0.62%--generic_smp_call_function_single_interrupt

  | | |
  | |   
--1.04%--futex_wait_queue_me

  | | |
  | |  
--0.96%--schedule

  | |    |
  | | 
--0.94%--__schedule
  | 
|   |

  | | --0.51%--pick_next_task_fair
  | |
  | --12.38%--futex_wake
  | |
  | |--11

Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

2020-06-22 Thread Junxiao Bi


On 6/22/20 8:20 AM, ebied...@xmission.com wrote:


If I understand correctly, the Java VM is not exiting.  Just some of
it's threads.

That is a very different problem to deal with.  That are many
optimizations that are possible when_all_  of the threads are exiting
that are not possible when_many_  threads are exiting.

Do you know if it is simply the cpu time or if it is the lock contention
that is the problem?  If it is simply the cpu time we should consider if
some of the locks that can be highly contended should become mutexes.
Or perhaps something like Matthew's cpu pinning idea.


The problem is high %sys time.

Thanks,

Junxiao.

Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

2020-06-21 Thread Junxiao Bi


On 6/20/20 9:27 AM, Matthew Wilcox wrote:


On Fri, Jun 19, 2020 at 05:42:45PM -0500, Eric W. Biederman wrote:

Junxiao Bi  writes:

Still high lock contention. Collect the following hot path.

A different location this time.

I know of at least exit_signal and exit_notify that take thread wide
locks, and it looks like exit_mm is another.  Those don't use the same
locks as flushing proc.


So I think you are simply seeing a result of the thundering herd of
threads shutting down at once.  Given that thread shutdown is fundamentally
a slow path there is only so much that can be done.

If you are up for a project to working through this thundering herd I
expect I can help some.  It will be a long process of cleaning up
the entire thread exit process with an eye to performance.

Wengang had some tests which produced wall-clock values for this problem,
which I agree is more informative.

I'm not entirely sure what the customer workload is that requires a
highly threaded workload to also shut down quickly.  To my mind, an
overall workload is normally composed of highly-threaded tasks that run
for a long time and only shut down rarely (thus performance of shutdown
is not important) and single-threaded tasks that run for a short time.


The real workload is a Java application working in server-agent mode, 
issue happened in agent side, all it do is waiting works dispatching 
from server and execute. To execute one work, agent will start lots of 
short live threads, there could be a lot of threads exit same time if 
there were a lots of work to execute, the contention on the exit path 
caused a high %sys time which impacted other workload.


Thanks,

Junxiao.



Understanding this workload is important to my next suggestion, which
is that rather than searching for all the places in the exit path which
contend on a single spinlock, we simply set the allowed CPUs for an
exiting task to include only the CPU that this thread is running on.
It will probably run faster to take the threads down in series on one
CPU rather than take them down in parallel across many CPUs (or am I
mistaken?  Is there inherently a lot of parallelism in the thread
exiting process?)

Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

2020-06-19 Thread Junxiao Bi


On 6/19/20 10:24 AM, ebied...@xmission.com wrote:


Junxiao Bi  writes:


Hi Eric,

The patch didn't improve lock contention.

Which raises the question where is the lock contention coming from.

Especially with my first variant.  Only the last thread to be reaped
would free up anything in the cache.

Can you comment out the call to proc_flush_pid entirely?


Still high lock contention. Collect the following hot path.

    74.90% 0.01%  proc_race 
[kernel.kallsyms]   [k] 
entry_SYSCALL_64_after_hwframe

    |
 --74.89%--entry_SYSCALL_64_after_hwframe
   |
    --74.88%--do_syscall_64
  |
  |--69.70%--exit_to_usermode_loop
  |  |
  |   --69.70%--do_signal
  | |
  | --69.69%--get_signal
  | |
  | |--56.30%--do_group_exit
  | |  |
  | |   --56.30%--do_exit
  | | |
  | | 
|--27.50%--_raw_write_lock_irq

  | | |  |
  | | | 
--27.47%--queued_write_lock_slowpath
  | | 
| |
  | | | 
--27.18%--native_queued_spin_lock_slowpath

  | | |
  | | 
|--26.10%--release_task.part.20

  | | |  |
  | | |   
--25.60%--_raw_write_lock_irq
  | | 
| |
  | | | 
--25.56%--queued_write_lock_slowpath
  | | 
|    |
  | | | 
--25.23%--native_queued_spin_lock_slowpath

  | | |
  | |  --0.56%--mmput
  | |    |
  | | 
--0.55%--exit_mmap

  | |
| --13.31%--_raw_spin_lock_irq
|   |
| --13.28%--native_queued_spin_lock_slowpath
  |

Thanks,

Junxiao.



That will rule out the proc_flush_pid in d_invalidate entirely.

The only candidate I can think of d_invalidate aka (proc_flush_pid) vs ps.

Eric

Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries

2020-06-19 Thread Junxiao Bi


Hi Eric,

The patch didn't improve lock contention.

   PerfTop:   48925 irqs/sec  kernel:95.6%  exact: 100.0% lost: 0/0 
drop: 0/0 [4000Hz cycles],  (all, 104 CPUs)
--- 



    69.66%  [kernel]    [k] 
native_queued_spin_lock_slowpath
 1.93%  [kernel]    [k] 
_raw_spin_lock
 1.24%  [kernel]    [k] 
page_counter_cancel
 0.70%  [kernel]    [k] 
do_syscall_64
 0.62%  [kernel]    [k] 
find_idlest_group.isra.96
 0.57%  [kernel]    [k] 
queued_write_lock_slowpath

 0.56%  [kernel]    [k] d_walk
 0.45%  [kernel]    [k] 
clear_page_erms
 0.44%  [kernel]    [k] 
syscall_return_via_sysret
 0.40%  [kernel]    [k] 
entry_SYSCALL_64
 0.38%  [kernel]    [k] 
refcount_dec_not_one
 0.37%  [kernel]    [k] 
propagate_protected_usage
 0.33%  [kernel]    [k] 
unmap_page_range
 0.33%  [kernel]    [k] 
select_collect

 0.32%  [kernel]    [k] memcpy_erms
 0.30%  [kernel]    [k] 
proc_task_readdir
 0.27%  [kernel]    [k] 
_raw_spin_lock_irqsave


Thanks,

Junxiao.

On 6/19/20 7:09 AM, ebied...@xmission.com wrote:

Junxiao Bi  reported:

When debugging some performance issue, i found that thousands of threads exit
around same time could cause a severe spin lock contention on proc dentry
"/proc/$parent_process_pid/task/", that's because threads needs to clean up
their pid file from that dir when exit.

Matthew Wilcox  reported:

We've looked at a few different ways of fixing this problem.

The flushing of the proc dentries from the dcache is an optmization,
and is not necessary for correctness.  Eventually cache pressure will
cause the dentries to be freed even if no flushing happens.  Some
light testing when I refactored the proc flushg[1] indicated that at
least the memory footprint is easily measurable.

An optimization that causes a performance problem due to a thundering
herd of threads is no real optimization.

Modify the code to only flush the /proc// directory when all
threads in a process are killed at once.  This continues to flush
practically everything when the process is reaped as the threads live
under /proc//task/.

There is a rare possibility that a debugger will access /proc//,
which this change will no longer flush, but I believe such accesses
are sufficiently rare to not be observed in practice.

[1] 7bc3e6e55acf ("proc: Use a list of inodes to flush from proc")
Link: https://lkml.kernel.org/r/54091fc0-ca46-2186-97a8-d1f3c4f38...@oracle.com
Reported-by: Masahiro Yamada 
Reported-by: Matthew Wilcox 
Signed-off-by: "Eric W. Biederman" 
---

I am still waiting for word on how this affects performance, but this is
a clean version that should avoid the thundering herd problem in
general.


  kernel/exit.c | 19 +++
  1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index cebae77a9664..567354550d62 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -151,8 +151,8 @@ void put_task_struct_rcu_user(struct task_struct *task)
  
  void release_task(struct task_struct *p)

  {
+   struct pid *flush_pid = NULL;
struct task_struct *leader;
-   struct pid *thread_pid;
int zap_leader;
  repeat:
/* don't need to get the RCU readlock here - the process is dead and
@@ -165,7 +165,16 @@ void release_task(struct task_struct *p)
  
  	write_lock_irq(_lock);

ptrace_release_task(p);
-   thread_pid = get_pid(p->thread_pid);
+
+   /*
+* When all of the threads are exiting wait until the end
+* and flush everything.
+*/
+   if (thread_group_leader(p))
+   flush_pid = get_pid(task_tgid(p));
+   else if (!(p->signal->flags & SIGNAL_GROUP_EXIT))
+   flush_pid = get_pid(task_pid(p));
+
__exit_signal(p);
  
  	/*

@@ -188,8 +197,10 @@ void release_task(struct task_struct *p)
}
  
  	write_unlock_irq(_lock);

-   proc_flush_pid(thread_pid);
-   put_pid(thread_pid);
+   if (flush_pid) {
+   proc_flush_pid(flush_pid);
+   put_pid(flush_pid);
+   }
release_thread(p);
put_task_struct_rcu_user(p);

Re: severe proc dentry lock contention

2020-06-18 Thread Junxiao Bi


On 6/18/20 5:02 PM, ebied...@xmission.com wrote:


Matthew Wilcox  writes:


On Thu, Jun 18, 2020 at 03:17:33PM -0700, Junxiao Bi wrote:

When debugging some performance issue, i found that thousands of threads
exit around same time could cause a severe spin lock contention on proc
dentry "/proc/$parent_process_pid/task/", that's because threads needs to
clean up their pid file from that dir when exit. Check the following
standalone test case that simulated the case and perf top result on v5.7
kernel. Any idea on how to fix this?

Thanks, Junxiao.

We've looked at a few different ways of fixing this problem.

Even though the contention is within the dcache, it seems like a usecase
that the dcache shouldn't be optimised for -- generally we do not have
hundreds of CPUs removing dentries from a single directory in parallel.

We could fix this within procfs.  We don't have a great patch yet, but
the current approach we're looking at allows only one thread at a time
to call dput() on any /proc/*/task directory.

We could also look at fixing this within the scheduler.  Only allowing
one CPU to run the threads of an exiting process would fix this particular
problem, but might have other consequences.

I was hoping that 7bc3e6e55acf would fix this, but that patch is in 5.7,
so that hope is ruled out.

Does anyone know if problem new in v5.7?  I am wondering if I introduced
this problem when I refactored the code or if I simply churned the code
but the issue remains effectively the same.

It's not new issue, we see it in old kernel like v4.14


Can you try only flushing entries when the last thread of the process is
reaped?  I think in practice we would want to be a little more
sophisticated but it is a good test case to see if it solves the issue.


Thank you. i will try and let you know.

Thanks,

Junxiao.



diff --git a/kernel/exit.c b/kernel/exit.c
index cebae77a9664..d56e4eb60bdd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -152,7 +152,7 @@ void put_task_struct_rcu_user(struct task_struct *task)
  void release_task(struct task_struct *p)
  {
struct task_struct *leader;
-   struct pid *thread_pid;
+   struct pid *thread_pid = NULL;
int zap_leader;
  repeat:
/* don't need to get the RCU readlock here - the process is dead and
@@ -165,7 +165,8 @@ void release_task(struct task_struct *p)
  
  	write_lock_irq(_lock);

ptrace_release_task(p);
-   thread_pid = get_pid(p->thread_pid);
+   if (p == p->group_leader)
+   thread_pid = get_pid(p->thread_pid);
__exit_signal(p);
  
  	/*

@@ -188,8 +189,10 @@ void release_task(struct task_struct *p)
}
  
  	write_unlock_irq(_lock);

-   proc_flush_pid(thread_pid);
-   put_pid(thread_pid);
+   if (thread_pid) {
+   proc_flush_pid(thread_pid);
+   put_pid(thread_pid);
+   }
release_thread(p);
put_task_struct_rcu_user(p);

severe proc dentry lock contention

2020-06-18 Thread Junxiao Bi


Hi,

When debugging some performance issue, i found that thousands of threads 
exit around same time could cause a severe spin lock contention on proc 
dentry "/proc/$parent_process_pid/task/", that's because threads needs 
to clean up their pid file from that dir when exit. Check the following 
standalone test case that simulated the case and perf top result on v5.7 
kernel. Any idea on how to fix this?



   PerfTop:   48891 irqs/sec  kernel:95.6%  exact: 100.0% lost: 0/0 
drop: 0/0 [4000Hz cycles],  (all, 72 CPUs)
--- 



    66.10%  [kernel]   [k] 
native_queued_spin_lock_slowpath

 1.13%  [kernel]   [k] _raw_spin_lock
 0.84%  [kernel]   [k] clear_page_erms
 0.82%  [kernel]   [k] 
queued_write_lock_slowpath

 0.64%  [kernel]   [k] proc_task_readdir
 0.61%  [kernel]   [k] 
find_idlest_group.isra.95
 0.61%  [kernel]   [k] 
syscall_return_via_sysret

 0.55%  [kernel]   [k] entry_SYSCALL_64
 0.49%  [kernel]   [k] memcpy_erms
 0.46%  [kernel]   [k] update_cfs_group
 0.41%  [kernel]   [k] get_pid_task
 0.39%  [kernel]   [k] 
_raw_spin_lock_irqsave
 0.37%  [kernel]   [k] 
__list_del_entry_valid
 0.34%  [kernel]   [k] 
get_page_from_freelist

 0.34%  [kernel]   [k] __d_lookup
 0.32%  [kernel]   [k] update_load_avg
 0.31%  libc-2.17.so   [.] get_next_seq
 0.27%  [kernel]   [k] avc_has_perm_noaudit
 0.26%  [kernel]   [k] __sched_text_start
 0.25%  [kernel]   [k] 
selinux_inode_permission

 0.25%  [kernel]   [k] __slab_free
 0.24%  [kernel]   [k] detach_entity_cfs_rq
 0.23%  [kernel]   [k] zap_pte_range
 0.22%  [kernel]   [k] 
_find_next_bit.constprop.1

 0.22%  libc-2.17.so   [.] vfprintf
 0.20%  libc-2.17.so   [.] _int_malloc
 0.19%  [kernel]   [k] _raw_spin_lock_irq
 0.18%  [kernel]   [k] rb_erase
 0.18%  [kernel]   [k] pid_revalidate
 0.18%  [kernel]   [k] lockref_get_not_dead
 0.18%  [kernel]   [k] 
__alloc_pages_nodemask

 0.17%  [kernel]   [k] set_task_cpu
 0.17%  libc-2.17.so   [.] __strcoll_l
 0.17%  [kernel]   [k] do_syscall_64
 0.17%  [kernel]   [k] __vmalloc_node_range
 0.17%  libc-2.17.so   [.] _IO_vfscanf
 0.17%  [kernel]   [k] refcount_dec_not_one
 0.15%  [kernel]   [k] __task_pid_nr_ns
 0.15%  [kernel]   [k] 
native_irq_return_iret

 0.15%  [kernel]   [k] free_pcppages_bulk
 0.14%  [kernel]   [k] kmem_cache_alloc
 0.14%  [kernel]   [k] link_path_walk
 0.14%  libc-2.17.so   [.] _int_free
 0.14%  [kernel]   [k] 
__update_load_avg_cfs_rq

 0.14%  perf.5.7.0-master.20200601.ol7.x86_64  [.] 0x000eac29
 0.13%  [kernel]   [k] kmem_cache_free
 0.13%  [kernel]   [k] number
 0.13%  [kernel]   [k] memset_erms
 0.12%  [kernel]   [k] proc_pid_status
 0.12%  [kernel]   [k] __d_lookup_rcu


=== runme.sh ==

#!/bin/bash

threads=${1:-1}
prog=proc_race
while [ 1 ]; do ./$prog $threads; done &

while [ 1 ]; do
    pid=`ps aux | grep $prog | grep -v grep| awk '{print $2}'`
    if [ -z $pid ]; then continue; fi
    threadnum=`ls -l /proc/$pid/task | wc -l`
    if [ $threadnum -gt $threads ]; then
        echo kill $pid
        kill -9 $pid
    fi
done


===proc_race.c=


#include 
#include 
#include 
#include 
#include 
#include 
#include 

#define handle_error_en(en, msg) \
    do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0)

#define

Re: [PATCH] block: fix RO partition with RW disk

2019-08-07 Thread Junxiao Bi


Anybody could help review this bug?

thanks,

Junxiao.

On 8/5/19 1:01 PM, Junxiao Bi wrote:

When md raid1 was used with imsm metadata, during the boot stage,
the raid device will first be set to readonly, then mdmon will set
it read-write later. When there were some partitions in this device,
the following race would make some partition left ro and fail to mount.

CPU 1: CPU 2:
add_partition()set_disk_ro() //set disk 
RW
  //disk was RO, so partition set to RO
  p->policy = get_disk_ro(disk);
 if (disk->part0.policy 
!= flag) {
 
set_disk_ro_uevent(disk, flag);
 // disk set to RW
 disk->part0.policy 
= flag;
 }
 // set all exit 
partition to RW
 while ((part = 
disk_part_iter_next()))
 part->policy = 
flag;
  // this part was not yet added, so it was still RO
  rcu_assign_pointer(ptbl->part[partno], p);

Move RO status setting of partitions after they were added into partition
table and introduce a mutex to sync RO status between disk and partitions.

Signed-off-by: Junxiao Bi 
---
  block/genhd.c | 3 +++
  block/partition-generic.c | 5 -
  include/linux/genhd.h | 1 +
  3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/block/genhd.c b/block/genhd.c
index 54f1f0d381f4..f3cce1d354cf 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1479,6 +1479,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id)
}
ptbl = rcu_dereference_protected(disk->part_tbl, 1);
rcu_assign_pointer(ptbl->part[0], >part0);
+   mutex_init(>part_lock);
  
  		/*

 * set_capacity() and get_capacity() currently don't use
@@ -1570,6 +1571,7 @@ void set_disk_ro(struct gendisk *disk, int flag)
struct disk_part_iter piter;
struct hd_struct *part;
  
+	mutex_lock(>part_lock);

if (disk->part0.policy != flag) {
set_disk_ro_uevent(disk, flag);
disk->part0.policy = flag;
@@ -1579,6 +1581,7 @@ void set_disk_ro(struct gendisk *disk, int flag)
while ((part = disk_part_iter_next()))
part->policy = flag;
disk_part_iter_exit();
+   mutex_unlock(>part_lock);
  }
  
  EXPORT_SYMBOL(set_disk_ro);

diff --git a/block/partition-generic.c b/block/partition-generic.c
index aee643ce13d1..63cb6fb996ff 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -345,7 +345,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int 
partno,
queue_limit_discard_alignment(>queue->limits, start);
p->nr_sects = len;
p->partno = partno;
-   p->policy = get_disk_ro(disk);
  
  	if (info) {

struct partition_meta_info *pinfo = alloc_part_info(disk);
@@ -401,6 +400,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int 
partno,
/* everything is up and running, commence */
rcu_assign_pointer(ptbl->part[partno], p);
  
+	mutex_lock(>part_lock);

+   p->policy = get_disk_ro(disk);
+   mutex_unlock(>part_lock);
+
/* suppress uevent if the disk suppresses it */
if (!dev_get_uevent_suppress(ddev))
kobject_uevent(>kobj, KOBJ_ADD);
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 8b5330dd5ac0..df6ddca8a92c 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -201,6 +201,7 @@ struct gendisk {
 */
struct disk_part_tbl __rcu *part_tbl;
struct hd_struct part0;
+   struct mutex part_lock;
  
  	const struct block_device_operations *fops;

struct request_queue *queue;

[PATCH RESEND] scsi: megaraid_sas: fix panic on loading firmware crashdump

2019-07-22 Thread Junxiao Bi

While loading fw crashdump in function fw_crash_buffer_show(),
left bytes in one dma chunk was not checked, if copying size
over it, overflow access will cause kernel panic.

Signed-off-by: Junxiao Bi 
---
 drivers/scsi/megaraid/megaraid_sas_base.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c 
b/drivers/scsi/megaraid/megaraid_sas_base.c
index 80ab9700f1de..3eef0858fa8e 100644
--- a/drivers/scsi/megaraid/megaraid_sas_base.c
+++ b/drivers/scsi/megaraid/megaraid_sas_base.c
@@ -3153,6 +3153,7 @@ fw_crash_buffer_show(struct device *cdev,
(struct megasas_instance *) shost->hostdata;
u32 size;
unsigned long dmachunk = CRASH_DMA_BUF_SIZE;
+   unsigned long chunk_left_bytes;
unsigned long src_addr;
unsigned long flags;
u32 buff_offset;
@@ -3176,6 +3177,8 @@ fw_crash_buffer_show(struct device *cdev,
}
 
size = (instance->fw_crash_buffer_size * dmachunk) - buff_offset;
+   chunk_left_bytes = dmachunk - (buff_offset % dmachunk);
+   size = (size > chunk_left_bytes) ? chunk_left_bytes : size;
size = (size >= PAGE_SIZE) ? (PAGE_SIZE - 1) : size;
 
src_addr = (unsigned long)instance->crash_buf[buff_offset / dmachunk] +
-- 
2.17.1

Re: [PATCH v3 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock

2017-01-16 Thread Junxiao Bi

On 01/17/2017 02:30 PM, Eric Ren wrote:
> We are in the situation that we have to avoid recursive cluster locking,
> but there is no way to check if a cluster lock has been taken by a
> precess already.
> 
> Mostly, we can avoid recursive locking by writing code carefully.
> However, we found that it's very hard to handle the routines that
> are invoked directly by vfs code. For instance:
> 
> const struct inode_operations ocfs2_file_iops = {
> .permission = ocfs2_permission,
> .get_acl= ocfs2_iop_get_acl,
> .set_acl= ocfs2_iop_set_acl,
> };
> 
> Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR):
> do_sys_open
>  may_open
>   inode_permission
>ocfs2_permission
> ocfs2_inode_lock() <=== first time
>  generic_permission
>   get_acl
>ocfs2_iop_get_acl
>   ocfs2_inode_lock() <=== recursive one
> 
> A deadlock will occur if a remote EX request comes in between two
> of ocfs2_inode_lock(). Briefly describe how the deadlock is formed:
> 
> On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in
> BAST(ocfs2_generic_handle_bast) when downconvert is started
> on behalf of the remote EX lock request. Another hand, the recursive
> cluster lock (the second one) will be blocked in in __ocfs2_cluster_lock()
> because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why?
> because there is no chance for the first cluster lock on this node to be
> unlocked - we block ourselves in the code path.
> 
> The idea to fix this issue is mostly taken from gfs2 code.
> 1. introduce a new field: struct ocfs2_lock_res.l_holders, to
> keep track of the processes' pid  who has taken the cluster lock
> of this lock resource;
> 2. introduce a new flag for ocfs2_inode_lock_full: OCFS2_META_LOCK_GETBH;
> it means just getting back disk inode bh for us if we've got cluster lock.
> 3. export a helper: ocfs2_is_locked_by_me() is used to check if we
> have got the cluster lock in the upper code path.
> 
> The tracking logic should be used by some of the ocfs2 vfs's callbacks,
> to solve the recursive locking issue cuased by the fact that vfs routines
> can call into each other.
> 
> The performance penalty of processing the holder list should only be seen
> at a few cases where the tracking logic is used, such as get/set acl.
> 
> You may ask what if the first time we got a PR lock, and the second time
> we want a EX lock? fortunately, this case never happens in the real world,
> as far as I can see, including permission check, (get|set)_(acl|attr), and
> the gfs2 code also do so.
> 
> Changes since v1:
> - Let ocfs2_is_locked_by_me() just return true/false to indicate if the
> process gets the cluster lock - suggested by: Joseph Qi <jiangqi...@gmail.com>
> and Junxiao Bi <junxiao...@oracle.com>.
> 
> - Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder",
> suggested by: Junxiao Bi.
> 
> - Do not inline functions whose bodies are not in scope, changed by:
> Stephen Rothwell <s...@canb.auug.org.au>.
> 
> Changes since v2:
> - Wrap the tracking logic code of recursive locking into functions,
> ocfs2_inode_lock_tracker() and ocfs2_inode_unlock_tracker(),
> suggested by: Junxiao Bi.
> 
> [s...@canb.auug.org.au remove some inlines]
> Signed-off-by: Eric Ren <z...@suse.com>

Reviewed-by: Junxiao Bi <junxiao...@oracle.com>

> ---
>  fs/ocfs2/dlmglue.c | 105 
> +++--
>  fs/ocfs2/dlmglue.h |  18 +
>  fs/ocfs2/ocfs2.h   |   1 +
>  3 files changed, 121 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> index 77d1632..c75b9e9 100644
> --- a/fs/ocfs2/dlmglue.c
> +++ b/fs/ocfs2/dlmglue.c
> @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
>   init_waitqueue_head(>l_event);
>   INIT_LIST_HEAD(>l_blocked_list);
>   INIT_LIST_HEAD(>l_mask_waiters);
> + INIT_LIST_HEAD(>l_holders);
>  }
>  
>  void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
> @@ -749,6 +750,50 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
>   res->l_flags = 0UL;
>  }
>  
> +/*
> + * Keep a list of processes who have interest in a lockres.
> + * Note: this is now only uesed for check recursive cluster locking.
> + */
> +static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
> +struct ocfs2_lock_holder *oh)
> +{
> + INIT_LIST_HEAD(>oh_list);
> + oh->oh_owner_pid =  get_pid(task_pid(current));
> +
> + spin_lock(>l_lock);
> + list_add_tail(>oh_

Re: [PATCH v3 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock

2017-01-16 Thread Junxiao Bi

On 01/17/2017 02:30 PM, Eric Ren wrote:
> We are in the situation that we have to avoid recursive cluster locking,
> but there is no way to check if a cluster lock has been taken by a
> precess already.
> 
> Mostly, we can avoid recursive locking by writing code carefully.
> However, we found that it's very hard to handle the routines that
> are invoked directly by vfs code. For instance:
> 
> const struct inode_operations ocfs2_file_iops = {
> .permission = ocfs2_permission,
> .get_acl= ocfs2_iop_get_acl,
> .set_acl= ocfs2_iop_set_acl,
> };
> 
> Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR):
> do_sys_open
>  may_open
>   inode_permission
>ocfs2_permission
> ocfs2_inode_lock() <=== first time
>  generic_permission
>   get_acl
>ocfs2_iop_get_acl
>   ocfs2_inode_lock() <=== recursive one
> 
> A deadlock will occur if a remote EX request comes in between two
> of ocfs2_inode_lock(). Briefly describe how the deadlock is formed:
> 
> On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in
> BAST(ocfs2_generic_handle_bast) when downconvert is started
> on behalf of the remote EX lock request. Another hand, the recursive
> cluster lock (the second one) will be blocked in in __ocfs2_cluster_lock()
> because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why?
> because there is no chance for the first cluster lock on this node to be
> unlocked - we block ourselves in the code path.
> 
> The idea to fix this issue is mostly taken from gfs2 code.
> 1. introduce a new field: struct ocfs2_lock_res.l_holders, to
> keep track of the processes' pid  who has taken the cluster lock
> of this lock resource;
> 2. introduce a new flag for ocfs2_inode_lock_full: OCFS2_META_LOCK_GETBH;
> it means just getting back disk inode bh for us if we've got cluster lock.
> 3. export a helper: ocfs2_is_locked_by_me() is used to check if we
> have got the cluster lock in the upper code path.
> 
> The tracking logic should be used by some of the ocfs2 vfs's callbacks,
> to solve the recursive locking issue cuased by the fact that vfs routines
> can call into each other.
> 
> The performance penalty of processing the holder list should only be seen
> at a few cases where the tracking logic is used, such as get/set acl.
> 
> You may ask what if the first time we got a PR lock, and the second time
> we want a EX lock? fortunately, this case never happens in the real world,
> as far as I can see, including permission check, (get|set)_(acl|attr), and
> the gfs2 code also do so.
> 
> Changes since v1:
> - Let ocfs2_is_locked_by_me() just return true/false to indicate if the
> process gets the cluster lock - suggested by: Joseph Qi 
> and Junxiao Bi .
> 
> - Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder",
> suggested by: Junxiao Bi.
> 
> - Do not inline functions whose bodies are not in scope, changed by:
> Stephen Rothwell .
> 
> Changes since v2:
> - Wrap the tracking logic code of recursive locking into functions,
> ocfs2_inode_lock_tracker() and ocfs2_inode_unlock_tracker(),
> suggested by: Junxiao Bi.
> 
> [s...@canb.auug.org.au remove some inlines]
> Signed-off-by: Eric Ren 

Reviewed-by: Junxiao Bi 

> ---
>  fs/ocfs2/dlmglue.c | 105 
> +++--
>  fs/ocfs2/dlmglue.h |  18 +
>  fs/ocfs2/ocfs2.h   |   1 +
>  3 files changed, 121 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> index 77d1632..c75b9e9 100644
> --- a/fs/ocfs2/dlmglue.c
> +++ b/fs/ocfs2/dlmglue.c
> @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
>   init_waitqueue_head(>l_event);
>   INIT_LIST_HEAD(>l_blocked_list);
>   INIT_LIST_HEAD(>l_mask_waiters);
> + INIT_LIST_HEAD(>l_holders);
>  }
>  
>  void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
> @@ -749,6 +750,50 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
>   res->l_flags = 0UL;
>  }
>  
> +/*
> + * Keep a list of processes who have interest in a lockres.
> + * Note: this is now only uesed for check recursive cluster locking.
> + */
> +static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
> +struct ocfs2_lock_holder *oh)
> +{
> + INIT_LIST_HEAD(>oh_list);
> + oh->oh_owner_pid =  get_pid(task_pid(current));
> +
> + spin_lock(>l_lock);
> + list_add_tail(>oh_list, >l_holders);
> + spin_unlock(>l_lock);
> +}
> +
> +static inline void ocfs2_remove_holder(struct ocfs2_lock_res *l

Re: [PATCH v3 2/2] ocfs2: fix deadlock issue when taking inode lock at vfs entry points

2017-01-16 Thread Junxiao Bi

On 01/17/2017 02:30 PM, Eric Ren wrote:
> Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
> results in a deadlock, as the author "Tariq Saeed" realized shortly
> after the patch was merged. The discussion happened here
> (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html).
> 
> The reason why taking cluster inode lock at vfs entry points opens up
> a self deadlock window, is explained in the previous patch of this
> series.
> 
> So far, we have seen two different code paths that have this issue.
> 1. do_sys_open
>  may_open
>   inode_permission
>ocfs2_permission
> ocfs2_inode_lock() <=== take PR
>  generic_permission
>   get_acl
>ocfs2_iop_get_acl
> ocfs2_inode_lock() <=== take PR
> 2. fchmod|fchmodat
> chmod_common
>  notify_change
>   ocfs2_setattr <=== take EX
>posix_acl_chmod
> get_acl
>  ocfs2_iop_get_acl <=== take PR
> ocfs2_iop_set_acl <=== take EX
> 
> Fixes them by adding the tracking logic (in the previous patch) for
> these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(),
> ocfs2_setattr().
> 
> Changes since v1:
> - Let ocfs2_is_locked_by_me() just return true/false to indicate if the
> process gets the cluster lock - suggested by: Joseph Qi <jiangqi...@gmail.com>
> and Junxiao Bi <junxiao...@oracle.com>.
> 
> - Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder",
> suggested by: Junxiao Bi.
> 
> - Add debugging output at ocfs2_setattr() and ocfs2_permission() to
> catch exceptional cases, suggested by: Junxiao Bi.
> 
> Changes since v2:
> - Use new wrappers of tracking logic code, suggested by: Junxiao Bi.
> 
> Signed-off-by: Eric Ren <z...@suse.com>
Reviewed-by: Junxiao Bi <junxiao...@oracle.com>

> ---
>  fs/ocfs2/acl.c  | 29 +
>  fs/ocfs2/file.c | 58 
> -
>  2 files changed, 58 insertions(+), 29 deletions(-)
> 
> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
> index bed1fcb..dc22ba8 100644
> --- a/fs/ocfs2/acl.c
> +++ b/fs/ocfs2/acl.c
> @@ -283,16 +283,14 @@ int ocfs2_set_acl(handle_t *handle,
>  int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  {
>   struct buffer_head *bh = NULL;
> - int status = 0;
> + int status, had_lock;
> + struct ocfs2_lock_holder oh;
>  
> - status = ocfs2_inode_lock(inode, , 1);
> - if (status < 0) {
> - if (status != -ENOENT)
> - mlog_errno(status);
> - return status;
> - }
> + had_lock = ocfs2_inode_lock_tracker(inode, , 1, );
> + if (had_lock < 0)
> + return had_lock;
>   status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
> - ocfs2_inode_unlock(inode, 1);
> + ocfs2_inode_unlock_tracker(inode, 1, , had_lock);
>   brelse(bh);
>   return status;
>  }
> @@ -302,21 +300,20 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode 
> *inode, int type)
>   struct ocfs2_super *osb;
>   struct buffer_head *di_bh = NULL;
>   struct posix_acl *acl;
> - int ret;
> + int had_lock;
> + struct ocfs2_lock_holder oh;
>  
>   osb = OCFS2_SB(inode->i_sb);
>   if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>   return NULL;
> - ret = ocfs2_inode_lock(inode, _bh, 0);
> - if (ret < 0) {
> - if (ret != -ENOENT)
> - mlog_errno(ret);
> - return ERR_PTR(ret);
> - }
> +
> + had_lock = ocfs2_inode_lock_tracker(inode, _bh, 0, );
> + if (had_lock < 0)
> + return ERR_PTR(had_lock);
>  
>   acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>  
> - ocfs2_inode_unlock(inode, 0);
> + ocfs2_inode_unlock_tracker(inode, 0, , had_lock);
>   brelse(di_bh);
>   return acl;
>  }
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index c488965..7b6a146 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -1138,6 +1138,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr 
> *attr)
>   handle_t *handle = NULL;
>   struct dquot *transfer_to[MAXQUOTAS] = { };
>   int qtype;
> + int had_lock;
> + struct ocfs2_lock_holder oh;
>  
>   trace_ocfs2_setattr(inode, dentry,
>   (unsigned long long)OCFS2_I(inode)->ip_blkno,
> @@ -1173,11 +1175,30 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr 
> *attr)
>   }

Re: [PATCH v3 2/2] ocfs2: fix deadlock issue when taking inode lock at vfs entry points

2017-01-16 Thread Junxiao Bi

On 01/17/2017 02:30 PM, Eric Ren wrote:
> Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
> results in a deadlock, as the author "Tariq Saeed" realized shortly
> after the patch was merged. The discussion happened here
> (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html).
> 
> The reason why taking cluster inode lock at vfs entry points opens up
> a self deadlock window, is explained in the previous patch of this
> series.
> 
> So far, we have seen two different code paths that have this issue.
> 1. do_sys_open
>  may_open
>   inode_permission
>ocfs2_permission
> ocfs2_inode_lock() <=== take PR
>  generic_permission
>   get_acl
>ocfs2_iop_get_acl
> ocfs2_inode_lock() <=== take PR
> 2. fchmod|fchmodat
> chmod_common
>  notify_change
>   ocfs2_setattr <=== take EX
>posix_acl_chmod
> get_acl
>  ocfs2_iop_get_acl <=== take PR
> ocfs2_iop_set_acl <=== take EX
> 
> Fixes them by adding the tracking logic (in the previous patch) for
> these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(),
> ocfs2_setattr().
> 
> Changes since v1:
> - Let ocfs2_is_locked_by_me() just return true/false to indicate if the
> process gets the cluster lock - suggested by: Joseph Qi 
> and Junxiao Bi .
> 
> - Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder",
> suggested by: Junxiao Bi.
> 
> - Add debugging output at ocfs2_setattr() and ocfs2_permission() to
> catch exceptional cases, suggested by: Junxiao Bi.
> 
> Changes since v2:
> - Use new wrappers of tracking logic code, suggested by: Junxiao Bi.
> 
> Signed-off-by: Eric Ren 
Reviewed-by: Junxiao Bi 

> ---
>  fs/ocfs2/acl.c  | 29 +
>  fs/ocfs2/file.c | 58 
> -
>  2 files changed, 58 insertions(+), 29 deletions(-)
> 
> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
> index bed1fcb..dc22ba8 100644
> --- a/fs/ocfs2/acl.c
> +++ b/fs/ocfs2/acl.c
> @@ -283,16 +283,14 @@ int ocfs2_set_acl(handle_t *handle,
>  int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
>  {
>   struct buffer_head *bh = NULL;
> - int status = 0;
> + int status, had_lock;
> + struct ocfs2_lock_holder oh;
>  
> - status = ocfs2_inode_lock(inode, , 1);
> - if (status < 0) {
> - if (status != -ENOENT)
> - mlog_errno(status);
> - return status;
> - }
> + had_lock = ocfs2_inode_lock_tracker(inode, , 1, );
> + if (had_lock < 0)
> + return had_lock;
>   status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
> - ocfs2_inode_unlock(inode, 1);
> + ocfs2_inode_unlock_tracker(inode, 1, , had_lock);
>   brelse(bh);
>   return status;
>  }
> @@ -302,21 +300,20 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode 
> *inode, int type)
>   struct ocfs2_super *osb;
>   struct buffer_head *di_bh = NULL;
>   struct posix_acl *acl;
> - int ret;
> + int had_lock;
> + struct ocfs2_lock_holder oh;
>  
>   osb = OCFS2_SB(inode->i_sb);
>   if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>   return NULL;
> - ret = ocfs2_inode_lock(inode, _bh, 0);
> - if (ret < 0) {
> - if (ret != -ENOENT)
> - mlog_errno(ret);
> - return ERR_PTR(ret);
> - }
> +
> + had_lock = ocfs2_inode_lock_tracker(inode, _bh, 0, );
> + if (had_lock < 0)
> + return ERR_PTR(had_lock);
>  
>   acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>  
> - ocfs2_inode_unlock(inode, 0);
> + ocfs2_inode_unlock_tracker(inode, 0, , had_lock);
>   brelse(di_bh);
>   return acl;
>  }
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index c488965..7b6a146 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -1138,6 +1138,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr 
> *attr)
>   handle_t *handle = NULL;
>   struct dquot *transfer_to[MAXQUOTAS] = { };
>   int qtype;
> + int had_lock;
> + struct ocfs2_lock_holder oh;
>  
>   trace_ocfs2_setattr(inode, dentry,
>   (unsigned long long)OCFS2_I(inode)->ip_blkno,
> @@ -1173,11 +1175,30 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr 
> *attr)
>   }
>   }
>  
> - status = ocfs2_inode_lock(inode, , 1);
> - if (status < 0) {

Re: [PATCH v2 2/2] ocfs2: fix deadlock issue when taking inode lock at vfs entry points

2017-01-15 Thread Junxiao Bi

On 01/16/2017 02:42 PM, Eric Ren wrote:
> Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
> results in a deadlock, as the author "Tariq Saeed" realized shortly
> after the patch was merged. The discussion happened here
> (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html).
> 
> The reason why taking cluster inode lock at vfs entry points opens up
> a self deadlock window, is explained in the previous patch of this
> series.
> 
> So far, we have seen two different code paths that have this issue.
> 1. do_sys_open
>  may_open
>   inode_permission
>ocfs2_permission
> ocfs2_inode_lock() <=== take PR
>  generic_permission
>   get_acl
>ocfs2_iop_get_acl
> ocfs2_inode_lock() <=== take PR
> 2. fchmod|fchmodat
> chmod_common
>  notify_change
>   ocfs2_setattr <=== take EX
>posix_acl_chmod
> get_acl
>  ocfs2_iop_get_acl <=== take PR
> ocfs2_iop_set_acl <=== take EX
> 
> Fixes them by adding the tracking logic (in the previous patch) for
> these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(),
> ocfs2_setattr().
> 
> Changes since v1:
> 1. Let ocfs2_is_locked_by_me() just return true/false to indicate if the
> process gets the cluster lock - suggested by: Joseph Qi <jiangqi...@gmail.com>
> and Junxiao Bi <junxiao...@oracle.com>.
> 
> 2. Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder",
> suggested by: Junxiao Bi <junxiao...@oracle.com>.
> 
> 3. Add debugging output at ocfs2_setattr() and ocfs2_permission() to
> catch exceptional cases, suggested by: Junxiao Bi <junxiao...@oracle.com>.
> 
> Signed-off-by: Eric Ren <z...@suse.com>
> ---
>  fs/ocfs2/acl.c  | 39 +
>  fs/ocfs2/file.c | 76 
> +
>  2 files changed, 100 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
> index bed1fcb..3e47262 100644
> --- a/fs/ocfs2/acl.c
> +++ b/fs/ocfs2/acl.c
> @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode, struct 
> posix_acl *acl, int type)
>  {
>   struct buffer_head *bh = NULL;
>   int status = 0;
> -
> - status = ocfs2_inode_lock(inode, , 1);
> + int arg_flags = 0, has_locked;
> + struct ocfs2_lock_holder oh;
> + struct ocfs2_lock_res *lockres;
> +
> + lockres = _I(inode)->ip_inode_lockres;
> + has_locked = ocfs2_is_locked_by_me(lockres);
> + if (has_locked)
> + arg_flags = OCFS2_META_LOCK_GETBH;
> + status = ocfs2_inode_lock_full(inode, , 1, arg_flags);
>   if (status < 0) {
>   if (status != -ENOENT)
>   mlog_errno(status);
>   return status;
>   }
> + if (!has_locked)
> + ocfs2_add_holder(lockres, );
> +
Same code pattern showed here and *get_acl, can it be abstracted to one
function?
The same issue for *setattr and *permission. Sorry for not mention that
in last review.

Thanks,
Junxiao.
>   status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
> - ocfs2_inode_unlock(inode, 1);
> +
> + if (!has_locked) {
> + ocfs2_remove_holder(lockres, );
> + ocfs2_inode_unlock(inode, 1);
> + }
>   brelse(bh);
> +
>   return status;
>  }
>  
> @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode 
> *inode, int type)
>   struct buffer_head *di_bh = NULL;
>   struct posix_acl *acl;
>   int ret;
> + int arg_flags = 0, has_locked;
> + struct ocfs2_lock_holder oh;
> + struct ocfs2_lock_res *lockres;
>  
>   osb = OCFS2_SB(inode->i_sb);
>   if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>   return NULL;
> - ret = ocfs2_inode_lock(inode, _bh, 0);
> +
> + lockres = _I(inode)->ip_inode_lockres;
> + has_locked = ocfs2_is_locked_by_me(lockres);
> + if (has_locked)
> + arg_flags = OCFS2_META_LOCK_GETBH;
> + ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags);
>   if (ret < 0) {
>   if (ret != -ENOENT)
>   mlog_errno(ret);
>   return ERR_PTR(ret);
>   }
> + if (!has_locked)
> + ocfs2_add_holder(lockres, );
>  
>   acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>  
> - ocfs2_inode_unlock(inode, 0);
> + if (!has_locked) {
> + ocfs2_remove_holder(lockres, );
> + ocfs2_inode_unlock(inode, 0);

Re: [PATCH v2 2/2] ocfs2: fix deadlock issue when taking inode lock at vfs entry points

2017-01-15 Thread Junxiao Bi

On 01/16/2017 02:42 PM, Eric Ren wrote:
> Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
> results in a deadlock, as the author "Tariq Saeed" realized shortly
> after the patch was merged. The discussion happened here
> (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html).
> 
> The reason why taking cluster inode lock at vfs entry points opens up
> a self deadlock window, is explained in the previous patch of this
> series.
> 
> So far, we have seen two different code paths that have this issue.
> 1. do_sys_open
>  may_open
>   inode_permission
>ocfs2_permission
> ocfs2_inode_lock() <=== take PR
>  generic_permission
>   get_acl
>ocfs2_iop_get_acl
> ocfs2_inode_lock() <=== take PR
> 2. fchmod|fchmodat
> chmod_common
>  notify_change
>   ocfs2_setattr <=== take EX
>posix_acl_chmod
> get_acl
>  ocfs2_iop_get_acl <=== take PR
> ocfs2_iop_set_acl <=== take EX
> 
> Fixes them by adding the tracking logic (in the previous patch) for
> these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(),
> ocfs2_setattr().
> 
> Changes since v1:
> 1. Let ocfs2_is_locked_by_me() just return true/false to indicate if the
> process gets the cluster lock - suggested by: Joseph Qi 
> and Junxiao Bi .
> 
> 2. Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder",
> suggested by: Junxiao Bi .
> 
> 3. Add debugging output at ocfs2_setattr() and ocfs2_permission() to
> catch exceptional cases, suggested by: Junxiao Bi .
> 
> Signed-off-by: Eric Ren 
> ---
>  fs/ocfs2/acl.c  | 39 +
>  fs/ocfs2/file.c | 76 
> +
>  2 files changed, 100 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
> index bed1fcb..3e47262 100644
> --- a/fs/ocfs2/acl.c
> +++ b/fs/ocfs2/acl.c
> @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode, struct 
> posix_acl *acl, int type)
>  {
>   struct buffer_head *bh = NULL;
>   int status = 0;
> -
> - status = ocfs2_inode_lock(inode, , 1);
> + int arg_flags = 0, has_locked;
> + struct ocfs2_lock_holder oh;
> + struct ocfs2_lock_res *lockres;
> +
> + lockres = _I(inode)->ip_inode_lockres;
> + has_locked = ocfs2_is_locked_by_me(lockres);
> + if (has_locked)
> + arg_flags = OCFS2_META_LOCK_GETBH;
> + status = ocfs2_inode_lock_full(inode, , 1, arg_flags);
>   if (status < 0) {
>   if (status != -ENOENT)
>   mlog_errno(status);
>   return status;
>   }
> + if (!has_locked)
> + ocfs2_add_holder(lockres, );
> +
Same code pattern showed here and *get_acl, can it be abstracted to one
function?
The same issue for *setattr and *permission. Sorry for not mention that
in last review.

Thanks,
Junxiao.
>   status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
> - ocfs2_inode_unlock(inode, 1);
> +
> + if (!has_locked) {
> + ocfs2_remove_holder(lockres, );
> + ocfs2_inode_unlock(inode, 1);
> + }
>   brelse(bh);
> +
>   return status;
>  }
>  
> @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode 
> *inode, int type)
>   struct buffer_head *di_bh = NULL;
>   struct posix_acl *acl;
>   int ret;
> + int arg_flags = 0, has_locked;
> + struct ocfs2_lock_holder oh;
> + struct ocfs2_lock_res *lockres;
>  
>   osb = OCFS2_SB(inode->i_sb);
>   if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>   return NULL;
> - ret = ocfs2_inode_lock(inode, _bh, 0);
> +
> + lockres = _I(inode)->ip_inode_lockres;
> + has_locked = ocfs2_is_locked_by_me(lockres);
> + if (has_locked)
> + arg_flags = OCFS2_META_LOCK_GETBH;
> + ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags);
>   if (ret < 0) {
>   if (ret != -ENOENT)
>   mlog_errno(ret);
>   return ERR_PTR(ret);
>   }
> + if (!has_locked)
> + ocfs2_add_holder(lockres, );
>  
>   acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>  
> - ocfs2_inode_unlock(inode, 0);
> + if (!has_locked) {
> + ocfs2_remove_holder(lockres, );
> + ocfs2_inode_unlock(inode, 0);
> + }
>   brelse(di_bh);
> +
>   return acl;
>  }
>  
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> ind

Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points

2017-01-15 Thread Junxiao Bi

On 01/16/2017 11:06 AM, Eric Ren wrote:
> Hi Junxiao,
> 
> On 01/16/2017 10:46 AM, Junxiao Bi wrote:
>>>> If had_lock==true, it is a bug? I think we should BUG_ON for it, that
>>>> can help us catch bug at the first time.
>>> Good idea! But I'm not sure if "ocfs2_setattr" is always the first one
>>> who takes the cluster lock.
>>> It's harder for me to name all the possible paths;-/
>> The BUG_ON() can help catch the path where ocfs2_setattr is not the
>> first one.
> Yes, I understand. But, the problem is that the vfs entries calling
> order is out of our control.
> I don't want to place an assertion where I'm not 100% sure it's
> absolutely right;-)
If it is not the first one, is it another recursive locking bug? In this
case, if you don't like BUG_ON(), you can dump the call trace and print
some warning message.

Thanks,
Junxiao.
> 
> Thanks,
> Eric
> 
>>
>> Thanks,
>> Junxiao.
>>
>>>>
>>>>> +if (had_lock)
>>>>> +arg_flags = OCFS2_META_LOCK_GETBH;
>>>>> +status = ocfs2_inode_lock_full(inode, , 1, arg_flags);
>>>>>if (status < 0) {
>>>>>if (status != -ENOENT)
>>>>>mlog_errno(status);
>>>>>goto bail_unlock_rw;
>>>>>}
>>>>> -inode_locked = 1;
>>>>> +if (!had_lock) {
>>>>> +ocfs2_add_holder(lockres, );
>>>>> +inode_locked = 1;
>>>>> +}
>>>>>  if (size_change) {
>>>>>status = inode_newsize_ok(inode, attr->ia_size);
>>>>> @@ -1260,7 +1270,8 @@ int ocfs2_setattr(struct dentry *dentry, struct
>>>>> iattr *attr)
>>>>>bail_commit:
>>>>>ocfs2_commit_trans(osb, handle);
>>>>>bail_unlock:
>>>>> -if (status) {
>>>>> +if (status && inode_locked) {
>>>>> +ocfs2_remove_holder(lockres, );
>>>>>ocfs2_inode_unlock(inode, 1);
>>>>>inode_locked = 0;
>>>>>}
>>>>> @@ -1278,8 +1289,10 @@ int ocfs2_setattr(struct dentry *dentry,
>>>>> struct iattr *attr)
>>>>>if (status < 0)
>>>>>mlog_errno(status);
>>>>>}
>>>>> -if (inode_locked)
>>>>> +if (inode_locked) {
>>>>> +ocfs2_remove_holder(lockres, );
>>>>>ocfs2_inode_unlock(inode, 1);
>>>>> +}
>>>>>  brelse(bh);
>>>>>return status;
>>>>> @@ -1321,20 +1334,31 @@ int ocfs2_getattr(struct vfsmount *mnt,
>>>>>int ocfs2_permission(struct inode *inode, int mask)
>>>>>{
>>>>>int ret;
>>>>> +int has_locked;
>>>>> +struct ocfs2_holder oh;
>>>>> +struct ocfs2_lock_res *lockres;
>>>>>  if (mask & MAY_NOT_BLOCK)
>>>>>return -ECHILD;
>>>>>-ret = ocfs2_inode_lock(inode, NULL, 0);
>>>>> -if (ret) {
>>>>> -if (ret != -ENOENT)
>>>>> -mlog_errno(ret);
>>>>> -goto out;
>>>>> +lockres = _I(inode)->ip_inode_lockres;
>>>>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL);
>>>> The same thing as ocfs2_setattr.
>>> OK. I will think over your suggestions!
>>>
>>> Thanks,
>>> Eric
>>>
>>>> Thanks,
>>>> Junxiao.
>>>>> +if (!has_locked) {
>>>>> +ret = ocfs2_inode_lock(inode, NULL, 0);
>>>>> +if (ret) {
>>>>> +if (ret != -ENOENT)
>>>>> +mlog_errno(ret);
>>>>> +goto out;
>>>>> +}
>>>>> +ocfs2_add_holder(lockres, );
>>>>>}
>>>>>  ret = generic_permission(inode, mask);
>>>>>-ocfs2_inode_unlock(inode, 0);
>>>>> +if (!has_locked) {
>>>>> +ocfs2_remove_holder(lockres, );
>>>>> +ocfs2_inode_unlock(inode, 0);
>>>>> +}
>>>>>out:
>>>>>return ret;
>>>>>}
>>>>>
>>
>

Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points

2017-01-15 Thread Junxiao Bi

On 01/16/2017 11:06 AM, Eric Ren wrote:
> Hi Junxiao,
> 
> On 01/16/2017 10:46 AM, Junxiao Bi wrote:
>>>> If had_lock==true, it is a bug? I think we should BUG_ON for it, that
>>>> can help us catch bug at the first time.
>>> Good idea! But I'm not sure if "ocfs2_setattr" is always the first one
>>> who takes the cluster lock.
>>> It's harder for me to name all the possible paths;-/
>> The BUG_ON() can help catch the path where ocfs2_setattr is not the
>> first one.
> Yes, I understand. But, the problem is that the vfs entries calling
> order is out of our control.
> I don't want to place an assertion where I'm not 100% sure it's
> absolutely right;-)
If it is not the first one, is it another recursive locking bug? In this
case, if you don't like BUG_ON(), you can dump the call trace and print
some warning message.

Thanks,
Junxiao.
> 
> Thanks,
> Eric
> 
>>
>> Thanks,
>> Junxiao.
>>
>>>>
>>>>> +if (had_lock)
>>>>> +arg_flags = OCFS2_META_LOCK_GETBH;
>>>>> +status = ocfs2_inode_lock_full(inode, , 1, arg_flags);
>>>>>if (status < 0) {
>>>>>if (status != -ENOENT)
>>>>>mlog_errno(status);
>>>>>goto bail_unlock_rw;
>>>>>}
>>>>> -inode_locked = 1;
>>>>> +if (!had_lock) {
>>>>> +ocfs2_add_holder(lockres, );
>>>>> +inode_locked = 1;
>>>>> +}
>>>>>  if (size_change) {
>>>>>status = inode_newsize_ok(inode, attr->ia_size);
>>>>> @@ -1260,7 +1270,8 @@ int ocfs2_setattr(struct dentry *dentry, struct
>>>>> iattr *attr)
>>>>>bail_commit:
>>>>>ocfs2_commit_trans(osb, handle);
>>>>>bail_unlock:
>>>>> -if (status) {
>>>>> +if (status && inode_locked) {
>>>>> +ocfs2_remove_holder(lockres, );
>>>>>ocfs2_inode_unlock(inode, 1);
>>>>>inode_locked = 0;
>>>>>}
>>>>> @@ -1278,8 +1289,10 @@ int ocfs2_setattr(struct dentry *dentry,
>>>>> struct iattr *attr)
>>>>>if (status < 0)
>>>>>mlog_errno(status);
>>>>>}
>>>>> -if (inode_locked)
>>>>> +if (inode_locked) {
>>>>> +ocfs2_remove_holder(lockres, );
>>>>>ocfs2_inode_unlock(inode, 1);
>>>>> +}
>>>>>  brelse(bh);
>>>>>return status;
>>>>> @@ -1321,20 +1334,31 @@ int ocfs2_getattr(struct vfsmount *mnt,
>>>>>int ocfs2_permission(struct inode *inode, int mask)
>>>>>{
>>>>>int ret;
>>>>> +int has_locked;
>>>>> +struct ocfs2_holder oh;
>>>>> +struct ocfs2_lock_res *lockres;
>>>>>  if (mask & MAY_NOT_BLOCK)
>>>>>return -ECHILD;
>>>>>-ret = ocfs2_inode_lock(inode, NULL, 0);
>>>>> -if (ret) {
>>>>> -if (ret != -ENOENT)
>>>>> -mlog_errno(ret);
>>>>> -goto out;
>>>>> +lockres = _I(inode)->ip_inode_lockres;
>>>>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL);
>>>> The same thing as ocfs2_setattr.
>>> OK. I will think over your suggestions!
>>>
>>> Thanks,
>>> Eric
>>>
>>>> Thanks,
>>>> Junxiao.
>>>>> +if (!has_locked) {
>>>>> +ret = ocfs2_inode_lock(inode, NULL, 0);
>>>>> +if (ret) {
>>>>> +if (ret != -ENOENT)
>>>>> +mlog_errno(ret);
>>>>> +goto out;
>>>>> +}
>>>>> +ocfs2_add_holder(lockres, );
>>>>>}
>>>>>  ret = generic_permission(inode, mask);
>>>>>-ocfs2_inode_unlock(inode, 0);
>>>>> +if (!has_locked) {
>>>>> +ocfs2_remove_holder(lockres, );
>>>>> +ocfs2_inode_unlock(inode, 0);
>>>>> +}
>>>>>out:
>>>>>return ret;
>>>>>}
>>>>>
>>
>

Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points

2017-01-15 Thread Junxiao Bi

On 01/13/2017 02:19 PM, Eric Ren wrote:
> Hi!
> 
> On 01/13/2017 12:22 PM, Junxiao Bi wrote:
>> On 01/05/2017 11:31 PM, Eric Ren wrote:
>>> Commit 743b5f1434f5 ("ocfs2: take inode lock in
>>> ocfs2_iop_set/get_acl()")
>>> results in a deadlock, as the author "Tariq Saeed" realized shortly
>>> after the patch was merged. The discussion happened here
>>> (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html).
>>>
>>>
>>> The reason why taking cluster inode lock at vfs entry points opens up
>>> a self deadlock window, is explained in the previous patch of this
>>> series.
>>>
>>> So far, we have seen two different code paths that have this issue.
>>> 1. do_sys_open
>>>   may_open
>>>inode_permission
>>> ocfs2_permission
>>>  ocfs2_inode_lock() <=== take PR
>>>   generic_permission
>>>get_acl
>>> ocfs2_iop_get_acl
>>>  ocfs2_inode_lock() <=== take PR
>>> 2. fchmod|fchmodat
>>>  chmod_common
>>>   notify_change
>>>ocfs2_setattr <=== take EX
>>> posix_acl_chmod
>>>  get_acl
>>>   ocfs2_iop_get_acl <=== take PR
>>>  ocfs2_iop_set_acl <=== take EX
>>>
>>> Fixes them by adding the tracking logic (in the previous patch) for
>>> these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(),
>>> ocfs2_setattr().
>>>
>>> Signed-off-by: Eric Ren <z...@suse.com>
>>> ---
>>>   fs/ocfs2/acl.c  | 39 ++-
>>>   fs/ocfs2/file.c | 44 ++--
>>>   2 files changed, 68 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
>>> index bed1fcb..c539890 100644
>>> --- a/fs/ocfs2/acl.c
>>> +++ b/fs/ocfs2/acl.c
>>> @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode,
>>> struct posix_acl *acl, int type)
>>>   {
>>>   struct buffer_head *bh = NULL;
>>>   int status = 0;
>>> -
>>> -status = ocfs2_inode_lock(inode, , 1);
>>> +int arg_flags = 0, has_locked;
>>> +struct ocfs2_holder oh;
>>> +struct ocfs2_lock_res *lockres;
>>> +
>>> +lockres = _I(inode)->ip_inode_lockres;
>>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL);
>>> +if (has_locked)
>>> +arg_flags = OCFS2_META_LOCK_GETBH;
>>> +status = ocfs2_inode_lock_full(inode, , 1, arg_flags);
>>>   if (status < 0) {
>>>   if (status != -ENOENT)
>>>   mlog_errno(status);
>>>   return status;
>>>   }
>>> +if (!has_locked)
>>> +ocfs2_add_holder(lockres, );
>>> +
>>>   status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
>>> -ocfs2_inode_unlock(inode, 1);
>>> +
>>> +if (!has_locked) {
>>> +ocfs2_remove_holder(lockres, );
>>> +ocfs2_inode_unlock(inode, 1);
>>> +}
>>>   brelse(bh);
>>> +
>>>   return status;
>>>   }
>>>   @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct
>>> inode *inode, int type)
>>>   struct buffer_head *di_bh = NULL;
>>>   struct posix_acl *acl;
>>>   int ret;
>>> +int arg_flags = 0, has_locked;
>>> +struct ocfs2_holder oh;
>>> +struct ocfs2_lock_res *lockres;
>>> osb = OCFS2_SB(inode->i_sb);
>>>   if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>>>   return NULL;
>>> -ret = ocfs2_inode_lock(inode, _bh, 0);
>>> +
>>> +lockres = _I(inode)->ip_inode_lockres;
>>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL);
>>> +if (has_locked)
>>> +arg_flags = OCFS2_META_LOCK_GETBH;
>>> +ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags);
>>>   if (ret < 0) {
>>>   if (ret != -ENOENT)
>>>   mlog_errno(ret);
>>>   return ERR_PTR(ret);
>>>   }
>>> +if (!has_locked)
>>> +ocfs2_add_holder(lockres, );
>>> acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>>>   -ocfs2_inode_unlock(inode, 0);
>>> +if (!has_locked) {

Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points

2017-01-15 Thread Junxiao Bi

On 01/13/2017 02:19 PM, Eric Ren wrote:
> Hi!
> 
> On 01/13/2017 12:22 PM, Junxiao Bi wrote:
>> On 01/05/2017 11:31 PM, Eric Ren wrote:
>>> Commit 743b5f1434f5 ("ocfs2: take inode lock in
>>> ocfs2_iop_set/get_acl()")
>>> results in a deadlock, as the author "Tariq Saeed" realized shortly
>>> after the patch was merged. The discussion happened here
>>> (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html).
>>>
>>>
>>> The reason why taking cluster inode lock at vfs entry points opens up
>>> a self deadlock window, is explained in the previous patch of this
>>> series.
>>>
>>> So far, we have seen two different code paths that have this issue.
>>> 1. do_sys_open
>>>   may_open
>>>inode_permission
>>> ocfs2_permission
>>>  ocfs2_inode_lock() <=== take PR
>>>   generic_permission
>>>get_acl
>>> ocfs2_iop_get_acl
>>>  ocfs2_inode_lock() <=== take PR
>>> 2. fchmod|fchmodat
>>>  chmod_common
>>>   notify_change
>>>ocfs2_setattr <=== take EX
>>> posix_acl_chmod
>>>  get_acl
>>>   ocfs2_iop_get_acl <=== take PR
>>>  ocfs2_iop_set_acl <=== take EX
>>>
>>> Fixes them by adding the tracking logic (in the previous patch) for
>>> these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(),
>>> ocfs2_setattr().
>>>
>>> Signed-off-by: Eric Ren 
>>> ---
>>>   fs/ocfs2/acl.c  | 39 ++-
>>>   fs/ocfs2/file.c | 44 ++--
>>>   2 files changed, 68 insertions(+), 15 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
>>> index bed1fcb..c539890 100644
>>> --- a/fs/ocfs2/acl.c
>>> +++ b/fs/ocfs2/acl.c
>>> @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode,
>>> struct posix_acl *acl, int type)
>>>   {
>>>   struct buffer_head *bh = NULL;
>>>   int status = 0;
>>> -
>>> -status = ocfs2_inode_lock(inode, , 1);
>>> +int arg_flags = 0, has_locked;
>>> +struct ocfs2_holder oh;
>>> +struct ocfs2_lock_res *lockres;
>>> +
>>> +lockres = _I(inode)->ip_inode_lockres;
>>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL);
>>> +if (has_locked)
>>> +arg_flags = OCFS2_META_LOCK_GETBH;
>>> +status = ocfs2_inode_lock_full(inode, , 1, arg_flags);
>>>   if (status < 0) {
>>>   if (status != -ENOENT)
>>>   mlog_errno(status);
>>>   return status;
>>>   }
>>> +if (!has_locked)
>>> +ocfs2_add_holder(lockres, );
>>> +
>>>   status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
>>> -ocfs2_inode_unlock(inode, 1);
>>> +
>>> +if (!has_locked) {
>>> +ocfs2_remove_holder(lockres, );
>>> +ocfs2_inode_unlock(inode, 1);
>>> +}
>>>   brelse(bh);
>>> +
>>>   return status;
>>>   }
>>>   @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct
>>> inode *inode, int type)
>>>   struct buffer_head *di_bh = NULL;
>>>   struct posix_acl *acl;
>>>   int ret;
>>> +int arg_flags = 0, has_locked;
>>> +struct ocfs2_holder oh;
>>> +struct ocfs2_lock_res *lockres;
>>> osb = OCFS2_SB(inode->i_sb);
>>>   if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>>>   return NULL;
>>> -ret = ocfs2_inode_lock(inode, _bh, 0);
>>> +
>>> +lockres = _I(inode)->ip_inode_lockres;
>>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL);
>>> +if (has_locked)
>>> +arg_flags = OCFS2_META_LOCK_GETBH;
>>> +ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags);
>>>   if (ret < 0) {
>>>   if (ret != -ENOENT)
>>>   mlog_errno(ret);
>>>   return ERR_PTR(ret);
>>>   }
>>> +if (!has_locked)
>>> +ocfs2_add_holder(lockres, );
>>> acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>>>   -ocfs2_inode_unlock(inode, 0);
>>> +if (!has_locked) {
>>> +

Re: [PATCH 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock

2017-01-15 Thread Junxiao Bi

On 01/13/2017 02:12 PM, Eric Ren wrote:
> Hi Junxiao!
> 
> On 01/13/2017 11:59 AM, Junxiao Bi wrote:
>> On 01/05/2017 11:31 PM, Eric Ren wrote:
>>> We are in the situation that we have to avoid recursive cluster locking,
>>> but there is no way to check if a cluster lock has been taken by a
>>> precess already.
>>>
>>> Mostly, we can avoid recursive locking by writing code carefully.
>>> However, we found that it's very hard to handle the routines that
>>> are invoked directly by vfs code. For instance:
>>>
>>> const struct inode_operations ocfs2_file_iops = {
>>>  .permission = ocfs2_permission,
>>>  .get_acl= ocfs2_iop_get_acl,
>>>  .set_acl= ocfs2_iop_set_acl,
>>> };
>>>
>>> Both ocfs2_permission() and ocfs2_iop_get_acl() call
>>> ocfs2_inode_lock(PR):
>>> do_sys_open
>>>   may_open
>>>inode_permission
>>> ocfs2_permission
>>>  ocfs2_inode_lock() <=== first time
>>>   generic_permission
>>>get_acl
>>> ocfs2_iop_get_acl
>>> ocfs2_inode_lock() <=== recursive one
>>>
>>> A deadlock will occur if a remote EX request comes in between two
>>> of ocfs2_inode_lock(). Briefly describe how the deadlock is formed:
>>>
>>> On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in
>>> BAST(ocfs2_generic_handle_bast) when downconvert is started
>>> on behalf of the remote EX lock request. Another hand, the recursive
>>> cluster lock (the second one) will be blocked in in
>>> __ocfs2_cluster_lock()
>>> because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why?
>>> because there is no chance for the first cluster lock on this node to be
>>> unlocked - we block ourselves in the code path.
>>>
>>> The idea to fix this issue is mostly taken from gfs2 code.
>>> 1. introduce a new field: struct ocfs2_lock_res.l_holders, to
>>> keep track of the processes' pid  who has taken the cluster lock
>>> of this lock resource;
>>> 2. introduce a new flag for ocfs2_inode_lock_full:
>>> OCFS2_META_LOCK_GETBH;
>>> it means just getting back disk inode bh for us if we've got cluster
>>> lock.
>>> 3. export a helper: ocfs2_is_locked_by_me() is used to check if we
>>> have got the cluster lock in the upper code path.
>>>
>>> The tracking logic should be used by some of the ocfs2 vfs's callbacks,
>>> to solve the recursive locking issue cuased by the fact that vfs
>>> routines
>>> can call into each other.
>>>
>>> The performance penalty of processing the holder list should only be
>>> seen
>>> at a few cases where the tracking logic is used, such as get/set acl.
>>>
>>> You may ask what if the first time we got a PR lock, and the second time
>>> we want a EX lock? fortunately, this case never happens in the real
>>> world,
>>> as far as I can see, including permission check,
>>> (get|set)_(acl|attr), and
>>> the gfs2 code also do so.
>>>
>>> Signed-off-by: Eric Ren <z...@suse.com>
>>> ---
>>>   fs/ocfs2/dlmglue.c | 47
>>> ---
>>>   fs/ocfs2/dlmglue.h | 18 ++
>>>   fs/ocfs2/ocfs2.h   |  1 +
>>>   3 files changed, 63 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>>> index 83d576f..500bda4 100644
>>> --- a/fs/ocfs2/dlmglue.c
>>> +++ b/fs/ocfs2/dlmglue.c
>>> @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct
>>> ocfs2_lock_res *res)
>>>   init_waitqueue_head(>l_event);
>>>   INIT_LIST_HEAD(>l_blocked_list);
>>>   INIT_LIST_HEAD(>l_mask_waiters);
>>> +INIT_LIST_HEAD(>l_holders);
>>>   }
>>> void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
>>> @@ -749,6 +750,45 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res
>>> *res)
>>>   res->l_flags = 0UL;
>>>   }
>>>   +inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
>>> +   struct ocfs2_holder *oh)
>>> +{
>>> +INIT_LIST_HEAD(>oh_list);
>>> +oh->oh_owner_pid =  get_pid(task_pid(current));
>> struct pid(oh->oh_owner_pid) looks complicated here, why not use
>> task_struct(current) or pid_t(current->pid) directly? Also i didn't see
>> the ref

Re: [PATCH 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock

2017-01-15 Thread Junxiao Bi

On 01/13/2017 02:12 PM, Eric Ren wrote:
> Hi Junxiao!
> 
> On 01/13/2017 11:59 AM, Junxiao Bi wrote:
>> On 01/05/2017 11:31 PM, Eric Ren wrote:
>>> We are in the situation that we have to avoid recursive cluster locking,
>>> but there is no way to check if a cluster lock has been taken by a
>>> precess already.
>>>
>>> Mostly, we can avoid recursive locking by writing code carefully.
>>> However, we found that it's very hard to handle the routines that
>>> are invoked directly by vfs code. For instance:
>>>
>>> const struct inode_operations ocfs2_file_iops = {
>>>  .permission = ocfs2_permission,
>>>  .get_acl= ocfs2_iop_get_acl,
>>>  .set_acl= ocfs2_iop_set_acl,
>>> };
>>>
>>> Both ocfs2_permission() and ocfs2_iop_get_acl() call
>>> ocfs2_inode_lock(PR):
>>> do_sys_open
>>>   may_open
>>>inode_permission
>>> ocfs2_permission
>>>  ocfs2_inode_lock() <=== first time
>>>   generic_permission
>>>get_acl
>>> ocfs2_iop_get_acl
>>> ocfs2_inode_lock() <=== recursive one
>>>
>>> A deadlock will occur if a remote EX request comes in between two
>>> of ocfs2_inode_lock(). Briefly describe how the deadlock is formed:
>>>
>>> On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in
>>> BAST(ocfs2_generic_handle_bast) when downconvert is started
>>> on behalf of the remote EX lock request. Another hand, the recursive
>>> cluster lock (the second one) will be blocked in in
>>> __ocfs2_cluster_lock()
>>> because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why?
>>> because there is no chance for the first cluster lock on this node to be
>>> unlocked - we block ourselves in the code path.
>>>
>>> The idea to fix this issue is mostly taken from gfs2 code.
>>> 1. introduce a new field: struct ocfs2_lock_res.l_holders, to
>>> keep track of the processes' pid  who has taken the cluster lock
>>> of this lock resource;
>>> 2. introduce a new flag for ocfs2_inode_lock_full:
>>> OCFS2_META_LOCK_GETBH;
>>> it means just getting back disk inode bh for us if we've got cluster
>>> lock.
>>> 3. export a helper: ocfs2_is_locked_by_me() is used to check if we
>>> have got the cluster lock in the upper code path.
>>>
>>> The tracking logic should be used by some of the ocfs2 vfs's callbacks,
>>> to solve the recursive locking issue cuased by the fact that vfs
>>> routines
>>> can call into each other.
>>>
>>> The performance penalty of processing the holder list should only be
>>> seen
>>> at a few cases where the tracking logic is used, such as get/set acl.
>>>
>>> You may ask what if the first time we got a PR lock, and the second time
>>> we want a EX lock? fortunately, this case never happens in the real
>>> world,
>>> as far as I can see, including permission check,
>>> (get|set)_(acl|attr), and
>>> the gfs2 code also do so.
>>>
>>> Signed-off-by: Eric Ren 
>>> ---
>>>   fs/ocfs2/dlmglue.c | 47
>>> ---
>>>   fs/ocfs2/dlmglue.h | 18 ++
>>>   fs/ocfs2/ocfs2.h   |  1 +
>>>   3 files changed, 63 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
>>> index 83d576f..500bda4 100644
>>> --- a/fs/ocfs2/dlmglue.c
>>> +++ b/fs/ocfs2/dlmglue.c
>>> @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct
>>> ocfs2_lock_res *res)
>>>   init_waitqueue_head(>l_event);
>>>   INIT_LIST_HEAD(>l_blocked_list);
>>>   INIT_LIST_HEAD(>l_mask_waiters);
>>> +INIT_LIST_HEAD(>l_holders);
>>>   }
>>> void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
>>> @@ -749,6 +750,45 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res
>>> *res)
>>>   res->l_flags = 0UL;
>>>   }
>>>   +inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
>>> +   struct ocfs2_holder *oh)
>>> +{
>>> +INIT_LIST_HEAD(>oh_list);
>>> +oh->oh_owner_pid =  get_pid(task_pid(current));
>> struct pid(oh->oh_owner_pid) looks complicated here, why not use
>> task_struct(current) or pid_t(current->pid) directly? Also i didn't see
>> the ref count needs

Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points

2017-01-12 Thread Junxiao Bi

On 01/05/2017 11:31 PM, Eric Ren wrote:
> Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
> results in a deadlock, as the author "Tariq Saeed" realized shortly
> after the patch was merged. The discussion happened here
> (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html).
> 
> The reason why taking cluster inode lock at vfs entry points opens up
> a self deadlock window, is explained in the previous patch of this
> series.
> 
> So far, we have seen two different code paths that have this issue.
> 1. do_sys_open
>  may_open
>   inode_permission
>ocfs2_permission
> ocfs2_inode_lock() <=== take PR
>  generic_permission
>   get_acl
>ocfs2_iop_get_acl
> ocfs2_inode_lock() <=== take PR
> 2. fchmod|fchmodat
> chmod_common
>  notify_change
>   ocfs2_setattr <=== take EX
>posix_acl_chmod
> get_acl
>  ocfs2_iop_get_acl <=== take PR
> ocfs2_iop_set_acl <=== take EX
> 
> Fixes them by adding the tracking logic (in the previous patch) for
> these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(),
> ocfs2_setattr().
> 
> Signed-off-by: Eric Ren 
> ---
>  fs/ocfs2/acl.c  | 39 ++-
>  fs/ocfs2/file.c | 44 ++--
>  2 files changed, 68 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
> index bed1fcb..c539890 100644
> --- a/fs/ocfs2/acl.c
> +++ b/fs/ocfs2/acl.c
> @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode, struct 
> posix_acl *acl, int type)
>  {
>   struct buffer_head *bh = NULL;
>   int status = 0;
> -
> - status = ocfs2_inode_lock(inode, , 1);
> + int arg_flags = 0, has_locked;
> + struct ocfs2_holder oh;
> + struct ocfs2_lock_res *lockres;
> +
> + lockres = _I(inode)->ip_inode_lockres;
> + has_locked = (ocfs2_is_locked_by_me(lockres) != NULL);
> + if (has_locked)
> + arg_flags = OCFS2_META_LOCK_GETBH;
> + status = ocfs2_inode_lock_full(inode, , 1, arg_flags);
>   if (status < 0) {
>   if (status != -ENOENT)
>   mlog_errno(status);
>   return status;
>   }
> + if (!has_locked)
> + ocfs2_add_holder(lockres, );
> +
>   status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
> - ocfs2_inode_unlock(inode, 1);
> +
> + if (!has_locked) {
> + ocfs2_remove_holder(lockres, );
> + ocfs2_inode_unlock(inode, 1);
> + }
>   brelse(bh);
> +
>   return status;
>  }
>  
> @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode 
> *inode, int type)
>   struct buffer_head *di_bh = NULL;
>   struct posix_acl *acl;
>   int ret;
> + int arg_flags = 0, has_locked;
> + struct ocfs2_holder oh;
> + struct ocfs2_lock_res *lockres;
>  
>   osb = OCFS2_SB(inode->i_sb);
>   if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>   return NULL;
> - ret = ocfs2_inode_lock(inode, _bh, 0);
> +
> + lockres = _I(inode)->ip_inode_lockres;
> + has_locked = (ocfs2_is_locked_by_me(lockres) != NULL);
> + if (has_locked)
> + arg_flags = OCFS2_META_LOCK_GETBH;
> + ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags);
>   if (ret < 0) {
>   if (ret != -ENOENT)
>   mlog_errno(ret);
>   return ERR_PTR(ret);
>   }
> + if (!has_locked)
> + ocfs2_add_holder(lockres, );
>  
>   acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>  
> - ocfs2_inode_unlock(inode, 0);
> + if (!has_locked) {
> + ocfs2_remove_holder(lockres, );
> + ocfs2_inode_unlock(inode, 0);
> + }
>   brelse(di_bh);
> +
>   return acl;
>  }
>  
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index c488965..62be75d 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -1138,6 +1138,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr 
> *attr)
>   handle_t *handle = NULL;
>   struct dquot *transfer_to[MAXQUOTAS] = { };
>   int qtype;
> + int arg_flags = 0, had_lock;
> + struct ocfs2_holder oh;
> + struct ocfs2_lock_res *lockres;
>  
>   trace_ocfs2_setattr(inode, dentry,
>   (unsigned long long)OCFS2_I(inode)->ip_blkno,
> @@ -1173,13 +1176,20 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr 
> *attr)
>   }
>   }
>  
> - status = ocfs2_inode_lock(inode, , 1);
> + lockres = _I(inode)->ip_inode_lockres;
> + had_lock = (ocfs2_is_locked_by_me(lockres) != NULL);

If had_lock==true, it is a bug? I think we should BUG_ON for it, that
can help us catch bug at the first time.


> + if (had_lock)
> + arg_flags = OCFS2_META_LOCK_GETBH;
> + status = ocfs2_inode_lock_full(inode, , 1, arg_flags);
>   if (status < 0) {
>

Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points

2017-01-12 Thread Junxiao Bi

On 01/05/2017 11:31 PM, Eric Ren wrote:
> Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()")
> results in a deadlock, as the author "Tariq Saeed" realized shortly
> after the patch was merged. The discussion happened here
> (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html).
> 
> The reason why taking cluster inode lock at vfs entry points opens up
> a self deadlock window, is explained in the previous patch of this
> series.
> 
> So far, we have seen two different code paths that have this issue.
> 1. do_sys_open
>  may_open
>   inode_permission
>ocfs2_permission
> ocfs2_inode_lock() <=== take PR
>  generic_permission
>   get_acl
>ocfs2_iop_get_acl
> ocfs2_inode_lock() <=== take PR
> 2. fchmod|fchmodat
> chmod_common
>  notify_change
>   ocfs2_setattr <=== take EX
>posix_acl_chmod
> get_acl
>  ocfs2_iop_get_acl <=== take PR
> ocfs2_iop_set_acl <=== take EX
> 
> Fixes them by adding the tracking logic (in the previous patch) for
> these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(),
> ocfs2_setattr().
> 
> Signed-off-by: Eric Ren 
> ---
>  fs/ocfs2/acl.c  | 39 ++-
>  fs/ocfs2/file.c | 44 ++--
>  2 files changed, 68 insertions(+), 15 deletions(-)
> 
> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
> index bed1fcb..c539890 100644
> --- a/fs/ocfs2/acl.c
> +++ b/fs/ocfs2/acl.c
> @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode, struct 
> posix_acl *acl, int type)
>  {
>   struct buffer_head *bh = NULL;
>   int status = 0;
> -
> - status = ocfs2_inode_lock(inode, , 1);
> + int arg_flags = 0, has_locked;
> + struct ocfs2_holder oh;
> + struct ocfs2_lock_res *lockres;
> +
> + lockres = _I(inode)->ip_inode_lockres;
> + has_locked = (ocfs2_is_locked_by_me(lockres) != NULL);
> + if (has_locked)
> + arg_flags = OCFS2_META_LOCK_GETBH;
> + status = ocfs2_inode_lock_full(inode, , 1, arg_flags);
>   if (status < 0) {
>   if (status != -ENOENT)
>   mlog_errno(status);
>   return status;
>   }
> + if (!has_locked)
> + ocfs2_add_holder(lockres, );
> +
>   status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
> - ocfs2_inode_unlock(inode, 1);
> +
> + if (!has_locked) {
> + ocfs2_remove_holder(lockres, );
> + ocfs2_inode_unlock(inode, 1);
> + }
>   brelse(bh);
> +
>   return status;
>  }
>  
> @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode 
> *inode, int type)
>   struct buffer_head *di_bh = NULL;
>   struct posix_acl *acl;
>   int ret;
> + int arg_flags = 0, has_locked;
> + struct ocfs2_holder oh;
> + struct ocfs2_lock_res *lockres;
>  
>   osb = OCFS2_SB(inode->i_sb);
>   if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
>   return NULL;
> - ret = ocfs2_inode_lock(inode, _bh, 0);
> +
> + lockres = _I(inode)->ip_inode_lockres;
> + has_locked = (ocfs2_is_locked_by_me(lockres) != NULL);
> + if (has_locked)
> + arg_flags = OCFS2_META_LOCK_GETBH;
> + ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags);
>   if (ret < 0) {
>   if (ret != -ENOENT)
>   mlog_errno(ret);
>   return ERR_PTR(ret);
>   }
> + if (!has_locked)
> + ocfs2_add_holder(lockres, );
>  
>   acl = ocfs2_get_acl_nolock(inode, type, di_bh);
>  
> - ocfs2_inode_unlock(inode, 0);
> + if (!has_locked) {
> + ocfs2_remove_holder(lockres, );
> + ocfs2_inode_unlock(inode, 0);
> + }
>   brelse(di_bh);
> +
>   return acl;
>  }
>  
> diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
> index c488965..62be75d 100644
> --- a/fs/ocfs2/file.c
> +++ b/fs/ocfs2/file.c
> @@ -1138,6 +1138,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr 
> *attr)
>   handle_t *handle = NULL;
>   struct dquot *transfer_to[MAXQUOTAS] = { };
>   int qtype;
> + int arg_flags = 0, had_lock;
> + struct ocfs2_holder oh;
> + struct ocfs2_lock_res *lockres;
>  
>   trace_ocfs2_setattr(inode, dentry,
>   (unsigned long long)OCFS2_I(inode)->ip_blkno,
> @@ -1173,13 +1176,20 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr 
> *attr)
>   }
>   }
>  
> - status = ocfs2_inode_lock(inode, , 1);
> + lockres = _I(inode)->ip_inode_lockres;
> + had_lock = (ocfs2_is_locked_by_me(lockres) != NULL);

If had_lock==true, it is a bug? I think we should BUG_ON for it, that
can help us catch bug at the first time.


> + if (had_lock)
> + arg_flags = OCFS2_META_LOCK_GETBH;
> + status = ocfs2_inode_lock_full(inode, , 1, arg_flags);
>   if (status < 0) {
>   if

Re: [PATCH 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock

2017-01-12 Thread Junxiao Bi

On 01/05/2017 11:31 PM, Eric Ren wrote:
> We are in the situation that we have to avoid recursive cluster locking,
> but there is no way to check if a cluster lock has been taken by a
> precess already.
> 
> Mostly, we can avoid recursive locking by writing code carefully.
> However, we found that it's very hard to handle the routines that
> are invoked directly by vfs code. For instance:
> 
> const struct inode_operations ocfs2_file_iops = {
> .permission = ocfs2_permission,
> .get_acl= ocfs2_iop_get_acl,
> .set_acl= ocfs2_iop_set_acl,
> };
> 
> Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR):
> do_sys_open
>  may_open
>   inode_permission
>ocfs2_permission
> ocfs2_inode_lock() <=== first time
>  generic_permission
>   get_acl
>ocfs2_iop_get_acl
>   ocfs2_inode_lock() <=== recursive one
> 
> A deadlock will occur if a remote EX request comes in between two
> of ocfs2_inode_lock(). Briefly describe how the deadlock is formed:
> 
> On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in
> BAST(ocfs2_generic_handle_bast) when downconvert is started
> on behalf of the remote EX lock request. Another hand, the recursive
> cluster lock (the second one) will be blocked in in __ocfs2_cluster_lock()
> because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why?
> because there is no chance for the first cluster lock on this node to be
> unlocked - we block ourselves in the code path.
> 
> The idea to fix this issue is mostly taken from gfs2 code.
> 1. introduce a new field: struct ocfs2_lock_res.l_holders, to
> keep track of the processes' pid  who has taken the cluster lock
> of this lock resource;
> 2. introduce a new flag for ocfs2_inode_lock_full: OCFS2_META_LOCK_GETBH;
> it means just getting back disk inode bh for us if we've got cluster lock.
> 3. export a helper: ocfs2_is_locked_by_me() is used to check if we
> have got the cluster lock in the upper code path.
> 
> The tracking logic should be used by some of the ocfs2 vfs's callbacks,
> to solve the recursive locking issue cuased by the fact that vfs routines
> can call into each other.
> 
> The performance penalty of processing the holder list should only be seen
> at a few cases where the tracking logic is used, such as get/set acl.
> 
> You may ask what if the first time we got a PR lock, and the second time
> we want a EX lock? fortunately, this case never happens in the real world,
> as far as I can see, including permission check, (get|set)_(acl|attr), and
> the gfs2 code also do so.
> 
> Signed-off-by: Eric Ren 
> ---
>  fs/ocfs2/dlmglue.c | 47 ---
>  fs/ocfs2/dlmglue.h | 18 ++
>  fs/ocfs2/ocfs2.h   |  1 +
>  3 files changed, 63 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> index 83d576f..500bda4 100644
> --- a/fs/ocfs2/dlmglue.c
> +++ b/fs/ocfs2/dlmglue.c
> @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
>   init_waitqueue_head(>l_event);
>   INIT_LIST_HEAD(>l_blocked_list);
>   INIT_LIST_HEAD(>l_mask_waiters);
> + INIT_LIST_HEAD(>l_holders);
>  }
>  
>  void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
> @@ -749,6 +750,45 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
>   res->l_flags = 0UL;
>  }
>  
> +inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
> +struct ocfs2_holder *oh)
> +{
> + INIT_LIST_HEAD(>oh_list);
> + oh->oh_owner_pid =  get_pid(task_pid(current));
struct pid(oh->oh_owner_pid) looks complicated here, why not use
task_struct(current) or pid_t(current->pid) directly? Also i didn't see
the ref count needs to be considered.

> +
> + spin_lock(>l_lock);
> + list_add_tail(>oh_list, >l_holders);
> + spin_unlock(>l_lock);
> +}
> +
> +inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres,
> +struct ocfs2_holder *oh)
> +{
> + spin_lock(>l_lock);
> + list_del(>oh_list);
> + spin_unlock(>l_lock);
> +
> + put_pid(oh->oh_owner_pid);
same the above

> +}
> +
> +inline struct ocfs2_holder *ocfs2_is_locked_by_me(struct ocfs2_lock_res 
> *lockres)
Agree with Joseph, return bool looks better. I didn't see how that help
debug since the return value is not used.


> +{
> + struct ocfs2_holder *oh;
> + struct pid *pid;
> +
> + /* look in the list of holders for one with the current task as owner */
> + spin_lock(>l_lock);
> + pid = task_pid(current);
> + list_for_each_entry(oh, >l_holders, oh_list) {
> + if (oh->oh_owner_pid == pid)
> + goto out;
> + }
> + oh = NULL;
> +out:
> + spin_unlock(>l_lock);
> + return oh;
> +}
> +
>  static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
>int level)
>  {
> @@ -2333,8 +2373,9 @@

Re: [PATCH 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock

2017-01-12 Thread Junxiao Bi

On 01/05/2017 11:31 PM, Eric Ren wrote:
> We are in the situation that we have to avoid recursive cluster locking,
> but there is no way to check if a cluster lock has been taken by a
> precess already.
> 
> Mostly, we can avoid recursive locking by writing code carefully.
> However, we found that it's very hard to handle the routines that
> are invoked directly by vfs code. For instance:
> 
> const struct inode_operations ocfs2_file_iops = {
> .permission = ocfs2_permission,
> .get_acl= ocfs2_iop_get_acl,
> .set_acl= ocfs2_iop_set_acl,
> };
> 
> Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR):
> do_sys_open
>  may_open
>   inode_permission
>ocfs2_permission
> ocfs2_inode_lock() <=== first time
>  generic_permission
>   get_acl
>ocfs2_iop_get_acl
>   ocfs2_inode_lock() <=== recursive one
> 
> A deadlock will occur if a remote EX request comes in between two
> of ocfs2_inode_lock(). Briefly describe how the deadlock is formed:
> 
> On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in
> BAST(ocfs2_generic_handle_bast) when downconvert is started
> on behalf of the remote EX lock request. Another hand, the recursive
> cluster lock (the second one) will be blocked in in __ocfs2_cluster_lock()
> because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why?
> because there is no chance for the first cluster lock on this node to be
> unlocked - we block ourselves in the code path.
> 
> The idea to fix this issue is mostly taken from gfs2 code.
> 1. introduce a new field: struct ocfs2_lock_res.l_holders, to
> keep track of the processes' pid  who has taken the cluster lock
> of this lock resource;
> 2. introduce a new flag for ocfs2_inode_lock_full: OCFS2_META_LOCK_GETBH;
> it means just getting back disk inode bh for us if we've got cluster lock.
> 3. export a helper: ocfs2_is_locked_by_me() is used to check if we
> have got the cluster lock in the upper code path.
> 
> The tracking logic should be used by some of the ocfs2 vfs's callbacks,
> to solve the recursive locking issue cuased by the fact that vfs routines
> can call into each other.
> 
> The performance penalty of processing the holder list should only be seen
> at a few cases where the tracking logic is used, such as get/set acl.
> 
> You may ask what if the first time we got a PR lock, and the second time
> we want a EX lock? fortunately, this case never happens in the real world,
> as far as I can see, including permission check, (get|set)_(acl|attr), and
> the gfs2 code also do so.
> 
> Signed-off-by: Eric Ren 
> ---
>  fs/ocfs2/dlmglue.c | 47 ---
>  fs/ocfs2/dlmglue.h | 18 ++
>  fs/ocfs2/ocfs2.h   |  1 +
>  3 files changed, 63 insertions(+), 3 deletions(-)
> 
> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
> index 83d576f..500bda4 100644
> --- a/fs/ocfs2/dlmglue.c
> +++ b/fs/ocfs2/dlmglue.c
> @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res)
>   init_waitqueue_head(>l_event);
>   INIT_LIST_HEAD(>l_blocked_list);
>   INIT_LIST_HEAD(>l_mask_waiters);
> + INIT_LIST_HEAD(>l_holders);
>  }
>  
>  void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
> @@ -749,6 +750,45 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
>   res->l_flags = 0UL;
>  }
>  
> +inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres,
> +struct ocfs2_holder *oh)
> +{
> + INIT_LIST_HEAD(>oh_list);
> + oh->oh_owner_pid =  get_pid(task_pid(current));
struct pid(oh->oh_owner_pid) looks complicated here, why not use
task_struct(current) or pid_t(current->pid) directly? Also i didn't see
the ref count needs to be considered.

> +
> + spin_lock(>l_lock);
> + list_add_tail(>oh_list, >l_holders);
> + spin_unlock(>l_lock);
> +}
> +
> +inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres,
> +struct ocfs2_holder *oh)
> +{
> + spin_lock(>l_lock);
> + list_del(>oh_list);
> + spin_unlock(>l_lock);
> +
> + put_pid(oh->oh_owner_pid);
same the above

> +}
> +
> +inline struct ocfs2_holder *ocfs2_is_locked_by_me(struct ocfs2_lock_res 
> *lockres)
Agree with Joseph, return bool looks better. I didn't see how that help
debug since the return value is not used.


> +{
> + struct ocfs2_holder *oh;
> + struct pid *pid;
> +
> + /* look in the list of holders for one with the current task as owner */
> + spin_lock(>l_lock);
> + pid = task_pid(current);
> + list_for_each_entry(oh, >l_holders, oh_list) {
> + if (oh->oh_owner_pid == pid)
> + goto out;
> + }
> + oh = NULL;
> +out:
> + spin_unlock(>l_lock);
> + return oh;
> +}
> +
>  static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
>int level)
>  {
> @@ -2333,8 +2373,9 @@ int

kernel panic on next-20160225

2016-02-25 Thread Junxiao Bi

Hi,

The following panic is triggered when run ocfs2 xattr test on
linux-next-20160225. Did anybody ever see this?

[  254.604228] BUG: unable to handle kernel paging request at
0002000800c0
[  254.605013] IP: [] kmem_cache_alloc+0x78/0x160
[  254.605013] PGD 7bbe5067 PUD 0
[  254.605013] Oops:  [#1] SMP
[  254.605013] Modules linked in: ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm
ocfs2_nodemanager ocfs2_stackglue iscsi_tcp libiscsi_tcp libiscsi
scsi_transport_iscsi xen_kbdfront xen_netfront xen_fbfront xen_blkfront
[  254.605013] CPU: 2 PID: 4044 Comm: mpirun Not tainted
4.5.0-rc5-next-20160225 #1
[  254.605013] Hardware name: Xen HVM domU, BIOS 4.3.1OVM 05/14/2014
[  254.605013] task: 88007a521a80 ti: 88007aed task.ti:
88007aed
[  254.605013] RIP: 0010:[]  []
kmem_cache_alloc+0x78/0x160
[  254.605013] RSP: 0018:88007aed3a48  EFLAGS: 00010282
[  254.605013] RAX:  RBX:  RCX:
1991
[  254.605013] RDX: 1990 RSI: 024000c0 RDI:
0001b330
[  254.605013] RBP: 88007aed3a98 R08: 88007d29b330 R09:
0002000800c0
[  254.605013] R10: 000c51376d87 R11: 8800792cac38 R12:
88007cc30f00
[  254.605013] R13: 024000c0 R14: 811b053f R15:
88007aed3ce7
[  254.605013] FS:  () GS:88007d28()
knlGS:
[  254.605013] CS:  0010 DS:  ES:  CR0: 80050033
[  254.605013] CR2: 0002000800c0 CR3: 7aeb2000 CR4:
000406e0
[  254.605013] Stack:
[  254.605013]  13082000 88007aed3d28 0079
0001
[  254.605013]  2f2f2f2f 8800792cac00 88007aed3d38
0101
[  254.605013]  88007a5e2000 88007aed3ce7 88007aed3b08
811b053f
[  254.605013] Call Trace:
[  254.605013]  [] __d_alloc+0x2f/0x1a0
[  254.605013]  [] ? unlazy_walk+0xe2/0x160
[  254.605013]  [] d_alloc+0x17/0x80
[  254.605013]  [] lookup_dcache+0x8a/0xc0
[  254.605013]  [] ? __alloc_pages_nodemask+0x173/0xeb0
[  254.605013]  [] path_openat+0x3c3/0x1210
[  254.605013]  [] ? radix_tree_lookup_slot+0x13/0x30
[  254.605013]  [] ? find_get_entry+0x32/0xc0
[  254.605013]  [] ? atime_needs_update+0x55/0xe0
[  254.605013]  [] ? filemap_fault+0xd1/0x4b0
[  254.605013]  [] ? do_set_pte+0xb6/0x140
[  254.605013]  [] do_filp_open+0x80/0xe0
[  254.605013]  [] ? __alloc_fd+0x48/0x1a0
[  254.605013]  [] ? getname_flags+0x7a/0x1e0
[  254.605013]  [] do_sys_open+0x110/0x200
[  254.605013]  [] SyS_open+0x19/0x20
[  254.605013]  [] do_syscall_64+0x72/0x230
[  254.605013]  [] ? __do_page_fault+0x177/0x430
[  254.605013]  [] entry_SYSCALL64_slow_path+0x25/0x25
[  254.605013] Code: 05 e6 77 e7 7e 4d 8b 08 49 8b 40 10 4d 85 c9 0f 84
dd 00 00 00 48 85 c0 0f 84 d4 00 00 00 49 63 44 24 20 49 8b 3c 24 48 8d
4a 01 <49> 8b 1c 01 4c 89 c8 65 48 0f c7 0f 0f 94 c0 3c 01 75 b6 49 63
[  254.605013] RIP  [] kmem_cache_alloc+0x78/0x160
[  254.605013]  RSP 
[  254.605013] CR2: 0002000800c0
[  254.792273] ---[ end trace 823969e602e4aaac ]---

Thanks,
Junxiao.

kernel panic on next-20160225

2016-02-25 Thread Junxiao Bi

Hi,

The following panic is triggered when run ocfs2 xattr test on
linux-next-20160225. Did anybody ever see this?

[  254.604228] BUG: unable to handle kernel paging request at
0002000800c0
[  254.605013] IP: [] kmem_cache_alloc+0x78/0x160
[  254.605013] PGD 7bbe5067 PUD 0
[  254.605013] Oops:  [#1] SMP
[  254.605013] Modules linked in: ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm
ocfs2_nodemanager ocfs2_stackglue iscsi_tcp libiscsi_tcp libiscsi
scsi_transport_iscsi xen_kbdfront xen_netfront xen_fbfront xen_blkfront
[  254.605013] CPU: 2 PID: 4044 Comm: mpirun Not tainted
4.5.0-rc5-next-20160225 #1
[  254.605013] Hardware name: Xen HVM domU, BIOS 4.3.1OVM 05/14/2014
[  254.605013] task: 88007a521a80 ti: 88007aed task.ti:
88007aed
[  254.605013] RIP: 0010:[]  []
kmem_cache_alloc+0x78/0x160
[  254.605013] RSP: 0018:88007aed3a48  EFLAGS: 00010282
[  254.605013] RAX:  RBX:  RCX:
1991
[  254.605013] RDX: 1990 RSI: 024000c0 RDI:
0001b330
[  254.605013] RBP: 88007aed3a98 R08: 88007d29b330 R09:
0002000800c0
[  254.605013] R10: 000c51376d87 R11: 8800792cac38 R12:
88007cc30f00
[  254.605013] R13: 024000c0 R14: 811b053f R15:
88007aed3ce7
[  254.605013] FS:  () GS:88007d28()
knlGS:
[  254.605013] CS:  0010 DS:  ES:  CR0: 80050033
[  254.605013] CR2: 0002000800c0 CR3: 7aeb2000 CR4:
000406e0
[  254.605013] Stack:
[  254.605013]  13082000 88007aed3d28 0079
0001
[  254.605013]  2f2f2f2f 8800792cac00 88007aed3d38
0101
[  254.605013]  88007a5e2000 88007aed3ce7 88007aed3b08
811b053f
[  254.605013] Call Trace:
[  254.605013]  [] __d_alloc+0x2f/0x1a0
[  254.605013]  [] ? unlazy_walk+0xe2/0x160
[  254.605013]  [] d_alloc+0x17/0x80
[  254.605013]  [] lookup_dcache+0x8a/0xc0
[  254.605013]  [] ? __alloc_pages_nodemask+0x173/0xeb0
[  254.605013]  [] path_openat+0x3c3/0x1210
[  254.605013]  [] ? radix_tree_lookup_slot+0x13/0x30
[  254.605013]  [] ? find_get_entry+0x32/0xc0
[  254.605013]  [] ? atime_needs_update+0x55/0xe0
[  254.605013]  [] ? filemap_fault+0xd1/0x4b0
[  254.605013]  [] ? do_set_pte+0xb6/0x140
[  254.605013]  [] do_filp_open+0x80/0xe0
[  254.605013]  [] ? __alloc_fd+0x48/0x1a0
[  254.605013]  [] ? getname_flags+0x7a/0x1e0
[  254.605013]  [] do_sys_open+0x110/0x200
[  254.605013]  [] SyS_open+0x19/0x20
[  254.605013]  [] do_syscall_64+0x72/0x230
[  254.605013]  [] ? __do_page_fault+0x177/0x430
[  254.605013]  [] entry_SYSCALL64_slow_path+0x25/0x25
[  254.605013] Code: 05 e6 77 e7 7e 4d 8b 08 49 8b 40 10 4d 85 c9 0f 84
dd 00 00 00 48 85 c0 0f 84 d4 00 00 00 49 63 44 24 20 49 8b 3c 24 48 8d
4a 01 <49> 8b 1c 01 4c 89 c8 65 48 0f c7 0f 0f 94 c0 3c 01 75 b6 49 63
[  254.605013] RIP  [] kmem_cache_alloc+0x78/0x160
[  254.605013]  RSP 
[  254.605013] CR2: 0002000800c0
[  254.792273] ---[ end trace 823969e602e4aaac ]---

Thanks,
Junxiao.

Re: linux-next: kernel panic in ipv6_defrag

2015-12-24 Thread Junxiao Bi

On 12/23/2015 04:59 PM, Florian Westphal wrote:
> Junxiao Bi  wrote:
>> The following panic happened when I run ocfs2-test on linux-next. Kernel
>> config is attached.
>>
>> [64910.905501] BUG: unable to handle kernel NULL pointer dereference at
>>   (null)
>> [64910.906466] IP: [] nf_ct_frag6_gather+0x7ad/0x9c0
> [..]
>> ocfs2_stack_o2cb(O) ocfs2_dlm(O) ocfs2_nodemanager(O) ocfs2_stackglue(O)
>> iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi xen_kbdfront
>> xen_netfront xen_fbfront xen_blkfront [last unloaded: ocfs2_stackglue]
>> [64910.906466] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G   O
>> 4.4.0-rc5-next-20151217 #1
> 
> Seems like this snapshot still lacks
> 
> e97ac12859dbf4d3ee0eddb9798867541d1d1e1e
> ("netfilter: ipv6: nf_defrag: fix NULL deref panic"),
> https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git/commit/net/ipv6/netfilter/nf_conntrack_reasm.c?id=e97ac12859dbf4d3ee0eddb9798867541d1d1e1e
> 
> Its included starting with next-20151221.
> 
> Please report back if it occurs with above commit present.
Looks issue resolved with this fix. Thank you.
> 
> Thanks.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: linux-next: kernel panic in ipv6_defrag

2015-12-24 Thread Junxiao Bi

On 12/23/2015 04:59 PM, Florian Westphal wrote:
> Junxiao Bi <junxiao...@oracle.com> wrote:
>> The following panic happened when I run ocfs2-test on linux-next. Kernel
>> config is attached.
>>
>> [64910.905501] BUG: unable to handle kernel NULL pointer dereference at
>>   (null)
>> [64910.906466] IP: [] nf_ct_frag6_gather+0x7ad/0x9c0
> [..]
>> ocfs2_stack_o2cb(O) ocfs2_dlm(O) ocfs2_nodemanager(O) ocfs2_stackglue(O)
>> iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi xen_kbdfront
>> xen_netfront xen_fbfront xen_blkfront [last unloaded: ocfs2_stackglue]
>> [64910.906466] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G   O
>> 4.4.0-rc5-next-20151217 #1
> 
> Seems like this snapshot still lacks
> 
> e97ac12859dbf4d3ee0eddb9798867541d1d1e1e
> ("netfilter: ipv6: nf_defrag: fix NULL deref panic"),
> https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git/commit/net/ipv6/netfilter/nf_conntrack_reasm.c?id=e97ac12859dbf4d3ee0eddb9798867541d1d1e1e
> 
> Its included starting with next-20151221.
> 
> Please report back if it occurs with above commit present.
Looks issue resolved with this fix. Thank you.
> 
> Thanks.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Ocfs2-devel] [PATCH v2 4/4] ocfs2: check/fix inode block for online file check

2015-11-24 Thread Junxiao Bi

On 11/25/2015 01:04 PM, Gang He wrote:
> Hi Mark and Junxiao,
> 
> 
>>>>
>> Hi Mark,
>>
>> On 11/25/2015 06:16 AM, Mark Fasheh wrote:
>>> Hi Junxiao,
>>>
>>> On Tue, Nov 03, 2015 at 03:12:35PM +0800, Junxiao Bi wrote:
>>>> Hi Gang,
>>>>
>>>> This is not like a right patch.
>>>> First, online file check only checks inode's block number, valid flag,
>>>> fs generation value, and meta ecc. I never see a real corruption
>>>> happened only on this field, if these fields are corrupted, that means
>>>> something bad may happen on other place. So fix this field may not help
>>>> and even cause corruption more hard.
>>>
>>> I agree that these are rather uncommon, we might even consider removing the
>>> VALID_FL fixup. I definitely don't think we're ready for anything more
>>> complicated than this though either. We kind of have to start somewhere too.
>>>
>> Yes, the fix is too simple, and just a start, I think we'd better wait
>> more useful parts done before merging it.
> I agree, just remark VALID_FL flag to fix this field is too simple, we should 
> delay this field fix before 
> I have a flawless solution, I will remove these lines code in the first 
> version patches. In the future submits,
> I also hope your guys to help review the code carefully, shout out your 
> comments when you doubt somewhere.
Sure.

> 
> 
> 
>>>
>>>> Second, the repair way is wrong. In
>>>> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't
>>>> match the ones in memory, the ones in memory are used to update the disk
>>>> fields. The question is how do you know these field in memory are
>>>> right(they may be the real corrupted ones)?
>>>
>>> Your second point (and the last part of your 1st point) makes a good
>>> argument for why this shouldn't happen automatically. Some of these
>>> corruptions might require a human to look at the log and decide what to do.
>>> Especially as you point out, where we might not know where the source of the
>>> corruption is. And if the human can't figure it out, then it's probably time
>>> to unmount and fsck.
>> The point is that the fix way is wrong, just flush memory info to disk
>> is not right. I agree online fsck is good feature, but need carefully
>> design, it should not involve more corruptions. A rough idea from mine
>> is that maybe we need some "frezee" mechanism in fs, which can hung all
>> fs op and let fs stop at a safe area. After freeze fs, we can do some
>> fsck work on it and these works should not cost lots time. What's your idea?
> If we need to touch some global data structures, freezing fs can be 
> considered when we can't
> get any way in case using the locks.
> If we only handle some independent problem, we just need to lock the related 
> data structures. 
Hmm, I am not sure whether it's hard to decide an independent issue.

Thanks,
Junxiao.
> 
>>
>> Thanks,
>> Junxiao.
>>
>>>
>>> Thanks,
>>> --Mark
>>>
>>> --
>>> Mark Fasheh
>>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check

2015-11-24 Thread Junxiao Bi

Hi Gang,

On 11/25/2015 11:29 AM, Gang He wrote:
> Hi Mark and Junxiao,
> 
> 
>>>>
>> On Tue, Nov 03, 2015 at 04:20:27PM +0800, Junxiao Bi wrote:
>>> Hi Gang,
>>>
>>> On 11/03/2015 03:54 PM, Gang He wrote:
>>>> Hi Junxiao,
>>>>
>>>> Thank for your reviewing.
>>>> Current design, we use a sysfile as a interface to check/fix a file (via 
>> pass a ino number).
>>>> But, this operation is manually triggered by user, instead of 
>>>> automatically 
>>  fix in the kernel.
>>>> Why?
>>>> 1) we should let users make this decision, since some users do not want to 
>> fix when encountering a file system corruption, maybe they want to keep the 
>> file system unchanged for a further investigation.
>>> If user don't want this, they should not use error=continue option, let
>>> fs go after a corruption is very dangerous.
>>
>> Maybe we need another errors=XXX flag (maybe errors=fix)?
>>
>> You both make good points, here's what I gather from the conversation:
>>
>>  - Some customers would be sad if they have to manually fix corruptions.
>>This takes effort on their part, and if the FS can handle it
>>automatically, it should.
>>
>>  - There are valid concerns that automatically fixing things is a change in
>>behavior that might not be welcome, or worse might lead to unforseeable
>>circumstances.
>>
>>  - I will add that fixing things automatically implies checking them
>>automatically which could introduce some performance impact depending on
>>how much checking we're doing.
>>
>> So if the user wants errors to be fixed automatically, they could mount with
>> errros=fix, and everyone else would have no change in behavior unless they
>> wanted to make use of the new feature.
> That is what I want to say, add a mount option to let users to decide. Here, 
> I want to split "error=fix"
> mount option  task out from online file check feature, I think this part 
> should be a independent feature.
> We can implement this feature after online file check is done, I want to 
> split the feature into some more 
> detailed features, implement them one by one. Do you agree this point?
With error=fix, when a possible corruption is found, online fsck will
start to check and fix things. So this doesn't looks like a independent
feature.

Thanks,
Junxiao.

> 
>>
>>
>>>> 2) frankly speaking, this feature will probably bring a second corruption 
>> if there is some error in the code, I do not suggest to use automatically 
>> fix 
>> by default in the first version.
>>> I think if this feature could bring more corruption, then this should be
>>> fixed first.
>>
>> Btw, I am pretty sure that Gang is referring to the feature being new and
>> thus more likely to have problems. There is nothing I see in here that is
>> file system corrupting.
>>  --Mark
>>
>>
>> --
>> Mark Fasheh
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check

2015-11-24 Thread Junxiao Bi

On 11/25/2015 05:46 AM, Mark Fasheh wrote:
> On Tue, Nov 03, 2015 at 04:20:27PM +0800, Junxiao Bi wrote:
>> Hi Gang,
>>
>> On 11/03/2015 03:54 PM, Gang He wrote:
>>> Hi Junxiao,
>>>
>>> Thank for your reviewing.
>>> Current design, we use a sysfile as a interface to check/fix a file (via 
>>> pass a ino number).
>>> But, this operation is manually triggered by user, instead of automatically 
>>>  fix in the kernel.
>>> Why?
>>> 1) we should let users make this decision, since some users do not want to 
>>> fix when encountering a file system corruption, maybe they want to keep the 
>>> file system unchanged for a further investigation.
>> If user don't want this, they should not use error=continue option, let
>> fs go after a corruption is very dangerous.
> 
> Maybe we need another errors=XXX flag (maybe errors=fix)?
Sound great. This is a good option since user may have not enough
knowledge whether to fix the found issue.

Thanks,
Junxiao.
> 
> You both make good points, here's what I gather from the conversation:
> 
>  - Some customers would be sad if they have to manually fix corruptions.
>This takes effort on their part, and if the FS can handle it
>automatically, it should.
> 
>  - There are valid concerns that automatically fixing things is a change in
>behavior that might not be welcome, or worse might lead to unforseeable
>circumstances.
> 
>  - I will add that fixing things automatically implies checking them
>automatically which could introduce some performance impact depending on
>how much checking we're doing.
> 
> So if the user wants errors to be fixed automatically, they could mount with
> errros=fix, and everyone else would have no change in behavior unless they
> wanted to make use of the new feature.
> 
> 
>>> 2) frankly speaking, this feature will probably bring a second corruption 
>>> if there is some error in the code, I do not suggest to use automatically 
>>> fix by default in the first version.
>> I think if this feature could bring more corruption, then this should be
>> fixed first.
> 
> Btw, I am pretty sure that Gang is referring to the feature being new and
> thus more likely to have problems. There is nothing I see in here that is
> file system corrupting.
>   --Mark
> 
> 
> --
> Mark Fasheh
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Ocfs2-devel] [PATCH v2 4/4] ocfs2: check/fix inode block for online file check

2015-11-24 Thread Junxiao Bi

Hi Mark,

On 11/25/2015 06:16 AM, Mark Fasheh wrote:
> Hi Junxiao,
> 
> On Tue, Nov 03, 2015 at 03:12:35PM +0800, Junxiao Bi wrote:
>> Hi Gang,
>>
>> This is not like a right patch.
>> First, online file check only checks inode's block number, valid flag,
>> fs generation value, and meta ecc. I never see a real corruption
>> happened only on this field, if these fields are corrupted, that means
>> something bad may happen on other place. So fix this field may not help
>> and even cause corruption more hard.
> 
> I agree that these are rather uncommon, we might even consider removing the
> VALID_FL fixup. I definitely don't think we're ready for anything more
> complicated than this though either. We kind of have to start somewhere too.
> 
Yes, the fix is too simple, and just a start, I think we'd better wait
more useful parts done before merging it.
> 
>> Second, the repair way is wrong. In
>> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't
>> match the ones in memory, the ones in memory are used to update the disk
>> fields. The question is how do you know these field in memory are
>> right(they may be the real corrupted ones)?
> 
> Your second point (and the last part of your 1st point) makes a good
> argument for why this shouldn't happen automatically. Some of these
> corruptions might require a human to look at the log and decide what to do.
> Especially as you point out, where we might not know where the source of the
> corruption is. And if the human can't figure it out, then it's probably time
> to unmount and fsck.
The point is that the fix way is wrong, just flush memory info to disk
is not right. I agree online fsck is good feature, but need carefully
design, it should not involve more corruptions. A rough idea from mine
is that maybe we need some "frezee" mechanism in fs, which can hung all
fs op and let fs stop at a safe area. After freeze fs, we can do some
fsck work on it and these works should not cost lots time. What's your idea?

Thanks,
Junxiao.

> 
> Thanks,
>   --Mark
> 
> --
> Mark Fasheh
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Ocfs2-devel] [PATCH v2 4/4] ocfs2: check/fix inode block for online file check

2015-11-24 Thread Junxiao Bi

Hi Mark,

On 11/25/2015 06:16 AM, Mark Fasheh wrote:
> Hi Junxiao,
> 
> On Tue, Nov 03, 2015 at 03:12:35PM +0800, Junxiao Bi wrote:
>> Hi Gang,
>>
>> This is not like a right patch.
>> First, online file check only checks inode's block number, valid flag,
>> fs generation value, and meta ecc. I never see a real corruption
>> happened only on this field, if these fields are corrupted, that means
>> something bad may happen on other place. So fix this field may not help
>> and even cause corruption more hard.
> 
> I agree that these are rather uncommon, we might even consider removing the
> VALID_FL fixup. I definitely don't think we're ready for anything more
> complicated than this though either. We kind of have to start somewhere too.
> 
Yes, the fix is too simple, and just a start, I think we'd better wait
more useful parts done before merging it.
> 
>> Second, the repair way is wrong. In
>> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't
>> match the ones in memory, the ones in memory are used to update the disk
>> fields. The question is how do you know these field in memory are
>> right(they may be the real corrupted ones)?
> 
> Your second point (and the last part of your 1st point) makes a good
> argument for why this shouldn't happen automatically. Some of these
> corruptions might require a human to look at the log and decide what to do.
> Especially as you point out, where we might not know where the source of the
> corruption is. And if the human can't figure it out, then it's probably time
> to unmount and fsck.
The point is that the fix way is wrong, just flush memory info to disk
is not right. I agree online fsck is good feature, but need carefully
design, it should not involve more corruptions. A rough idea from mine
is that maybe we need some "frezee" mechanism in fs, which can hung all
fs op and let fs stop at a safe area. After freeze fs, we can do some
fsck work on it and these works should not cost lots time. What's your idea?

Thanks,
Junxiao.

> 
> Thanks,
>   --Mark
> 
> --
> Mark Fasheh
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check

2015-11-24 Thread Junxiao Bi

On 11/25/2015 05:46 AM, Mark Fasheh wrote:
> On Tue, Nov 03, 2015 at 04:20:27PM +0800, Junxiao Bi wrote:
>> Hi Gang,
>>
>> On 11/03/2015 03:54 PM, Gang He wrote:
>>> Hi Junxiao,
>>>
>>> Thank for your reviewing.
>>> Current design, we use a sysfile as a interface to check/fix a file (via 
>>> pass a ino number).
>>> But, this operation is manually triggered by user, instead of automatically 
>>>  fix in the kernel.
>>> Why?
>>> 1) we should let users make this decision, since some users do not want to 
>>> fix when encountering a file system corruption, maybe they want to keep the 
>>> file system unchanged for a further investigation.
>> If user don't want this, they should not use error=continue option, let
>> fs go after a corruption is very dangerous.
> 
> Maybe we need another errors=XXX flag (maybe errors=fix)?
Sound great. This is a good option since user may have not enough
knowledge whether to fix the found issue.

Thanks,
Junxiao.
> 
> You both make good points, here's what I gather from the conversation:
> 
>  - Some customers would be sad if they have to manually fix corruptions.
>This takes effort on their part, and if the FS can handle it
>automatically, it should.
> 
>  - There are valid concerns that automatically fixing things is a change in
>behavior that might not be welcome, or worse might lead to unforseeable
>circumstances.
> 
>  - I will add that fixing things automatically implies checking them
>automatically which could introduce some performance impact depending on
>how much checking we're doing.
> 
> So if the user wants errors to be fixed automatically, they could mount with
> errros=fix, and everyone else would have no change in behavior unless they
> wanted to make use of the new feature.
> 
> 
>>> 2) frankly speaking, this feature will probably bring a second corruption 
>>> if there is some error in the code, I do not suggest to use automatically 
>>> fix by default in the first version.
>> I think if this feature could bring more corruption, then this should be
>> fixed first.
> 
> Btw, I am pretty sure that Gang is referring to the feature being new and
> thus more likely to have problems. There is nothing I see in here that is
> file system corrupting.
>   --Mark
> 
> 
> --
> Mark Fasheh
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check

2015-11-24 Thread Junxiao Bi

Hi Gang,

On 11/25/2015 11:29 AM, Gang He wrote:
> Hi Mark and Junxiao,
> 
> 
>>>>
>> On Tue, Nov 03, 2015 at 04:20:27PM +0800, Junxiao Bi wrote:
>>> Hi Gang,
>>>
>>> On 11/03/2015 03:54 PM, Gang He wrote:
>>>> Hi Junxiao,
>>>>
>>>> Thank for your reviewing.
>>>> Current design, we use a sysfile as a interface to check/fix a file (via 
>> pass a ino number).
>>>> But, this operation is manually triggered by user, instead of 
>>>> automatically 
>>  fix in the kernel.
>>>> Why?
>>>> 1) we should let users make this decision, since some users do not want to 
>> fix when encountering a file system corruption, maybe they want to keep the 
>> file system unchanged for a further investigation.
>>> If user don't want this, they should not use error=continue option, let
>>> fs go after a corruption is very dangerous.
>>
>> Maybe we need another errors=XXX flag (maybe errors=fix)?
>>
>> You both make good points, here's what I gather from the conversation:
>>
>>  - Some customers would be sad if they have to manually fix corruptions.
>>This takes effort on their part, and if the FS can handle it
>>automatically, it should.
>>
>>  - There are valid concerns that automatically fixing things is a change in
>>behavior that might not be welcome, or worse might lead to unforseeable
>>circumstances.
>>
>>  - I will add that fixing things automatically implies checking them
>>automatically which could introduce some performance impact depending on
>>how much checking we're doing.
>>
>> So if the user wants errors to be fixed automatically, they could mount with
>> errros=fix, and everyone else would have no change in behavior unless they
>> wanted to make use of the new feature.
> That is what I want to say, add a mount option to let users to decide. Here, 
> I want to split "error=fix"
> mount option  task out from online file check feature, I think this part 
> should be a independent feature.
> We can implement this feature after online file check is done, I want to 
> split the feature into some more 
> detailed features, implement them one by one. Do you agree this point?
With error=fix, when a possible corruption is found, online fsck will
start to check and fix things. So this doesn't looks like a independent
feature.

Thanks,
Junxiao.

> 
>>
>>
>>>> 2) frankly speaking, this feature will probably bring a second corruption 
>> if there is some error in the code, I do not suggest to use automatically 
>> fix 
>> by default in the first version.
>>> I think if this feature could bring more corruption, then this should be
>>> fixed first.
>>
>> Btw, I am pretty sure that Gang is referring to the feature being new and
>> thus more likely to have problems. There is nothing I see in here that is
>> file system corrupting.
>>  --Mark
>>
>>
>> --
>> Mark Fasheh
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [Ocfs2-devel] [PATCH v2 4/4] ocfs2: check/fix inode block for online file check

2015-11-24 Thread Junxiao Bi

On 11/25/2015 01:04 PM, Gang He wrote:
> Hi Mark and Junxiao,
> 
> 
>>>>
>> Hi Mark,
>>
>> On 11/25/2015 06:16 AM, Mark Fasheh wrote:
>>> Hi Junxiao,
>>>
>>> On Tue, Nov 03, 2015 at 03:12:35PM +0800, Junxiao Bi wrote:
>>>> Hi Gang,
>>>>
>>>> This is not like a right patch.
>>>> First, online file check only checks inode's block number, valid flag,
>>>> fs generation value, and meta ecc. I never see a real corruption
>>>> happened only on this field, if these fields are corrupted, that means
>>>> something bad may happen on other place. So fix this field may not help
>>>> and even cause corruption more hard.
>>>
>>> I agree that these are rather uncommon, we might even consider removing the
>>> VALID_FL fixup. I definitely don't think we're ready for anything more
>>> complicated than this though either. We kind of have to start somewhere too.
>>>
>> Yes, the fix is too simple, and just a start, I think we'd better wait
>> more useful parts done before merging it.
> I agree, just remark VALID_FL flag to fix this field is too simple, we should 
> delay this field fix before 
> I have a flawless solution, I will remove these lines code in the first 
> version patches. In the future submits,
> I also hope your guys to help review the code carefully, shout out your 
> comments when you doubt somewhere.
Sure.

> 
> 
> 
>>>
>>>> Second, the repair way is wrong. In
>>>> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't
>>>> match the ones in memory, the ones in memory are used to update the disk
>>>> fields. The question is how do you know these field in memory are
>>>> right(they may be the real corrupted ones)?
>>>
>>> Your second point (and the last part of your 1st point) makes a good
>>> argument for why this shouldn't happen automatically. Some of these
>>> corruptions might require a human to look at the log and decide what to do.
>>> Especially as you point out, where we might not know where the source of the
>>> corruption is. And if the human can't figure it out, then it's probably time
>>> to unmount and fsck.
>> The point is that the fix way is wrong, just flush memory info to disk
>> is not right. I agree online fsck is good feature, but need carefully
>> design, it should not involve more corruptions. A rough idea from mine
>> is that maybe we need some "frezee" mechanism in fs, which can hung all
>> fs op and let fs stop at a safe area. After freeze fs, we can do some
>> fsck work on it and these works should not cost lots time. What's your idea?
> If we need to touch some global data structures, freezing fs can be 
> considered when we can't
> get any way in case using the locks.
> If we only handle some independent problem, we just need to lock the related 
> data structures. 
Hmm, I am not sure whether it's hard to decide an independent issue.

Thanks,
Junxiao.
> 
>>
>> Thanks,
>> Junxiao.
>>
>>>
>>> Thanks,
>>> --Mark
>>>
>>> --
>>> Mark Fasheh
>>>
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check

2015-11-03 Thread Junxiao Bi

On 11/03/2015 04:47 PM, Gang He wrote:
> 
> 
> 

>> On 11/03/2015 04:15 PM, Gang He wrote:
>>> Hello Junxiao,
>>>
>>> See my comments inline.
>>>
>>>
>>
 Hi Gang,

 This is not like a right patch.
 First, online file check only checks inode's block number, valid flag,
 fs generation value, and meta ecc. I never see a real corruption
 happened only on this field, if these fields are corrupted, that means
 something bad may happen on other place. So fix this field may not help
 and even cause corruption more hard.
>>> This online file check/fix feature is used to check/fix some light file 
>>> meta 
>> block corruption, instead of turning a file system off and using fsck.ocfs2.
>> What's light meta block corruption? Do you have a case about it?
>>> e.g. meta ecc error, we really need not to use fsck.ocfs2. 
>>> of course, this feature does not replace fsck.ocfs2 and touch some 
>> complicated meta block problems, if there is some potential problem in some 
>> areas, we can discuss them one by one.
>>>
>>>
>>>
 Second, the repair way is wrong. In
 ocfs2_filecheck_repair_inode_block(), if these fields in disk don't
 match the ones in memory, the ones in memory are used to update the disk
 fields. The question is how do you know these field in memory are
 right(they may be the real corrupted ones)?
>>> Here, if the inode block was corrupted, the file system is not able to load 
>> it into the memory.
>> How do you know inode block corrupted? If bh for inode block is
>> overwritten, i mean bh corrupted, the repair will corrupted a good inode
>> block.
> You know, the meta block is only validated when the file system loads the 
> block from disk to memory.
> If the inode object is in the memory, we consider this inode block is OK.
This assuming is not true as there are always bugs. Bugs can make inode
object in memory bad and corrupted the fs when repair the inode.

Thanks,
Junxiao.
> If the inode is not loaded by the file system via the normal way, the file 
> system will print a kernel error log to tell which ino is corrupted.
> we will use  ocfs2_filecheck_repair_inode_block() function to fix the inode 
> block before loading.
> 
> Thanks
> Gang
> 
>>
>> Thanks,
>> Junxiao.
>>
>>> ocfs2_filecheck_repair_inode_block() will able to load it into the memory, 
>> since it try to fix these light-level problem before loading.
>>> if the fix is OK, the changed meta-block can pass the block-validate 
>>> function 
>> and load into the memory as a inode object.
>>> Since the file system is under a cluster environment, we have to use some 
>> existing function and code path to keep these block operation under a 
>> cluster 
>> lock.
>>>
>>>
>>> Thanks
>>> Gang
>>>

 Thanks,
 Junxiao.
 On 10/28/2015 02:26 PM, Gang He wrote:
> +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
> +struct buffer_head *bh)
> +{
> + int rc;
> + int changed = 0;
> + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
> +
> + rc = ocfs2_filecheck_validate_inode_block(sb, bh);
> + /* Can't fix invalid inode block */
> + if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO)
> + return rc;
> +
> + trace_ocfs2_filecheck_repair_inode_block(
> + (unsigned long long)bh->b_blocknr);
> +
> + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
> + ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
> + mlog(ML_ERROR,
> + "Filecheck: try to repair dinode #%llu on readonly 
> filesystem\n",
> + (unsigned long long)bh->b_blocknr);
> + return -OCFS2_FILECHECK_ERR_READONLY;
> + }
> +
> + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
> + di->i_blkno = cpu_to_le64(bh->b_blocknr);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
> + (unsigned long long)bh->b_blocknr,
> + (unsigned long long)le64_to_cpu(di->i_blkno));
> + }
> +
> + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
> + di->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is 
> set\n",
> + (unsigned long long)bh->b_blocknr);
> + }
> +
> + if (le32_to_cpu(di->i_fs_generation) !=
> + OCFS2_SB(sb)->fs_generation) {
> + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: fs_generation to %u\n",
> + (unsigned long long)bh->b_blocknr,
> + le32_to_cpu(di->i_fs_generation));
> + }
> +
> + if (changed ||
> +

Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check

2015-11-03 Thread Junxiao Bi

On 11/03/2015 04:15 PM, Gang He wrote:
> Hello Junxiao,
> 
> See my comments inline.
> 
> 

>> Hi Gang,
>>
>> This is not like a right patch.
>> First, online file check only checks inode's block number, valid flag,
>> fs generation value, and meta ecc. I never see a real corruption
>> happened only on this field, if these fields are corrupted, that means
>> something bad may happen on other place. So fix this field may not help
>> and even cause corruption more hard.
> This online file check/fix feature is used to check/fix some light file meta 
> block corruption, instead of turning a file system off and using fsck.ocfs2.
What's light meta block corruption? Do you have a case about it?
> e.g. meta ecc error, we really need not to use fsck.ocfs2. 
> of course, this feature does not replace fsck.ocfs2 and touch some 
> complicated meta block problems, if there is some potential problem in some 
> areas, we can discuss them one by one.
> 
> 
> 
>> Second, the repair way is wrong. In
>> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't
>> match the ones in memory, the ones in memory are used to update the disk
>> fields. The question is how do you know these field in memory are
>> right(they may be the real corrupted ones)?
> Here, if the inode block was corrupted, the file system is not able to load 
> it into the memory.
How do you know inode block corrupted? If bh for inode block is
overwritten, i mean bh corrupted, the repair will corrupted a good inode
block.

Thanks,
Junxiao.

> ocfs2_filecheck_repair_inode_block() will able to load it into the memory, 
> since it try to fix these light-level problem before loading.
> if the fix is OK, the changed meta-block can pass the block-validate function 
> and load into the memory as a inode object.
> Since the file system is under a cluster environment, we have to use some 
> existing function and code path to keep these block operation under a cluster 
> lock.
> 
> 
> Thanks
> Gang
> 
>>
>> Thanks,
>> Junxiao.
>> On 10/28/2015 02:26 PM, Gang He wrote:
>>> +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
>>> +  struct buffer_head *bh)
>>> +{
>>> +   int rc;
>>> +   int changed = 0;
>>> +   struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
>>> +
>>> +   rc = ocfs2_filecheck_validate_inode_block(sb, bh);
>>> +   /* Can't fix invalid inode block */
>>> +   if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO)
>>> +   return rc;
>>> +
>>> +   trace_ocfs2_filecheck_repair_inode_block(
>>> +   (unsigned long long)bh->b_blocknr);
>>> +
>>> +   if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
>>> +   ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
>>> +   mlog(ML_ERROR,
>>> +   "Filecheck: try to repair dinode #%llu on readonly 
>>> filesystem\n",
>>> +   (unsigned long long)bh->b_blocknr);
>>> +   return -OCFS2_FILECHECK_ERR_READONLY;
>>> +   }
>>> +
>>> +   if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
>>> +   di->i_blkno = cpu_to_le64(bh->b_blocknr);
>>> +   changed = 1;
>>> +   mlog(ML_ERROR,
>>> +   "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
>>> +   (unsigned long long)bh->b_blocknr,
>>> +   (unsigned long long)le64_to_cpu(di->i_blkno));
>>> +   }
>>> +
>>> +   if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
>>> +   di->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
>>> +   changed = 1;
>>> +   mlog(ML_ERROR,
>>> +   "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is 
>>> set\n",
>>> +   (unsigned long long)bh->b_blocknr);
>>> +   }
>>> +
>>> +   if (le32_to_cpu(di->i_fs_generation) !=
>>> +   OCFS2_SB(sb)->fs_generation) {
>>> +   di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
>>> +   changed = 1;
>>> +   mlog(ML_ERROR,
>>> +   "Filecheck: reset dinode #%llu: fs_generation to %u\n",
>>> +   (unsigned long long)bh->b_blocknr,
>>> +   le32_to_cpu(di->i_fs_generation));
>>> +   }
>>> +
>>> +   if (changed ||
>>> +   ocfs2_validate_meta_ecc(sb, bh->b_data, >i_check)) {
>>> +   ocfs2_compute_meta_ecc(sb, bh->b_data, >i_check);
>>> +   mark_buffer_dirty(bh);
>>> +   mlog(ML_ERROR,
>>> +   "Filecheck: reset dinode #%llu: compute meta ecc\n",
>>> +   (unsigned long long)bh->b_blocknr);
>>> +   }
>>> +
>>> +   return 0;
>>> +}
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check

2015-11-03 Thread Junxiao Bi

Hi Gang,

On 11/03/2015 03:54 PM, Gang He wrote:
> Hi Junxiao,
> 
> Thank for your reviewing.
> Current design, we use a sysfile as a interface to check/fix a file (via pass 
> a ino number).
> But, this operation is manually triggered by user, instead of automatically  
> fix in the kernel.
> Why?
> 1) we should let users make this decision, since some users do not want to 
> fix when encountering a file system corruption, maybe they want to keep the 
> file system unchanged for a further investigation.
If user don't want this, they should not use error=continue option, let
fs go after a corruption is very dangerous.
> 2) frankly speaking, this feature will probably bring a second corruption if 
> there is some error in the code, I do not suggest to use automatically fix by 
> default in the first version.
I think if this feature could bring more corruption, then this should be
fixed first.

Thanks,
Junxiao
> 3) in the future, if this feature is well proved, we can add a mount option 
> to make this automatically fix enabled.
> 
> 
> Thanks
> Gang
>
> 
> 

>> Hi Gang,
>>
>> I didn't see a need to add a sysfs file for the check and repair. This
>> leaves a hard problem for customer to decide. How they decide whether
>> they should repair the bad inode since this may cause corruption even
>> harder?
>> I think the error should be fixed by this feature automaticlly if repair
>> helps, of course this can be done only when error=continue is enabled or
>> add some mount option for it.
>>
>> Thanks,
>> Junxiao.
>>
>> On 10/28/2015 02:25 PM, Gang He wrote:
>>> Implement online file check sysfile interfaces, e.g.
>>> how to create the related sysfile according to device name,
>>> how to display/handle file check request from the sysfile.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>  fs/ocfs2/Makefile|   3 +-
>>>  fs/ocfs2/filecheck.c | 566 
>> +++
>>>  fs/ocfs2/filecheck.h |  48 +
>>>  fs/ocfs2/inode.h |   3 +
>>>  4 files changed, 619 insertions(+), 1 deletion(-)
>>>  create mode 100644 fs/ocfs2/filecheck.c
>>>  create mode 100644 fs/ocfs2/filecheck.h
>>>
>>> diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
>>> index ce210d4..e27e652 100644
>>> --- a/fs/ocfs2/Makefile
>>> +++ b/fs/ocfs2/Makefile
>>> @@ -41,7 +41,8 @@ ocfs2-objs := \
>>> quota_local.o   \
>>> quota_global.o  \
>>> xattr.o \
>>> -   acl.o
>>> +   acl.o   \
>>> +   filecheck.o
>>>  
>>>  ocfs2_stackglue-objs := stackglue.o
>>>  ocfs2_stack_o2cb-objs := stack_o2cb.o
>>> diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
>>> new file mode 100644
>>> index 000..f12ed1f
>>> --- /dev/null
>>> +++ b/fs/ocfs2/filecheck.c
>>> @@ -0,0 +1,566 @@
>>> +/* -*- mode: c; c-basic-offset: 8; -*-
>>> + * vim: noexpandtab sw=8 ts=8 sts=0:
>>> + *
>>> + * filecheck.c
>>> + *
>>> + * Code which implements online file check.
>>> + *
>>> + * Copyright (C) 2015 Novell.  All rights reserved.
>>> + *
>>> + * This program is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU General Public
>>> + * License as published by the Free Software Foundation, version 2.
>>> + *
>>> + * This program is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> + * General Public License for more details.
>>> + */
>>> +
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +
>>> +#include "ocfs2.h"
>>> +#include "ocfs2_fs.h"
>>> +#include "stackglue.h"
>>> +#include "inode.h"
>>> +
>>> +#include "filecheck.h"
>>> +
>>> +
>>> +/* File check error strings,
>>> + * must correspond with error number in header file.
>>> + */
>>> +static const char * const ocfs2_filecheck_errs[] = {
>>> +   "SUCCESS",
>>> +   "FAILED",
>>> +   "INPROGRESS",
>>> +   "READONLY",
>>> +   "INVALIDINO",
>>> +   "BLOCKECC",
>>> +   "BLOCKNO",
>>> +   "VALIDFLAG",
>>> +   "GENERATION",
>>> +   "UNSUPPORTED"
>>> +};
>>> +
>>> +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
>>> +static LIST_HEAD(ocfs2_filecheck_sysfs_list);
>>> +
>>> +struct ocfs2_filecheck {
>>> +   struct list_head fc_head;   /* File check entry list head */
>>> +   spinlock_t fc_lock;
>>> +   unsigned int fc_max;/* Maximum number of entry in list */
>>> +   unsigned int fc_size;   /* Current entry count in list */
>>> +   unsigned int fc_done;   /* File check entries are done in list */
>>> +};
>>> +
>>> +struct ocfs2_filecheck_sysfs_entry {
>>> +   struct list_head fs_list;
>>> +   atomic_t fs_count;
>>> +   struct super_block *fs_sb;
>>> +   struct kset *fs_kset;
>>> +   struct ocfs2_filecheck *fs_fcheck;
>>> +};
>>> +
>>> +#define OCFS2_FILECHECK_MAXSIZE100
>>> +#define OCFS2_FILECHECK_MINSIZE10
>>> +
>>> +/* File

Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check

2015-11-03 Thread Junxiao Bi

On 11/03/2015 04:47 PM, Gang He wrote:
> 
> 
> 

>> On 11/03/2015 04:15 PM, Gang He wrote:
>>> Hello Junxiao,
>>>
>>> See my comments inline.
>>>
>>>
>>
 Hi Gang,

 This is not like a right patch.
 First, online file check only checks inode's block number, valid flag,
 fs generation value, and meta ecc. I never see a real corruption
 happened only on this field, if these fields are corrupted, that means
 something bad may happen on other place. So fix this field may not help
 and even cause corruption more hard.
>>> This online file check/fix feature is used to check/fix some light file 
>>> meta 
>> block corruption, instead of turning a file system off and using fsck.ocfs2.
>> What's light meta block corruption? Do you have a case about it?
>>> e.g. meta ecc error, we really need not to use fsck.ocfs2. 
>>> of course, this feature does not replace fsck.ocfs2 and touch some 
>> complicated meta block problems, if there is some potential problem in some 
>> areas, we can discuss them one by one.
>>>
>>>
>>>
 Second, the repair way is wrong. In
 ocfs2_filecheck_repair_inode_block(), if these fields in disk don't
 match the ones in memory, the ones in memory are used to update the disk
 fields. The question is how do you know these field in memory are
 right(they may be the real corrupted ones)?
>>> Here, if the inode block was corrupted, the file system is not able to load 
>> it into the memory.
>> How do you know inode block corrupted? If bh for inode block is
>> overwritten, i mean bh corrupted, the repair will corrupted a good inode
>> block.
> You know, the meta block is only validated when the file system loads the 
> block from disk to memory.
> If the inode object is in the memory, we consider this inode block is OK.
This assuming is not true as there are always bugs. Bugs can make inode
object in memory bad and corrupted the fs when repair the inode.

Thanks,
Junxiao.
> If the inode is not loaded by the file system via the normal way, the file 
> system will print a kernel error log to tell which ino is corrupted.
> we will use  ocfs2_filecheck_repair_inode_block() function to fix the inode 
> block before loading.
> 
> Thanks
> Gang
> 
>>
>> Thanks,
>> Junxiao.
>>
>>> ocfs2_filecheck_repair_inode_block() will able to load it into the memory, 
>> since it try to fix these light-level problem before loading.
>>> if the fix is OK, the changed meta-block can pass the block-validate 
>>> function 
>> and load into the memory as a inode object.
>>> Since the file system is under a cluster environment, we have to use some 
>> existing function and code path to keep these block operation under a 
>> cluster 
>> lock.
>>>
>>>
>>> Thanks
>>> Gang
>>>

 Thanks,
 Junxiao.
 On 10/28/2015 02:26 PM, Gang He wrote:
> +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
> +struct buffer_head *bh)
> +{
> + int rc;
> + int changed = 0;
> + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
> +
> + rc = ocfs2_filecheck_validate_inode_block(sb, bh);
> + /* Can't fix invalid inode block */
> + if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO)
> + return rc;
> +
> + trace_ocfs2_filecheck_repair_inode_block(
> + (unsigned long long)bh->b_blocknr);
> +
> + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
> + ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
> + mlog(ML_ERROR,
> + "Filecheck: try to repair dinode #%llu on readonly 
> filesystem\n",
> + (unsigned long long)bh->b_blocknr);
> + return -OCFS2_FILECHECK_ERR_READONLY;
> + }
> +
> + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
> + di->i_blkno = cpu_to_le64(bh->b_blocknr);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
> + (unsigned long long)bh->b_blocknr,
> + (unsigned long long)le64_to_cpu(di->i_blkno));
> + }
> +
> + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
> + di->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is 
> set\n",
> + (unsigned long long)bh->b_blocknr);
> + }
> +
> + if (le32_to_cpu(di->i_fs_generation) !=
> + OCFS2_SB(sb)->fs_generation) {
> + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: fs_generation to %u\n",
> + (unsigned long long)bh->b_blocknr,
> + le32_to_cpu(di->i_fs_generation));
> + }
> +
> + if (changed ||
> +

Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check

2015-11-03 Thread Junxiao Bi

Hi Gang,

On 11/03/2015 03:54 PM, Gang He wrote:
> Hi Junxiao,
> 
> Thank for your reviewing.
> Current design, we use a sysfile as a interface to check/fix a file (via pass 
> a ino number).
> But, this operation is manually triggered by user, instead of automatically  
> fix in the kernel.
> Why?
> 1) we should let users make this decision, since some users do not want to 
> fix when encountering a file system corruption, maybe they want to keep the 
> file system unchanged for a further investigation.
If user don't want this, they should not use error=continue option, let
fs go after a corruption is very dangerous.
> 2) frankly speaking, this feature will probably bring a second corruption if 
> there is some error in the code, I do not suggest to use automatically fix by 
> default in the first version.
I think if this feature could bring more corruption, then this should be
fixed first.

Thanks,
Junxiao
> 3) in the future, if this feature is well proved, we can add a mount option 
> to make this automatically fix enabled.
> 
> 
> Thanks
> Gang
>
> 
> 

>> Hi Gang,
>>
>> I didn't see a need to add a sysfs file for the check and repair. This
>> leaves a hard problem for customer to decide. How they decide whether
>> they should repair the bad inode since this may cause corruption even
>> harder?
>> I think the error should be fixed by this feature automaticlly if repair
>> helps, of course this can be done only when error=continue is enabled or
>> add some mount option for it.
>>
>> Thanks,
>> Junxiao.
>>
>> On 10/28/2015 02:25 PM, Gang He wrote:
>>> Implement online file check sysfile interfaces, e.g.
>>> how to create the related sysfile according to device name,
>>> how to display/handle file check request from the sysfile.
>>>
>>> Signed-off-by: Gang He 
>>> ---
>>>  fs/ocfs2/Makefile|   3 +-
>>>  fs/ocfs2/filecheck.c | 566 
>> +++
>>>  fs/ocfs2/filecheck.h |  48 +
>>>  fs/ocfs2/inode.h |   3 +
>>>  4 files changed, 619 insertions(+), 1 deletion(-)
>>>  create mode 100644 fs/ocfs2/filecheck.c
>>>  create mode 100644 fs/ocfs2/filecheck.h
>>>
>>> diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
>>> index ce210d4..e27e652 100644
>>> --- a/fs/ocfs2/Makefile
>>> +++ b/fs/ocfs2/Makefile
>>> @@ -41,7 +41,8 @@ ocfs2-objs := \
>>> quota_local.o   \
>>> quota_global.o  \
>>> xattr.o \
>>> -   acl.o
>>> +   acl.o   \
>>> +   filecheck.o
>>>  
>>>  ocfs2_stackglue-objs := stackglue.o
>>>  ocfs2_stack_o2cb-objs := stack_o2cb.o
>>> diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
>>> new file mode 100644
>>> index 000..f12ed1f
>>> --- /dev/null
>>> +++ b/fs/ocfs2/filecheck.c
>>> @@ -0,0 +1,566 @@
>>> +/* -*- mode: c; c-basic-offset: 8; -*-
>>> + * vim: noexpandtab sw=8 ts=8 sts=0:
>>> + *
>>> + * filecheck.c
>>> + *
>>> + * Code which implements online file check.
>>> + *
>>> + * Copyright (C) 2015 Novell.  All rights reserved.
>>> + *
>>> + * This program is free software; you can redistribute it and/or
>>> + * modify it under the terms of the GNU General Public
>>> + * License as published by the Free Software Foundation, version 2.
>>> + *
>>> + * This program is distributed in the hope that it will be useful,
>>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>>> + * General Public License for more details.
>>> + */
>>> +
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +#include 
>>> +
>>> +#include "ocfs2.h"
>>> +#include "ocfs2_fs.h"
>>> +#include "stackglue.h"
>>> +#include "inode.h"
>>> +
>>> +#include "filecheck.h"
>>> +
>>> +
>>> +/* File check error strings,
>>> + * must correspond with error number in header file.
>>> + */
>>> +static const char * const ocfs2_filecheck_errs[] = {
>>> +   "SUCCESS",
>>> +   "FAILED",
>>> +   "INPROGRESS",
>>> +   "READONLY",
>>> +   "INVALIDINO",
>>> +   "BLOCKECC",
>>> +   "BLOCKNO",
>>> +   "VALIDFLAG",
>>> +   "GENERATION",
>>> +   "UNSUPPORTED"
>>> +};
>>> +
>>> +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
>>> +static LIST_HEAD(ocfs2_filecheck_sysfs_list);
>>> +
>>> +struct ocfs2_filecheck {
>>> +   struct list_head fc_head;   /* File check entry list head */
>>> +   spinlock_t fc_lock;
>>> +   unsigned int fc_max;/* Maximum number of entry in list */
>>> +   unsigned int fc_size;   /* Current entry count in list */
>>> +   unsigned int fc_done;   /* File check entries are done in list */
>>> +};
>>> +
>>> +struct ocfs2_filecheck_sysfs_entry {
>>> +   struct list_head fs_list;
>>> +   atomic_t fs_count;
>>> +   struct super_block *fs_sb;
>>> +   struct kset *fs_kset;
>>> +   struct ocfs2_filecheck *fs_fcheck;
>>> +};
>>> +
>>> +#define OCFS2_FILECHECK_MAXSIZE100
>>> +#define OCFS2_FILECHECK_MINSIZE10
>>>

Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check

2015-11-03 Thread Junxiao Bi

On 11/03/2015 04:15 PM, Gang He wrote:
> Hello Junxiao,
> 
> See my comments inline.
> 
> 

>> Hi Gang,
>>
>> This is not like a right patch.
>> First, online file check only checks inode's block number, valid flag,
>> fs generation value, and meta ecc. I never see a real corruption
>> happened only on this field, if these fields are corrupted, that means
>> something bad may happen on other place. So fix this field may not help
>> and even cause corruption more hard.
> This online file check/fix feature is used to check/fix some light file meta 
> block corruption, instead of turning a file system off and using fsck.ocfs2.
What's light meta block corruption? Do you have a case about it?
> e.g. meta ecc error, we really need not to use fsck.ocfs2. 
> of course, this feature does not replace fsck.ocfs2 and touch some 
> complicated meta block problems, if there is some potential problem in some 
> areas, we can discuss them one by one.
> 
> 
> 
>> Second, the repair way is wrong. In
>> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't
>> match the ones in memory, the ones in memory are used to update the disk
>> fields. The question is how do you know these field in memory are
>> right(they may be the real corrupted ones)?
> Here, if the inode block was corrupted, the file system is not able to load 
> it into the memory.
How do you know inode block corrupted? If bh for inode block is
overwritten, i mean bh corrupted, the repair will corrupted a good inode
block.

Thanks,
Junxiao.

> ocfs2_filecheck_repair_inode_block() will able to load it into the memory, 
> since it try to fix these light-level problem before loading.
> if the fix is OK, the changed meta-block can pass the block-validate function 
> and load into the memory as a inode object.
> Since the file system is under a cluster environment, we have to use some 
> existing function and code path to keep these block operation under a cluster 
> lock.
> 
> 
> Thanks
> Gang
> 
>>
>> Thanks,
>> Junxiao.
>> On 10/28/2015 02:26 PM, Gang He wrote:
>>> +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
>>> +  struct buffer_head *bh)
>>> +{
>>> +   int rc;
>>> +   int changed = 0;
>>> +   struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
>>> +
>>> +   rc = ocfs2_filecheck_validate_inode_block(sb, bh);
>>> +   /* Can't fix invalid inode block */
>>> +   if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO)
>>> +   return rc;
>>> +
>>> +   trace_ocfs2_filecheck_repair_inode_block(
>>> +   (unsigned long long)bh->b_blocknr);
>>> +
>>> +   if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
>>> +   ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
>>> +   mlog(ML_ERROR,
>>> +   "Filecheck: try to repair dinode #%llu on readonly 
>>> filesystem\n",
>>> +   (unsigned long long)bh->b_blocknr);
>>> +   return -OCFS2_FILECHECK_ERR_READONLY;
>>> +   }
>>> +
>>> +   if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
>>> +   di->i_blkno = cpu_to_le64(bh->b_blocknr);
>>> +   changed = 1;
>>> +   mlog(ML_ERROR,
>>> +   "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
>>> +   (unsigned long long)bh->b_blocknr,
>>> +   (unsigned long long)le64_to_cpu(di->i_blkno));
>>> +   }
>>> +
>>> +   if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
>>> +   di->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
>>> +   changed = 1;
>>> +   mlog(ML_ERROR,
>>> +   "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is 
>>> set\n",
>>> +   (unsigned long long)bh->b_blocknr);
>>> +   }
>>> +
>>> +   if (le32_to_cpu(di->i_fs_generation) !=
>>> +   OCFS2_SB(sb)->fs_generation) {
>>> +   di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
>>> +   changed = 1;
>>> +   mlog(ML_ERROR,
>>> +   "Filecheck: reset dinode #%llu: fs_generation to %u\n",
>>> +   (unsigned long long)bh->b_blocknr,
>>> +   le32_to_cpu(di->i_fs_generation));
>>> +   }
>>> +
>>> +   if (changed ||
>>> +   ocfs2_validate_meta_ecc(sb, bh->b_data, >i_check)) {
>>> +   ocfs2_compute_meta_ecc(sb, bh->b_data, >i_check);
>>> +   mark_buffer_dirty(bh);
>>> +   mlog(ML_ERROR,
>>> +   "Filecheck: reset dinode #%llu: compute meta ecc\n",
>>> +   (unsigned long long)bh->b_blocknr);
>>> +   }
>>> +
>>> +   return 0;
>>> +}
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check

2015-11-02 Thread Junxiao Bi

Hi Gang,

I didn't see a need to add a sysfs file for the check and repair. This
leaves a hard problem for customer to decide. How they decide whether
they should repair the bad inode since this may cause corruption even
harder?
I think the error should be fixed by this feature automaticlly if repair
helps, of course this can be done only when error=continue is enabled or
add some mount option for it.

Thanks,
Junxiao.

On 10/28/2015 02:25 PM, Gang He wrote:
> Implement online file check sysfile interfaces, e.g.
> how to create the related sysfile according to device name,
> how to display/handle file check request from the sysfile.
> 
> Signed-off-by: Gang He 
> ---
>  fs/ocfs2/Makefile|   3 +-
>  fs/ocfs2/filecheck.c | 566 
> +++
>  fs/ocfs2/filecheck.h |  48 +
>  fs/ocfs2/inode.h |   3 +
>  4 files changed, 619 insertions(+), 1 deletion(-)
>  create mode 100644 fs/ocfs2/filecheck.c
>  create mode 100644 fs/ocfs2/filecheck.h
> 
> diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
> index ce210d4..e27e652 100644
> --- a/fs/ocfs2/Makefile
> +++ b/fs/ocfs2/Makefile
> @@ -41,7 +41,8 @@ ocfs2-objs := \
>   quota_local.o   \
>   quota_global.o  \
>   xattr.o \
> - acl.o
> + acl.o   \
> + filecheck.o
>  
>  ocfs2_stackglue-objs := stackglue.o
>  ocfs2_stack_o2cb-objs := stack_o2cb.o
> diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
> new file mode 100644
> index 000..f12ed1f
> --- /dev/null
> +++ b/fs/ocfs2/filecheck.c
> @@ -0,0 +1,566 @@
> +/* -*- mode: c; c-basic-offset: 8; -*-
> + * vim: noexpandtab sw=8 ts=8 sts=0:
> + *
> + * filecheck.c
> + *
> + * Code which implements online file check.
> + *
> + * Copyright (C) 2015 Novell.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "ocfs2.h"
> +#include "ocfs2_fs.h"
> +#include "stackglue.h"
> +#include "inode.h"
> +
> +#include "filecheck.h"
> +
> +
> +/* File check error strings,
> + * must correspond with error number in header file.
> + */
> +static const char * const ocfs2_filecheck_errs[] = {
> + "SUCCESS",
> + "FAILED",
> + "INPROGRESS",
> + "READONLY",
> + "INVALIDINO",
> + "BLOCKECC",
> + "BLOCKNO",
> + "VALIDFLAG",
> + "GENERATION",
> + "UNSUPPORTED"
> +};
> +
> +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
> +static LIST_HEAD(ocfs2_filecheck_sysfs_list);
> +
> +struct ocfs2_filecheck {
> + struct list_head fc_head;   /* File check entry list head */
> + spinlock_t fc_lock;
> + unsigned int fc_max;/* Maximum number of entry in list */
> + unsigned int fc_size;   /* Current entry count in list */
> + unsigned int fc_done;   /* File check entries are done in list */
> +};
> +
> +struct ocfs2_filecheck_sysfs_entry {
> + struct list_head fs_list;
> + atomic_t fs_count;
> + struct super_block *fs_sb;
> + struct kset *fs_kset;
> + struct ocfs2_filecheck *fs_fcheck;
> +};
> +
> +#define OCFS2_FILECHECK_MAXSIZE  100
> +#define OCFS2_FILECHECK_MINSIZE  10
> +
> +/* File check operation type */
> +enum {
> + OCFS2_FILECHECK_TYPE_CHK = 0,   /* Check a file */
> + OCFS2_FILECHECK_TYPE_FIX,   /* Fix a file */
> + OCFS2_FILECHECK_TYPE_SET = 100  /* Set file check options */
> +};
> +
> +struct ocfs2_filecheck_entry {
> + struct list_head fe_list;
> + unsigned long fe_ino;
> + unsigned int fe_type;
> + unsigned short fe_done:1;
> + unsigned short fe_status:15;
> +};
> +
> +struct ocfs2_filecheck_args {
> + unsigned int fa_type;
> + union {
> + unsigned long fa_ino;
> + unsigned int fa_len;
> + };
> +};
> +
> +static const char *
> +ocfs2_filecheck_error(int errno)
> +{
> + if (!errno)
> + return ocfs2_filecheck_errs[errno];
> +
> + BUG_ON(errno < OCFS2_FILECHECK_ERR_START ||
> + errno > OCFS2_FILECHECK_ERR_END);
> + return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
> +}
> +
> +static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
> + struct kobj_attribute *attr,
> + char *buf);
> +static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
> + struct kobj_attribute *attr,
> +

Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check

2015-11-02 Thread Junxiao Bi

Hi Gang,

This is not like a right patch.
First, online file check only checks inode's block number, valid flag,
fs generation value, and meta ecc. I never see a real corruption
happened only on this field, if these fields are corrupted, that means
something bad may happen on other place. So fix this field may not help
and even cause corruption more hard.
Second, the repair way is wrong. In
ocfs2_filecheck_repair_inode_block(), if these fields in disk don't
match the ones in memory, the ones in memory are used to update the disk
fields. The question is how do you know these field in memory are
right(they may be the real corrupted ones)?

Thanks,
Junxiao.
On 10/28/2015 02:26 PM, Gang He wrote:
> +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
> +struct buffer_head *bh)
> +{
> + int rc;
> + int changed = 0;
> + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
> +
> + rc = ocfs2_filecheck_validate_inode_block(sb, bh);
> + /* Can't fix invalid inode block */
> + if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO)
> + return rc;
> +
> + trace_ocfs2_filecheck_repair_inode_block(
> + (unsigned long long)bh->b_blocknr);
> +
> + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
> + ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
> + mlog(ML_ERROR,
> + "Filecheck: try to repair dinode #%llu on readonly 
> filesystem\n",
> + (unsigned long long)bh->b_blocknr);
> + return -OCFS2_FILECHECK_ERR_READONLY;
> + }
> +
> + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
> + di->i_blkno = cpu_to_le64(bh->b_blocknr);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
> + (unsigned long long)bh->b_blocknr,
> + (unsigned long long)le64_to_cpu(di->i_blkno));
> + }
> +
> + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
> + di->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is 
> set\n",
> + (unsigned long long)bh->b_blocknr);
> + }
> +
> + if (le32_to_cpu(di->i_fs_generation) !=
> + OCFS2_SB(sb)->fs_generation) {
> + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: fs_generation to %u\n",
> + (unsigned long long)bh->b_blocknr,
> + le32_to_cpu(di->i_fs_generation));
> + }
> +
> + if (changed ||
> + ocfs2_validate_meta_ecc(sb, bh->b_data, >i_check)) {
> + ocfs2_compute_meta_ecc(sb, bh->b_data, >i_check);
> + mark_buffer_dirty(bh);
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: compute meta ecc\n",
> + (unsigned long long)bh->b_blocknr);
> + }
> +
> + return 0;
> +}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check

2015-11-02 Thread Junxiao Bi

Hi Gang,

This is not like a right patch.
First, online file check only checks inode's block number, valid flag,
fs generation value, and meta ecc. I never see a real corruption
happened only on this field, if these fields are corrupted, that means
something bad may happen on other place. So fix this field may not help
and even cause corruption more hard.
Second, the repair way is wrong. In
ocfs2_filecheck_repair_inode_block(), if these fields in disk don't
match the ones in memory, the ones in memory are used to update the disk
fields. The question is how do you know these field in memory are
right(they may be the real corrupted ones)?

Thanks,
Junxiao.
On 10/28/2015 02:26 PM, Gang He wrote:
> +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb,
> +struct buffer_head *bh)
> +{
> + int rc;
> + int changed = 0;
> + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
> +
> + rc = ocfs2_filecheck_validate_inode_block(sb, bh);
> + /* Can't fix invalid inode block */
> + if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO)
> + return rc;
> +
> + trace_ocfs2_filecheck_repair_inode_block(
> + (unsigned long long)bh->b_blocknr);
> +
> + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) ||
> + ocfs2_is_soft_readonly(OCFS2_SB(sb))) {
> + mlog(ML_ERROR,
> + "Filecheck: try to repair dinode #%llu on readonly 
> filesystem\n",
> + (unsigned long long)bh->b_blocknr);
> + return -OCFS2_FILECHECK_ERR_READONLY;
> + }
> +
> + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
> + di->i_blkno = cpu_to_le64(bh->b_blocknr);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: i_blkno to %llu\n",
> + (unsigned long long)bh->b_blocknr,
> + (unsigned long long)le64_to_cpu(di->i_blkno));
> + }
> +
> + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
> + di->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is 
> set\n",
> + (unsigned long long)bh->b_blocknr);
> + }
> +
> + if (le32_to_cpu(di->i_fs_generation) !=
> + OCFS2_SB(sb)->fs_generation) {
> + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
> + changed = 1;
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: fs_generation to %u\n",
> + (unsigned long long)bh->b_blocknr,
> + le32_to_cpu(di->i_fs_generation));
> + }
> +
> + if (changed ||
> + ocfs2_validate_meta_ecc(sb, bh->b_data, >i_check)) {
> + ocfs2_compute_meta_ecc(sb, bh->b_data, >i_check);
> + mark_buffer_dirty(bh);
> + mlog(ML_ERROR,
> + "Filecheck: reset dinode #%llu: compute meta ecc\n",
> + (unsigned long long)bh->b_blocknr);
> + }
> +
> + return 0;
> +}

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check

2015-11-02 Thread Junxiao Bi

Hi Gang,

I didn't see a need to add a sysfs file for the check and repair. This
leaves a hard problem for customer to decide. How they decide whether
they should repair the bad inode since this may cause corruption even
harder?
I think the error should be fixed by this feature automaticlly if repair
helps, of course this can be done only when error=continue is enabled or
add some mount option for it.

Thanks,
Junxiao.

On 10/28/2015 02:25 PM, Gang He wrote:
> Implement online file check sysfile interfaces, e.g.
> how to create the related sysfile according to device name,
> how to display/handle file check request from the sysfile.
> 
> Signed-off-by: Gang He 
> ---
>  fs/ocfs2/Makefile|   3 +-
>  fs/ocfs2/filecheck.c | 566 
> +++
>  fs/ocfs2/filecheck.h |  48 +
>  fs/ocfs2/inode.h |   3 +
>  4 files changed, 619 insertions(+), 1 deletion(-)
>  create mode 100644 fs/ocfs2/filecheck.c
>  create mode 100644 fs/ocfs2/filecheck.h
> 
> diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
> index ce210d4..e27e652 100644
> --- a/fs/ocfs2/Makefile
> +++ b/fs/ocfs2/Makefile
> @@ -41,7 +41,8 @@ ocfs2-objs := \
>   quota_local.o   \
>   quota_global.o  \
>   xattr.o \
> - acl.o
> + acl.o   \
> + filecheck.o
>  
>  ocfs2_stackglue-objs := stackglue.o
>  ocfs2_stack_o2cb-objs := stack_o2cb.o
> diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c
> new file mode 100644
> index 000..f12ed1f
> --- /dev/null
> +++ b/fs/ocfs2/filecheck.c
> @@ -0,0 +1,566 @@
> +/* -*- mode: c; c-basic-offset: 8; -*-
> + * vim: noexpandtab sw=8 ts=8 sts=0:
> + *
> + * filecheck.c
> + *
> + * Code which implements online file check.
> + *
> + * Copyright (C) 2015 Novell.  All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public
> + * License as published by the Free Software Foundation, version 2.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#include "ocfs2.h"
> +#include "ocfs2_fs.h"
> +#include "stackglue.h"
> +#include "inode.h"
> +
> +#include "filecheck.h"
> +
> +
> +/* File check error strings,
> + * must correspond with error number in header file.
> + */
> +static const char * const ocfs2_filecheck_errs[] = {
> + "SUCCESS",
> + "FAILED",
> + "INPROGRESS",
> + "READONLY",
> + "INVALIDINO",
> + "BLOCKECC",
> + "BLOCKNO",
> + "VALIDFLAG",
> + "GENERATION",
> + "UNSUPPORTED"
> +};
> +
> +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock);
> +static LIST_HEAD(ocfs2_filecheck_sysfs_list);
> +
> +struct ocfs2_filecheck {
> + struct list_head fc_head;   /* File check entry list head */
> + spinlock_t fc_lock;
> + unsigned int fc_max;/* Maximum number of entry in list */
> + unsigned int fc_size;   /* Current entry count in list */
> + unsigned int fc_done;   /* File check entries are done in list */
> +};
> +
> +struct ocfs2_filecheck_sysfs_entry {
> + struct list_head fs_list;
> + atomic_t fs_count;
> + struct super_block *fs_sb;
> + struct kset *fs_kset;
> + struct ocfs2_filecheck *fs_fcheck;
> +};
> +
> +#define OCFS2_FILECHECK_MAXSIZE  100
> +#define OCFS2_FILECHECK_MINSIZE  10
> +
> +/* File check operation type */
> +enum {
> + OCFS2_FILECHECK_TYPE_CHK = 0,   /* Check a file */
> + OCFS2_FILECHECK_TYPE_FIX,   /* Fix a file */
> + OCFS2_FILECHECK_TYPE_SET = 100  /* Set file check options */
> +};
> +
> +struct ocfs2_filecheck_entry {
> + struct list_head fe_list;
> + unsigned long fe_ino;
> + unsigned int fe_type;
> + unsigned short fe_done:1;
> + unsigned short fe_status:15;
> +};
> +
> +struct ocfs2_filecheck_args {
> + unsigned int fa_type;
> + union {
> + unsigned long fa_ino;
> + unsigned int fa_len;
> + };
> +};
> +
> +static const char *
> +ocfs2_filecheck_error(int errno)
> +{
> + if (!errno)
> + return ocfs2_filecheck_errs[errno];
> +
> + BUG_ON(errno < OCFS2_FILECHECK_ERR_START ||
> + errno > OCFS2_FILECHECK_ERR_END);
> + return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1];
> +}
> +
> +static ssize_t ocfs2_filecheck_show(struct kobject *kobj,
> + struct kobj_attribute *attr,
> + char *buf);
> +static ssize_t ocfs2_filecheck_store(struct kobject *kobj,
> + struct kobj_attribute *attr,
> +

[PATCH v2] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-09 Thread Junxiao Bi

commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O during 
memory allocation")
introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, 
__GFP_IO is cleared
when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
cleared. Or it may still
run into I/O, like in superblock shrinker. And this will make the kernel run 
into the deadlock case
described in that commit.

See Dave Chinner's comment about io in superblock shrinker:

Filesystem shrinkers do indeed perform IO from the superblock
shrinker and have for years. Even clean inodes can require IO before
they can be freed - e.g. on an orphan list, need truncation of
post-eof blocks, need to wait for ordered operations to complete
before it can be freed, etc.

IOWs, Ext4, btrfs and XFS all can issue and/or block on
arbitrary amounts of IO in the superblock shrinker context. XFS, in
particular, has been doing transactions and IO from the VFS inode
cache shrinker since it was first introduced

Fix this by clearing __GFP_FS in memalloc_noio_flags(), this function
has masked all the gfp_mask that will be passed into fs for the processes
setting PF_MEMALLOC_NOIO in the direct reclaim path.

v1 thread at:
https://lkml.org/lkml/2014/9/3/32

v2 changes:
patch log update to make the issue more clear.

Signed-off-by: Junxiao Bi 
Cc: Dave Chinner 
Cc: joyce.xue 
Cc: Ming Lei 
Cc: 
---
 include/linux/sched.h |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885..2fb2c47 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
task_struct *p, cputime_t *ut,
 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
 #define used_math() tsk_used_math(current)
 
-/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */
+/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
+ * __GFP_FS is also cleared as it implies __GFP_IO.
+ */
 static inline gfp_t memalloc_noio_flags(gfp_t flags)
 {
if (unlikely(current->flags & PF_MEMALLOC_NOIO))
-   flags &= ~__GFP_IO;
+   flags &= ~(__GFP_IO | __GFP_FS);
return flags;
 }
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH v2] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-09 Thread Junxiao Bi

commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O during 
memory allocation)
introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, 
__GFP_IO is cleared
when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
cleared. Or it may still
run into I/O, like in superblock shrinker. And this will make the kernel run 
into the deadlock case
described in that commit.

See Dave Chinner's comment about io in superblock shrinker:

Filesystem shrinkers do indeed perform IO from the superblock
shrinker and have for years. Even clean inodes can require IO before
they can be freed - e.g. on an orphan list, need truncation of
post-eof blocks, need to wait for ordered operations to complete
before it can be freed, etc.

IOWs, Ext4, btrfs and XFS all can issue and/or block on
arbitrary amounts of IO in the superblock shrinker context. XFS, in
particular, has been doing transactions and IO from the VFS inode
cache shrinker since it was first introduced

Fix this by clearing __GFP_FS in memalloc_noio_flags(), this function
has masked all the gfp_mask that will be passed into fs for the processes
setting PF_MEMALLOC_NOIO in the direct reclaim path.

v1 thread at:
https://lkml.org/lkml/2014/9/3/32

v2 changes:
patch log update to make the issue more clear.

Signed-off-by: Junxiao Bi junxiao...@oracle.com
Cc: Dave Chinner da...@fromorbit.com
Cc: joyce.xue xuejiu...@huawei.com
Cc: Ming Lei ming@canonical.com
Cc: sta...@vger.kernel.org
---
 include/linux/sched.h |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885..2fb2c47 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
task_struct *p, cputime_t *ut,
 #define tsk_used_math(p) ((p)-flags  PF_USED_MATH)
 #define used_math() tsk_used_math(current)
 
-/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */
+/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags
+ * __GFP_FS is also cleared as it implies __GFP_IO.
+ */
 static inline gfp_t memalloc_noio_flags(gfp_t flags)
 {
if (unlikely(current-flags  PF_MEMALLOC_NOIO))
-   flags = ~__GFP_IO;
+   flags = ~(__GFP_IO | __GFP_FS);
return flags;
 }
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-04 Thread Junxiao Bi

On 09/05/2014 10:32 AM, Junxiao Bi wrote:
> On 09/04/2014 05:23 PM, Dave Chinner wrote:
>> On Wed, Sep 03, 2014 at 01:54:54PM +0800, Junxiao Bi wrote:
>>> commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O 
>>> during memory allocation")
>>> introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory 
>>> allocation, __GFP_IO is cleared
>>> when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
>>> cleared. Or it may still
>>> run into I/O, like in superblock shrinker.
>>>
>>> Signed-off-by: Junxiao Bi 
>>> Cc: joyce.xue 
>>> Cc: Ming Lei 
>>> ---
>>>  include/linux/sched.h |6 --
>>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>>> index 5c2c885..2fb2c47 100644
>>> --- a/include/linux/sched.h
>>> +++ b/include/linux/sched.h
>>> @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
>>> task_struct *p, cputime_t *ut,
>>>  #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
>>>  #define used_math() tsk_used_math(current)
>>>  
>>> -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */
>>> +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
>>> + * __GFP_FS is also cleared as it implies __GFP_IO.
>>> + */
>>>  static inline gfp_t memalloc_noio_flags(gfp_t flags)
>>>  {
>>> if (unlikely(current->flags & PF_MEMALLOC_NOIO))
>>> -   flags &= ~__GFP_IO;
>>> +   flags &= ~(__GFP_IO | __GFP_FS);
>>> return flags;
>>>  }
>>
>> You also need to mask all the shrink_control->gfp_mask
>> initialisations in mm/vmscan.c. The current code only masks the page
>> reclaim gfp_mask, not those that are passed to the shrinkers.
> Yes, there are some shrink_control->gfp_mask not masked in vmscan.c in
> the following functions. Beside this, all seemed be masked from direct
> reclaim path by memalloc_noio_flags().
> 
> -reclaim_clean_pages_from_list()
> used by alloc_contig_range(), this function is invoked in hugetlb and
> cma, for hugetlb, it should be safe as only userspace use it. I am not
> sure about the cma.
> David & Andrew, may you share your idea about whether cma is affected?
> 
Look at CMA, it's used for device which doesn't support scatter/gather
dma and mainly used for embedded device like camera, this should not be
the case of the block device. So i think this gfp_mask doesn't need be
masked.

Thanks,
Junxiao.
> -mem_cgroup_shrink_node_zone()
> -try_to_free_mem_cgroup_pages()
> These two are used by mem cgroup, as no kernel thread can be assigned
> into such cgroup, so i think, no need mask.
> 
> -balance_pgdat()
> used by kswapd, no need mask.
> 
> -shrink_all_memory()
> used by hibernate, should be safe with GFP_FS/IO.
> 
> Thanks,
> Junxiao.
>>
>> Cheers,
>>
>> Dave.
>>
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-04 Thread Junxiao Bi

On 09/04/2014 05:23 PM, Dave Chinner wrote:
> On Wed, Sep 03, 2014 at 01:54:54PM +0800, Junxiao Bi wrote:
>> commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O 
>> during memory allocation")
>> introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory 
>> allocation, __GFP_IO is cleared
>> when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
>> cleared. Or it may still
>> run into I/O, like in superblock shrinker.
>>
>> Signed-off-by: Junxiao Bi 
>> Cc: joyce.xue 
>> Cc: Ming Lei 
>> ---
>>  include/linux/sched.h |6 --
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 5c2c885..2fb2c47 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
>> task_struct *p, cputime_t *ut,
>>  #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
>>  #define used_math() tsk_used_math(current)
>>  
>> -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */
>> +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
>> + * __GFP_FS is also cleared as it implies __GFP_IO.
>> + */
>>  static inline gfp_t memalloc_noio_flags(gfp_t flags)
>>  {
>>  if (unlikely(current->flags & PF_MEMALLOC_NOIO))
>> -flags &= ~__GFP_IO;
>> +flags &= ~(__GFP_IO | __GFP_FS);
>>  return flags;
>>  }
> 
> You also need to mask all the shrink_control->gfp_mask
> initialisations in mm/vmscan.c. The current code only masks the page
> reclaim gfp_mask, not those that are passed to the shrinkers.
Yes, there are some shrink_control->gfp_mask not masked in vmscan.c in
the following functions. Beside this, all seemed be masked from direct
reclaim path by memalloc_noio_flags().

-reclaim_clean_pages_from_list()
used by alloc_contig_range(), this function is invoked in hugetlb and
cma, for hugetlb, it should be safe as only userspace use it. I am not
sure about the cma.
David & Andrew, may you share your idea about whether cma is affected?

-mem_cgroup_shrink_node_zone()
-try_to_free_mem_cgroup_pages()
These two are used by mem cgroup, as no kernel thread can be assigned
into such cgroup, so i think, no need mask.

-balance_pgdat()
used by kswapd, no need mask.

-shrink_all_memory()
used by hibernate, should be safe with GFP_FS/IO.

Thanks,
Junxiao.
> 
> Cheers,
> 
> Dave.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-04 Thread Junxiao Bi

On 09/04/2014 05:23 PM, Dave Chinner wrote:
 On Wed, Sep 03, 2014 at 01:54:54PM +0800, Junxiao Bi wrote:
 commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O 
 during memory allocation)
 introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory 
 allocation, __GFP_IO is cleared
 when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
 cleared. Or it may still
 run into I/O, like in superblock shrinker.

 Signed-off-by: Junxiao Bi junxiao...@oracle.com
 Cc: joyce.xue xuejiu...@huawei.com
 Cc: Ming Lei ming@canonical.com
 ---
  include/linux/sched.h |6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index 5c2c885..2fb2c47 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
 task_struct *p, cputime_t *ut,
  #define tsk_used_math(p) ((p)-flags  PF_USED_MATH)
  #define used_math() tsk_used_math(current)
  
 -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */
 +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags
 + * __GFP_FS is also cleared as it implies __GFP_IO.
 + */
  static inline gfp_t memalloc_noio_flags(gfp_t flags)
  {
  if (unlikely(current-flags  PF_MEMALLOC_NOIO))
 -flags = ~__GFP_IO;
 +flags = ~(__GFP_IO | __GFP_FS);
  return flags;
  }
 
 You also need to mask all the shrink_control-gfp_mask
 initialisations in mm/vmscan.c. The current code only masks the page
 reclaim gfp_mask, not those that are passed to the shrinkers.
Yes, there are some shrink_control-gfp_mask not masked in vmscan.c in
the following functions. Beside this, all seemed be masked from direct
reclaim path by memalloc_noio_flags().

-reclaim_clean_pages_from_list()
used by alloc_contig_range(), this function is invoked in hugetlb and
cma, for hugetlb, it should be safe as only userspace use it. I am not
sure about the cma.
David  Andrew, may you share your idea about whether cma is affected?

-mem_cgroup_shrink_node_zone()
-try_to_free_mem_cgroup_pages()
These two are used by mem cgroup, as no kernel thread can be assigned
into such cgroup, so i think, no need mask.

-balance_pgdat()
used by kswapd, no need mask.

-shrink_all_memory()
used by hibernate, should be safe with GFP_FS/IO.

Thanks,
Junxiao.
 
 Cheers,
 
 Dave.
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-04 Thread Junxiao Bi

On 09/05/2014 10:32 AM, Junxiao Bi wrote:
 On 09/04/2014 05:23 PM, Dave Chinner wrote:
 On Wed, Sep 03, 2014 at 01:54:54PM +0800, Junxiao Bi wrote:
 commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O 
 during memory allocation)
 introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory 
 allocation, __GFP_IO is cleared
 when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
 cleared. Or it may still
 run into I/O, like in superblock shrinker.

 Signed-off-by: Junxiao Bi junxiao...@oracle.com
 Cc: joyce.xue xuejiu...@huawei.com
 Cc: Ming Lei ming@canonical.com
 ---
  include/linux/sched.h |6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index 5c2c885..2fb2c47 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
 task_struct *p, cputime_t *ut,
  #define tsk_used_math(p) ((p)-flags  PF_USED_MATH)
  #define used_math() tsk_used_math(current)
  
 -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */
 +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags
 + * __GFP_FS is also cleared as it implies __GFP_IO.
 + */
  static inline gfp_t memalloc_noio_flags(gfp_t flags)
  {
 if (unlikely(current-flags  PF_MEMALLOC_NOIO))
 -   flags = ~__GFP_IO;
 +   flags = ~(__GFP_IO | __GFP_FS);
 return flags;
  }

 You also need to mask all the shrink_control-gfp_mask
 initialisations in mm/vmscan.c. The current code only masks the page
 reclaim gfp_mask, not those that are passed to the shrinkers.
 Yes, there are some shrink_control-gfp_mask not masked in vmscan.c in
 the following functions. Beside this, all seemed be masked from direct
 reclaim path by memalloc_noio_flags().
 
 -reclaim_clean_pages_from_list()
 used by alloc_contig_range(), this function is invoked in hugetlb and
 cma, for hugetlb, it should be safe as only userspace use it. I am not
 sure about the cma.
 David  Andrew, may you share your idea about whether cma is affected?
 
Look at CMA, it's used for device which doesn't support scatter/gather
dma and mainly used for embedded device like camera, this should not be
the case of the block device. So i think this gfp_mask doesn't need be
masked.

Thanks,
Junxiao.
 -mem_cgroup_shrink_node_zone()
 -try_to_free_mem_cgroup_pages()
 These two are used by mem cgroup, as no kernel thread can be assigned
 into such cgroup, so i think, no need mask.
 
 -balance_pgdat()
 used by kswapd, no need mask.
 
 -shrink_all_memory()
 used by hibernate, should be safe with GFP_FS/IO.
 
 Thanks,
 Junxiao.

 Cheers,

 Dave.

 
 --
 To unsubscribe from this list: send the line unsubscribe linux-fsdevel in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-03 Thread Junxiao Bi

On 09/04/2014 10:30 AM, Andrew Morton wrote:
> On Thu, 04 Sep 2014 10:08:09 +0800 Junxiao Bi  wrote:
> 
>> On 09/04/2014 07:10 AM, Andrew Morton wrote:
>>> On Wed,  3 Sep 2014 13:54:54 +0800 Junxiao Bi  wrote:
>>>
>>>> commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O 
>>>> during memory allocation")
>>>> introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory 
>>>> allocation, __GFP_IO is cleared
>>>> when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
>>>> cleared. Or it may still
>>>> run into I/O, like in superblock shrinker.
>>>
>>> Is there an actual bug which inspired this fix?  If so, please describe
>>> it.
>>>
>> Yes, an ocfs2 deadlock bug is related to this, there is a workqueue in
>> ocfs2 who is for building tcp connections and processing ocfs2 message.
>> Like when an new node is up in ocfs2 cluster, the workqueue will try to
>> build the connections to it, since there are some common code in
>> networking like sock_alloc() using GFP_KERNEL to allocate memory, direct
>> reclaim will be triggered and call into superblock shrinker if available
>> memory is not enough even set PF_MEMALLOC_NOIO for the workqueue. To
>> shrink the inode cache, ocfs2 needs release cluster lock and this
>> depends on workqueue to do it, so cause the deadlock. Not sure whether
>> there are similar issue for other cluster fs, like nfs, it is possible
>> rpciod hung like the ocfs2 workqueue?
> 
> All this info should be in the changelog.
> 
>>
>>> I don't think it's accurate to say that __GFP_FS implies __GFP_IO. 
>>> Where did that info come from?
>> __GFP_FS allowed callback into fs during memory allocation, and fs may
>> do io whatever __GFP_IO is set?
> 
> __GFP_FS and __GFP_IO are (or were) for communicating to vmscan: don't
> enter the fs for writepage, don't write back swapcache.
> 
> I guess those concepts have grown over time without a ton of thought
> going into it.  Yes, I suppose that if a filesystem's writepage is
> called (for example) it expects that it will be able to perform
> writeback and it won't check (or even be passed) the __GFP_IO setting.
> 
> So I guess we could say that !__GFP_FS && GFP_IO is not implemented and
> shouldn't occur.
> 
> That being said, it still seems quite bad to disable VFS cache
> shrinking for PF_MEMALLOC_NOIO allocation attempts.
Even without this ocfs2 deadlock bug, the implement of PF_MEMALLOC_NOIO
is wrong. See the deadlock case described in its log below. Let see the
case "block device runtime resume", since __GFP_FS is not cleared, it
could run into fs writepage and cause deadlock.


>From 21caf2fc1931b485483ddd254b634fa8f0099963 Mon Sep 17 00:00:00 2001
From: Ming Lei 
Date: Fri, 22 Feb 2013 16:34:08 -0800
Subject: [PATCH] mm: teach mm by current context info to not do I/O during
 memory allocation

This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of
'struct task_struct'), so that the flag can be set by one task to avoid
doing I/O inside memory allocation in the task's context.

The patch trys to solve one deadlock problem caused by block device, and
the problem may happen at least in the below situations:

- during block device runtime resume, if memory allocation with
  GFP_KERNEL is called inside runtime resume callback of any one of its
  ancestors(or the block device itself), the deadlock may be triggered
  inside the memory allocation since it might not complete until the block
  device becomes active and the involed page I/O finishes.  The situation
  is pointed out first by Alan Stern.  It is not a good approach to
  convert all GFP_KERNEL[1] in the path into GFP_NOIO because several
  subsystems may be involved(for example, PCI, USB and SCSI may be
  involved for usb mass stoarage device, network devices involved too in
  the iSCSI case)

- during block device runtime suspend, because runtime resume need to
  wait for completion of concurrent runtime suspend.

- during error handling of usb mass storage deivce, USB bus reset will
  be put on the device, so there shouldn't have any memory allocation with
  GFP_KERNEL during USB bus reset, otherwise the deadlock similar with
  above may be triggered.  Unfortunately, any usb device may include one
  mass storage interface in theory, so it requires all usb interface
  drivers to handle the situation.  In fact, most usb drivers don't know
  how to handle bus reset on the device and don't provide .pre_set() and
  .post_reset() callback at all, so USB core has to unbind and bind driver
  for these devices.  So it is still not practical to resort to GFP_NOIO
  for solving the problem.

Thanks,
Junxiao.

Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-03 Thread Junxiao Bi

On 09/03/2014 08:20 PM, Trond Myklebust wrote:
> On Wed, Sep 3, 2014 at 1:54 AM, Junxiao Bi  wrote:
>> commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O 
>> during memory allocation")
>> introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory 
>> allocation, __GFP_IO is cleared
>> when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
>> cleared. Or it may still
>> run into I/O, like in superblock shrinker.
>>
>> Signed-off-by: Junxiao Bi 
>> Cc: joyce.xue 
>> Cc: Ming Lei 
>> ---
>>  include/linux/sched.h |6 --
>>  1 file changed, 4 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/linux/sched.h b/include/linux/sched.h
>> index 5c2c885..2fb2c47 100644
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
>> task_struct *p, cputime_t *ut,
>>  #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
>>  #define used_math() tsk_used_math(current)
>>
>> -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */
>> +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
>> + * __GFP_FS is also cleared as it implies __GFP_IO.
>> + */
>>  static inline gfp_t memalloc_noio_flags(gfp_t flags)
>>  {
>> if (unlikely(current->flags & PF_MEMALLOC_NOIO))
>> -   flags &= ~__GFP_IO;
>> +   flags &= ~(__GFP_IO | __GFP_FS);
>> return flags;
>>  }
>>
> 
> Shouldn't this be a stable fix? If it is needed, then it will affect
> all kernels that define PF_MEMALLOC_NOIO.
Yes, should be. An ocfs2 deadlock bug related to this.

Thanks,
Junxiao.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-03 Thread Junxiao Bi

On 09/04/2014 07:10 AM, Andrew Morton wrote:
> On Wed,  3 Sep 2014 13:54:54 +0800 Junxiao Bi  wrote:
> 
>> commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O 
>> during memory allocation")
>> introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory 
>> allocation, __GFP_IO is cleared
>> when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
>> cleared. Or it may still
>> run into I/O, like in superblock shrinker.
> 
> Is there an actual bug which inspired this fix?  If so, please describe
> it.
> 
Yes, an ocfs2 deadlock bug is related to this, there is a workqueue in
ocfs2 who is for building tcp connections and processing ocfs2 message.
Like when an new node is up in ocfs2 cluster, the workqueue will try to
build the connections to it, since there are some common code in
networking like sock_alloc() using GFP_KERNEL to allocate memory, direct
reclaim will be triggered and call into superblock shrinker if available
memory is not enough even set PF_MEMALLOC_NOIO for the workqueue. To
shrink the inode cache, ocfs2 needs release cluster lock and this
depends on workqueue to do it, so cause the deadlock. Not sure whether
there are similar issue for other cluster fs, like nfs, it is possible
rpciod hung like the ocfs2 workqueue?


> I don't think it's accurate to say that __GFP_FS implies __GFP_IO. 
> Where did that info come from?
__GFP_FS allowed callback into fs during memory allocation, and fs may
do io whatever __GFP_IO is set?
> 
> And the superblock shrinker is a good example of why this shouldn't be
> the case.  The main thing that code does is to reclaim clean fs objects
> without performing IO.  AFAICT the proposed patch will significantly
> weaken PF_MEMALLOC_NOIO allocation attempts by needlessly preventing
> the kernel from reclaiming such objects?
Even fs didn't do io in superblock shrinker, it is possible for a fs
process who is not convenient to set GFP_NOFS holding some fs lock and
call back fs again?

PF_MEMALLOC_NOIO is only set for some special processes. I think it
won't affect much.

Thanks,
Junxiao.
> 
>> --- a/include/linux/sched.h
>> +++ b/include/linux/sched.h
>> @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
>> task_struct *p, cputime_t *ut,
>>  #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
>>  #define used_math() tsk_used_math(current)
>>  
>> -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */
>> +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
>> + * __GFP_FS is also cleared as it implies __GFP_IO.
>> + */
>>  static inline gfp_t memalloc_noio_flags(gfp_t flags)
>>  {
>>  if (unlikely(current->flags & PF_MEMALLOC_NOIO))
>> -flags &= ~__GFP_IO;
>> +flags &= ~(__GFP_IO | __GFP_FS);
>>  return flags;
>>  }
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-03 Thread Junxiao Bi

On 09/04/2014 07:10 AM, Andrew Morton wrote:
 On Wed,  3 Sep 2014 13:54:54 +0800 Junxiao Bi junxiao...@oracle.com wrote:
 
 commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O 
 during memory allocation)
 introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory 
 allocation, __GFP_IO is cleared
 when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
 cleared. Or it may still
 run into I/O, like in superblock shrinker.
 
 Is there an actual bug which inspired this fix?  If so, please describe
 it.
 
Yes, an ocfs2 deadlock bug is related to this, there is a workqueue in
ocfs2 who is for building tcp connections and processing ocfs2 message.
Like when an new node is up in ocfs2 cluster, the workqueue will try to
build the connections to it, since there are some common code in
networking like sock_alloc() using GFP_KERNEL to allocate memory, direct
reclaim will be triggered and call into superblock shrinker if available
memory is not enough even set PF_MEMALLOC_NOIO for the workqueue. To
shrink the inode cache, ocfs2 needs release cluster lock and this
depends on workqueue to do it, so cause the deadlock. Not sure whether
there are similar issue for other cluster fs, like nfs, it is possible
rpciod hung like the ocfs2 workqueue?


 I don't think it's accurate to say that __GFP_FS implies __GFP_IO. 
 Where did that info come from?
__GFP_FS allowed callback into fs during memory allocation, and fs may
do io whatever __GFP_IO is set?
 
 And the superblock shrinker is a good example of why this shouldn't be
 the case.  The main thing that code does is to reclaim clean fs objects
 without performing IO.  AFAICT the proposed patch will significantly
 weaken PF_MEMALLOC_NOIO allocation attempts by needlessly preventing
 the kernel from reclaiming such objects?
Even fs didn't do io in superblock shrinker, it is possible for a fs
process who is not convenient to set GFP_NOFS holding some fs lock and
call back fs again?

PF_MEMALLOC_NOIO is only set for some special processes. I think it
won't affect much.

Thanks,
Junxiao.
 
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
 task_struct *p, cputime_t *ut,
  #define tsk_used_math(p) ((p)-flags  PF_USED_MATH)
  #define used_math() tsk_used_math(current)
  
 -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */
 +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags
 + * __GFP_FS is also cleared as it implies __GFP_IO.
 + */
  static inline gfp_t memalloc_noio_flags(gfp_t flags)
  {
  if (unlikely(current-flags  PF_MEMALLOC_NOIO))
 -flags = ~__GFP_IO;
 +flags = ~(__GFP_IO | __GFP_FS);
  return flags;
  }
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-03 Thread Junxiao Bi

On 09/03/2014 08:20 PM, Trond Myklebust wrote:
 On Wed, Sep 3, 2014 at 1:54 AM, Junxiao Bi junxiao...@oracle.com wrote:
 commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O 
 during memory allocation)
 introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory 
 allocation, __GFP_IO is cleared
 when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
 cleared. Or it may still
 run into I/O, like in superblock shrinker.

 Signed-off-by: Junxiao Bi junxiao...@oracle.com
 Cc: joyce.xue xuejiu...@huawei.com
 Cc: Ming Lei ming@canonical.com
 ---
  include/linux/sched.h |6 --
  1 file changed, 4 insertions(+), 2 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index 5c2c885..2fb2c47 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
 task_struct *p, cputime_t *ut,
  #define tsk_used_math(p) ((p)-flags  PF_USED_MATH)
  #define used_math() tsk_used_math(current)

 -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */
 +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags
 + * __GFP_FS is also cleared as it implies __GFP_IO.
 + */
  static inline gfp_t memalloc_noio_flags(gfp_t flags)
  {
 if (unlikely(current-flags  PF_MEMALLOC_NOIO))
 -   flags = ~__GFP_IO;
 +   flags = ~(__GFP_IO | __GFP_FS);
 return flags;
  }

 
 Shouldn't this be a stable fix? If it is needed, then it will affect
 all kernels that define PF_MEMALLOC_NOIO.
Yes, should be. An ocfs2 deadlock bug related to this.

Thanks,
Junxiao.
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-03 Thread Junxiao Bi

On 09/04/2014 10:30 AM, Andrew Morton wrote:
 On Thu, 04 Sep 2014 10:08:09 +0800 Junxiao Bi junxiao...@oracle.com wrote:
 
 On 09/04/2014 07:10 AM, Andrew Morton wrote:
 On Wed,  3 Sep 2014 13:54:54 +0800 Junxiao Bi junxiao...@oracle.com wrote:

 commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O 
 during memory allocation)
 introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory 
 allocation, __GFP_IO is cleared
 when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
 cleared. Or it may still
 run into I/O, like in superblock shrinker.

 Is there an actual bug which inspired this fix?  If so, please describe
 it.

 Yes, an ocfs2 deadlock bug is related to this, there is a workqueue in
 ocfs2 who is for building tcp connections and processing ocfs2 message.
 Like when an new node is up in ocfs2 cluster, the workqueue will try to
 build the connections to it, since there are some common code in
 networking like sock_alloc() using GFP_KERNEL to allocate memory, direct
 reclaim will be triggered and call into superblock shrinker if available
 memory is not enough even set PF_MEMALLOC_NOIO for the workqueue. To
 shrink the inode cache, ocfs2 needs release cluster lock and this
 depends on workqueue to do it, so cause the deadlock. Not sure whether
 there are similar issue for other cluster fs, like nfs, it is possible
 rpciod hung like the ocfs2 workqueue?
 
 All this info should be in the changelog.
 

 I don't think it's accurate to say that __GFP_FS implies __GFP_IO. 
 Where did that info come from?
 __GFP_FS allowed callback into fs during memory allocation, and fs may
 do io whatever __GFP_IO is set?
 
 __GFP_FS and __GFP_IO are (or were) for communicating to vmscan: don't
 enter the fs for writepage, don't write back swapcache.
 
 I guess those concepts have grown over time without a ton of thought
 going into it.  Yes, I suppose that if a filesystem's writepage is
 called (for example) it expects that it will be able to perform
 writeback and it won't check (or even be passed) the __GFP_IO setting.
 
 So I guess we could say that !__GFP_FS  GFP_IO is not implemented and
 shouldn't occur.
 
 That being said, it still seems quite bad to disable VFS cache
 shrinking for PF_MEMALLOC_NOIO allocation attempts.
Even without this ocfs2 deadlock bug, the implement of PF_MEMALLOC_NOIO
is wrong. See the deadlock case described in its log below. Let see the
case block device runtime resume, since __GFP_FS is not cleared, it
could run into fs writepage and cause deadlock.


From 21caf2fc1931b485483ddd254b634fa8f0099963 Mon Sep 17 00:00:00 2001
From: Ming Lei ming@canonical.com
Date: Fri, 22 Feb 2013 16:34:08 -0800
Subject: [PATCH] mm: teach mm by current context info to not do I/O during
 memory allocation

This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of
'struct task_struct'), so that the flag can be set by one task to avoid
doing I/O inside memory allocation in the task's context.

The patch trys to solve one deadlock problem caused by block device, and
the problem may happen at least in the below situations:

- during block device runtime resume, if memory allocation with
  GFP_KERNEL is called inside runtime resume callback of any one of its
  ancestors(or the block device itself), the deadlock may be triggered
  inside the memory allocation since it might not complete until the block
  device becomes active and the involed page I/O finishes.  The situation
  is pointed out first by Alan Stern.  It is not a good approach to
  convert all GFP_KERNEL[1] in the path into GFP_NOIO because several
  subsystems may be involved(for example, PCI, USB and SCSI may be
  involved for usb mass stoarage device, network devices involved too in
  the iSCSI case)

- during block device runtime suspend, because runtime resume need to
  wait for completion of concurrent runtime suspend.

- during error handling of usb mass storage deivce, USB bus reset will
  be put on the device, so there shouldn't have any memory allocation with
  GFP_KERNEL during USB bus reset, otherwise the deadlock similar with
  above may be triggered.  Unfortunately, any usb device may include one
  mass storage interface in theory, so it requires all usb interface
  drivers to handle the situation.  In fact, most usb drivers don't know
  how to handle bus reset on the device and don't provide .pre_set() and
  .post_reset() callback at all, so USB core has to unbind and bind driver
  for these devices.  So it is still not practical to resort to GFP_NOIO
  for solving the problem.

Thanks,
Junxiao.
 

 And the superblock shrinker is a good example of why this shouldn't be
 the case.  The main thing that code does is to reclaim clean fs objects
 without performing IO.  AFAICT the proposed patch will significantly
 weaken PF_MEMALLOC_NOIO allocation attempts by needlessly preventing
 the kernel from reclaiming such objects?
 Even fs didn't do io in superblock shrinker

[PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-02 Thread Junxiao Bi

commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O during 
memory allocation")
introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, 
__GFP_IO is cleared
when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
cleared. Or it may still
run into I/O, like in superblock shrinker.

Signed-off-by: Junxiao Bi 
Cc: joyce.xue 
Cc: Ming Lei 
---
 include/linux/sched.h |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885..2fb2c47 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
task_struct *p, cputime_t *ut,
 #define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
 #define used_math() tsk_used_math(current)
 
-/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */
+/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
+ * __GFP_FS is also cleared as it implies __GFP_IO.
+ */
 static inline gfp_t memalloc_noio_flags(gfp_t flags)
 {
if (unlikely(current->flags & PF_MEMALLOC_NOIO))
-   flags &= ~__GFP_IO;
+   flags &= ~(__GFP_IO | __GFP_FS);
return flags;
 }
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] fs/super.c: do not shrink fs slab during direct memory reclaim

2014-09-02 Thread Junxiao Bi

On 09/03/2014 11:10 AM, Dave Chinner wrote:
> On Wed, Sep 03, 2014 at 09:38:31AM +0800, Junxiao Bi wrote:
>> Hi Jiufei,
>>
>> On 09/02/2014 05:03 PM, Xue jiufei wrote:
>>> Hi, Dave
>>> On 2014/9/2 7:51, Dave Chinner wrote:
>>>> On Fri, Aug 29, 2014 at 05:57:22PM +0800, Xue jiufei wrote:
>>>>> The patch trys to solve one deadlock problem caused by cluster
>>>>> fs, like ocfs2. And the problem may happen at least in the below
>>>>> situations:
>>>>> 1)Receiving a connect message from other nodes, node queues a
>>>>> work_struct o2net_listen_work.
>>>>> 2)o2net_wq processes this work and calls sock_alloc() to allocate
>>>>> memory for a new socket.
>>>>> 3)It would do direct memory reclaim when available memory is not
>>>>> enough and trigger the inode cleanup. That inode being cleaned up
>>>>> is happened to be ocfs2 inode, so call evict()->ocfs2_evict_inode()
>>>>> ->ocfs2_drop_lock()->dlmunlock()->o2net_send_message_vec(),
>>>>> and wait for the unlock response from master.
>>>>> 4)tcp layer received the response, call o2net_data_ready() and
>>>>> queue sc_rx_work, waiting o2net_wq to process this work.
>>>>> 5)o2net_wq is a single thread workqueue, it process the work one by
>>>>> one. Right now it is still doing o2net_listen_work and cannot handle
>>>>> sc_rx_work. so we deadlock.
>>>>>
>>>>> It is impossible to set GFP_NOFS for memory allocation in sock_alloc().
>>>>> So we use PF_FSTRANS to avoid the task reentering filesystem when
>>>>> available memory is not enough.
>>>>>
>>>>> Signed-off-by: joyce.xue 
>>>>
>>>> For the second time: use memalloc_noio_save/memalloc_noio_restore.
>>>> And please put a great big comment in the code explaining why you
>>>> need to do this special thing with memory reclaim flags.
>>>>
>>>> Cheers,
>>>>
>>>> Dave.
>>>>
>>> Thanks for your reply. But I am afraid that memalloc_noio_save/
>>> memalloc_noio_restore can not solve my problem. __GFP_IO is cleared
>>> if PF_MEMALLOC_NOIO is set and can avoid doing IO in direct memory
>>> reclaim. However, __GFP_FS is still set that can not avoid pruning
>>> dcache and icache in memory allocation, resulting in the deadlock I
>>> described.
>>
>> You can use PF_MEMALLOC_NOIO to replace PF_FSTRANS, set this flag in
>> ocfs2 and check it in sb shrinker.
> 
> No changes to the superblock shrinker, please. The flag should
> modify the gfp_mask in the struct shrink_control passed to the
> shrinker, just like the noio flag is used in the rest of the mm
> code.
__GFP_FS seemed imply __GFP_IO, can superblock shrinker check
!(sc->gfp_mask & __GFP_IO) and stop?

Thanks,
Junxiao.
> 
> Cheers,
> 
> Dave.
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] fs/super.c: do not shrink fs slab during direct memory reclaim

2014-09-02 Thread Junxiao Bi

Hi Jiufei,

On 09/02/2014 05:03 PM, Xue jiufei wrote:
> Hi, Dave
> On 2014/9/2 7:51, Dave Chinner wrote:
>> On Fri, Aug 29, 2014 at 05:57:22PM +0800, Xue jiufei wrote:
>>> The patch trys to solve one deadlock problem caused by cluster
>>> fs, like ocfs2. And the problem may happen at least in the below
>>> situations:
>>> 1)Receiving a connect message from other nodes, node queues a
>>> work_struct o2net_listen_work.
>>> 2)o2net_wq processes this work and calls sock_alloc() to allocate
>>> memory for a new socket.
>>> 3)It would do direct memory reclaim when available memory is not
>>> enough and trigger the inode cleanup. That inode being cleaned up
>>> is happened to be ocfs2 inode, so call evict()->ocfs2_evict_inode()
>>> ->ocfs2_drop_lock()->dlmunlock()->o2net_send_message_vec(),
>>> and wait for the unlock response from master.
>>> 4)tcp layer received the response, call o2net_data_ready() and
>>> queue sc_rx_work, waiting o2net_wq to process this work.
>>> 5)o2net_wq is a single thread workqueue, it process the work one by
>>> one. Right now it is still doing o2net_listen_work and cannot handle
>>> sc_rx_work. so we deadlock.
>>>
>>> It is impossible to set GFP_NOFS for memory allocation in sock_alloc().
>>> So we use PF_FSTRANS to avoid the task reentering filesystem when
>>> available memory is not enough.
>>>
>>> Signed-off-by: joyce.xue 
>>
>> For the second time: use memalloc_noio_save/memalloc_noio_restore.
>> And please put a great big comment in the code explaining why you
>> need to do this special thing with memory reclaim flags.
>>
>> Cheers,
>>
>> Dave.
>>
> Thanks for your reply. But I am afraid that memalloc_noio_save/
> memalloc_noio_restore can not solve my problem. __GFP_IO is cleared
> if PF_MEMALLOC_NOIO is set and can avoid doing IO in direct memory
> reclaim. However, __GFP_FS is still set that can not avoid pruning
> dcache and icache in memory allocation, resulting in the deadlock I
> described.

You can use PF_MEMALLOC_NOIO to replace PF_FSTRANS, set this flag in
ocfs2 and check it in sb shrinker.

Thanks,
Junxiao.
> 
> Thanks.
> XueJiufei
> 
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] fs/super.c: do not shrink fs slab during direct memory reclaim

2014-09-02 Thread Junxiao Bi

Hi Jiufei,

On 09/02/2014 05:03 PM, Xue jiufei wrote:
 Hi, Dave
 On 2014/9/2 7:51, Dave Chinner wrote:
 On Fri, Aug 29, 2014 at 05:57:22PM +0800, Xue jiufei wrote:
 The patch trys to solve one deadlock problem caused by cluster
 fs, like ocfs2. And the problem may happen at least in the below
 situations:
 1)Receiving a connect message from other nodes, node queues a
 work_struct o2net_listen_work.
 2)o2net_wq processes this work and calls sock_alloc() to allocate
 memory for a new socket.
 3)It would do direct memory reclaim when available memory is not
 enough and trigger the inode cleanup. That inode being cleaned up
 is happened to be ocfs2 inode, so call evict()-ocfs2_evict_inode()
 -ocfs2_drop_lock()-dlmunlock()-o2net_send_message_vec(),
 and wait for the unlock response from master.
 4)tcp layer received the response, call o2net_data_ready() and
 queue sc_rx_work, waiting o2net_wq to process this work.
 5)o2net_wq is a single thread workqueue, it process the work one by
 one. Right now it is still doing o2net_listen_work and cannot handle
 sc_rx_work. so we deadlock.

 It is impossible to set GFP_NOFS for memory allocation in sock_alloc().
 So we use PF_FSTRANS to avoid the task reentering filesystem when
 available memory is not enough.

 Signed-off-by: joyce.xue xuejiu...@huawei.com

 For the second time: use memalloc_noio_save/memalloc_noio_restore.
 And please put a great big comment in the code explaining why you
 need to do this special thing with memory reclaim flags.

 Cheers,

 Dave.

 Thanks for your reply. But I am afraid that memalloc_noio_save/
 memalloc_noio_restore can not solve my problem. __GFP_IO is cleared
 if PF_MEMALLOC_NOIO is set and can avoid doing IO in direct memory
 reclaim. However, __GFP_FS is still set that can not avoid pruning
 dcache and icache in memory allocation, resulting in the deadlock I
 described.

You can use PF_MEMALLOC_NOIO to replace PF_FSTRANS, set this flag in
ocfs2 and check it in sb shrinker.

Thanks,
Junxiao.
 
 Thanks.
 XueJiufei
 
 
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] fs/super.c: do not shrink fs slab during direct memory reclaim

2014-09-02 Thread Junxiao Bi

On 09/03/2014 11:10 AM, Dave Chinner wrote:
 On Wed, Sep 03, 2014 at 09:38:31AM +0800, Junxiao Bi wrote:
 Hi Jiufei,

 On 09/02/2014 05:03 PM, Xue jiufei wrote:
 Hi, Dave
 On 2014/9/2 7:51, Dave Chinner wrote:
 On Fri, Aug 29, 2014 at 05:57:22PM +0800, Xue jiufei wrote:
 The patch trys to solve one deadlock problem caused by cluster
 fs, like ocfs2. And the problem may happen at least in the below
 situations:
 1)Receiving a connect message from other nodes, node queues a
 work_struct o2net_listen_work.
 2)o2net_wq processes this work and calls sock_alloc() to allocate
 memory for a new socket.
 3)It would do direct memory reclaim when available memory is not
 enough and trigger the inode cleanup. That inode being cleaned up
 is happened to be ocfs2 inode, so call evict()-ocfs2_evict_inode()
 -ocfs2_drop_lock()-dlmunlock()-o2net_send_message_vec(),
 and wait for the unlock response from master.
 4)tcp layer received the response, call o2net_data_ready() and
 queue sc_rx_work, waiting o2net_wq to process this work.
 5)o2net_wq is a single thread workqueue, it process the work one by
 one. Right now it is still doing o2net_listen_work and cannot handle
 sc_rx_work. so we deadlock.

 It is impossible to set GFP_NOFS for memory allocation in sock_alloc().
 So we use PF_FSTRANS to avoid the task reentering filesystem when
 available memory is not enough.

 Signed-off-by: joyce.xue xuejiu...@huawei.com

 For the second time: use memalloc_noio_save/memalloc_noio_restore.
 And please put a great big comment in the code explaining why you
 need to do this special thing with memory reclaim flags.

 Cheers,

 Dave.

 Thanks for your reply. But I am afraid that memalloc_noio_save/
 memalloc_noio_restore can not solve my problem. __GFP_IO is cleared
 if PF_MEMALLOC_NOIO is set and can avoid doing IO in direct memory
 reclaim. However, __GFP_FS is still set that can not avoid pruning
 dcache and icache in memory allocation, resulting in the deadlock I
 described.

 You can use PF_MEMALLOC_NOIO to replace PF_FSTRANS, set this flag in
 ocfs2 and check it in sb shrinker.
 
 No changes to the superblock shrinker, please. The flag should
 modify the gfp_mask in the struct shrink_control passed to the
 shrinker, just like the noio flag is used in the rest of the mm
 code.
__GFP_FS seemed imply __GFP_IO, can superblock shrinker check
!(sc-gfp_mask  __GFP_IO) and stop?

Thanks,
Junxiao.
 
 Cheers,
 
 Dave.
 

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set

2014-09-02 Thread Junxiao Bi

commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O during 
memory allocation)
introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, 
__GFP_IO is cleared
when this flag is set, but __GFP_FS implies __GFP_IO, it should also be 
cleared. Or it may still
run into I/O, like in superblock shrinker.

Signed-off-by: Junxiao Bi junxiao...@oracle.com
Cc: joyce.xue xuejiu...@huawei.com
Cc: Ming Lei ming@canonical.com
---
 include/linux/sched.h |6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885..2fb2c47 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct 
task_struct *p, cputime_t *ut,
 #define tsk_used_math(p) ((p)-flags  PF_USED_MATH)
 #define used_math() tsk_used_math(current)
 
-/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */
+/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags
+ * __GFP_FS is also cleared as it implies __GFP_IO.
+ */
 static inline gfp_t memalloc_noio_flags(gfp_t flags)
 {
if (unlikely(current-flags  PF_MEMALLOC_NOIO))
-   flags = ~__GFP_IO;
+   flags = ~(__GFP_IO | __GFP_FS);
return flags;
 }
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] block: fix reqeust->__data_len overflow

2014-07-01 Thread Junxiao Bi

blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)
is used to check whether a bio can be merged into an exist request.
If can, req->__data_len += bio->bio_size. Since req->__data_len is
a 32bit uint, if blk_rq_get_max_sectors(req) > (UINT_MAX >> 9),
req->__date_len may overflow when merging a new bio.
This probably happen for discard request. In xen blkfront driver,
its max_discard_sectors is set to the whole disk sector size, see
xlvbd_init_blk_queue(). So issuing discrad requests to a
xen virtual disk with a size over 4G is very possible to trigger the
overflow. This overflow will cause kernel panic in blk_end_request_all()
due to BUG() triggered.

The following is a call trace we saw in 3.0.69. Upstream kernel also suffer
this issue.

@ __end_that: dev xvdg: type=1, flags=2224441
@   sector 0, nr/cnr 8378368/4294959104
@   bio 8803d8cf3080, biotail 8803d8cf32c0, buffer   (null),
@ len 4289724416
@ blk_update_request: bio idx 0 >= vcnt 0
@ request botched: dev xvdg: type=1, flags=2224441
@   sector 0, nr/cnr 8378368/4294959104
@   bio 8803d8cf3080, biotail 8803d8cf32c0, buffer   (null),
@ len 4289724416
@ [ cut here ]
@ kernel BUG at block/blk-core.c:2394!
@ invalid opcode:  [#1] SMP
@ CPU 0
@ Modules linked in: nfs fscache auth_rpcgss nfs_acl autofs4 i2c_dev i2c_core
@ lockd sunrpc(U) ksplice_x773z34q_vmlinux_new(U) ksplice_x773z34q(U)
@ ksplice_bj7y22gc_vmlinux_new(U) ksplice_bj7y22gc_ipv6_new(U)
@ ksplice_bj7y22gc(U)
@ .
@ ksplice_i1o46065(U) ksplice_5gqtkuvt_vmlinux_new(U) ksplice_5gqtkuvt(U)
@ ksplice_2bcv8td6(U) ksplice_v5bs54bz_vmlinux_new(U) ksplice_v5bs54bz(U)
@ ksplice_l7s0dhx6(U) ksplice_aur7sgvi(U) ksplice_ckie4cpv(U)
@ nf_conntrack_netbios_ns
@ .
@ nf_conntrack_broadcast ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_state
@ nf_conntrack xt_comment iptable_filter ip_tables be2iscsi iscsi_boot_sysfs
@ ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp bnx2i cnic
@ uio ipv6
@ .
@ cxgb3i libcxgbi cxgb3 mdio libiscsi_tcp libiscsi scsi_transport_iscsi
@ parport_pc lp parport snd_seq_dummy snd_seq_oss snd_seq_midi_event snd_seq
@ snd_seq_device snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd soundcore
@ snd_page_alloc
@ .
@ pcspkr xen_netfront dm_snapshot dm_zero dm_mirror dm_region_hash dm_log
@ dm_mod xen_blkfront ext3 jbd mbcache sd_mod crc_t10dif [last unloaded:
@ ksplice_x773z34q_vmlinux_old]
@ .
@ Pid: 0, comm: swapper Not tainted 2.6.39-400.212.1.el5uek #1
@ RIP: e030:[]  []
@ __blk_end_request_all+0x2a/0x40
@ RSP: e02b:8803ffc03df8  EFLAGS: 00010002
@ RAX: 0001 RBX: 8803db3c8000 RCX: 8803d8cf32c0
@ RDX: 0001 RSI: 8803d8cf3080 RDI: 8803daed08d8
@ RBP: 8803ffc03df8 R08:  R09: 8803daed08d8
@ R10:  R11: 000a R12: 
@ R13: 8803dad5e3c0 R14: 0001 R15: 0029
@ FS:  7f1f34a32940() GS:8803ffc0() knlGS:
@ CS:  e033 DS:  ES:  CR0: 8005003b
@ CR2: 020c6148 CR3: 0003c6492000 CR4: 2660
@ DR0:  DR1:  DR2: 
@ DR3:  DR6: 0ff0 DR7: 0400
@ Process swapper (pid: 0, threadinfo 81794000, task 8179f020)
@ Stack:
@  8803ffc03e48 a005c56a 8803da57a8d0 0028810d99ee
@   8803db1ea7c0 8803db1beec0 005e
@  0001  8803ffc03e98 810d735d
@ Call Trace:
@  
@  [] blkif_interrupt+0x20a/0x3a0 [xen_blkfront]
@  [] handle_irq_event_percpu+0x5d/0x1a0
@  [] handle_irq_event+0x4f/0x80
@  [] handle_edge_irq+0xa5/0x100
@  [] __xen_evtchn_do_upcall+0x218/0x310
@  [] xen_evtchn_do_upcall+0x2f/0x50
@  [] xen_do_hypervisor_callback+0x1e/0x30
@  
@  [] ? xen_hypercall_sched_op+0xa/0x20
@  [] ? xen_hypercall_sched_op+0xa/0x20
@  [] ? xen_safe_halt+0x10/0x20
@  [] ? default_idle+0x5b/0x170
@  [] ? cpu_idle+0xc6/0xf0
@  [] ? rest_init+0x72/0x80
@  [] ? start_kernel+0x2aa/0x390
@  [] ? x86_64_start_reservations+0x6a/0xa0
@  [] ? xen_start_kernel+0x315/0x440
@ Code: 00 55 48 89 e5 0f 1f 44 00 00 48 8b 87 60 01 00 00 31 c9 48 85 c0 75 0e
@ 8b 57 54 e8 91 ff ff ff 84 c0 75 07 c9 c3 8b 48 54 eb ed <0f> 0b 0f 1f 40 00
@ eb fa 0f 1f 80 00 00 00 00 0f 1f 80 00 00 00
@ RIP  [] __blk_end_request_all+0x2a/0x40
@  RSP 
@ ---[ end trace b09ff97496363201 ]---

Signed-off-by: Junxiao Bi 
---
 block/blk-merge.c |   29 +++--
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index b3bf0df..ae4f4c8 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -325,11 +325,30 @@ no_merge:
return 0;
 }
 
-int ll_back_merge_fn(struct request_queue *q, struct request *req,
+static inline bool ll_allow_merge_bio(struct request *req,

[PATCH] block: fix reqeust-__data_len overflow

2014-07-01 Thread Junxiao Bi

blk_rq_sectors(req) + bio_sectors(bio)  blk_rq_get_max_sectors(req)
is used to check whether a bio can be merged into an exist request.
If can, req-__data_len += bio-bio_size. Since req-__data_len is
a 32bit uint, if blk_rq_get_max_sectors(req)  (UINT_MAX  9),
req-__date_len may overflow when merging a new bio.
This probably happen for discard request. In xen blkfront driver,
its max_discard_sectors is set to the whole disk sector size, see
xlvbd_init_blk_queue(). So issuing discrad requests to a
xen virtual disk with a size over 4G is very possible to trigger the
overflow. This overflow will cause kernel panic in blk_end_request_all()
due to BUG() triggered.

The following is a call trace we saw in 3.0.69. Upstream kernel also suffer
this issue.

@ __end_that: dev xvdg: type=1, flags=2224441
@   sector 0, nr/cnr 8378368/4294959104
@   bio 8803d8cf3080, biotail 8803d8cf32c0, buffer   (null),
@ len 4289724416
@ blk_update_request: bio idx 0 = vcnt 0
@ request botched: dev xvdg: type=1, flags=2224441
@   sector 0, nr/cnr 8378368/4294959104
@   bio 8803d8cf3080, biotail 8803d8cf32c0, buffer   (null),
@ len 4289724416
@ [ cut here ]
@ kernel BUG at block/blk-core.c:2394!
@ invalid opcode:  [#1] SMP
@ CPU 0
@ Modules linked in: nfs fscache auth_rpcgss nfs_acl autofs4 i2c_dev i2c_core
@ lockd sunrpc(U) ksplice_x773z34q_vmlinux_new(U) ksplice_x773z34q(U)
@ ksplice_bj7y22gc_vmlinux_new(U) ksplice_bj7y22gc_ipv6_new(U)
@ ksplice_bj7y22gc(U)
@ .
@ ksplice_i1o46065(U) ksplice_5gqtkuvt_vmlinux_new(U) ksplice_5gqtkuvt(U)
@ ksplice_2bcv8td6(U) ksplice_v5bs54bz_vmlinux_new(U) ksplice_v5bs54bz(U)
@ ksplice_l7s0dhx6(U) ksplice_aur7sgvi(U) ksplice_ckie4cpv(U)
@ nf_conntrack_netbios_ns
@ .
@ nf_conntrack_broadcast ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_state
@ nf_conntrack xt_comment iptable_filter ip_tables be2iscsi iscsi_boot_sysfs
@ ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp bnx2i cnic
@ uio ipv6
@ .
@ cxgb3i libcxgbi cxgb3 mdio libiscsi_tcp libiscsi scsi_transport_iscsi
@ parport_pc lp parport snd_seq_dummy snd_seq_oss snd_seq_midi_event snd_seq
@ snd_seq_device snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd soundcore
@ snd_page_alloc
@ .
@ pcspkr xen_netfront dm_snapshot dm_zero dm_mirror dm_region_hash dm_log
@ dm_mod xen_blkfront ext3 jbd mbcache sd_mod crc_t10dif [last unloaded:
@ ksplice_x773z34q_vmlinux_old]
@ .
@ Pid: 0, comm: swapper Not tainted 2.6.39-400.212.1.el5uek #1
@ RIP: e030:[8123757a]  [8123757a]
@ __blk_end_request_all+0x2a/0x40
@ RSP: e02b:8803ffc03df8  EFLAGS: 00010002
@ RAX: 0001 RBX: 8803db3c8000 RCX: 8803d8cf32c0
@ RDX: 0001 RSI: 8803d8cf3080 RDI: 8803daed08d8
@ RBP: 8803ffc03df8 R08:  R09: 8803daed08d8
@ R10:  R11: 000a R12: 
@ R13: 8803dad5e3c0 R14: 0001 R15: 0029
@ FS:  7f1f34a32940() GS:8803ffc0() knlGS:
@ CS:  e033 DS:  ES:  CR0: 8005003b
@ CR2: 020c6148 CR3: 0003c6492000 CR4: 2660
@ DR0:  DR1:  DR2: 
@ DR3:  DR6: 0ff0 DR7: 0400
@ Process swapper (pid: 0, threadinfo 81794000, task 8179f020)
@ Stack:
@  8803ffc03e48 a005c56a 8803da57a8d0 0028810d99ee
@   8803db1ea7c0 8803db1beec0 005e
@  0001  8803ffc03e98 810d735d
@ Call Trace:
@  IRQ
@  [a005c56a] blkif_interrupt+0x20a/0x3a0 [xen_blkfront]
@  [810d735d] handle_irq_event_percpu+0x5d/0x1a0
@  [810d74ef] handle_irq_event+0x4f/0x80
@  [810d9e25] handle_edge_irq+0xa5/0x100
@  [812f7cc8] __xen_evtchn_do_upcall+0x218/0x310
@  [812f7e7f] xen_evtchn_do_upcall+0x2f/0x50
@  [8151168e] xen_do_hypervisor_callback+0x1e/0x30
@  EOI
@  [810013aa] ? xen_hypercall_sched_op+0xa/0x20
@  [810013aa] ? xen_hypercall_sched_op+0xa/0x20
@  [8100a2b0] ? xen_safe_halt+0x10/0x20
@  [8101dffb] ? default_idle+0x5b/0x170
@  [81014ac6] ? cpu_idle+0xc6/0xf0
@  [814eab62] ? rest_init+0x72/0x80
@  [819c902a] ? start_kernel+0x2aa/0x390
@  [819c832a] ? x86_64_start_reservations+0x6a/0xa0
@  [819cc9b5] ? xen_start_kernel+0x315/0x440
@ Code: 00 55 48 89 e5 0f 1f 44 00 00 48 8b 87 60 01 00 00 31 c9 48 85 c0 75 0e
@ 8b 57 54 e8 91 ff ff ff 84 c0 75 07 c9 c3 8b 48 54 eb ed 0f 0b 0f 1f 40 00
@ eb fa 0f 1f 80 00 00 00 00 0f 1f 80 00 00 00
@ RIP  [8123757a] __blk_end_request_all+0x2a/0x40
@  RSP 8803ffc03df8
@ ---[ end trace b09ff97496363201 ]---

Signed-off-by: Junxiao Bi junxiao...@oracle.com
---
 block/blk-merge.c |   29 +++--
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/block/blk-merge.c

Re: [PATCH] block: fix uint overflow when merging io requests

2014-06-30 Thread Junxiao Bi

On 06/27/2014 03:24 PM, Junxiao Bi wrote:
> This uint overflow will cause req->__data_len < req->bio->bi_size,
> this will confuse block layer and device driver.
>
> I watched a panic caused by this when mkfs.ext4 a volume of a large
> virtual disk on vm guest, blkdev_issue_discard() issue two bio with
> a total size over UINT_MAX, but the check in ll_back_merge_fn() didn't
> take affect due to the overflow and they were merged into one request.
> After the request is done, in blk_end_request_all(), BUG_ON(pending)
> was triggered and kernel panic. "pending" is true is because
> blk_update_request() return ture when req->__data_len is less
> than req->bio->bi_size.
Any body help review this patch?
blk_rq_sectors(), bio_sectors(), blk_rq_get_max_sectors() are all uint.

blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req)

This checking is bypassed when overflow happen. It will cause an io
request's
length less than its child bio's size.

Thanks,
Junxiao.
>
> Signed-off-by: Junxiao Bi 
> ---
>  block/blk-merge.c |   40 ++--
>  1 file changed, 34 insertions(+), 6 deletions(-)
>
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index b3bf0df..340c0a7 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -325,11 +325,41 @@ no_merge:
>   return 0;
>  }
>  
> -int ll_back_merge_fn(struct request_queue *q, struct request *req,
> +static inline bool ll_allow_merge_bio(struct request *req,
>struct bio *bio)
>  {
>   if (blk_rq_sectors(req) + bio_sectors(bio) >
> - blk_rq_get_max_sectors(req)) {
> + blk_rq_get_max_sectors(req))
> + return false;
> +
> + /* check uint overflow */
> + if (blk_rq_sectors(req) + bio_sectors(bio) < blk_rq_sectors(req)
> + || blk_rq_sectors(req) + bio_sectors(bio) < bio_sectors(bio))
> + return false;
> +
> + return true;
> +}
> +
> +static inline bool ll_allow_merge_req(struct request *req,
> +  struct request *next)
> +{
> + if (blk_rq_sectors(req) + blk_rq_sectors(next) >
> + blk_rq_get_max_sectors(req))
> + return false;
> +
> + /* check uint overflow */
> + if (blk_rq_sectors(req) + blk_rq_sectors(next) < blk_rq_sectors(req)
> + || blk_rq_sectors(req) + blk_rq_sectors(next) <
> + blk_rq_sectors(next))
> + return false;
> +
> + return true;
> +}
> +
> +int ll_back_merge_fn(struct request_queue *q, struct request *req,
> +  struct bio *bio)
> +{
> + if (!ll_allow_merge_bio(req, bio)) {
>   req->cmd_flags |= REQ_NOMERGE;
>   if (req == q->last_merge)
>   q->last_merge = NULL;
> @@ -346,8 +376,7 @@ int ll_back_merge_fn(struct request_queue *q, struct 
> request *req,
>  int ll_front_merge_fn(struct request_queue *q, struct request *req,
> struct bio *bio)
>  {
> - if (blk_rq_sectors(req) + bio_sectors(bio) >
> - blk_rq_get_max_sectors(req)) {
> + if (!ll_allow_merge_bio(req, bio)) {
>   req->cmd_flags |= REQ_NOMERGE;
>   if (req == q->last_merge)
>   q->last_merge = NULL;
> @@ -389,8 +418,7 @@ static int ll_merge_requests_fn(struct request_queue *q, 
> struct request *req,
>   /*
>* Will it become too large?
>*/
> - if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
> - blk_rq_get_max_sectors(req))
> + if (!ll_allow_merge_req(req, next))
>   return 0;
>  
>   total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] block: fix uint overflow when merging io requests

2014-06-30 Thread Junxiao Bi

On 06/27/2014 03:24 PM, Junxiao Bi wrote:
 This uint overflow will cause req-__data_len  req-bio-bi_size,
 this will confuse block layer and device driver.

 I watched a panic caused by this when mkfs.ext4 a volume of a large
 virtual disk on vm guest, blkdev_issue_discard() issue two bio with
 a total size over UINT_MAX, but the check in ll_back_merge_fn() didn't
 take affect due to the overflow and they were merged into one request.
 After the request is done, in blk_end_request_all(), BUG_ON(pending)
 was triggered and kernel panic. pending is true is because
 blk_update_request() return ture when req-__data_len is less
 than req-bio-bi_size.
Any body help review this patch?
blk_rq_sectors(), bio_sectors(), blk_rq_get_max_sectors() are all uint.

blk_rq_sectors(req) + bio_sectors(bio)  blk_rq_get_max_sectors(req)

This checking is bypassed when overflow happen. It will cause an io
request's
length less than its child bio's size.

Thanks,
Junxiao.

 Signed-off-by: Junxiao Bi junxiao...@oracle.com
 ---
  block/blk-merge.c |   40 ++--
  1 file changed, 34 insertions(+), 6 deletions(-)

 diff --git a/block/blk-merge.c b/block/blk-merge.c
 index b3bf0df..340c0a7 100644
 --- a/block/blk-merge.c
 +++ b/block/blk-merge.c
 @@ -325,11 +325,41 @@ no_merge:
   return 0;
  }
  
 -int ll_back_merge_fn(struct request_queue *q, struct request *req,
 +static inline bool ll_allow_merge_bio(struct request *req,
struct bio *bio)
  {
   if (blk_rq_sectors(req) + bio_sectors(bio) 
 - blk_rq_get_max_sectors(req)) {
 + blk_rq_get_max_sectors(req))
 + return false;
 +
 + /* check uint overflow */
 + if (blk_rq_sectors(req) + bio_sectors(bio)  blk_rq_sectors(req)
 + || blk_rq_sectors(req) + bio_sectors(bio)  bio_sectors(bio))
 + return false;
 +
 + return true;
 +}
 +
 +static inline bool ll_allow_merge_req(struct request *req,
 +  struct request *next)
 +{
 + if (blk_rq_sectors(req) + blk_rq_sectors(next) 
 + blk_rq_get_max_sectors(req))
 + return false;
 +
 + /* check uint overflow */
 + if (blk_rq_sectors(req) + blk_rq_sectors(next)  blk_rq_sectors(req)
 + || blk_rq_sectors(req) + blk_rq_sectors(next) 
 + blk_rq_sectors(next))
 + return false;
 +
 + return true;
 +}
 +
 +int ll_back_merge_fn(struct request_queue *q, struct request *req,
 +  struct bio *bio)
 +{
 + if (!ll_allow_merge_bio(req, bio)) {
   req-cmd_flags |= REQ_NOMERGE;
   if (req == q-last_merge)
   q-last_merge = NULL;
 @@ -346,8 +376,7 @@ int ll_back_merge_fn(struct request_queue *q, struct 
 request *req,
  int ll_front_merge_fn(struct request_queue *q, struct request *req,
 struct bio *bio)
  {
 - if (blk_rq_sectors(req) + bio_sectors(bio) 
 - blk_rq_get_max_sectors(req)) {
 + if (!ll_allow_merge_bio(req, bio)) {
   req-cmd_flags |= REQ_NOMERGE;
   if (req == q-last_merge)
   q-last_merge = NULL;
 @@ -389,8 +418,7 @@ static int ll_merge_requests_fn(struct request_queue *q, 
 struct request *req,
   /*
* Will it become too large?
*/
 - if ((blk_rq_sectors(req) + blk_rq_sectors(next)) 
 - blk_rq_get_max_sectors(req))
 + if (!ll_allow_merge_req(req, next))
   return 0;
  
   total_phys_segments = req-nr_phys_segments + next-nr_phys_segments;

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] block: fix uint overflow when merging io requests

2014-06-27 Thread Junxiao Bi

This uint overflow will cause req->__data_len < req->bio->bi_size,
this will confuse block layer and device driver.

I watched a panic caused by this when mkfs.ext4 a volume of a large
virtual disk on vm guest, blkdev_issue_discard() issue two bio with
a total size over UINT_MAX, but the check in ll_back_merge_fn() didn't
take affect due to the overflow and they were merged into one request.
After the request is done, in blk_end_request_all(), BUG_ON(pending)
was triggered and kernel panic. "pending" is true is because
blk_update_request() return ture when req->__data_len is less
than req->bio->bi_size.

Signed-off-by: Junxiao Bi 
---
 block/blk-merge.c |   40 ++--
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index b3bf0df..340c0a7 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -325,11 +325,41 @@ no_merge:
return 0;
 }
 
-int ll_back_merge_fn(struct request_queue *q, struct request *req,
+static inline bool ll_allow_merge_bio(struct request *req,
 struct bio *bio)
 {
if (blk_rq_sectors(req) + bio_sectors(bio) >
-   blk_rq_get_max_sectors(req)) {
+   blk_rq_get_max_sectors(req))
+   return false;
+
+   /* check uint overflow */
+   if (blk_rq_sectors(req) + bio_sectors(bio) < blk_rq_sectors(req)
+   || blk_rq_sectors(req) + bio_sectors(bio) < bio_sectors(bio))
+   return false;
+
+   return true;
+}
+
+static inline bool ll_allow_merge_req(struct request *req,
+struct request *next)
+{
+   if (blk_rq_sectors(req) + blk_rq_sectors(next) >
+   blk_rq_get_max_sectors(req))
+   return false;
+
+   /* check uint overflow */
+   if (blk_rq_sectors(req) + blk_rq_sectors(next) < blk_rq_sectors(req)
+   || blk_rq_sectors(req) + blk_rq_sectors(next) <
+   blk_rq_sectors(next))
+   return false;
+
+   return true;
+}
+
+int ll_back_merge_fn(struct request_queue *q, struct request *req,
+struct bio *bio)
+{
+   if (!ll_allow_merge_bio(req, bio)) {
req->cmd_flags |= REQ_NOMERGE;
if (req == q->last_merge)
q->last_merge = NULL;
@@ -346,8 +376,7 @@ int ll_back_merge_fn(struct request_queue *q, struct 
request *req,
 int ll_front_merge_fn(struct request_queue *q, struct request *req,
  struct bio *bio)
 {
-   if (blk_rq_sectors(req) + bio_sectors(bio) >
-   blk_rq_get_max_sectors(req)) {
+   if (!ll_allow_merge_bio(req, bio)) {
req->cmd_flags |= REQ_NOMERGE;
if (req == q->last_merge)
q->last_merge = NULL;
@@ -389,8 +418,7 @@ static int ll_merge_requests_fn(struct request_queue *q, 
struct request *req,
/*
 * Will it become too large?
 */
-   if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
-   blk_rq_get_max_sectors(req))
+   if (!ll_allow_merge_req(req, next))
return 0;
 
total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] block: fix uint overflow when merging io requests

2014-06-27 Thread Junxiao Bi

This uint overflow will cause req-__data_len  req-bio-bi_size,
this will confuse block layer and device driver.

I watched a panic caused by this when mkfs.ext4 a volume of a large
virtual disk on vm guest, blkdev_issue_discard() issue two bio with
a total size over UINT_MAX, but the check in ll_back_merge_fn() didn't
take affect due to the overflow and they were merged into one request.
After the request is done, in blk_end_request_all(), BUG_ON(pending)
was triggered and kernel panic. pending is true is because
blk_update_request() return ture when req-__data_len is less
than req-bio-bi_size.

Signed-off-by: Junxiao Bi junxiao...@oracle.com
---
 block/blk-merge.c |   40 ++--
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index b3bf0df..340c0a7 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -325,11 +325,41 @@ no_merge:
return 0;
 }
 
-int ll_back_merge_fn(struct request_queue *q, struct request *req,
+static inline bool ll_allow_merge_bio(struct request *req,
 struct bio *bio)
 {
if (blk_rq_sectors(req) + bio_sectors(bio) 
-   blk_rq_get_max_sectors(req)) {
+   blk_rq_get_max_sectors(req))
+   return false;
+
+   /* check uint overflow */
+   if (blk_rq_sectors(req) + bio_sectors(bio)  blk_rq_sectors(req)
+   || blk_rq_sectors(req) + bio_sectors(bio)  bio_sectors(bio))
+   return false;
+
+   return true;
+}
+
+static inline bool ll_allow_merge_req(struct request *req,
+struct request *next)
+{
+   if (blk_rq_sectors(req) + blk_rq_sectors(next) 
+   blk_rq_get_max_sectors(req))
+   return false;
+
+   /* check uint overflow */
+   if (blk_rq_sectors(req) + blk_rq_sectors(next)  blk_rq_sectors(req)
+   || blk_rq_sectors(req) + blk_rq_sectors(next) 
+   blk_rq_sectors(next))
+   return false;
+
+   return true;
+}
+
+int ll_back_merge_fn(struct request_queue *q, struct request *req,
+struct bio *bio)
+{
+   if (!ll_allow_merge_bio(req, bio)) {
req-cmd_flags |= REQ_NOMERGE;
if (req == q-last_merge)
q-last_merge = NULL;
@@ -346,8 +376,7 @@ int ll_back_merge_fn(struct request_queue *q, struct 
request *req,
 int ll_front_merge_fn(struct request_queue *q, struct request *req,
  struct bio *bio)
 {
-   if (blk_rq_sectors(req) + bio_sectors(bio) 
-   blk_rq_get_max_sectors(req)) {
+   if (!ll_allow_merge_bio(req, bio)) {
req-cmd_flags |= REQ_NOMERGE;
if (req == q-last_merge)
q-last_merge = NULL;
@@ -389,8 +418,7 @@ static int ll_merge_requests_fn(struct request_queue *q, 
struct request *req,
/*
 * Will it become too large?
 */
-   if ((blk_rq_sectors(req) + blk_rq_sectors(next)) 
-   blk_rq_get_max_sectors(req))
+   if (!ll_allow_merge_req(req, next))
return 0;
 
total_phys_segments = req-nr_phys_segments + next-nr_phys_segments;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2] block: make nr_requests tunable for loop

2014-06-10 Thread Junxiao Bi

On 06/10/2014 11:12 AM, Jens Axboe wrote:
> On 2014-06-09 20:50, Junxiao Bi wrote:
>> On 06/10/2014 10:41 AM, Jens Axboe wrote:
>>> On 2014-06-09 20:31, Junxiao Bi wrote:
>>>> commit 7b5a3522 (loop: Limit the number of requests in the bio list)
>>>> limit
>>>> the request number in loop queue to not over 128. Since the
>>>> "request_fn" of
>>>> loop device is null, the requests number is not allowed tuned. Make
>>>> it tunable
>>>> from sysfs can improve performance.
>>>>
>>>> The following test is done on a machine with 512M memory. The
>>>> backend of
>>>> /dev/loop1 is a nfs file.
>>>>
>>>> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
>>>> 128
>>>> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
>>>> 5000+0 records in
>>>> 5000+0 records out
>>>> 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s
>>>> [root@bijx mnt]#
>>>> [root@bijx mnt]# echo 1024 > /sys/block/loop0/queue/nr_requests
>>>> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
>>>> 1024
>>>> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
>>>> 5000+0 records in
>>>> 5000+0 records out
>>>> 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s
>>>>
>>>> Signed-off-by: Junxiao Bi 
>>>> ---
>>>>block/blk-core.c  |6 ++
>>>>block/blk-sysfs.c |9 +++--
>>>>2 files changed, 9 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>> index 40d6548..58c4bd4 100644
>>>> --- a/block/blk-core.c
>>>> +++ b/block/blk-core.c
>>>> @@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue
>>>> *q, unsigned int nr)
>>>>q->nr_requests = nr;
>>>>blk_queue_congestion_threshold(q);
>>>>
>>>> +/* for loop device, return after set its nr_requests */
>>>> +if (!q->request_fn) {
>>>> +spin_unlock_irq(q->queue_lock);
>>>> +return 0;
>>>> +}
>>>
>>> It'd be prettier to split this differently - something ala:
>>>
>>> if (request_fn)
>>>  blk_update_congestion_thresholds(q);
>> The congestion threshholds is needed in commit 7b5a3522 (loop: Limit the
>> number of requests in the bio list). So I think it needs be set even
>> request_fn is null.
>
> I mean the request list thresholds, the part below where you currently
> just exit.
>
>>> But I think you have a larger issue here... For the request lists, we
>>> update the congestion thresholds and wakeup anyone waiting, if we need
>>> to. There's no way to do that for loop, since the waitqueue is
>>> internal to loop.
>> Loop do the congestion control by itself, in loop_make_request() /
>> loop_thread().
>
> Yes, that is my point! You update nr_congestion_off, but you don't
> wake anyone currently sitting in wait_event_lock_irq() on that value.
> See what the code below where you just exit does for request list
> based devices.
Jens, do you have an idea to resolve it?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2] block: make nr_requests tunable for loop

2014-06-10 Thread Junxiao Bi

On 06/10/2014 11:12 AM, Jens Axboe wrote:
 On 2014-06-09 20:50, Junxiao Bi wrote:
 On 06/10/2014 10:41 AM, Jens Axboe wrote:
 On 2014-06-09 20:31, Junxiao Bi wrote:
 commit 7b5a3522 (loop: Limit the number of requests in the bio list)
 limit
 the request number in loop queue to not over 128. Since the
 request_fn of
 loop device is null, the requests number is not allowed tuned. Make
 it tunable
 from sysfs can improve performance.

 The following test is done on a machine with 512M memory. The
 backend of
 /dev/loop1 is a nfs file.

 [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
 128
 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
 5000+0 records in
 5000+0 records out
 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s
 [root@bijx mnt]#
 [root@bijx mnt]# echo 1024  /sys/block/loop0/queue/nr_requests
 [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
 1024
 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
 5000+0 records in
 5000+0 records out
 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s

 Signed-off-by: Junxiao Bi junxiao...@oracle.com
 ---
block/blk-core.c  |6 ++
block/blk-sysfs.c |9 +++--
2 files changed, 9 insertions(+), 6 deletions(-)

 diff --git a/block/blk-core.c b/block/blk-core.c
 index 40d6548..58c4bd4 100644
 --- a/block/blk-core.c
 +++ b/block/blk-core.c
 @@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue
 *q, unsigned int nr)
q-nr_requests = nr;
blk_queue_congestion_threshold(q);

 +/* for loop device, return after set its nr_requests */
 +if (!q-request_fn) {
 +spin_unlock_irq(q-queue_lock);
 +return 0;
 +}

 It'd be prettier to split this differently - something ala:

 if (request_fn)
  blk_update_congestion_thresholds(q);
 The congestion threshholds is needed in commit 7b5a3522 (loop: Limit the
 number of requests in the bio list). So I think it needs be set even
 request_fn is null.

 I mean the request list thresholds, the part below where you currently
 just exit.

 But I think you have a larger issue here... For the request lists, we
 update the congestion thresholds and wakeup anyone waiting, if we need
 to. There's no way to do that for loop, since the waitqueue is
 internal to loop.
 Loop do the congestion control by itself, in loop_make_request() /
 loop_thread().

 Yes, that is my point! You update nr_congestion_off, but you don't
 wake anyone currently sitting in wait_event_lock_irq() on that value.
 See what the code below where you just exit does for request list
 based devices.
Jens, do you have an idea to resolve it?

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2] block: make nr_requests tunable for loop

2014-06-09 Thread Junxiao Bi

On 06/10/2014 11:12 AM, Jens Axboe wrote:
> On 2014-06-09 20:50, Junxiao Bi wrote:
>> On 06/10/2014 10:41 AM, Jens Axboe wrote:
>>> On 2014-06-09 20:31, Junxiao Bi wrote:
>>>> commit 7b5a3522 (loop: Limit the number of requests in the bio list)
>>>> limit
>>>> the request number in loop queue to not over 128. Since the
>>>> "request_fn" of
>>>> loop device is null, the requests number is not allowed tuned. Make
>>>> it tunable
>>>> from sysfs can improve performance.
>>>>
>>>> The following test is done on a machine with 512M memory. The
>>>> backend of
>>>> /dev/loop1 is a nfs file.
>>>>
>>>> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
>>>> 128
>>>> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
>>>> 5000+0 records in
>>>> 5000+0 records out
>>>> 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s
>>>> [root@bijx mnt]#
>>>> [root@bijx mnt]# echo 1024 > /sys/block/loop0/queue/nr_requests
>>>> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
>>>> 1024
>>>> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
>>>> 5000+0 records in
>>>> 5000+0 records out
>>>> 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s
>>>>
>>>> Signed-off-by: Junxiao Bi 
>>>> ---
>>>>block/blk-core.c  |6 ++
>>>>block/blk-sysfs.c |9 +++--
>>>>2 files changed, 9 insertions(+), 6 deletions(-)
>>>>
>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>> index 40d6548..58c4bd4 100644
>>>> --- a/block/blk-core.c
>>>> +++ b/block/blk-core.c
>>>> @@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue
>>>> *q, unsigned int nr)
>>>>q->nr_requests = nr;
>>>>blk_queue_congestion_threshold(q);
>>>>
>>>> +/* for loop device, return after set its nr_requests */
>>>> +if (!q->request_fn) {
>>>> +spin_unlock_irq(q->queue_lock);
>>>> +return 0;
>>>> +}
>>>
>>> It'd be prettier to split this differently - something ala:
>>>
>>> if (request_fn)
>>>  blk_update_congestion_thresholds(q);
>> The congestion threshholds is needed in commit 7b5a3522 (loop: Limit the
>> number of requests in the bio list). So I think it needs be set even
>> request_fn is null.
>
> I mean the request list thresholds, the part below where you currently
> just exit.
>
>>> But I think you have a larger issue here... For the request lists, we
>>> update the congestion thresholds and wakeup anyone waiting, if we need
>>> to. There's no way to do that for loop, since the waitqueue is
>>> internal to loop.
>> Loop do the congestion control by itself, in loop_make_request() /
>> loop_thread().
>
> Yes, that is my point! You update nr_congestion_off, but you don't
> wake anyone currently sitting in wait_event_lock_irq() on that value.
> See what the code below where you just exit does for request list
> based devices.
Ah, i see. It can't be wake up once nr_congestion_off is updated. But
after a little delay, loop_thread will consume the requests in list and
wake up it. Is this OK?

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH V2] block: make nr_requests tunable for loop

2014-06-09 Thread Junxiao Bi

On 06/10/2014 10:41 AM, Jens Axboe wrote:
> On 2014-06-09 20:31, Junxiao Bi wrote:
>> commit 7b5a3522 (loop: Limit the number of requests in the bio list)
>> limit
>> the request number in loop queue to not over 128. Since the
>> "request_fn" of
>> loop device is null, the requests number is not allowed tuned. Make
>> it tunable
>> from sysfs can improve performance.
>>
>> The following test is done on a machine with 512M memory. The backend of
>> /dev/loop1 is a nfs file.
>>
>> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
>> 128
>> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
>> 5000+0 records in
>> 5000+0 records out
>> 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s
>> [root@bijx mnt]#
>> [root@bijx mnt]# echo 1024 > /sys/block/loop0/queue/nr_requests
>> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
>> 1024
>> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
>> 5000+0 records in
>> 5000+0 records out
>> 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s
>>
>> Signed-off-by: Junxiao Bi 
>> ---
>>   block/blk-core.c  |6 ++
>>   block/blk-sysfs.c |9 +++--
>>   2 files changed, 9 insertions(+), 6 deletions(-)
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index 40d6548..58c4bd4 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue
>> *q, unsigned int nr)
>>   q->nr_requests = nr;
>>   blk_queue_congestion_threshold(q);
>>
>> +/* for loop device, return after set its nr_requests */
>> +if (!q->request_fn) {
>> +spin_unlock_irq(q->queue_lock);
>> +return 0;
>> +}
>
> It'd be prettier to split this differently - something ala:
>
> if (request_fn)
> blk_update_congestion_thresholds(q);
The congestion threshholds is needed in commit 7b5a3522 (loop: Limit the
number of requests in the bio list). So I think it needs be set even
request_fn is null.
>
> But I think you have a larger issue here... For the request lists, we
> update the congestion thresholds and wakeup anyone waiting, if we need
> to. There's no way to do that for loop, since the waitqueue is
> internal to loop.
Loop do the congestion control by itself, in loop_make_request() /
loop_thread().

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH V2] block: make nr_requests tunable for loop

2014-06-09 Thread Junxiao Bi

commit 7b5a3522 (loop: Limit the number of requests in the bio list) limit
the request number in loop queue to not over 128. Since the "request_fn" of
loop device is null, the requests number is not allowed tuned. Make it tunable
from sysfs can improve performance.

The following test is done on a machine with 512M memory. The backend of
/dev/loop1 is a nfs file.

[root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
128
[root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
5000+0 records in
5000+0 records out
524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s
[root@bijx mnt]#
[root@bijx mnt]# echo 1024 > /sys/block/loop0/queue/nr_requests
[root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
1024
[root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
5000+0 records in
5000+0 records out
524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s

Signed-off-by: Junxiao Bi 
---
 block/blk-core.c  |6 ++
 block/blk-sysfs.c |9 +++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 40d6548..58c4bd4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue *q, 
unsigned int nr)
q->nr_requests = nr;
blk_queue_congestion_threshold(q);
 
+   /* for loop device, return after set its nr_requests */
+   if (!q->request_fn) {
+   spin_unlock_irq(q->queue_lock);
+   return 0;
+   }
+
/* congestion isn't cgroup aware and follows root blkcg for now */
rl = >root_rl;
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 23321fb..c5456a5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -51,9 +51,6 @@ queue_requests_store(struct request_queue *q, const char 
*page, size_t count)
unsigned long nr;
int ret, err;
 
-   if (!q->request_fn && !q->mq_ops)
-   return -EINVAL;
-
ret = queue_var_store(, page, count);
if (ret < 0)
return ret;
@@ -61,10 +58,10 @@ queue_requests_store(struct request_queue *q, const char 
*page, size_t count)
if (nr < BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
 
-   if (q->request_fn)
-   err = blk_update_nr_requests(q, nr);
-   else
+   if (q->mq_ops)
err = blk_mq_update_nr_requests(q, nr);
+   else
+   err = blk_update_nr_requests(q, nr);
 
if (err)
return err;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] block: make nr_requests tunable for loop

2014-06-09 Thread Junxiao Bi

On 06/09/2014 11:53 PM, Jens Axboe wrote:
> On 2014-06-09 01:29, Andreas Mohr wrote:
>> Hi,
>>
>> having had a look at current mainline sources,
>> frankly I've (well, initially...) got trouble understanding
>> what this patch is doing.
>>
>> It's replacing an aggressive error-type bail-out (-EINVAL) for NULL
>> request_fn
>> with an inoccuous-looking "return ret;", yet that ret content currently
>> *implicitly* is a >= 0 value (resulting from processing by earlier code
>> which may or may not get incomprehensibly rewritten in future).
>> I don't understand the reasons for this huge change in return value
>> handling
>> (since it's now not assigning a specific return value
>> for this modified bail-out case).
>>
>> OK, well... you could say that since all this function ever was
>> interested in is the result value of queue_var_store()
>> (except for error bail-out cases), doing an interim "return ret;"
>> (which is exactly what the function tail is also doing)
>> is exactly right.
>>
>> But still simple textual appearance of the resulting patch hunks
>> seems strangely asymmetric
>> which may easily be a canary for structurally wrong layering of this
>> function.
>> Not to mention the now required extra spin_unlock_irq()
>> in interim return handler...
>>
>>
>> Well, after further analysis I would come to the conclusion
>> that in general queue_requests_store() does a LOT more than it should -
>> since blk-sysfs.c's only (expected!) purpose is
>> to do parameterization of request_queue behaviour as gathered
>> from sysfs attribute space,
>> all that function should ever be concerned with is parsing that sysfs
>> value
>> and then calling a blk helper for configuration of that very
>> attribute value
>> which would *internally* do all the strange internal queue magic
>> that is currently being updated *open-coded*
>> at this supposedly *sysfs*-specific place. Ugh.
>> Main question here: what would one do if one decided to rip out sysfs
>> and use something entirely different for parameterization?
>> Yeah indeed - thought so...
>>
>>
>> So yeah, I'd definitely say that that function is lacking some cleanup
>> which would possibly then even lead (or: would have led ;)
>> to a much more nicely symmetric textual appearance
>> of the patch hunk of the small but quite likely useful change
>> that you currently intend to have here.
>
> If you are done ranting, look at the current tree where it has been
> split out. There was no reason to have it split before, since the
> sysfs entry point was the only place where we updated nr_requests. If
> that code has been duplicated, there would have been a justified
> reason for writing two pages about it.
Yes, agree, this is the only place updating nr_requests,  we can split
it as a separated function if it needs updating at some other places in
future.

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] block: make nr_requests tunable for loop

2014-06-09 Thread Junxiao Bi

commit 7b5a3522 (loop: Limit the number of requests in the bio list) limit
the request number in loop queue to not over 128. Make the number tunable
from sysfs can improve performance.

The following test is done on a machine with 512M memory. The backend of
/dev/loop1 is a nfs file.

[root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
128
[root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
5000+0 records in
5000+0 records out
524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s
[root@bijx mnt]#
[root@bijx mnt]# echo 1024 > /sys/block/loop0/queue/nr_requests
[root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
1024
[root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
5000+0 records in
5000+0 records out
524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s

Signed-off-by: Junxiao Bi 
---
 block/blk-sysfs.c |8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7500f87..193ad8a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -52,9 +52,6 @@ queue_requests_store(struct request_queue *q, const char 
*page, size_t count)
unsigned long nr;
int ret;
 
-   if (!q->request_fn)
-   return -EINVAL;
-
ret = queue_var_store(, page, count);
if (ret < 0)
return ret;
@@ -66,6 +63,11 @@ queue_requests_store(struct request_queue *q, const char 
*page, size_t count)
q->nr_requests = nr;
blk_queue_congestion_threshold(q);
 
+   if (!q->request_fn) {
+   spin_unlock_irq(q->queue_lock);
+   return ret;
+   }
+
/* congestion isn't cgroup aware and follows root blkcg for now */
rl = >root_rl;
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH] block: make nr_requests tunable for loop

2014-06-09 Thread Junxiao Bi

commit 7b5a3522 (loop: Limit the number of requests in the bio list) limit
the request number in loop queue to not over 128. Make the number tunable
from sysfs can improve performance.

The following test is done on a machine with 512M memory. The backend of
/dev/loop1 is a nfs file.

[root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
128
[root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
5000+0 records in
5000+0 records out
524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s
[root@bijx mnt]#
[root@bijx mnt]# echo 1024  /sys/block/loop0/queue/nr_requests
[root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
1024
[root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
5000+0 records in
5000+0 records out
524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s

Signed-off-by: Junxiao Bi junxiao...@oracle.com
---
 block/blk-sysfs.c |8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 7500f87..193ad8a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -52,9 +52,6 @@ queue_requests_store(struct request_queue *q, const char 
*page, size_t count)
unsigned long nr;
int ret;
 
-   if (!q-request_fn)
-   return -EINVAL;
-
ret = queue_var_store(nr, page, count);
if (ret  0)
return ret;
@@ -66,6 +63,11 @@ queue_requests_store(struct request_queue *q, const char 
*page, size_t count)
q-nr_requests = nr;
blk_queue_congestion_threshold(q);
 
+   if (!q-request_fn) {
+   spin_unlock_irq(q-queue_lock);
+   return ret;
+   }
+
/* congestion isn't cgroup aware and follows root blkcg for now */
rl = q-root_rl;
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Re: [PATCH] block: make nr_requests tunable for loop

2014-06-09 Thread Junxiao Bi

On 06/09/2014 11:53 PM, Jens Axboe wrote:
 On 2014-06-09 01:29, Andreas Mohr wrote:
 Hi,

 having had a look at current mainline sources,
 frankly I've (well, initially...) got trouble understanding
 what this patch is doing.

 It's replacing an aggressive error-type bail-out (-EINVAL) for NULL
 request_fn
 with an inoccuous-looking return ret;, yet that ret content currently
 *implicitly* is a = 0 value (resulting from processing by earlier code
 which may or may not get incomprehensibly rewritten in future).
 I don't understand the reasons for this huge change in return value
 handling
 (since it's now not assigning a specific return value
 for this modified bail-out case).

 OK, well... you could say that since all this function ever was
 interested in is the result value of queue_var_store()
 (except for error bail-out cases), doing an interim return ret;
 (which is exactly what the function tail is also doing)
 is exactly right.

 But still simple textual appearance of the resulting patch hunks
 seems strangely asymmetric
 which may easily be a canary for structurally wrong layering of this
 function.
 Not to mention the now required extra spin_unlock_irq()
 in interim return handler...


 Well, after further analysis I would come to the conclusion
 that in general queue_requests_store() does a LOT more than it should -
 since blk-sysfs.c's only (expected!) purpose is
 to do parameterization of request_queue behaviour as gathered
 from sysfs attribute space,
 all that function should ever be concerned with is parsing that sysfs
 value
 and then calling a blk helper for configuration of that very
 attribute value
 which would *internally* do all the strange internal queue magic
 that is currently being updated *open-coded*
 at this supposedly *sysfs*-specific place. Ugh.
 Main question here: what would one do if one decided to rip out sysfs
 and use something entirely different for parameterization?
 Yeah indeed - thought so...


 So yeah, I'd definitely say that that function is lacking some cleanup
 which would possibly then even lead (or: would have led ;)
 to a much more nicely symmetric textual appearance
 of the patch hunk of the small but quite likely useful change
 that you currently intend to have here.

 If you are done ranting, look at the current tree where it has been
 split out. There was no reason to have it split before, since the
 sysfs entry point was the only place where we updated nr_requests. If
 that code has been duplicated, there would have been a justified
 reason for writing two pages about it.
Yes, agree, this is the only place updating nr_requests,  we can split
it as a separated function if it needs updating at some other places in
future.

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[PATCH V2] block: make nr_requests tunable for loop

2014-06-09 Thread Junxiao Bi

commit 7b5a3522 (loop: Limit the number of requests in the bio list) limit
the request number in loop queue to not over 128. Since the request_fn of
loop device is null, the requests number is not allowed tuned. Make it tunable
from sysfs can improve performance.

The following test is done on a machine with 512M memory. The backend of
/dev/loop1 is a nfs file.

[root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
128
[root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
5000+0 records in
5000+0 records out
524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s
[root@bijx mnt]#
[root@bijx mnt]# echo 1024  /sys/block/loop0/queue/nr_requests
[root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests
1024
[root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000
5000+0 records in
5000+0 records out
524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s

Signed-off-by: Junxiao Bi junxiao...@oracle.com
---
 block/blk-core.c  |6 ++
 block/blk-sysfs.c |9 +++--
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 40d6548..58c4bd4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue *q, 
unsigned int nr)
q-nr_requests = nr;
blk_queue_congestion_threshold(q);
 
+   /* for loop device, return after set its nr_requests */
+   if (!q-request_fn) {
+   spin_unlock_irq(q-queue_lock);
+   return 0;
+   }
+
/* congestion isn't cgroup aware and follows root blkcg for now */
rl = q-root_rl;
 
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 23321fb..c5456a5 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -51,9 +51,6 @@ queue_requests_store(struct request_queue *q, const char 
*page, size_t count)
unsigned long nr;
int ret, err;
 
-   if (!q-request_fn  !q-mq_ops)
-   return -EINVAL;
-
ret = queue_var_store(nr, page, count);
if (ret  0)
return ret;
@@ -61,10 +58,10 @@ queue_requests_store(struct request_queue *q, const char 
*page, size_t count)
if (nr  BLKDEV_MIN_RQ)
nr = BLKDEV_MIN_RQ;
 
-   if (q-request_fn)
-   err = blk_update_nr_requests(q, nr);
-   else
+   if (q-mq_ops)
err = blk_mq_update_nr_requests(q, nr);
+   else
+   err = blk_update_nr_requests(q, nr);
 
if (err)
return err;
-- 
1.7.9.5

--
To unsubscribe from this list: send the line unsubscribe linux-kernel in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

1 2 >

1 - 100 of 104 matches

Mail list logo