Re: [PATCH] block: fix io hung by block throttle
On 4/18/21 11:09 PM, Junxiao Bi wrote: - finish_wait(>wait, ); + mutex_lock(>throttle_mutex); + wait_event(rqw->wait, acquire_inflight_cb(rqw, private_data)); + mutex_unlock(>throttle_mutex); This will break the throttle? There is a inflight io limitation. With this change, there can be only one io inflight whatever the limit is. Sorry, ignore this. I should go sleep that time. Thanks, Junxiao.
Re: [PATCH] block: fix io hung by block throttle
On 4/18/21 5:33 AM, Hillf Danton wrote: On Sat, 17 Apr 2021 14:37:57 Junxiao Bi wrote: On 4/17/21 3:10 AM, Hillf Danton wrote: + if (acquire_inflight_cb(rqw, private_data)) This function is to increase atomic variable rq_wait->inflight. You are right. What's the mutex for? It cuts the race between we peek at the sleepers on rqw->wait while they are coming and going, and we cant update rqw->inflight without making sure there are no sleepers. Why? I think checking the sleeper in original code is for a fast path. For wbt, acquire_inflight_cb is wbt_inflight_cb where atomic_inc_below is used to update rqw->inflight. I don't see why a mutex is needed for this atomic operation. With the mutex in place, in addition to the certainty of !sleepers, we can avoid the race between us and waker in terms of updating inflight by removing the invokation of acquire_inflight_cb in the wakeup callback, and the bonus is we no longer need the wakeup cb and the rq_qos_wait_data because the more traditional wait_event() can do the job. Finally we can dump the cleanup_cb_t. +++ b/block/blk-rq-qos.c @@ -200,96 +200,24 @@ bool rq_depth_scale_down(struct rq_depth return true; } -struct rq_qos_wait_data { - struct wait_queue_entry wq; - struct task_struct *task; - struct rq_wait *rqw; - acquire_inflight_cb_t *cb; - void *private_data; - bool got_token; -}; - -static int rq_qos_wake_function(struct wait_queue_entry *curr, - unsigned int mode, int wake_flags, void *key) -{ - struct rq_qos_wait_data *data = container_of(curr, -struct rq_qos_wait_data, -wq); - - /* -* If we fail to get a budget, return -1 to interrupt the wake up loop -* in __wake_up_common. -*/ - if (!data->cb(data->rqw, data->private_data)) - return -1; - - data->got_token = true; - smp_wmb(); - list_del_init(>entry); - wake_up_process(data->task); - return 1; -} - /** * rq_qos_wait - throttle on a rqw if we need to * @rqw: rqw to throttle on * @private_data: caller provided specific data * @acquire_inflight_cb: inc the rqw->inflight counter if we can - * @cleanup_cb: the callback to cleanup in case we race with a waker * * This provides a uniform place for the rq_qos users to do their throttling. * Since you can end up with a lot of things sleeping at once, this manages the * waking up based on the resources available. The acquire_inflight_cb should * inc the rqw->inflight if we have the ability to do so, or return false if not * and then we will sleep until the room becomes available. - * - * cleanup_cb is in case that we race with a waker and need to cleanup the - * inflight count accordingly. */ void rq_qos_wait(struct rq_wait *rqw, void *private_data, -acquire_inflight_cb_t *acquire_inflight_cb, -cleanup_cb_t *cleanup_cb) +acquire_inflight_cb_t *acquire_inflight_cb) { - struct rq_qos_wait_data data = { - .wq = { - .func = rq_qos_wake_function, - .entry = LIST_HEAD_INIT(data.wq.entry), - }, - .task = current, - .rqw = rqw, - .cb = acquire_inflight_cb, - .private_data = private_data, - }; - bool has_sleeper; - - has_sleeper = wq_has_sleeper(>wait); - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) - return; - - prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE); - has_sleeper = !wq_has_single_sleeper(>wait); - do { - /* The memory barrier in set_task_state saves us here. */ - if (data.got_token) - break; - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { - finish_wait(>wait, ); - - /* -* We raced with wbt_wake_function() getting a token, -* which means we now have two. Put our local token -* and wake anyone else potentially waiting for one. -*/ - smp_rmb(); - if (data.got_token) - cleanup_cb(rqw, private_data); - break; - } - io_schedule(); - has_sleeper = true; - set_current_state(TASK_UNINTERRUPTIBLE); - } while (1); - finish_wait(>wait, ); + mutex_lock(>throttle_mutex); + wait_event(rqw->wait, acquire_inflight_cb(rqw, private_data)); + mutex_unlock(>throttle_mutex); This will break the throttle? Th
Re: [PATCH] block: fix io hung by block throttle
On 4/17/21 3:10 AM, Hillf Danton wrote: --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -260,19 +260,17 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, .cb = acquire_inflight_cb, .private_data = private_data, }; - bool has_sleeper; - has_sleeper = wq_has_sleeper(>wait); - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) + if (!wq_has_sleeper(>wait) + && acquire_inflight_cb(rqw, private_data)) return; prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE); - has_sleeper = !wq_has_single_sleeper(>wait); do { /* The memory barrier in set_task_state saves us here. */ if (data.got_token) break; - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { + if (acquire_inflight_cb(rqw, private_data)) { finish_wait(>wait, ); Simply removing !has_sleeper is not enough if it is mandatory before acquire_inflight_cb() without adding something like a mutex to sieve the concurrent sleepers out, see below. --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -260,19 +260,18 @@ void rq_qos_wait(struct rq_wait *rqw, vo .cb = acquire_inflight_cb, .private_data = private_data, }; - bool has_sleeper; - has_sleeper = wq_has_sleeper(>wait); - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) - return; + mutex_lock(>mutex); + + if (acquire_inflight_cb(rqw, private_data)) This function is to increase atomic variable rq_wait->inflight. What's the mutex for? Thanks, Junxiao. + goto out; prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE); - has_sleeper = !wq_has_single_sleeper(>wait); do { /* The memory barrier in set_task_state saves us here. */ if (data.got_token) break; - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { + if (acquire_inflight_cb(rqw, private_data)) { finish_wait(>wait, ); /* @@ -286,10 +285,11 @@ void rq_qos_wait(struct rq_wait *rqw, vo break; } io_schedule(); - has_sleeper = true; set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(>wait, ); +out: + mutex_unlock(>mutex); }
Re: [PATCH] block: fix io hung by block throttle
On 4/14/21 9:11 PM, Hillf Danton wrote: On Wed, 14 Apr 2021 14:18:30 Junxiao Bi wrote: There is a race bug which can cause io hung when multiple processes run parallel in rq_qos_wait(). Let assume there were 4 processes P1/P2/P3/P4, P1/P2 were at the entry of rq_qos_wait, and P3/P4 were waiting for io done, 2 io were inflight, the inflight io limit was 2. See race below. void rq_qos_wait() { ... bool has_sleeper; >>>> P3/P4 were in sleeper list, has_sleeper was true for both P1 and P2. has_sleeper = wq_has_sleeper(>wait); if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) return; >>>> 2 inflight io done, P3/P4 were waken up to issue 2 new io. >>>> 2 new io done, no inflight io. >>>> P1/P2 were added to the sleeper list, 2 entry in the list prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE); >>>> P1/P2 were in the sleeper list, has_sleeper was true for P1/P2. has_sleeper = !wq_has_single_sleeper(>wait); do { /* The memory barrier in set_task_state saves us here. */ if (data.got_token) break; if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { finish_wait(>wait, ); /* * We raced with wbt_wake_function() getting a token, * which means we now have two. Put our local token * and wake anyone else potentially waiting for one. */ smp_rmb(); if (data.got_token) cleanup_cb(rqw, private_data); break; } >>>> P1/P2 hung here forever. New io requests will also hung here. io_schedule(); has_sleeper = true; set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(>wait, ); } Cc: sta...@vger.kernel.org Signed-off-by: Junxiao Bi --- block/blk-rq-qos.c | 9 +++-- 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 656460636ad3..04d888c99bc0 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -260,19 +260,17 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, .cb = acquire_inflight_cb, .private_data = private_data, }; - bool has_sleeper; - has_sleeper = wq_has_sleeper(>wait); - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) + if (!wq_has_sleeper(>wait) + && acquire_inflight_cb(rqw, private_data)) return; prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE); - has_sleeper = !wq_has_single_sleeper(>wait); do { /* The memory barrier in set_task_state saves us here. */ if (data.got_token) break; - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { + if (acquire_inflight_cb(rqw, private_data)) { finish_wait(>wait, ); /* @@ -286,7 +284,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, break; } io_schedule(); - has_sleeper = true; set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(>wait, ); -- 2.24.3 (Apple Git-128) No wakeup may cause the hang. --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -287,7 +287,8 @@ void rq_qos_wait(struct rq_wait *rqw, vo } io_schedule(); has_sleeper = true; - set_current_state(TASK_UNINTERRUPTIBLE); + prepare_to_wait_exclusive(>wait, , + TASK_UNINTERRUPTIBLE); From rq_qos_wake_function(), the process can be waken up and removed from the sleeper list only when it get the budget. Looks not necessary to re-add it to sleeper list again. Thanks, Junxiao. } while (1); finish_wait(>wait, ); }
[PATCH] block: fix io hung by block throttle
There is a race bug which can cause io hung when multiple processes run parallel in rq_qos_wait(). Let assume there were 4 processes P1/P2/P3/P4, P1/P2 were at the entry of rq_qos_wait, and P3/P4 were waiting for io done, 2 io were inflight, the inflight io limit was 2. See race below. void rq_qos_wait() { ... bool has_sleeper; >>>> P3/P4 were in sleeper list, has_sleeper was true for both P1 and P2. has_sleeper = wq_has_sleeper(>wait); if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) return; >>>> 2 inflight io done, P3/P4 were waken up to issue 2 new io. >>>> 2 new io done, no inflight io. >>>> P1/P2 were added to the sleeper list, 2 entry in the list prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE); >>>> P1/P2 were in the sleeper list, has_sleeper was true for P1/P2. has_sleeper = !wq_has_single_sleeper(>wait); do { /* The memory barrier in set_task_state saves us here. */ if (data.got_token) break; if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { finish_wait(>wait, ); /* * We raced with wbt_wake_function() getting a token, * which means we now have two. Put our local token * and wake anyone else potentially waiting for one. */ smp_rmb(); if (data.got_token) cleanup_cb(rqw, private_data); break; } >>>> P1/P2 hung here forever. New io requests will also hung here. io_schedule(); has_sleeper = true; set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(>wait, ); } Cc: sta...@vger.kernel.org Signed-off-by: Junxiao Bi --- block/blk-rq-qos.c | 9 +++-- 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 656460636ad3..04d888c99bc0 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -260,19 +260,17 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, .cb = acquire_inflight_cb, .private_data = private_data, }; - bool has_sleeper; - has_sleeper = wq_has_sleeper(>wait); - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) + if (!wq_has_sleeper(>wait) + && acquire_inflight_cb(rqw, private_data)) return; prepare_to_wait_exclusive(>wait, , TASK_UNINTERRUPTIBLE); - has_sleeper = !wq_has_single_sleeper(>wait); do { /* The memory barrier in set_task_state saves us here. */ if (data.got_token) break; - if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { + if (acquire_inflight_cb(rqw, private_data)) { finish_wait(>wait, ); /* @@ -286,7 +284,6 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, break; } io_schedule(); - has_sleeper = true; set_current_state(TASK_UNINTERRUPTIBLE); } while (1); finish_wait(>wait, ); -- 2.24.3 (Apple Git-128)
Re: Race condition in Kernel
On 3/24/21 5:37 PM, Ming Lei wrote: On Wed, Mar 24, 2021 at 12:37:03PM +, Gulam Mohamed wrote: Hi All, We are facing a stale link (of the device) issue during the iscsi-logout process if we use parted command just before the iscsi logout. Here are the details: As part of iscsi logout, the partitions and the disk will be removed. The parted command, used to list the partitions, will open the disk in RW mode which results in systemd-udevd re-reading the partitions. This will trigger the rescan partitions which will also delete and re-add the partitions. So, both iscsi logout processing and the parted (through systemd-udevd) will be involved in add/delete of partitions. In our case, the following sequence of operations happened (the iscsi device is /dev/sdb with partition sdb1): 1. sdb1 was removed by PARTED 2. kworker, as part of iscsi logout, couldn't remove sdb1 as it was already removed by PARTED 3. sdb1 was added by parted 4. sdb was NOW removed as part of iscsi logout (the last part of the device removal after remoing the partitions) Since the symlink /sys/class/block/sdb1 points to /sys/class/devices/platform/hostx/sessionx/targetx:x:x:x/x:x:x:x/block/sdb/sdb1 and since sdb is already removed, the symlink /sys/class/block/sdb1 will be orphan and stale. So, this stale link is a result of the race condition in kernel between the systemd-udevd and iscsi-logout processing as described above. We are able to reproduce this even with latest upstream kernel. We have come across a patch from Ming Lei which was created for "avoid to drop & re-add partitions if partitions aren't changed": https://lore.kernel.org/linux-block/20210216084430.ga23...@lst.de/T/ BTW, there is a newer version of this patchset: https://lore.kernel.org/linux-block/20210224081825.ga1...@lst.de/#r This patch could resolve our problem of stale link but it just seems to be a work-around and not the actual fix for the race. We were looking for help to fix this race in kernel. Do you have any idea how to fix this race condition? IMO, that isn't a work-around, kernel shouldn't drop partitions if partition table isn't changed. But Christoph thought the current approach is taken since beginning of kernel, and he suggested to fix systemd-udev. This is a real kernel bug. Whatever BLK_RRPART do, it should not cause this sysfs stale link issue. After this issue happen, there is no way to remove that stale link except reboot. The situation is even worse when login back a new disk, since it will reuse the disk number of the old one, it will fail when it creates the symbol link because the stale link is still there. Thanks, Junxiao. Thanks, Ming
Re: [PATCH RFC 0/8] dcache: increase poison resistance
Hi Konstantin, How would you like to proceed with this patch set? This patchset as it is already fixed the customer issue we faced, it will stop memory fragmentation causing by negative dentry and no performance regression through our test. In production workload, it is common that some app kept creating and removing tmp files, this will leave a lot of negative dentry over time, some time later, it will cause memory fragmentation and system run into memory compaction and not responsible. It will be good to push it to upstream merge. If you are busy, we can try push it again. Thanks, Junxiao. On 12/14/20 3:10 PM, Junxiao Bi wrote: On 12/13/20 11:43 PM, Konstantin Khlebnikov wrote: On Sun, Dec 13, 2020 at 9:52 PM Junxiao Bi <mailto:junxiao...@oracle.com>> wrote: On 12/11/20 11:32 PM, Konstantin Khlebnikov wrote: > On Thu, Dec 10, 2020 at 2:01 AM Junxiao Bi mailto:junxiao...@oracle.com> > <mailto:junxiao...@oracle.com <mailto:junxiao...@oracle.com>>> wrote: > > Hi Konstantin, > > We tested this patch set recently and found it limiting negative > dentry > to a small part of total memory. And also we don't see any > performance > regression on it. Do you have any plan to integrate it into > mainline? It > will help a lot on memory fragmentation issue causing by dentry slab, > there were a lot of customer cases where sys% was very high since > most > cpu were doing memory compaction, dentry slab was taking too much > memory > and nearly all dentry there were negative. > > > Right now I don't have any plans for this. I suspect such problems will > appear much more often since machines are getting bigger. > So, somebody will take care of it. We already had a lot of customer cases. It made no sense to leave so many negative dentry in the system, it caused memory fragmentation and not much benefit. Dcache could grow so big only if the system lacks of memory pressure. Simplest solution is a cronjob which provinces such pressure by creating sparse file on disk-based fs and then reading it. This should wash away all inactive caches with no IO and zero chance of oom. Sound good, will try. > > First part which collects negative dentries at the end list of > siblings could be > done in a more obvious way by splitting the list in two. > But this touches much more code. That would add new field to dentry? Yep. Decision is up to maintainers. > > Last patch isn't very rigid but does non-trivial changes. > Probably it's better to call some garbage collector thingy periodically. > Lru list needs pressure to age and reorder entries properly. Swap the negative dentry to the head of hash list when it get accessed? Extra ones can be easily trimmed when swapping, using GC is to reduce perf impact? Reclaimer/shrinker scans denties in LRU lists, it's an another list. Ah, you mean GC to reclaim from LRU list. I am not sure it could catch up the speed of negative dentry generating. Thanks, Junxiao. My patch used order in hash lists is a very unusual way. Don't be confused. There are four lists parent - siblings hashtable - hashchain LRU inode - alias Thanks, Junxioao. > > Gc could be off by default or thresholds set very high (50% of ram for > example). > Final setup could be left up to owners of large systems, which needs > fine tuning.
Re: [PATCH RFC 0/8] dcache: increase poison resistance
On 12/13/20 11:43 PM, Konstantin Khlebnikov wrote: On Sun, Dec 13, 2020 at 9:52 PM Junxiao Bi <mailto:junxiao...@oracle.com>> wrote: On 12/11/20 11:32 PM, Konstantin Khlebnikov wrote: > On Thu, Dec 10, 2020 at 2:01 AM Junxiao Bi mailto:junxiao...@oracle.com> > <mailto:junxiao...@oracle.com <mailto:junxiao...@oracle.com>>> wrote: > > Hi Konstantin, > > We tested this patch set recently and found it limiting negative > dentry > to a small part of total memory. And also we don't see any > performance > regression on it. Do you have any plan to integrate it into > mainline? It > will help a lot on memory fragmentation issue causing by dentry slab, > there were a lot of customer cases where sys% was very high since > most > cpu were doing memory compaction, dentry slab was taking too much > memory > and nearly all dentry there were negative. > > > Right now I don't have any plans for this. I suspect such problems will > appear much more often since machines are getting bigger. > So, somebody will take care of it. We already had a lot of customer cases. It made no sense to leave so many negative dentry in the system, it caused memory fragmentation and not much benefit. Dcache could grow so big only if the system lacks of memory pressure. Simplest solution is a cronjob which provinces such pressure by creating sparse file on disk-based fs and then reading it. This should wash away all inactive caches with no IO and zero chance of oom. Sound good, will try. > > First part which collects negative dentries at the end list of > siblings could be > done in a more obvious way by splitting the list in two. > But this touches much more code. That would add new field to dentry? Yep. Decision is up to maintainers. > > Last patch isn't very rigid but does non-trivial changes. > Probably it's better to call some garbage collector thingy periodically. > Lru list needs pressure to age and reorder entries properly. Swap the negative dentry to the head of hash list when it get accessed? Extra ones can be easily trimmed when swapping, using GC is to reduce perf impact? Reclaimer/shrinker scans denties in LRU lists, it's an another list. Ah, you mean GC to reclaim from LRU list. I am not sure it could catch up the speed of negative dentry generating. Thanks, Junxiao. My patch used order in hash lists is a very unusual way. Don't be confused. There are four lists parent - siblings hashtable - hashchain LRU inode - alias Thanks, Junxioao. > > Gc could be off by default or thresholds set very high (50% of ram for > example). > Final setup could be left up to owners of large systems, which needs > fine tuning.
Re: [PATCH RFC 0/8] dcache: increase poison resistance
On 12/11/20 11:32 PM, Konstantin Khlebnikov wrote: On Thu, Dec 10, 2020 at 2:01 AM Junxiao Bi <mailto:junxiao...@oracle.com>> wrote: Hi Konstantin, We tested this patch set recently and found it limiting negative dentry to a small part of total memory. And also we don't see any performance regression on it. Do you have any plan to integrate it into mainline? It will help a lot on memory fragmentation issue causing by dentry slab, there were a lot of customer cases where sys% was very high since most cpu were doing memory compaction, dentry slab was taking too much memory and nearly all dentry there were negative. Right now I don't have any plans for this. I suspect such problems will appear much more often since machines are getting bigger. So, somebody will take care of it. We already had a lot of customer cases. It made no sense to leave so many negative dentry in the system, it caused memory fragmentation and not much benefit. First part which collects negative dentries at the end list of siblings could be done in a more obvious way by splitting the list in two. But this touches much more code. That would add new field to dentry? Last patch isn't very rigid but does non-trivial changes. Probably it's better to call some garbage collector thingy periodically. Lru list needs pressure to age and reorder entries properly. Swap the negative dentry to the head of hash list when it get accessed? Extra ones can be easily trimmed when swapping, using GC is to reduce perf impact? Thanks, Junxioao. Gc could be off by default or thresholds set very high (50% of ram for example). Final setup could be left up to owners of large systems, which needs fine tuning.
Re: [PATCH RFC 0/8] dcache: increase poison resistance
Hi Konstantin, We tested this patch set recently and found it limiting negative dentry to a small part of total memory. And also we don't see any performance regression on it. Do you have any plan to integrate it into mainline? It will help a lot on memory fragmentation issue causing by dentry slab, there were a lot of customer cases where sys% was very high since most cpu were doing memory compaction, dentry slab was taking too much memory and nearly all dentry there were negative. The following is test result we run on two types of servers, one is 256G memory with 24 CPUS and another is 3T memory with 384 CPUS. The test case is using a lot of processes to generate negative dentry in parallel, the following is the test result after 72 hours, the negative dentry number is stable around that number even running longer time. If without the patch set, in less than half an hour 197G was took by negative dentry on 256G system, in 1 day 2.4T was took on 3T system. neg-dentry-number neg-dentry-mem-usage 256G 55259084 10.6G 3T 202306756 38.8G For perf test, we run the following, and no regression found. - create 1M negative dentry and then touch them to convert them to positive dentry - create 10K/100K/1M files - remove 10K/100K/1M files - kernel compile To verify the fsnotify fix, we used inotifywait to watch file create/open in some directory where there is a lot of negative dentry, without the patch set, the system will run into soft lockup, with it, no soft lockup. We also try to defeat the limitation by making different processes generating negative dentry with the same naming way, that will make one negative dentry being accessed couple times around same time, DCACHE_REFERENCED will be set on it and then it can't be trimmed easily. We do see negative dentry will take all the memory slowly from one of our system with 120G memory, for above two system, we see the memory usage were increased, but still a small part of total memory. This looks ok, since the common negative dentry user case will be create some temp files and then remove it, it will be rare to access same negative dentry around same time. Thanks, Junxiao. On 5/8/20 5:23 AM, Konstantin Khlebnikov wrote: For most filesystems result of every negative lookup is cached, content of directories is usually cached too. Production of negative dentries isn't limited with disk speed. It's really easy to generate millions of them if system has enough memory. Getting this memory back ins't that easy because slab frees pages only when all related objects are gone. While dcache shrinker works in LRU order. Typical scenario is an idle system where some process periodically creates temporary files and removes them. After some time, memory will be filled with negative dentries for these random file names. Simple lookup of random names also generates negative dentries very fast. Constant flow of such negative denries drains all other inactive caches. Negative dentries are linked into siblings list along with normal positive dentries. Some operations walks dcache tree but looks only for positive dentries: most important is fsnotify/inotify. Hordes of negative dentries slow down these operations significantly. Time of dentry lookup is usually unaffected because hash table grows along with size of memory. Unless somebody especially crafts hash collisions. This patch set solves all of these problems: Move negative denries to the end of sliblings list, thus walkers could skip them at first sight (patches 3-6). Keep in dcache at most three unreferenced negative denties in row in each hash bucket (patches 7-8). --- Konstantin Khlebnikov (8): dcache: show count of hash buckets in sysctl fs.dentry-state selftests: add stress testing tool for dcache dcache: sweep cached negative dentries to the end of list of siblings fsnotify: stop walking child dentries if remaining tail is negative dcache: add action D_WALK_SKIP_SIBLINGS to d_walk() dcache: stop walking siblings if remaining dentries all negative dcache: push releasing dentry lock into sweep_negative dcache: prevent flooding with negative dentries fs/dcache.c | 144 +++- fs/libfs.c| 10 +- fs/notify/fsnotify.c | 6 +- include/linux/dcache.h| 6 + tools/testing/selftests/filesystems/Makefile | 1 + .../selftests/filesystems/dcache_stress.c | 210 ++ 6 files changed, 370 insertions(+), 7 deletions(-) create mode 100644 tools/testing/selftests/filesystems/dcache_stress.c -- Signature
Re: [md] e1a86dbbbd: mdadm-selftests.enchmarks/mdadm-selftests/tests/07layouts.fail
This issue had been fixed. I send the following patch in another thread. Please take a look. Thank you. [PATCH] md: get sysfs entry after redundancy attr group create Thanks, Junxiao. On 8/3/20 9:00 AM, Junxiao Bi wrote: Hi Song, I am working on setup an env to reproduce, will update soon. Thanks, Junxiao. On 8/2/20 10:52 PM, Song Liu wrote: On Jul 29, 2020, at 2:04 AM, kernel test robot wrote: Greeting, FYI, we noticed the following commit (built with gcc-9): commit: e1a86dbbbd6a77f73c3d099030495fa31f181e2f ("md: fix deadlock causing by sysfs_notify") https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master in testcase: mdadm-selftests with following parameters: disk: 1HDD test_prefix: 07layout ucode: 0x21 on test machine: 4 threads Intel(R) Core(TM) i3-3220 CPU @ 3.30GHz with 4G memory caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace): If you fix the issue, kindly add following tag Reported-by: kernel test robot 2020-07-29 01:06:34 mkdir -p /var/tmp 2020-07-29 01:06:34 mke2fs -t ext3 -b 4096 -J size=4 -q /dev/sda3 2020-07-29 01:07:36 mount -t ext3 /dev/sda3 /var/tmp sed -e 's/{DEFAULT_METADATA}/1.2/g' \ -e 's,{MAP_PATH},/run/mdadm/map,g' mdadm.8.in > mdadm.8 /usr/bin/install -D -m 644 mdadm.8 /usr/share/man/man8/mdadm.8 /usr/bin/install -D -m 644 mdmon.8 /usr/share/man/man8/mdmon.8 /usr/bin/install -D -m 644 md.4 /usr/share/man/man4/md.4 /usr/bin/install -D -m 644 mdadm.conf.5 /usr/share/man/man5/mdadm.conf.5 /usr/bin/install -D -m 644 udev-md-raid-creating.rules /lib/udev/rules.d/01-md-raid-creating.rules /usr/bin/install -D -m 644 udev-md-raid-arrays.rules /lib/udev/rules.d/63-md-raid-arrays.rules /usr/bin/install -D -m 644 udev-md-raid-assembly.rules /lib/udev/rules.d/64-md-raid-assembly.rules /usr/bin/install -D -m 644 udev-md-clustered-confirm-device.rules /lib/udev/rules.d/69-md-clustered-confirm-device.rules /usr/bin/install -D -m 755 mdadm /sbin/mdadm /usr/bin/install -D -m 755 mdmon /sbin/mdmon Testing on linux-5.8.0-rc4-00129-ge1a86dbbbd6a7 kernel /lkp/benchmarks/mdadm-selftests/tests/07layouts... FAILED - see /var/tmp/07layouts.log and /var/tmp/fail07layouts.log for details 07layouts TIMEOUT To reproduce: git clone https://github.com/intel/lkp-tests.git cd lkp-tests bin/lkp install job.yaml # job file is attached in this email bin/lkp run job.yaml Thanks, Rong Chen <07layouts.log> Hi Junxiao, Could you please look into this issue? Thanks, Song
[PATCH] md: get sysfs entry after redundancy attr group create
"sync_completed" and "degraded" belongs to redundancy attr group, it was not exist yet when md device was created. Reported-by: kernel test robot Fixes: e1a86dbbbd6a ("md: fix deadlock causing by sysfs_notify") Signed-off-by: Junxiao Bi --- drivers/md/md.c | 17 ++--- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index fee8943ead7b..60d2142c4693 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -846,7 +846,13 @@ void mddev_unlock(struct mddev *mddev) sysfs_remove_group(>kobj, _redundancy_group); if (mddev->sysfs_action) sysfs_put(mddev->sysfs_action); + if (mddev->sysfs_completed) + sysfs_put(mddev->sysfs_completed); + if (mddev->sysfs_degraded) + sysfs_put(mddev->sysfs_degraded); mddev->sysfs_action = NULL; + mddev->sysfs_completed = NULL; + mddev->sysfs_degraded = NULL; } } mddev->sysfs_active = 0; @@ -4036,6 +4042,8 @@ level_store(struct mddev *mddev, const char *buf, size_t len) pr_warn("md: cannot register extra attributes for %s\n", mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); } if (oldpers->sync_request != NULL && pers->sync_request == NULL) { @@ -5542,14 +5550,9 @@ static void md_free(struct kobject *ko) if (mddev->sysfs_state) sysfs_put(mddev->sysfs_state); - if (mddev->sysfs_completed) - sysfs_put(mddev->sysfs_completed); - if (mddev->sysfs_degraded) - sysfs_put(mddev->sysfs_degraded); if (mddev->sysfs_level) sysfs_put(mddev->sysfs_level); - if (mddev->gendisk) del_gendisk(mddev->gendisk); if (mddev->queue) @@ -5710,8 +5713,6 @@ static int md_alloc(dev_t dev, char *name) if (!error && mddev->kobj.sd) { kobject_uevent(>kobj, KOBJ_ADD); mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); - mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); - mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); mddev->sysfs_level = sysfs_get_dirent_safe(mddev->kobj.sd, "level"); } mddev_put(mddev); @@ -5991,6 +5992,8 @@ int md_run(struct mddev *mddev) pr_warn("md: cannot register extra attributes for %s\n", mdname(mddev)); mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); + mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed"); + mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded"); } else if (mddev->ro == 2) /* auto-readonly not meaningful */ mddev->ro = 0; -- 2.20.1 (Apple Git-117)
Re: [md] e1a86dbbbd: mdadm-selftests.enchmarks/mdadm-selftests/tests/07layouts.fail
Hi Song, I am working on setup an env to reproduce, will update soon. Thanks, Junxiao. On 8/2/20 10:52 PM, Song Liu wrote: On Jul 29, 2020, at 2:04 AM, kernel test robot wrote: Greeting, FYI, we noticed the following commit (built with gcc-9): commit: e1a86dbbbd6a77f73c3d099030495fa31f181e2f ("md: fix deadlock causing by sysfs_notify") https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git master in testcase: mdadm-selftests with following parameters: disk: 1HDD test_prefix: 07layout ucode: 0x21 on test machine: 4 threads Intel(R) Core(TM) i3-3220 CPU @ 3.30GHz with 4G memory caused below changes (please refer to attached dmesg/kmsg for entire log/backtrace): If you fix the issue, kindly add following tag Reported-by: kernel test robot 2020-07-29 01:06:34 mkdir -p /var/tmp 2020-07-29 01:06:34 mke2fs -t ext3 -b 4096 -J size=4 -q /dev/sda3 2020-07-29 01:07:36 mount -t ext3 /dev/sda3 /var/tmp sed -e 's/{DEFAULT_METADATA}/1.2/g' \ -e 's,{MAP_PATH},/run/mdadm/map,g' mdadm.8.in > mdadm.8 /usr/bin/install -D -m 644 mdadm.8 /usr/share/man/man8/mdadm.8 /usr/bin/install -D -m 644 mdmon.8 /usr/share/man/man8/mdmon.8 /usr/bin/install -D -m 644 md.4 /usr/share/man/man4/md.4 /usr/bin/install -D -m 644 mdadm.conf.5 /usr/share/man/man5/mdadm.conf.5 /usr/bin/install -D -m 644 udev-md-raid-creating.rules /lib/udev/rules.d/01-md-raid-creating.rules /usr/bin/install -D -m 644 udev-md-raid-arrays.rules /lib/udev/rules.d/63-md-raid-arrays.rules /usr/bin/install -D -m 644 udev-md-raid-assembly.rules /lib/udev/rules.d/64-md-raid-assembly.rules /usr/bin/install -D -m 644 udev-md-clustered-confirm-device.rules /lib/udev/rules.d/69-md-clustered-confirm-device.rules /usr/bin/install -D -m 755 mdadm /sbin/mdadm /usr/bin/install -D -m 755 mdmon /sbin/mdmon Testing on linux-5.8.0-rc4-00129-ge1a86dbbbd6a7 kernel /lkp/benchmarks/mdadm-selftests/tests/07layouts... FAILED - see /var/tmp/07layouts.log and /var/tmp/fail07layouts.log for details 07layouts TIMEOUT To reproduce: git clone https://github.com/intel/lkp-tests.git cd lkp-tests bin/lkp install job.yaml # job file is attached in this email bin/lkp run job.yaml Thanks, Rong Chen <07layouts.log> Hi Junxiao, Could you please look into this issue? Thanks, Song
[PATCH V2] md: fix deadlock causing by sysfs_notify
The following deadlock was captured. The first process is holding 'kernfs_mutex' and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device, this pending bio list would be flushed by second process 'md127_raid1', but it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace sysfs_notify() can fix it. There were other sysfs_notify() invoked from io path, removed all of them. PID: 40430 TASK: 8ee9c8c65c40 CPU: 29 COMMAND: "probe_file" #0 [b87c4df37260] __schedule at 9a8678ec #1 [b87c4df372f8] schedule at 9a867f06 #2 [b87c4df37310] io_schedule at 9a0c73e6 #3 [b87c4df37328] __dta___xfs_iunpin_wait_3443 at c03a4057 [xfs] #4 [b87c4df373a0] xfs_iunpin_wait at c03a6c79 [xfs] #5 [b87c4df373b0] __dta_xfs_reclaim_inode_3357 at c039a46c [xfs] #6 [b87c4df37400] xfs_reclaim_inodes_ag at c039a8b6 [xfs] #7 [b87c4df37590] xfs_reclaim_inodes_nr at c039bb33 [xfs] #8 [b87c4df375b0] xfs_fs_free_cached_objects at c03af0e9 [xfs] #9 [b87c4df375c0] super_cache_scan at 9a287ec7 #10 [b87c4df37618] shrink_slab at 9a1efd93 #11 [b87c4df37700] shrink_node at 9a1f5968 #12 [b87c4df37788] do_try_to_free_pages at 9a1f5ea2 #13 [b87c4df377f0] try_to_free_mem_cgroup_pages at 9a1f6445 #14 [b87c4df37880] try_charge at 9a26cc5f #15 [b87c4df37920] memcg_kmem_charge_memcg at 9a270f6a #16 [b87c4df37958] new_slab at 9a251430 #17 [b87c4df379c0] ___slab_alloc at 9a251c85 #18 [b87c4df37a80] __slab_alloc at 9a25635d #19 [b87c4df37ac0] kmem_cache_alloc at 9a251f89 #20 [b87c4df37b00] alloc_inode at 9a2a2b10 #21 [b87c4df37b20] iget_locked at 9a2a4854 #22 [b87c4df37b60] kernfs_get_inode at 9a311377 #23 [b87c4df37b80] kernfs_iop_lookup at 9a311e2b #24 [b87c4df37ba8] lookup_slow at 9a290118 #25 [b87c4df37c10] walk_component at 9a291e83 #26 [b87c4df37c78] path_lookupat at 9a293619 #27 [b87c4df37cd8] filename_lookup at 9a2953af #28 [b87c4df37de8] user_path_at_empty at 9a295566 #29 [b87c4df37e10] vfs_statx at 9a289787 #30 [b87c4df37e70] SYSC_newlstat at 9a289d5d #31 [b87c4df37f18] sys_newlstat at 9a28a60e #32 [b87c4df37f28] do_syscall_64 at 9a003949 #33 [b87c4df37f50] entry_SYSCALL_64_after_hwframe at 9aa001ad RIP: 7f617a5f2905 RSP: 7f607334f838 RFLAGS: 0246 RAX: ffda RBX: 7f6064044b20 RCX: 7f617a5f2905 RDX: 7f6064044b20 RSI: 7f6064044b20 RDI: 7f6064005890 RBP: 7f6064044aa0 R8: 0030 R9: 011c R10: 0013 R11: 0246 R12: 7f606417e6d0 R13: 7f6064044aa0 R14: 7f6064044b10 R15: ORIG_RAX: 0006 CS: 0033 SS: 002b PID: 927TASK: 8f15ac5dbd80 CPU: 42 COMMAND: "md127_raid1" #0 [b87c4df07b28] __schedule at 9a8678ec #1 [b87c4df07bc0] schedule at 9a867f06 #2 [b87c4df07bd8] schedule_preempt_disabled at 9a86825e #3 [b87c4df07be8] __mutex_lock at 9a869bcc #4 [b87c4df07ca0] __mutex_lock_slowpath at 9a86a013 #5 [b87c4df07cb0] mutex_lock at 9a86a04f #6 [b87c4df07cc8] kernfs_find_and_get_ns at 9a311d83 #7 [b87c4df07cf0] sysfs_notify at 9a314b3a #8 [b87c4df07d18] md_update_sb at 9a688696 #9 [b87c4df07d98] md_update_sb at 9a6886d5 #10 [b87c4df07da8] md_check_recovery at 9a68ad9c #11 [b87c4df07dd0] raid1d at c01f0375 [raid1] #12 [b87c4df07ea0] md_thread at 9a680348 #13 [b87c4df07f08] kthread at 9a0b8005 #14 [b87c4df07f50] ret_from_fork at 9aa00344 Signed-off-by: Junxiao Bi --- v2 <- v1 - fix sysfs_notify for sysfs file 'level' to align styles with others. --- drivers/md/md-bitmap.c | 2 +- drivers/md/md.c| 44 -- drivers/md/md.h| 8 +++- drivers/md/raid10.c| 2 +- drivers/md/raid5.c | 6 +++--- 5 files changed, 42 insertions(+), 20 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 95a5f3757fa3..d61b524ae440 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1631,7 +1631,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) s += blocks; } bitmap->last_end_sync = jiffies; - sysfs_notify(>mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); } EXPORT_SYMBOL(md_bitmap_cond_end_sync); diff --git a/drivers/md/md.c b/
Re: [PATCH] md: fix deadlock causing by sysfs_notify
On 7/14/20 9:18 AM, Song Liu wrote: On Mon, Jul 13, 2020 at 11:41 PM Junxiao Bi wrote: On 7/13/20 11:17 PM, Song Liu wrote: On Thu, Jul 9, 2020 at 4:36 PM Junxiao Bi wrote: The following deadlock was captured. The first process is holding 'kernfs_mutex' and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device, this pending bio list would be flushed by second process 'md127_raid1', but it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace sysfs_notify() can fix it. There were other sysfs_notify() invoked from io path, removed all of them. [...] Cc: sta...@vger.kernel.org Signed-off-by: Junxiao Bi Thanks for the patch. It looks good in general. One question though, do we need the same change the following line in md.c:level_store()? sysfs_notify(>kobj, NULL, "level"); Thanks for the review. This one is not in io path, looks it's safe. I can change it if you want to align it with others. This one is the only leftover. Let's also change it. Sure, i will send a v2. Thanks, Junxiao. Thanks, Song
Re: [PATCH] md: fix deadlock causing by sysfs_notify
On 7/13/20 11:17 PM, Song Liu wrote: On Thu, Jul 9, 2020 at 4:36 PM Junxiao Bi wrote: The following deadlock was captured. The first process is holding 'kernfs_mutex' and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device, this pending bio list would be flushed by second process 'md127_raid1', but it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace sysfs_notify() can fix it. There were other sysfs_notify() invoked from io path, removed all of them. [...] Cc: sta...@vger.kernel.org Signed-off-by: Junxiao Bi Thanks for the patch. It looks good in general. One question though, do we need the same change the following line in md.c:level_store()? sysfs_notify(>kobj, NULL, "level"); Thanks for the review. This one is not in io path, looks it's safe. I can change it if you want to align it with others. Thanks, Junxiao. Thanks, Song [...]
Re: [PATCH] md: fix deadlock causing by sysfs_notify
Anybody help take a look at this deadlock? Issue happened when raid_check was running, at that time, system memory was not enough, one process which was doing path lookup from sysfs triggered the direct memory reclaim, it was holding filesystem mutex 'kernelfs_mutex' and hung by io. The io would be flushed from raid1d()->flush_pending_writes() by process 'md127_raid1', but it was hung by 'kernelfs_mutex' in md_check_recovery()->md_update_sb() before flush_pending_writes(). Thanks, Junxiao. On 7/9/20 4:35 PM, Junxiao Bi wrote: The following deadlock was captured. The first process is holding 'kernfs_mutex' and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device, this pending bio list would be flushed by second process 'md127_raid1', but it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace sysfs_notify() can fix it. There were other sysfs_notify() invoked from io path, removed all of them. PID: 40430 TASK: 8ee9c8c65c40 CPU: 29 COMMAND: "probe_file" #0 [b87c4df37260] __schedule at 9a8678ec #1 [b87c4df372f8] schedule at 9a867f06 #2 [b87c4df37310] io_schedule at 9a0c73e6 #3 [b87c4df37328] __dta___xfs_iunpin_wait_3443 at c03a4057 [xfs] #4 [b87c4df373a0] xfs_iunpin_wait at c03a6c79 [xfs] #5 [b87c4df373b0] __dta_xfs_reclaim_inode_3357 at c039a46c [xfs] #6 [b87c4df37400] xfs_reclaim_inodes_ag at c039a8b6 [xfs] #7 [b87c4df37590] xfs_reclaim_inodes_nr at c039bb33 [xfs] #8 [b87c4df375b0] xfs_fs_free_cached_objects at c03af0e9 [xfs] #9 [b87c4df375c0] super_cache_scan at 9a287ec7 #10 [b87c4df37618] shrink_slab at 9a1efd93 #11 [b87c4df37700] shrink_node at 9a1f5968 #12 [b87c4df37788] do_try_to_free_pages at 9a1f5ea2 #13 [b87c4df377f0] try_to_free_mem_cgroup_pages at 9a1f6445 #14 [b87c4df37880] try_charge at 9a26cc5f #15 [b87c4df37920] memcg_kmem_charge_memcg at 9a270f6a #16 [b87c4df37958] new_slab at 9a251430 #17 [b87c4df379c0] ___slab_alloc at 9a251c85 #18 [b87c4df37a80] __slab_alloc at 9a25635d #19 [b87c4df37ac0] kmem_cache_alloc at 9a251f89 #20 [b87c4df37b00] alloc_inode at 9a2a2b10 #21 [b87c4df37b20] iget_locked at 9a2a4854 #22 [b87c4df37b60] kernfs_get_inode at 9a311377 #23 [b87c4df37b80] kernfs_iop_lookup at 9a311e2b #24 [b87c4df37ba8] lookup_slow at 9a290118 #25 [b87c4df37c10] walk_component at 9a291e83 #26 [b87c4df37c78] path_lookupat at 9a293619 #27 [b87c4df37cd8] filename_lookup at 9a2953af #28 [b87c4df37de8] user_path_at_empty at 9a295566 #29 [b87c4df37e10] vfs_statx at 9a289787 #30 [b87c4df37e70] SYSC_newlstat at 9a289d5d #31 [b87c4df37f18] sys_newlstat at 9a28a60e #32 [b87c4df37f28] do_syscall_64 at 9a003949 #33 [b87c4df37f50] entry_SYSCALL_64_after_hwframe at 9aa001ad RIP: 7f617a5f2905 RSP: 7f607334f838 RFLAGS: 0246 RAX: ffda RBX: 7f6064044b20 RCX: 7f617a5f2905 RDX: 7f6064044b20 RSI: 7f6064044b20 RDI: 7f6064005890 RBP: 7f6064044aa0 R8: 0030 R9: 011c R10: 0013 R11: 0246 R12: 7f606417e6d0 R13: 7f6064044aa0 R14: 7f6064044b10 R15: ORIG_RAX: 0006 CS: 0033 SS: 002b PID: 927TASK: 8f15ac5dbd80 CPU: 42 COMMAND: "md127_raid1" #0 [b87c4df07b28] __schedule at 9a8678ec #1 [b87c4df07bc0] schedule at 9a867f06 #2 [b87c4df07bd8] schedule_preempt_disabled at 9a86825e #3 [b87c4df07be8] __mutex_lock at 9a869bcc #4 [b87c4df07ca0] __mutex_lock_slowpath at 9a86a013 #5 [b87c4df07cb0] mutex_lock at 9a86a04f #6 [b87c4df07cc8] kernfs_find_and_get_ns at 9a311d83 #7 [b87c4df07cf0] sysfs_notify at 9a314b3a #8 [b87c4df07d18] md_update_sb at 9a688696 #9 [b87c4df07d98] md_update_sb at 9a6886d5 #10 [b87c4df07da8] md_check_recovery at 9a68ad9c #11 [b87c4df07dd0] raid1d at c01f0375 [raid1] #12 [b87c4df07ea0] md_thread at 9a680348 #13 [b87c4df07f08] kthread at 9a0b8005 #14 [b87c4df07f50] ret_from_fork at 9aa00344 Cc: sta...@vger.kernel.org Signed-off-by: Junxiao Bi --- drivers/md/md-bitmap.c | 2 +- drivers/md/md.c| 39 ++- drivers/md/md.h| 7 ++- drivers/md/raid10.c| 2 +- drivers/md/raid5.c | 6 +++--- 5 files changed, 37 insertions(+), 19 deletions(-) diff --git a/drivers/m
[PATCH] md: fix deadlock causing by sysfs_notify
The following deadlock was captured. The first process is holding 'kernfs_mutex' and hung by io. The io was staging in 'r1conf.pending_bio_list' of raid1 device, this pending bio list would be flushed by second process 'md127_raid1', but it was hung by 'kernfs_mutex'. Using sysfs_notify_dirent_safe() to replace sysfs_notify() can fix it. There were other sysfs_notify() invoked from io path, removed all of them. PID: 40430 TASK: 8ee9c8c65c40 CPU: 29 COMMAND: "probe_file" #0 [b87c4df37260] __schedule at 9a8678ec #1 [b87c4df372f8] schedule at 9a867f06 #2 [b87c4df37310] io_schedule at 9a0c73e6 #3 [b87c4df37328] __dta___xfs_iunpin_wait_3443 at c03a4057 [xfs] #4 [b87c4df373a0] xfs_iunpin_wait at c03a6c79 [xfs] #5 [b87c4df373b0] __dta_xfs_reclaim_inode_3357 at c039a46c [xfs] #6 [b87c4df37400] xfs_reclaim_inodes_ag at c039a8b6 [xfs] #7 [b87c4df37590] xfs_reclaim_inodes_nr at c039bb33 [xfs] #8 [b87c4df375b0] xfs_fs_free_cached_objects at c03af0e9 [xfs] #9 [b87c4df375c0] super_cache_scan at 9a287ec7 #10 [b87c4df37618] shrink_slab at 9a1efd93 #11 [b87c4df37700] shrink_node at 9a1f5968 #12 [b87c4df37788] do_try_to_free_pages at 9a1f5ea2 #13 [b87c4df377f0] try_to_free_mem_cgroup_pages at 9a1f6445 #14 [b87c4df37880] try_charge at 9a26cc5f #15 [b87c4df37920] memcg_kmem_charge_memcg at 9a270f6a #16 [b87c4df37958] new_slab at 9a251430 #17 [b87c4df379c0] ___slab_alloc at 9a251c85 #18 [b87c4df37a80] __slab_alloc at 9a25635d #19 [b87c4df37ac0] kmem_cache_alloc at 9a251f89 #20 [b87c4df37b00] alloc_inode at 9a2a2b10 #21 [b87c4df37b20] iget_locked at 9a2a4854 #22 [b87c4df37b60] kernfs_get_inode at 9a311377 #23 [b87c4df37b80] kernfs_iop_lookup at 9a311e2b #24 [b87c4df37ba8] lookup_slow at 9a290118 #25 [b87c4df37c10] walk_component at 9a291e83 #26 [b87c4df37c78] path_lookupat at 9a293619 #27 [b87c4df37cd8] filename_lookup at 9a2953af #28 [b87c4df37de8] user_path_at_empty at 9a295566 #29 [b87c4df37e10] vfs_statx at 9a289787 #30 [b87c4df37e70] SYSC_newlstat at 9a289d5d #31 [b87c4df37f18] sys_newlstat at 9a28a60e #32 [b87c4df37f28] do_syscall_64 at 9a003949 #33 [b87c4df37f50] entry_SYSCALL_64_after_hwframe at 9aa001ad RIP: 7f617a5f2905 RSP: 7f607334f838 RFLAGS: 0246 RAX: ffda RBX: 7f6064044b20 RCX: 7f617a5f2905 RDX: 7f6064044b20 RSI: 7f6064044b20 RDI: 7f6064005890 RBP: 7f6064044aa0 R8: 0030 R9: 011c R10: 0013 R11: 0246 R12: 7f606417e6d0 R13: 7f6064044aa0 R14: 7f6064044b10 R15: ORIG_RAX: 0006 CS: 0033 SS: 002b PID: 927TASK: 8f15ac5dbd80 CPU: 42 COMMAND: "md127_raid1" #0 [b87c4df07b28] __schedule at 9a8678ec #1 [b87c4df07bc0] schedule at 9a867f06 #2 [b87c4df07bd8] schedule_preempt_disabled at 9a86825e #3 [b87c4df07be8] __mutex_lock at 9a869bcc #4 [b87c4df07ca0] __mutex_lock_slowpath at 9a86a013 #5 [b87c4df07cb0] mutex_lock at 9a86a04f #6 [b87c4df07cc8] kernfs_find_and_get_ns at 9a311d83 #7 [b87c4df07cf0] sysfs_notify at 9a314b3a #8 [b87c4df07d18] md_update_sb at 9a688696 #9 [b87c4df07d98] md_update_sb at 9a6886d5 #10 [b87c4df07da8] md_check_recovery at 9a68ad9c #11 [b87c4df07dd0] raid1d at c01f0375 [raid1] #12 [b87c4df07ea0] md_thread at 9a680348 #13 [b87c4df07f08] kthread at 9a0b8005 #14 [b87c4df07f50] ret_from_fork at 9aa00344 Cc: sta...@vger.kernel.org Signed-off-by: Junxiao Bi --- drivers/md/md-bitmap.c | 2 +- drivers/md/md.c| 39 ++- drivers/md/md.h| 7 ++- drivers/md/raid10.c| 2 +- drivers/md/raid5.c | 6 +++--- 5 files changed, 37 insertions(+), 19 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 95a5f3757fa3..d61b524ae440 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -1631,7 +1631,7 @@ void md_bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector, bool force) s += blocks; } bitmap->last_end_sync = jiffies; - sysfs_notify(>mddev->kobj, NULL, "sync_completed"); + sysfs_notify_dirent_safe(bitmap->mddev->sysfs_completed); } EXPORT_SYMBOL(md_bitmap_cond_end_sync); diff --git a/drivers/md/md.c b/drivers/md/md.c index f567f536b529..42a0b5ceaaec 100644 --- a/dri
Re: [PATCH 4.19 114/131] ocfs2: avoid inode removal while nfsd is accessing it
On 7/2/20 3:24 PM, Linus Torvalds wrote: On Thu, Jul 2, 2020 at 2:17 PM Pavel Machek wrote: commit 4cd9973f9ff69e37dd0ba2bd6e6423f8179c329a upstream. Patch series "ocfs2: fix nfsd over ocfs2 issues", v2. This causes locking imbalance: This sems to be true upstream too. When ocfs2_nfs_sync_lock() returns error, caller can not know if the lock was taken or not. Right you are. And your patch looks sane: diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index c141b06811a6..8149fb6f1f0d 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2867,9 +2867,15 @@ int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex) status = ocfs2_cluster_lock(osb, lockres, ex ? LKM_EXMODE : LKM_PRMODE, 0, 0); - if (status < 0) + if (status < 0) { mlog(ML_ERROR, "lock on nfs sync lock failed %d\n", status); + if (ex) + up_write(>nfs_sync_rwlock); + else + up_read(>nfs_sync_rwlock); + } + return status; } although the whole thing looks messy. If the issue is a lifetime thing (like that commit says), the proper model isn't a lock, but a refcount. Oh well. Junxiao? There is a block number embedded in nfs file handle, to verify it's an inode, need acquire this nfs_sync_lock global lock to avoid any inode removed from local node and other nodes in the cluster, before this verify done, seemed no way to use a refcount. Thanks, Junxiao. Linus
Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries
On 6/22/20 5:47 PM, Matthew Wilcox wrote: On Sun, Jun 21, 2020 at 10:15:39PM -0700, Junxiao Bi wrote: On 6/20/20 9:27 AM, Matthew Wilcox wrote: On Fri, Jun 19, 2020 at 05:42:45PM -0500, Eric W. Biederman wrote: Junxiao Bi writes: Still high lock contention. Collect the following hot path. A different location this time. I know of at least exit_signal and exit_notify that take thread wide locks, and it looks like exit_mm is another. Those don't use the same locks as flushing proc. So I think you are simply seeing a result of the thundering herd of threads shutting down at once. Given that thread shutdown is fundamentally a slow path there is only so much that can be done. If you are up for a project to working through this thundering herd I expect I can help some. It will be a long process of cleaning up the entire thread exit process with an eye to performance. Wengang had some tests which produced wall-clock values for this problem, which I agree is more informative. I'm not entirely sure what the customer workload is that requires a highly threaded workload to also shut down quickly. To my mind, an overall workload is normally composed of highly-threaded tasks that run for a long time and only shut down rarely (thus performance of shutdown is not important) and single-threaded tasks that run for a short time. The real workload is a Java application working in server-agent mode, issue happened in agent side, all it do is waiting works dispatching from server and execute. To execute one work, agent will start lots of short live threads, there could be a lot of threads exit same time if there were a lots of work to execute, the contention on the exit path caused a high %sys time which impacted other workload. How about this for a micro? Executes in about ten seconds on my laptop. You might need to tweak it a bit to get better timing on a server. // gcc -pthread -O2 -g -W -Wall #include #include void *worker(void *arg) { int i = 0; int *p = arg; for (;;) { while (i < 1000 * 1000) { i += *p; } sleep(1); } } int main(int argc, char **argv) { pthread_t threads[20][100]; Tuning 100 to 1000 here and the following 2 loops. Test it on 2-socket server with 104 cpu. Perf is similar on v5.7 and v5.7 with Eric's fix. The spin lock was shifted to spin lock in futex, so the fix didn't help. 46.41% 0.11% perf_test [kernel.kallsyms] [k] entry_SYSCALL_64_after_hwframe | --46.30%--entry_SYSCALL_64_after_hwframe | --46.12%--do_syscall_64 | |--30.47%--__x64_sys_futex | | | --30.45%--do_futex | | | |--18.04%--futex_wait | | | | | |--16.94%--futex_wait_setup | | | | | | | --16.61%--_raw_spin_lock | | | | | | | --16.30%--native_queued_spin_lock_slowpath | | | | | | | --0.81%--call_function_interrupt | | | | | | | --0.79%--smp_call_function_interrupt | | | | | | | --0.62%--generic_smp_call_function_single_interrupt | | | | | --1.04%--futex_wait_queue_me | | | | | --0.96%--schedule | | | | | --0.94%--__schedule | | | | | --0.51%--pick_next_task_fair | | | --12.38%--futex_wake | | | |--11
Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries
On 6/22/20 8:20 AM, ebied...@xmission.com wrote: If I understand correctly, the Java VM is not exiting. Just some of it's threads. That is a very different problem to deal with. That are many optimizations that are possible when_all_ of the threads are exiting that are not possible when_many_ threads are exiting. Do you know if it is simply the cpu time or if it is the lock contention that is the problem? If it is simply the cpu time we should consider if some of the locks that can be highly contended should become mutexes. Or perhaps something like Matthew's cpu pinning idea. The problem is high %sys time. Thanks, Junxiao.
Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries
On 6/20/20 9:27 AM, Matthew Wilcox wrote: On Fri, Jun 19, 2020 at 05:42:45PM -0500, Eric W. Biederman wrote: Junxiao Bi writes: Still high lock contention. Collect the following hot path. A different location this time. I know of at least exit_signal and exit_notify that take thread wide locks, and it looks like exit_mm is another. Those don't use the same locks as flushing proc. So I think you are simply seeing a result of the thundering herd of threads shutting down at once. Given that thread shutdown is fundamentally a slow path there is only so much that can be done. If you are up for a project to working through this thundering herd I expect I can help some. It will be a long process of cleaning up the entire thread exit process with an eye to performance. Wengang had some tests which produced wall-clock values for this problem, which I agree is more informative. I'm not entirely sure what the customer workload is that requires a highly threaded workload to also shut down quickly. To my mind, an overall workload is normally composed of highly-threaded tasks that run for a long time and only shut down rarely (thus performance of shutdown is not important) and single-threaded tasks that run for a short time. The real workload is a Java application working in server-agent mode, issue happened in agent side, all it do is waiting works dispatching from server and execute. To execute one work, agent will start lots of short live threads, there could be a lot of threads exit same time if there were a lots of work to execute, the contention on the exit path caused a high %sys time which impacted other workload. Thanks, Junxiao. Understanding this workload is important to my next suggestion, which is that rather than searching for all the places in the exit path which contend on a single spinlock, we simply set the allowed CPUs for an exiting task to include only the CPU that this thread is running on. It will probably run faster to take the threads down in series on one CPU rather than take them down in parallel across many CPUs (or am I mistaken? Is there inherently a lot of parallelism in the thread exiting process?)
Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries
On 6/19/20 10:24 AM, ebied...@xmission.com wrote: Junxiao Bi writes: Hi Eric, The patch didn't improve lock contention. Which raises the question where is the lock contention coming from. Especially with my first variant. Only the last thread to be reaped would free up anything in the cache. Can you comment out the call to proc_flush_pid entirely? Still high lock contention. Collect the following hot path. 74.90% 0.01% proc_race [kernel.kallsyms] [k] entry_SYSCALL_64_after_hwframe | --74.89%--entry_SYSCALL_64_after_hwframe | --74.88%--do_syscall_64 | |--69.70%--exit_to_usermode_loop | | | --69.70%--do_signal | | | --69.69%--get_signal | | | |--56.30%--do_group_exit | | | | | --56.30%--do_exit | | | | | |--27.50%--_raw_write_lock_irq | | | | | | | --27.47%--queued_write_lock_slowpath | | | | | | | --27.18%--native_queued_spin_lock_slowpath | | | | | |--26.10%--release_task.part.20 | | | | | | | --25.60%--_raw_write_lock_irq | | | | | | | --25.56%--queued_write_lock_slowpath | | | | | | | --25.23%--native_queued_spin_lock_slowpath | | | | | --0.56%--mmput | | | | | --0.55%--exit_mmap | | | --13.31%--_raw_spin_lock_irq | | | --13.28%--native_queued_spin_lock_slowpath | Thanks, Junxiao. That will rule out the proc_flush_pid in d_invalidate entirely. The only candidate I can think of d_invalidate aka (proc_flush_pid) vs ps. Eric
Re: [PATCH] proc: Avoid a thundering herd of threads freeing proc dentries
Hi Eric, The patch didn't improve lock contention. PerfTop: 48925 irqs/sec kernel:95.6% exact: 100.0% lost: 0/0 drop: 0/0 [4000Hz cycles], (all, 104 CPUs) --- 69.66% [kernel] [k] native_queued_spin_lock_slowpath 1.93% [kernel] [k] _raw_spin_lock 1.24% [kernel] [k] page_counter_cancel 0.70% [kernel] [k] do_syscall_64 0.62% [kernel] [k] find_idlest_group.isra.96 0.57% [kernel] [k] queued_write_lock_slowpath 0.56% [kernel] [k] d_walk 0.45% [kernel] [k] clear_page_erms 0.44% [kernel] [k] syscall_return_via_sysret 0.40% [kernel] [k] entry_SYSCALL_64 0.38% [kernel] [k] refcount_dec_not_one 0.37% [kernel] [k] propagate_protected_usage 0.33% [kernel] [k] unmap_page_range 0.33% [kernel] [k] select_collect 0.32% [kernel] [k] memcpy_erms 0.30% [kernel] [k] proc_task_readdir 0.27% [kernel] [k] _raw_spin_lock_irqsave Thanks, Junxiao. On 6/19/20 7:09 AM, ebied...@xmission.com wrote: Junxiao Bi reported: When debugging some performance issue, i found that thousands of threads exit around same time could cause a severe spin lock contention on proc dentry "/proc/$parent_process_pid/task/", that's because threads needs to clean up their pid file from that dir when exit. Matthew Wilcox reported: We've looked at a few different ways of fixing this problem. The flushing of the proc dentries from the dcache is an optmization, and is not necessary for correctness. Eventually cache pressure will cause the dentries to be freed even if no flushing happens. Some light testing when I refactored the proc flushg[1] indicated that at least the memory footprint is easily measurable. An optimization that causes a performance problem due to a thundering herd of threads is no real optimization. Modify the code to only flush the /proc// directory when all threads in a process are killed at once. This continues to flush practically everything when the process is reaped as the threads live under /proc//task/. There is a rare possibility that a debugger will access /proc//, which this change will no longer flush, but I believe such accesses are sufficiently rare to not be observed in practice. [1] 7bc3e6e55acf ("proc: Use a list of inodes to flush from proc") Link: https://lkml.kernel.org/r/54091fc0-ca46-2186-97a8-d1f3c4f38...@oracle.com Reported-by: Masahiro Yamada Reported-by: Matthew Wilcox Signed-off-by: "Eric W. Biederman" --- I am still waiting for word on how this affects performance, but this is a clean version that should avoid the thundering herd problem in general. kernel/exit.c | 19 +++ 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index cebae77a9664..567354550d62 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -151,8 +151,8 @@ void put_task_struct_rcu_user(struct task_struct *task) void release_task(struct task_struct *p) { + struct pid *flush_pid = NULL; struct task_struct *leader; - struct pid *thread_pid; int zap_leader; repeat: /* don't need to get the RCU readlock here - the process is dead and @@ -165,7 +165,16 @@ void release_task(struct task_struct *p) write_lock_irq(_lock); ptrace_release_task(p); - thread_pid = get_pid(p->thread_pid); + + /* +* When all of the threads are exiting wait until the end +* and flush everything. +*/ + if (thread_group_leader(p)) + flush_pid = get_pid(task_tgid(p)); + else if (!(p->signal->flags & SIGNAL_GROUP_EXIT)) + flush_pid = get_pid(task_pid(p)); + __exit_signal(p); /* @@ -188,8 +197,10 @@ void release_task(struct task_struct *p) } write_unlock_irq(_lock); - proc_flush_pid(thread_pid); - put_pid(thread_pid); + if (flush_pid) { + proc_flush_pid(flush_pid); + put_pid(flush_pid); + } release_thread(p); put_task_struct_rcu_user(p);
Re: severe proc dentry lock contention
On 6/18/20 5:02 PM, ebied...@xmission.com wrote: Matthew Wilcox writes: On Thu, Jun 18, 2020 at 03:17:33PM -0700, Junxiao Bi wrote: When debugging some performance issue, i found that thousands of threads exit around same time could cause a severe spin lock contention on proc dentry "/proc/$parent_process_pid/task/", that's because threads needs to clean up their pid file from that dir when exit. Check the following standalone test case that simulated the case and perf top result on v5.7 kernel. Any idea on how to fix this? Thanks, Junxiao. We've looked at a few different ways of fixing this problem. Even though the contention is within the dcache, it seems like a usecase that the dcache shouldn't be optimised for -- generally we do not have hundreds of CPUs removing dentries from a single directory in parallel. We could fix this within procfs. We don't have a great patch yet, but the current approach we're looking at allows only one thread at a time to call dput() on any /proc/*/task directory. We could also look at fixing this within the scheduler. Only allowing one CPU to run the threads of an exiting process would fix this particular problem, but might have other consequences. I was hoping that 7bc3e6e55acf would fix this, but that patch is in 5.7, so that hope is ruled out. Does anyone know if problem new in v5.7? I am wondering if I introduced this problem when I refactored the code or if I simply churned the code but the issue remains effectively the same. It's not new issue, we see it in old kernel like v4.14 Can you try only flushing entries when the last thread of the process is reaped? I think in practice we would want to be a little more sophisticated but it is a good test case to see if it solves the issue. Thank you. i will try and let you know. Thanks, Junxiao. diff --git a/kernel/exit.c b/kernel/exit.c index cebae77a9664..d56e4eb60bdd 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -152,7 +152,7 @@ void put_task_struct_rcu_user(struct task_struct *task) void release_task(struct task_struct *p) { struct task_struct *leader; - struct pid *thread_pid; + struct pid *thread_pid = NULL; int zap_leader; repeat: /* don't need to get the RCU readlock here - the process is dead and @@ -165,7 +165,8 @@ void release_task(struct task_struct *p) write_lock_irq(_lock); ptrace_release_task(p); - thread_pid = get_pid(p->thread_pid); + if (p == p->group_leader) + thread_pid = get_pid(p->thread_pid); __exit_signal(p); /* @@ -188,8 +189,10 @@ void release_task(struct task_struct *p) } write_unlock_irq(_lock); - proc_flush_pid(thread_pid); - put_pid(thread_pid); + if (thread_pid) { + proc_flush_pid(thread_pid); + put_pid(thread_pid); + } release_thread(p); put_task_struct_rcu_user(p);
severe proc dentry lock contention
Hi, When debugging some performance issue, i found that thousands of threads exit around same time could cause a severe spin lock contention on proc dentry "/proc/$parent_process_pid/task/", that's because threads needs to clean up their pid file from that dir when exit. Check the following standalone test case that simulated the case and perf top result on v5.7 kernel. Any idea on how to fix this? PerfTop: 48891 irqs/sec kernel:95.6% exact: 100.0% lost: 0/0 drop: 0/0 [4000Hz cycles], (all, 72 CPUs) --- 66.10% [kernel] [k] native_queued_spin_lock_slowpath 1.13% [kernel] [k] _raw_spin_lock 0.84% [kernel] [k] clear_page_erms 0.82% [kernel] [k] queued_write_lock_slowpath 0.64% [kernel] [k] proc_task_readdir 0.61% [kernel] [k] find_idlest_group.isra.95 0.61% [kernel] [k] syscall_return_via_sysret 0.55% [kernel] [k] entry_SYSCALL_64 0.49% [kernel] [k] memcpy_erms 0.46% [kernel] [k] update_cfs_group 0.41% [kernel] [k] get_pid_task 0.39% [kernel] [k] _raw_spin_lock_irqsave 0.37% [kernel] [k] __list_del_entry_valid 0.34% [kernel] [k] get_page_from_freelist 0.34% [kernel] [k] __d_lookup 0.32% [kernel] [k] update_load_avg 0.31% libc-2.17.so [.] get_next_seq 0.27% [kernel] [k] avc_has_perm_noaudit 0.26% [kernel] [k] __sched_text_start 0.25% [kernel] [k] selinux_inode_permission 0.25% [kernel] [k] __slab_free 0.24% [kernel] [k] detach_entity_cfs_rq 0.23% [kernel] [k] zap_pte_range 0.22% [kernel] [k] _find_next_bit.constprop.1 0.22% libc-2.17.so [.] vfprintf 0.20% libc-2.17.so [.] _int_malloc 0.19% [kernel] [k] _raw_spin_lock_irq 0.18% [kernel] [k] rb_erase 0.18% [kernel] [k] pid_revalidate 0.18% [kernel] [k] lockref_get_not_dead 0.18% [kernel] [k] __alloc_pages_nodemask 0.17% [kernel] [k] set_task_cpu 0.17% libc-2.17.so [.] __strcoll_l 0.17% [kernel] [k] do_syscall_64 0.17% [kernel] [k] __vmalloc_node_range 0.17% libc-2.17.so [.] _IO_vfscanf 0.17% [kernel] [k] refcount_dec_not_one 0.15% [kernel] [k] __task_pid_nr_ns 0.15% [kernel] [k] native_irq_return_iret 0.15% [kernel] [k] free_pcppages_bulk 0.14% [kernel] [k] kmem_cache_alloc 0.14% [kernel] [k] link_path_walk 0.14% libc-2.17.so [.] _int_free 0.14% [kernel] [k] __update_load_avg_cfs_rq 0.14% perf.5.7.0-master.20200601.ol7.x86_64 [.] 0x000eac29 0.13% [kernel] [k] kmem_cache_free 0.13% [kernel] [k] number 0.13% [kernel] [k] memset_erms 0.12% [kernel] [k] proc_pid_status 0.12% [kernel] [k] __d_lookup_rcu === runme.sh == #!/bin/bash threads=${1:-1} prog=proc_race while [ 1 ]; do ./$prog $threads; done & while [ 1 ]; do pid=`ps aux | grep $prog | grep -v grep| awk '{print $2}'` if [ -z $pid ]; then continue; fi threadnum=`ls -l /proc/$pid/task | wc -l` if [ $threadnum -gt $threads ]; then echo kill $pid kill -9 $pid fi done ===proc_race.c= #include #include #include #include #include #include #include #define handle_error_en(en, msg) \ do { errno = en; perror(msg); exit(EXIT_FAILURE); } while (0) #define
Re: [PATCH] block: fix RO partition with RW disk
Anybody could help review this bug? thanks, Junxiao. On 8/5/19 1:01 PM, Junxiao Bi wrote: When md raid1 was used with imsm metadata, during the boot stage, the raid device will first be set to readonly, then mdmon will set it read-write later. When there were some partitions in this device, the following race would make some partition left ro and fail to mount. CPU 1: CPU 2: add_partition()set_disk_ro() //set disk RW //disk was RO, so partition set to RO p->policy = get_disk_ro(disk); if (disk->part0.policy != flag) { set_disk_ro_uevent(disk, flag); // disk set to RW disk->part0.policy = flag; } // set all exit partition to RW while ((part = disk_part_iter_next())) part->policy = flag; // this part was not yet added, so it was still RO rcu_assign_pointer(ptbl->part[partno], p); Move RO status setting of partitions after they were added into partition table and introduce a mutex to sync RO status between disk and partitions. Signed-off-by: Junxiao Bi --- block/genhd.c | 3 +++ block/partition-generic.c | 5 - include/linux/genhd.h | 1 + 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 54f1f0d381f4..f3cce1d354cf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1479,6 +1479,7 @@ struct gendisk *__alloc_disk_node(int minors, int node_id) } ptbl = rcu_dereference_protected(disk->part_tbl, 1); rcu_assign_pointer(ptbl->part[0], >part0); + mutex_init(>part_lock); /* * set_capacity() and get_capacity() currently don't use @@ -1570,6 +1571,7 @@ void set_disk_ro(struct gendisk *disk, int flag) struct disk_part_iter piter; struct hd_struct *part; + mutex_lock(>part_lock); if (disk->part0.policy != flag) { set_disk_ro_uevent(disk, flag); disk->part0.policy = flag; @@ -1579,6 +1581,7 @@ void set_disk_ro(struct gendisk *disk, int flag) while ((part = disk_part_iter_next())) part->policy = flag; disk_part_iter_exit(); + mutex_unlock(>part_lock); } EXPORT_SYMBOL(set_disk_ro); diff --git a/block/partition-generic.c b/block/partition-generic.c index aee643ce13d1..63cb6fb996ff 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -345,7 +345,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, queue_limit_discard_alignment(>queue->limits, start); p->nr_sects = len; p->partno = partno; - p->policy = get_disk_ro(disk); if (info) { struct partition_meta_info *pinfo = alloc_part_info(disk); @@ -401,6 +400,10 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, /* everything is up and running, commence */ rcu_assign_pointer(ptbl->part[partno], p); + mutex_lock(>part_lock); + p->policy = get_disk_ro(disk); + mutex_unlock(>part_lock); + /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(>kobj, KOBJ_ADD); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 8b5330dd5ac0..df6ddca8a92c 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -201,6 +201,7 @@ struct gendisk { */ struct disk_part_tbl __rcu *part_tbl; struct hd_struct part0; + struct mutex part_lock; const struct block_device_operations *fops; struct request_queue *queue;
[PATCH RESEND] scsi: megaraid_sas: fix panic on loading firmware crashdump
While loading fw crashdump in function fw_crash_buffer_show(), left bytes in one dma chunk was not checked, if copying size over it, overflow access will cause kernel panic. Signed-off-by: Junxiao Bi --- drivers/scsi/megaraid/megaraid_sas_base.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 80ab9700f1de..3eef0858fa8e 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -3153,6 +3153,7 @@ fw_crash_buffer_show(struct device *cdev, (struct megasas_instance *) shost->hostdata; u32 size; unsigned long dmachunk = CRASH_DMA_BUF_SIZE; + unsigned long chunk_left_bytes; unsigned long src_addr; unsigned long flags; u32 buff_offset; @@ -3176,6 +3177,8 @@ fw_crash_buffer_show(struct device *cdev, } size = (instance->fw_crash_buffer_size * dmachunk) - buff_offset; + chunk_left_bytes = dmachunk - (buff_offset % dmachunk); + size = (size > chunk_left_bytes) ? chunk_left_bytes : size; size = (size >= PAGE_SIZE) ? (PAGE_SIZE - 1) : size; src_addr = (unsigned long)instance->crash_buf[buff_offset / dmachunk] + -- 2.17.1
Re: [PATCH v3 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock
On 01/17/2017 02:30 PM, Eric Ren wrote: > We are in the situation that we have to avoid recursive cluster locking, > but there is no way to check if a cluster lock has been taken by a > precess already. > > Mostly, we can avoid recursive locking by writing code carefully. > However, we found that it's very hard to handle the routines that > are invoked directly by vfs code. For instance: > > const struct inode_operations ocfs2_file_iops = { > .permission = ocfs2_permission, > .get_acl= ocfs2_iop_get_acl, > .set_acl= ocfs2_iop_set_acl, > }; > > Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR): > do_sys_open > may_open > inode_permission >ocfs2_permission > ocfs2_inode_lock() <=== first time > generic_permission > get_acl >ocfs2_iop_get_acl > ocfs2_inode_lock() <=== recursive one > > A deadlock will occur if a remote EX request comes in between two > of ocfs2_inode_lock(). Briefly describe how the deadlock is formed: > > On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in > BAST(ocfs2_generic_handle_bast) when downconvert is started > on behalf of the remote EX lock request. Another hand, the recursive > cluster lock (the second one) will be blocked in in __ocfs2_cluster_lock() > because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why? > because there is no chance for the first cluster lock on this node to be > unlocked - we block ourselves in the code path. > > The idea to fix this issue is mostly taken from gfs2 code. > 1. introduce a new field: struct ocfs2_lock_res.l_holders, to > keep track of the processes' pid who has taken the cluster lock > of this lock resource; > 2. introduce a new flag for ocfs2_inode_lock_full: OCFS2_META_LOCK_GETBH; > it means just getting back disk inode bh for us if we've got cluster lock. > 3. export a helper: ocfs2_is_locked_by_me() is used to check if we > have got the cluster lock in the upper code path. > > The tracking logic should be used by some of the ocfs2 vfs's callbacks, > to solve the recursive locking issue cuased by the fact that vfs routines > can call into each other. > > The performance penalty of processing the holder list should only be seen > at a few cases where the tracking logic is used, such as get/set acl. > > You may ask what if the first time we got a PR lock, and the second time > we want a EX lock? fortunately, this case never happens in the real world, > as far as I can see, including permission check, (get|set)_(acl|attr), and > the gfs2 code also do so. > > Changes since v1: > - Let ocfs2_is_locked_by_me() just return true/false to indicate if the > process gets the cluster lock - suggested by: Joseph Qi <jiangqi...@gmail.com> > and Junxiao Bi <junxiao...@oracle.com>. > > - Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder", > suggested by: Junxiao Bi. > > - Do not inline functions whose bodies are not in scope, changed by: > Stephen Rothwell <s...@canb.auug.org.au>. > > Changes since v2: > - Wrap the tracking logic code of recursive locking into functions, > ocfs2_inode_lock_tracker() and ocfs2_inode_unlock_tracker(), > suggested by: Junxiao Bi. > > [s...@canb.auug.org.au remove some inlines] > Signed-off-by: Eric Ren <z...@suse.com> Reviewed-by: Junxiao Bi <junxiao...@oracle.com> > --- > fs/ocfs2/dlmglue.c | 105 > +++-- > fs/ocfs2/dlmglue.h | 18 + > fs/ocfs2/ocfs2.h | 1 + > 3 files changed, 121 insertions(+), 3 deletions(-) > > diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c > index 77d1632..c75b9e9 100644 > --- a/fs/ocfs2/dlmglue.c > +++ b/fs/ocfs2/dlmglue.c > @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) > init_waitqueue_head(>l_event); > INIT_LIST_HEAD(>l_blocked_list); > INIT_LIST_HEAD(>l_mask_waiters); > + INIT_LIST_HEAD(>l_holders); > } > > void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, > @@ -749,6 +750,50 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) > res->l_flags = 0UL; > } > > +/* > + * Keep a list of processes who have interest in a lockres. > + * Note: this is now only uesed for check recursive cluster locking. > + */ > +static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, > +struct ocfs2_lock_holder *oh) > +{ > + INIT_LIST_HEAD(>oh_list); > + oh->oh_owner_pid = get_pid(task_pid(current)); > + > + spin_lock(>l_lock); > + list_add_tail(>oh_
Re: [PATCH v3 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock
On 01/17/2017 02:30 PM, Eric Ren wrote: > We are in the situation that we have to avoid recursive cluster locking, > but there is no way to check if a cluster lock has been taken by a > precess already. > > Mostly, we can avoid recursive locking by writing code carefully. > However, we found that it's very hard to handle the routines that > are invoked directly by vfs code. For instance: > > const struct inode_operations ocfs2_file_iops = { > .permission = ocfs2_permission, > .get_acl= ocfs2_iop_get_acl, > .set_acl= ocfs2_iop_set_acl, > }; > > Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR): > do_sys_open > may_open > inode_permission >ocfs2_permission > ocfs2_inode_lock() <=== first time > generic_permission > get_acl >ocfs2_iop_get_acl > ocfs2_inode_lock() <=== recursive one > > A deadlock will occur if a remote EX request comes in between two > of ocfs2_inode_lock(). Briefly describe how the deadlock is formed: > > On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in > BAST(ocfs2_generic_handle_bast) when downconvert is started > on behalf of the remote EX lock request. Another hand, the recursive > cluster lock (the second one) will be blocked in in __ocfs2_cluster_lock() > because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why? > because there is no chance for the first cluster lock on this node to be > unlocked - we block ourselves in the code path. > > The idea to fix this issue is mostly taken from gfs2 code. > 1. introduce a new field: struct ocfs2_lock_res.l_holders, to > keep track of the processes' pid who has taken the cluster lock > of this lock resource; > 2. introduce a new flag for ocfs2_inode_lock_full: OCFS2_META_LOCK_GETBH; > it means just getting back disk inode bh for us if we've got cluster lock. > 3. export a helper: ocfs2_is_locked_by_me() is used to check if we > have got the cluster lock in the upper code path. > > The tracking logic should be used by some of the ocfs2 vfs's callbacks, > to solve the recursive locking issue cuased by the fact that vfs routines > can call into each other. > > The performance penalty of processing the holder list should only be seen > at a few cases where the tracking logic is used, such as get/set acl. > > You may ask what if the first time we got a PR lock, and the second time > we want a EX lock? fortunately, this case never happens in the real world, > as far as I can see, including permission check, (get|set)_(acl|attr), and > the gfs2 code also do so. > > Changes since v1: > - Let ocfs2_is_locked_by_me() just return true/false to indicate if the > process gets the cluster lock - suggested by: Joseph Qi > and Junxiao Bi . > > - Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder", > suggested by: Junxiao Bi. > > - Do not inline functions whose bodies are not in scope, changed by: > Stephen Rothwell . > > Changes since v2: > - Wrap the tracking logic code of recursive locking into functions, > ocfs2_inode_lock_tracker() and ocfs2_inode_unlock_tracker(), > suggested by: Junxiao Bi. > > [s...@canb.auug.org.au remove some inlines] > Signed-off-by: Eric Ren Reviewed-by: Junxiao Bi > --- > fs/ocfs2/dlmglue.c | 105 > +++-- > fs/ocfs2/dlmglue.h | 18 + > fs/ocfs2/ocfs2.h | 1 + > 3 files changed, 121 insertions(+), 3 deletions(-) > > diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c > index 77d1632..c75b9e9 100644 > --- a/fs/ocfs2/dlmglue.c > +++ b/fs/ocfs2/dlmglue.c > @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) > init_waitqueue_head(>l_event); > INIT_LIST_HEAD(>l_blocked_list); > INIT_LIST_HEAD(>l_mask_waiters); > + INIT_LIST_HEAD(>l_holders); > } > > void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, > @@ -749,6 +750,50 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) > res->l_flags = 0UL; > } > > +/* > + * Keep a list of processes who have interest in a lockres. > + * Note: this is now only uesed for check recursive cluster locking. > + */ > +static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, > +struct ocfs2_lock_holder *oh) > +{ > + INIT_LIST_HEAD(>oh_list); > + oh->oh_owner_pid = get_pid(task_pid(current)); > + > + spin_lock(>l_lock); > + list_add_tail(>oh_list, >l_holders); > + spin_unlock(>l_lock); > +} > + > +static inline void ocfs2_remove_holder(struct ocfs2_lock_res *l
Re: [PATCH v3 2/2] ocfs2: fix deadlock issue when taking inode lock at vfs entry points
On 01/17/2017 02:30 PM, Eric Ren wrote: > Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()") > results in a deadlock, as the author "Tariq Saeed" realized shortly > after the patch was merged. The discussion happened here > (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html). > > The reason why taking cluster inode lock at vfs entry points opens up > a self deadlock window, is explained in the previous patch of this > series. > > So far, we have seen two different code paths that have this issue. > 1. do_sys_open > may_open > inode_permission >ocfs2_permission > ocfs2_inode_lock() <=== take PR > generic_permission > get_acl >ocfs2_iop_get_acl > ocfs2_inode_lock() <=== take PR > 2. fchmod|fchmodat > chmod_common > notify_change > ocfs2_setattr <=== take EX >posix_acl_chmod > get_acl > ocfs2_iop_get_acl <=== take PR > ocfs2_iop_set_acl <=== take EX > > Fixes them by adding the tracking logic (in the previous patch) for > these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(), > ocfs2_setattr(). > > Changes since v1: > - Let ocfs2_is_locked_by_me() just return true/false to indicate if the > process gets the cluster lock - suggested by: Joseph Qi <jiangqi...@gmail.com> > and Junxiao Bi <junxiao...@oracle.com>. > > - Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder", > suggested by: Junxiao Bi. > > - Add debugging output at ocfs2_setattr() and ocfs2_permission() to > catch exceptional cases, suggested by: Junxiao Bi. > > Changes since v2: > - Use new wrappers of tracking logic code, suggested by: Junxiao Bi. > > Signed-off-by: Eric Ren <z...@suse.com> Reviewed-by: Junxiao Bi <junxiao...@oracle.com> > --- > fs/ocfs2/acl.c | 29 + > fs/ocfs2/file.c | 58 > - > 2 files changed, 58 insertions(+), 29 deletions(-) > > diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c > index bed1fcb..dc22ba8 100644 > --- a/fs/ocfs2/acl.c > +++ b/fs/ocfs2/acl.c > @@ -283,16 +283,14 @@ int ocfs2_set_acl(handle_t *handle, > int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) > { > struct buffer_head *bh = NULL; > - int status = 0; > + int status, had_lock; > + struct ocfs2_lock_holder oh; > > - status = ocfs2_inode_lock(inode, , 1); > - if (status < 0) { > - if (status != -ENOENT) > - mlog_errno(status); > - return status; > - } > + had_lock = ocfs2_inode_lock_tracker(inode, , 1, ); > + if (had_lock < 0) > + return had_lock; > status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); > - ocfs2_inode_unlock(inode, 1); > + ocfs2_inode_unlock_tracker(inode, 1, , had_lock); > brelse(bh); > return status; > } > @@ -302,21 +300,20 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode > *inode, int type) > struct ocfs2_super *osb; > struct buffer_head *di_bh = NULL; > struct posix_acl *acl; > - int ret; > + int had_lock; > + struct ocfs2_lock_holder oh; > > osb = OCFS2_SB(inode->i_sb); > if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) > return NULL; > - ret = ocfs2_inode_lock(inode, _bh, 0); > - if (ret < 0) { > - if (ret != -ENOENT) > - mlog_errno(ret); > - return ERR_PTR(ret); > - } > + > + had_lock = ocfs2_inode_lock_tracker(inode, _bh, 0, ); > + if (had_lock < 0) > + return ERR_PTR(had_lock); > > acl = ocfs2_get_acl_nolock(inode, type, di_bh); > > - ocfs2_inode_unlock(inode, 0); > + ocfs2_inode_unlock_tracker(inode, 0, , had_lock); > brelse(di_bh); > return acl; > } > diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c > index c488965..7b6a146 100644 > --- a/fs/ocfs2/file.c > +++ b/fs/ocfs2/file.c > @@ -1138,6 +1138,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr > *attr) > handle_t *handle = NULL; > struct dquot *transfer_to[MAXQUOTAS] = { }; > int qtype; > + int had_lock; > + struct ocfs2_lock_holder oh; > > trace_ocfs2_setattr(inode, dentry, > (unsigned long long)OCFS2_I(inode)->ip_blkno, > @@ -1173,11 +1175,30 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr > *attr) > }
Re: [PATCH v3 2/2] ocfs2: fix deadlock issue when taking inode lock at vfs entry points
On 01/17/2017 02:30 PM, Eric Ren wrote: > Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()") > results in a deadlock, as the author "Tariq Saeed" realized shortly > after the patch was merged. The discussion happened here > (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html). > > The reason why taking cluster inode lock at vfs entry points opens up > a self deadlock window, is explained in the previous patch of this > series. > > So far, we have seen two different code paths that have this issue. > 1. do_sys_open > may_open > inode_permission >ocfs2_permission > ocfs2_inode_lock() <=== take PR > generic_permission > get_acl >ocfs2_iop_get_acl > ocfs2_inode_lock() <=== take PR > 2. fchmod|fchmodat > chmod_common > notify_change > ocfs2_setattr <=== take EX >posix_acl_chmod > get_acl > ocfs2_iop_get_acl <=== take PR > ocfs2_iop_set_acl <=== take EX > > Fixes them by adding the tracking logic (in the previous patch) for > these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(), > ocfs2_setattr(). > > Changes since v1: > - Let ocfs2_is_locked_by_me() just return true/false to indicate if the > process gets the cluster lock - suggested by: Joseph Qi > and Junxiao Bi . > > - Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder", > suggested by: Junxiao Bi. > > - Add debugging output at ocfs2_setattr() and ocfs2_permission() to > catch exceptional cases, suggested by: Junxiao Bi. > > Changes since v2: > - Use new wrappers of tracking logic code, suggested by: Junxiao Bi. > > Signed-off-by: Eric Ren Reviewed-by: Junxiao Bi > --- > fs/ocfs2/acl.c | 29 + > fs/ocfs2/file.c | 58 > - > 2 files changed, 58 insertions(+), 29 deletions(-) > > diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c > index bed1fcb..dc22ba8 100644 > --- a/fs/ocfs2/acl.c > +++ b/fs/ocfs2/acl.c > @@ -283,16 +283,14 @@ int ocfs2_set_acl(handle_t *handle, > int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type) > { > struct buffer_head *bh = NULL; > - int status = 0; > + int status, had_lock; > + struct ocfs2_lock_holder oh; > > - status = ocfs2_inode_lock(inode, , 1); > - if (status < 0) { > - if (status != -ENOENT) > - mlog_errno(status); > - return status; > - } > + had_lock = ocfs2_inode_lock_tracker(inode, , 1, ); > + if (had_lock < 0) > + return had_lock; > status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); > - ocfs2_inode_unlock(inode, 1); > + ocfs2_inode_unlock_tracker(inode, 1, , had_lock); > brelse(bh); > return status; > } > @@ -302,21 +300,20 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode > *inode, int type) > struct ocfs2_super *osb; > struct buffer_head *di_bh = NULL; > struct posix_acl *acl; > - int ret; > + int had_lock; > + struct ocfs2_lock_holder oh; > > osb = OCFS2_SB(inode->i_sb); > if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) > return NULL; > - ret = ocfs2_inode_lock(inode, _bh, 0); > - if (ret < 0) { > - if (ret != -ENOENT) > - mlog_errno(ret); > - return ERR_PTR(ret); > - } > + > + had_lock = ocfs2_inode_lock_tracker(inode, _bh, 0, ); > + if (had_lock < 0) > + return ERR_PTR(had_lock); > > acl = ocfs2_get_acl_nolock(inode, type, di_bh); > > - ocfs2_inode_unlock(inode, 0); > + ocfs2_inode_unlock_tracker(inode, 0, , had_lock); > brelse(di_bh); > return acl; > } > diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c > index c488965..7b6a146 100644 > --- a/fs/ocfs2/file.c > +++ b/fs/ocfs2/file.c > @@ -1138,6 +1138,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr > *attr) > handle_t *handle = NULL; > struct dquot *transfer_to[MAXQUOTAS] = { }; > int qtype; > + int had_lock; > + struct ocfs2_lock_holder oh; > > trace_ocfs2_setattr(inode, dentry, > (unsigned long long)OCFS2_I(inode)->ip_blkno, > @@ -1173,11 +1175,30 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr > *attr) > } > } > > - status = ocfs2_inode_lock(inode, , 1); > - if (status < 0) {
Re: [PATCH v2 2/2] ocfs2: fix deadlock issue when taking inode lock at vfs entry points
On 01/16/2017 02:42 PM, Eric Ren wrote: > Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()") > results in a deadlock, as the author "Tariq Saeed" realized shortly > after the patch was merged. The discussion happened here > (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html). > > The reason why taking cluster inode lock at vfs entry points opens up > a self deadlock window, is explained in the previous patch of this > series. > > So far, we have seen two different code paths that have this issue. > 1. do_sys_open > may_open > inode_permission >ocfs2_permission > ocfs2_inode_lock() <=== take PR > generic_permission > get_acl >ocfs2_iop_get_acl > ocfs2_inode_lock() <=== take PR > 2. fchmod|fchmodat > chmod_common > notify_change > ocfs2_setattr <=== take EX >posix_acl_chmod > get_acl > ocfs2_iop_get_acl <=== take PR > ocfs2_iop_set_acl <=== take EX > > Fixes them by adding the tracking logic (in the previous patch) for > these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(), > ocfs2_setattr(). > > Changes since v1: > 1. Let ocfs2_is_locked_by_me() just return true/false to indicate if the > process gets the cluster lock - suggested by: Joseph Qi <jiangqi...@gmail.com> > and Junxiao Bi <junxiao...@oracle.com>. > > 2. Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder", > suggested by: Junxiao Bi <junxiao...@oracle.com>. > > 3. Add debugging output at ocfs2_setattr() and ocfs2_permission() to > catch exceptional cases, suggested by: Junxiao Bi <junxiao...@oracle.com>. > > Signed-off-by: Eric Ren <z...@suse.com> > --- > fs/ocfs2/acl.c | 39 + > fs/ocfs2/file.c | 76 > + > 2 files changed, 100 insertions(+), 15 deletions(-) > > diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c > index bed1fcb..3e47262 100644 > --- a/fs/ocfs2/acl.c > +++ b/fs/ocfs2/acl.c > @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode, struct > posix_acl *acl, int type) > { > struct buffer_head *bh = NULL; > int status = 0; > - > - status = ocfs2_inode_lock(inode, , 1); > + int arg_flags = 0, has_locked; > + struct ocfs2_lock_holder oh; > + struct ocfs2_lock_res *lockres; > + > + lockres = _I(inode)->ip_inode_lockres; > + has_locked = ocfs2_is_locked_by_me(lockres); > + if (has_locked) > + arg_flags = OCFS2_META_LOCK_GETBH; > + status = ocfs2_inode_lock_full(inode, , 1, arg_flags); > if (status < 0) { > if (status != -ENOENT) > mlog_errno(status); > return status; > } > + if (!has_locked) > + ocfs2_add_holder(lockres, ); > + Same code pattern showed here and *get_acl, can it be abstracted to one function? The same issue for *setattr and *permission. Sorry for not mention that in last review. Thanks, Junxiao. > status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); > - ocfs2_inode_unlock(inode, 1); > + > + if (!has_locked) { > + ocfs2_remove_holder(lockres, ); > + ocfs2_inode_unlock(inode, 1); > + } > brelse(bh); > + > return status; > } > > @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode > *inode, int type) > struct buffer_head *di_bh = NULL; > struct posix_acl *acl; > int ret; > + int arg_flags = 0, has_locked; > + struct ocfs2_lock_holder oh; > + struct ocfs2_lock_res *lockres; > > osb = OCFS2_SB(inode->i_sb); > if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) > return NULL; > - ret = ocfs2_inode_lock(inode, _bh, 0); > + > + lockres = _I(inode)->ip_inode_lockres; > + has_locked = ocfs2_is_locked_by_me(lockres); > + if (has_locked) > + arg_flags = OCFS2_META_LOCK_GETBH; > + ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags); > if (ret < 0) { > if (ret != -ENOENT) > mlog_errno(ret); > return ERR_PTR(ret); > } > + if (!has_locked) > + ocfs2_add_holder(lockres, ); > > acl = ocfs2_get_acl_nolock(inode, type, di_bh); > > - ocfs2_inode_unlock(inode, 0); > + if (!has_locked) { > + ocfs2_remove_holder(lockres, ); > + ocfs2_inode_unlock(inode, 0);
Re: [PATCH v2 2/2] ocfs2: fix deadlock issue when taking inode lock at vfs entry points
On 01/16/2017 02:42 PM, Eric Ren wrote: > Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()") > results in a deadlock, as the author "Tariq Saeed" realized shortly > after the patch was merged. The discussion happened here > (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html). > > The reason why taking cluster inode lock at vfs entry points opens up > a self deadlock window, is explained in the previous patch of this > series. > > So far, we have seen two different code paths that have this issue. > 1. do_sys_open > may_open > inode_permission >ocfs2_permission > ocfs2_inode_lock() <=== take PR > generic_permission > get_acl >ocfs2_iop_get_acl > ocfs2_inode_lock() <=== take PR > 2. fchmod|fchmodat > chmod_common > notify_change > ocfs2_setattr <=== take EX >posix_acl_chmod > get_acl > ocfs2_iop_get_acl <=== take PR > ocfs2_iop_set_acl <=== take EX > > Fixes them by adding the tracking logic (in the previous patch) for > these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(), > ocfs2_setattr(). > > Changes since v1: > 1. Let ocfs2_is_locked_by_me() just return true/false to indicate if the > process gets the cluster lock - suggested by: Joseph Qi > and Junxiao Bi . > > 2. Change "struct ocfs2_holder" to a more meaningful name "ocfs2_lock_holder", > suggested by: Junxiao Bi . > > 3. Add debugging output at ocfs2_setattr() and ocfs2_permission() to > catch exceptional cases, suggested by: Junxiao Bi . > > Signed-off-by: Eric Ren > --- > fs/ocfs2/acl.c | 39 + > fs/ocfs2/file.c | 76 > + > 2 files changed, 100 insertions(+), 15 deletions(-) > > diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c > index bed1fcb..3e47262 100644 > --- a/fs/ocfs2/acl.c > +++ b/fs/ocfs2/acl.c > @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode, struct > posix_acl *acl, int type) > { > struct buffer_head *bh = NULL; > int status = 0; > - > - status = ocfs2_inode_lock(inode, , 1); > + int arg_flags = 0, has_locked; > + struct ocfs2_lock_holder oh; > + struct ocfs2_lock_res *lockres; > + > + lockres = _I(inode)->ip_inode_lockres; > + has_locked = ocfs2_is_locked_by_me(lockres); > + if (has_locked) > + arg_flags = OCFS2_META_LOCK_GETBH; > + status = ocfs2_inode_lock_full(inode, , 1, arg_flags); > if (status < 0) { > if (status != -ENOENT) > mlog_errno(status); > return status; > } > + if (!has_locked) > + ocfs2_add_holder(lockres, ); > + Same code pattern showed here and *get_acl, can it be abstracted to one function? The same issue for *setattr and *permission. Sorry for not mention that in last review. Thanks, Junxiao. > status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); > - ocfs2_inode_unlock(inode, 1); > + > + if (!has_locked) { > + ocfs2_remove_holder(lockres, ); > + ocfs2_inode_unlock(inode, 1); > + } > brelse(bh); > + > return status; > } > > @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode > *inode, int type) > struct buffer_head *di_bh = NULL; > struct posix_acl *acl; > int ret; > + int arg_flags = 0, has_locked; > + struct ocfs2_lock_holder oh; > + struct ocfs2_lock_res *lockres; > > osb = OCFS2_SB(inode->i_sb); > if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) > return NULL; > - ret = ocfs2_inode_lock(inode, _bh, 0); > + > + lockres = _I(inode)->ip_inode_lockres; > + has_locked = ocfs2_is_locked_by_me(lockres); > + if (has_locked) > + arg_flags = OCFS2_META_LOCK_GETBH; > + ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags); > if (ret < 0) { > if (ret != -ENOENT) > mlog_errno(ret); > return ERR_PTR(ret); > } > + if (!has_locked) > + ocfs2_add_holder(lockres, ); > > acl = ocfs2_get_acl_nolock(inode, type, di_bh); > > - ocfs2_inode_unlock(inode, 0); > + if (!has_locked) { > + ocfs2_remove_holder(lockres, ); > + ocfs2_inode_unlock(inode, 0); > + } > brelse(di_bh); > + > return acl; > } > > diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c > ind
Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points
On 01/16/2017 11:06 AM, Eric Ren wrote: > Hi Junxiao, > > On 01/16/2017 10:46 AM, Junxiao Bi wrote: >>>> If had_lock==true, it is a bug? I think we should BUG_ON for it, that >>>> can help us catch bug at the first time. >>> Good idea! But I'm not sure if "ocfs2_setattr" is always the first one >>> who takes the cluster lock. >>> It's harder for me to name all the possible paths;-/ >> The BUG_ON() can help catch the path where ocfs2_setattr is not the >> first one. > Yes, I understand. But, the problem is that the vfs entries calling > order is out of our control. > I don't want to place an assertion where I'm not 100% sure it's > absolutely right;-) If it is not the first one, is it another recursive locking bug? In this case, if you don't like BUG_ON(), you can dump the call trace and print some warning message. Thanks, Junxiao. > > Thanks, > Eric > >> >> Thanks, >> Junxiao. >> >>>> >>>>> +if (had_lock) >>>>> +arg_flags = OCFS2_META_LOCK_GETBH; >>>>> +status = ocfs2_inode_lock_full(inode, , 1, arg_flags); >>>>>if (status < 0) { >>>>>if (status != -ENOENT) >>>>>mlog_errno(status); >>>>>goto bail_unlock_rw; >>>>>} >>>>> -inode_locked = 1; >>>>> +if (!had_lock) { >>>>> +ocfs2_add_holder(lockres, ); >>>>> +inode_locked = 1; >>>>> +} >>>>> if (size_change) { >>>>>status = inode_newsize_ok(inode, attr->ia_size); >>>>> @@ -1260,7 +1270,8 @@ int ocfs2_setattr(struct dentry *dentry, struct >>>>> iattr *attr) >>>>>bail_commit: >>>>>ocfs2_commit_trans(osb, handle); >>>>>bail_unlock: >>>>> -if (status) { >>>>> +if (status && inode_locked) { >>>>> +ocfs2_remove_holder(lockres, ); >>>>>ocfs2_inode_unlock(inode, 1); >>>>>inode_locked = 0; >>>>>} >>>>> @@ -1278,8 +1289,10 @@ int ocfs2_setattr(struct dentry *dentry, >>>>> struct iattr *attr) >>>>>if (status < 0) >>>>>mlog_errno(status); >>>>>} >>>>> -if (inode_locked) >>>>> +if (inode_locked) { >>>>> +ocfs2_remove_holder(lockres, ); >>>>>ocfs2_inode_unlock(inode, 1); >>>>> +} >>>>> brelse(bh); >>>>>return status; >>>>> @@ -1321,20 +1334,31 @@ int ocfs2_getattr(struct vfsmount *mnt, >>>>>int ocfs2_permission(struct inode *inode, int mask) >>>>>{ >>>>>int ret; >>>>> +int has_locked; >>>>> +struct ocfs2_holder oh; >>>>> +struct ocfs2_lock_res *lockres; >>>>> if (mask & MAY_NOT_BLOCK) >>>>>return -ECHILD; >>>>>-ret = ocfs2_inode_lock(inode, NULL, 0); >>>>> -if (ret) { >>>>> -if (ret != -ENOENT) >>>>> -mlog_errno(ret); >>>>> -goto out; >>>>> +lockres = _I(inode)->ip_inode_lockres; >>>>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL); >>>> The same thing as ocfs2_setattr. >>> OK. I will think over your suggestions! >>> >>> Thanks, >>> Eric >>> >>>> Thanks, >>>> Junxiao. >>>>> +if (!has_locked) { >>>>> +ret = ocfs2_inode_lock(inode, NULL, 0); >>>>> +if (ret) { >>>>> +if (ret != -ENOENT) >>>>> +mlog_errno(ret); >>>>> +goto out; >>>>> +} >>>>> +ocfs2_add_holder(lockres, ); >>>>>} >>>>> ret = generic_permission(inode, mask); >>>>>-ocfs2_inode_unlock(inode, 0); >>>>> +if (!has_locked) { >>>>> +ocfs2_remove_holder(lockres, ); >>>>> +ocfs2_inode_unlock(inode, 0); >>>>> +} >>>>>out: >>>>>return ret; >>>>>} >>>>> >> >
Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points
On 01/16/2017 11:06 AM, Eric Ren wrote: > Hi Junxiao, > > On 01/16/2017 10:46 AM, Junxiao Bi wrote: >>>> If had_lock==true, it is a bug? I think we should BUG_ON for it, that >>>> can help us catch bug at the first time. >>> Good idea! But I'm not sure if "ocfs2_setattr" is always the first one >>> who takes the cluster lock. >>> It's harder for me to name all the possible paths;-/ >> The BUG_ON() can help catch the path where ocfs2_setattr is not the >> first one. > Yes, I understand. But, the problem is that the vfs entries calling > order is out of our control. > I don't want to place an assertion where I'm not 100% sure it's > absolutely right;-) If it is not the first one, is it another recursive locking bug? In this case, if you don't like BUG_ON(), you can dump the call trace and print some warning message. Thanks, Junxiao. > > Thanks, > Eric > >> >> Thanks, >> Junxiao. >> >>>> >>>>> +if (had_lock) >>>>> +arg_flags = OCFS2_META_LOCK_GETBH; >>>>> +status = ocfs2_inode_lock_full(inode, , 1, arg_flags); >>>>>if (status < 0) { >>>>>if (status != -ENOENT) >>>>>mlog_errno(status); >>>>>goto bail_unlock_rw; >>>>>} >>>>> -inode_locked = 1; >>>>> +if (!had_lock) { >>>>> +ocfs2_add_holder(lockres, ); >>>>> +inode_locked = 1; >>>>> +} >>>>> if (size_change) { >>>>>status = inode_newsize_ok(inode, attr->ia_size); >>>>> @@ -1260,7 +1270,8 @@ int ocfs2_setattr(struct dentry *dentry, struct >>>>> iattr *attr) >>>>>bail_commit: >>>>>ocfs2_commit_trans(osb, handle); >>>>>bail_unlock: >>>>> -if (status) { >>>>> +if (status && inode_locked) { >>>>> +ocfs2_remove_holder(lockres, ); >>>>>ocfs2_inode_unlock(inode, 1); >>>>>inode_locked = 0; >>>>>} >>>>> @@ -1278,8 +1289,10 @@ int ocfs2_setattr(struct dentry *dentry, >>>>> struct iattr *attr) >>>>>if (status < 0) >>>>>mlog_errno(status); >>>>>} >>>>> -if (inode_locked) >>>>> +if (inode_locked) { >>>>> +ocfs2_remove_holder(lockres, ); >>>>>ocfs2_inode_unlock(inode, 1); >>>>> +} >>>>> brelse(bh); >>>>>return status; >>>>> @@ -1321,20 +1334,31 @@ int ocfs2_getattr(struct vfsmount *mnt, >>>>>int ocfs2_permission(struct inode *inode, int mask) >>>>>{ >>>>>int ret; >>>>> +int has_locked; >>>>> +struct ocfs2_holder oh; >>>>> +struct ocfs2_lock_res *lockres; >>>>> if (mask & MAY_NOT_BLOCK) >>>>>return -ECHILD; >>>>>-ret = ocfs2_inode_lock(inode, NULL, 0); >>>>> -if (ret) { >>>>> -if (ret != -ENOENT) >>>>> -mlog_errno(ret); >>>>> -goto out; >>>>> +lockres = _I(inode)->ip_inode_lockres; >>>>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL); >>>> The same thing as ocfs2_setattr. >>> OK. I will think over your suggestions! >>> >>> Thanks, >>> Eric >>> >>>> Thanks, >>>> Junxiao. >>>>> +if (!has_locked) { >>>>> +ret = ocfs2_inode_lock(inode, NULL, 0); >>>>> +if (ret) { >>>>> +if (ret != -ENOENT) >>>>> +mlog_errno(ret); >>>>> +goto out; >>>>> +} >>>>> +ocfs2_add_holder(lockres, ); >>>>>} >>>>> ret = generic_permission(inode, mask); >>>>>-ocfs2_inode_unlock(inode, 0); >>>>> +if (!has_locked) { >>>>> +ocfs2_remove_holder(lockres, ); >>>>> +ocfs2_inode_unlock(inode, 0); >>>>> +} >>>>>out: >>>>>return ret; >>>>>} >>>>> >> >
Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points
On 01/13/2017 02:19 PM, Eric Ren wrote: > Hi! > > On 01/13/2017 12:22 PM, Junxiao Bi wrote: >> On 01/05/2017 11:31 PM, Eric Ren wrote: >>> Commit 743b5f1434f5 ("ocfs2: take inode lock in >>> ocfs2_iop_set/get_acl()") >>> results in a deadlock, as the author "Tariq Saeed" realized shortly >>> after the patch was merged. The discussion happened here >>> (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html). >>> >>> >>> The reason why taking cluster inode lock at vfs entry points opens up >>> a self deadlock window, is explained in the previous patch of this >>> series. >>> >>> So far, we have seen two different code paths that have this issue. >>> 1. do_sys_open >>> may_open >>>inode_permission >>> ocfs2_permission >>> ocfs2_inode_lock() <=== take PR >>> generic_permission >>>get_acl >>> ocfs2_iop_get_acl >>> ocfs2_inode_lock() <=== take PR >>> 2. fchmod|fchmodat >>> chmod_common >>> notify_change >>>ocfs2_setattr <=== take EX >>> posix_acl_chmod >>> get_acl >>> ocfs2_iop_get_acl <=== take PR >>> ocfs2_iop_set_acl <=== take EX >>> >>> Fixes them by adding the tracking logic (in the previous patch) for >>> these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(), >>> ocfs2_setattr(). >>> >>> Signed-off-by: Eric Ren <z...@suse.com> >>> --- >>> fs/ocfs2/acl.c | 39 ++- >>> fs/ocfs2/file.c | 44 ++-- >>> 2 files changed, 68 insertions(+), 15 deletions(-) >>> >>> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c >>> index bed1fcb..c539890 100644 >>> --- a/fs/ocfs2/acl.c >>> +++ b/fs/ocfs2/acl.c >>> @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode, >>> struct posix_acl *acl, int type) >>> { >>> struct buffer_head *bh = NULL; >>> int status = 0; >>> - >>> -status = ocfs2_inode_lock(inode, , 1); >>> +int arg_flags = 0, has_locked; >>> +struct ocfs2_holder oh; >>> +struct ocfs2_lock_res *lockres; >>> + >>> +lockres = _I(inode)->ip_inode_lockres; >>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL); >>> +if (has_locked) >>> +arg_flags = OCFS2_META_LOCK_GETBH; >>> +status = ocfs2_inode_lock_full(inode, , 1, arg_flags); >>> if (status < 0) { >>> if (status != -ENOENT) >>> mlog_errno(status); >>> return status; >>> } >>> +if (!has_locked) >>> +ocfs2_add_holder(lockres, ); >>> + >>> status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); >>> -ocfs2_inode_unlock(inode, 1); >>> + >>> +if (!has_locked) { >>> +ocfs2_remove_holder(lockres, ); >>> +ocfs2_inode_unlock(inode, 1); >>> +} >>> brelse(bh); >>> + >>> return status; >>> } >>> @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct >>> inode *inode, int type) >>> struct buffer_head *di_bh = NULL; >>> struct posix_acl *acl; >>> int ret; >>> +int arg_flags = 0, has_locked; >>> +struct ocfs2_holder oh; >>> +struct ocfs2_lock_res *lockres; >>> osb = OCFS2_SB(inode->i_sb); >>> if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) >>> return NULL; >>> -ret = ocfs2_inode_lock(inode, _bh, 0); >>> + >>> +lockres = _I(inode)->ip_inode_lockres; >>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL); >>> +if (has_locked) >>> +arg_flags = OCFS2_META_LOCK_GETBH; >>> +ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags); >>> if (ret < 0) { >>> if (ret != -ENOENT) >>> mlog_errno(ret); >>> return ERR_PTR(ret); >>> } >>> +if (!has_locked) >>> +ocfs2_add_holder(lockres, ); >>> acl = ocfs2_get_acl_nolock(inode, type, di_bh); >>> -ocfs2_inode_unlock(inode, 0); >>> +if (!has_locked) {
Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points
On 01/13/2017 02:19 PM, Eric Ren wrote: > Hi! > > On 01/13/2017 12:22 PM, Junxiao Bi wrote: >> On 01/05/2017 11:31 PM, Eric Ren wrote: >>> Commit 743b5f1434f5 ("ocfs2: take inode lock in >>> ocfs2_iop_set/get_acl()") >>> results in a deadlock, as the author "Tariq Saeed" realized shortly >>> after the patch was merged. The discussion happened here >>> (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html). >>> >>> >>> The reason why taking cluster inode lock at vfs entry points opens up >>> a self deadlock window, is explained in the previous patch of this >>> series. >>> >>> So far, we have seen two different code paths that have this issue. >>> 1. do_sys_open >>> may_open >>>inode_permission >>> ocfs2_permission >>> ocfs2_inode_lock() <=== take PR >>> generic_permission >>>get_acl >>> ocfs2_iop_get_acl >>> ocfs2_inode_lock() <=== take PR >>> 2. fchmod|fchmodat >>> chmod_common >>> notify_change >>>ocfs2_setattr <=== take EX >>> posix_acl_chmod >>> get_acl >>> ocfs2_iop_get_acl <=== take PR >>> ocfs2_iop_set_acl <=== take EX >>> >>> Fixes them by adding the tracking logic (in the previous patch) for >>> these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(), >>> ocfs2_setattr(). >>> >>> Signed-off-by: Eric Ren >>> --- >>> fs/ocfs2/acl.c | 39 ++- >>> fs/ocfs2/file.c | 44 ++-- >>> 2 files changed, 68 insertions(+), 15 deletions(-) >>> >>> diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c >>> index bed1fcb..c539890 100644 >>> --- a/fs/ocfs2/acl.c >>> +++ b/fs/ocfs2/acl.c >>> @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode, >>> struct posix_acl *acl, int type) >>> { >>> struct buffer_head *bh = NULL; >>> int status = 0; >>> - >>> -status = ocfs2_inode_lock(inode, , 1); >>> +int arg_flags = 0, has_locked; >>> +struct ocfs2_holder oh; >>> +struct ocfs2_lock_res *lockres; >>> + >>> +lockres = _I(inode)->ip_inode_lockres; >>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL); >>> +if (has_locked) >>> +arg_flags = OCFS2_META_LOCK_GETBH; >>> +status = ocfs2_inode_lock_full(inode, , 1, arg_flags); >>> if (status < 0) { >>> if (status != -ENOENT) >>> mlog_errno(status); >>> return status; >>> } >>> +if (!has_locked) >>> +ocfs2_add_holder(lockres, ); >>> + >>> status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); >>> -ocfs2_inode_unlock(inode, 1); >>> + >>> +if (!has_locked) { >>> +ocfs2_remove_holder(lockres, ); >>> +ocfs2_inode_unlock(inode, 1); >>> +} >>> brelse(bh); >>> + >>> return status; >>> } >>> @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct >>> inode *inode, int type) >>> struct buffer_head *di_bh = NULL; >>> struct posix_acl *acl; >>> int ret; >>> +int arg_flags = 0, has_locked; >>> +struct ocfs2_holder oh; >>> +struct ocfs2_lock_res *lockres; >>> osb = OCFS2_SB(inode->i_sb); >>> if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) >>> return NULL; >>> -ret = ocfs2_inode_lock(inode, _bh, 0); >>> + >>> +lockres = _I(inode)->ip_inode_lockres; >>> +has_locked = (ocfs2_is_locked_by_me(lockres) != NULL); >>> +if (has_locked) >>> +arg_flags = OCFS2_META_LOCK_GETBH; >>> +ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags); >>> if (ret < 0) { >>> if (ret != -ENOENT) >>> mlog_errno(ret); >>> return ERR_PTR(ret); >>> } >>> +if (!has_locked) >>> +ocfs2_add_holder(lockres, ); >>> acl = ocfs2_get_acl_nolock(inode, type, di_bh); >>> -ocfs2_inode_unlock(inode, 0); >>> +if (!has_locked) { >>> +
Re: [PATCH 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock
On 01/13/2017 02:12 PM, Eric Ren wrote: > Hi Junxiao! > > On 01/13/2017 11:59 AM, Junxiao Bi wrote: >> On 01/05/2017 11:31 PM, Eric Ren wrote: >>> We are in the situation that we have to avoid recursive cluster locking, >>> but there is no way to check if a cluster lock has been taken by a >>> precess already. >>> >>> Mostly, we can avoid recursive locking by writing code carefully. >>> However, we found that it's very hard to handle the routines that >>> are invoked directly by vfs code. For instance: >>> >>> const struct inode_operations ocfs2_file_iops = { >>> .permission = ocfs2_permission, >>> .get_acl= ocfs2_iop_get_acl, >>> .set_acl= ocfs2_iop_set_acl, >>> }; >>> >>> Both ocfs2_permission() and ocfs2_iop_get_acl() call >>> ocfs2_inode_lock(PR): >>> do_sys_open >>> may_open >>>inode_permission >>> ocfs2_permission >>> ocfs2_inode_lock() <=== first time >>> generic_permission >>>get_acl >>> ocfs2_iop_get_acl >>> ocfs2_inode_lock() <=== recursive one >>> >>> A deadlock will occur if a remote EX request comes in between two >>> of ocfs2_inode_lock(). Briefly describe how the deadlock is formed: >>> >>> On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in >>> BAST(ocfs2_generic_handle_bast) when downconvert is started >>> on behalf of the remote EX lock request. Another hand, the recursive >>> cluster lock (the second one) will be blocked in in >>> __ocfs2_cluster_lock() >>> because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why? >>> because there is no chance for the first cluster lock on this node to be >>> unlocked - we block ourselves in the code path. >>> >>> The idea to fix this issue is mostly taken from gfs2 code. >>> 1. introduce a new field: struct ocfs2_lock_res.l_holders, to >>> keep track of the processes' pid who has taken the cluster lock >>> of this lock resource; >>> 2. introduce a new flag for ocfs2_inode_lock_full: >>> OCFS2_META_LOCK_GETBH; >>> it means just getting back disk inode bh for us if we've got cluster >>> lock. >>> 3. export a helper: ocfs2_is_locked_by_me() is used to check if we >>> have got the cluster lock in the upper code path. >>> >>> The tracking logic should be used by some of the ocfs2 vfs's callbacks, >>> to solve the recursive locking issue cuased by the fact that vfs >>> routines >>> can call into each other. >>> >>> The performance penalty of processing the holder list should only be >>> seen >>> at a few cases where the tracking logic is used, such as get/set acl. >>> >>> You may ask what if the first time we got a PR lock, and the second time >>> we want a EX lock? fortunately, this case never happens in the real >>> world, >>> as far as I can see, including permission check, >>> (get|set)_(acl|attr), and >>> the gfs2 code also do so. >>> >>> Signed-off-by: Eric Ren <z...@suse.com> >>> --- >>> fs/ocfs2/dlmglue.c | 47 >>> --- >>> fs/ocfs2/dlmglue.h | 18 ++ >>> fs/ocfs2/ocfs2.h | 1 + >>> 3 files changed, 63 insertions(+), 3 deletions(-) >>> >>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c >>> index 83d576f..500bda4 100644 >>> --- a/fs/ocfs2/dlmglue.c >>> +++ b/fs/ocfs2/dlmglue.c >>> @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct >>> ocfs2_lock_res *res) >>> init_waitqueue_head(>l_event); >>> INIT_LIST_HEAD(>l_blocked_list); >>> INIT_LIST_HEAD(>l_mask_waiters); >>> +INIT_LIST_HEAD(>l_holders); >>> } >>> void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, >>> @@ -749,6 +750,45 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res >>> *res) >>> res->l_flags = 0UL; >>> } >>> +inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, >>> + struct ocfs2_holder *oh) >>> +{ >>> +INIT_LIST_HEAD(>oh_list); >>> +oh->oh_owner_pid = get_pid(task_pid(current)); >> struct pid(oh->oh_owner_pid) looks complicated here, why not use >> task_struct(current) or pid_t(current->pid) directly? Also i didn't see >> the ref
Re: [PATCH 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock
On 01/13/2017 02:12 PM, Eric Ren wrote: > Hi Junxiao! > > On 01/13/2017 11:59 AM, Junxiao Bi wrote: >> On 01/05/2017 11:31 PM, Eric Ren wrote: >>> We are in the situation that we have to avoid recursive cluster locking, >>> but there is no way to check if a cluster lock has been taken by a >>> precess already. >>> >>> Mostly, we can avoid recursive locking by writing code carefully. >>> However, we found that it's very hard to handle the routines that >>> are invoked directly by vfs code. For instance: >>> >>> const struct inode_operations ocfs2_file_iops = { >>> .permission = ocfs2_permission, >>> .get_acl= ocfs2_iop_get_acl, >>> .set_acl= ocfs2_iop_set_acl, >>> }; >>> >>> Both ocfs2_permission() and ocfs2_iop_get_acl() call >>> ocfs2_inode_lock(PR): >>> do_sys_open >>> may_open >>>inode_permission >>> ocfs2_permission >>> ocfs2_inode_lock() <=== first time >>> generic_permission >>>get_acl >>> ocfs2_iop_get_acl >>> ocfs2_inode_lock() <=== recursive one >>> >>> A deadlock will occur if a remote EX request comes in between two >>> of ocfs2_inode_lock(). Briefly describe how the deadlock is formed: >>> >>> On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in >>> BAST(ocfs2_generic_handle_bast) when downconvert is started >>> on behalf of the remote EX lock request. Another hand, the recursive >>> cluster lock (the second one) will be blocked in in >>> __ocfs2_cluster_lock() >>> because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why? >>> because there is no chance for the first cluster lock on this node to be >>> unlocked - we block ourselves in the code path. >>> >>> The idea to fix this issue is mostly taken from gfs2 code. >>> 1. introduce a new field: struct ocfs2_lock_res.l_holders, to >>> keep track of the processes' pid who has taken the cluster lock >>> of this lock resource; >>> 2. introduce a new flag for ocfs2_inode_lock_full: >>> OCFS2_META_LOCK_GETBH; >>> it means just getting back disk inode bh for us if we've got cluster >>> lock. >>> 3. export a helper: ocfs2_is_locked_by_me() is used to check if we >>> have got the cluster lock in the upper code path. >>> >>> The tracking logic should be used by some of the ocfs2 vfs's callbacks, >>> to solve the recursive locking issue cuased by the fact that vfs >>> routines >>> can call into each other. >>> >>> The performance penalty of processing the holder list should only be >>> seen >>> at a few cases where the tracking logic is used, such as get/set acl. >>> >>> You may ask what if the first time we got a PR lock, and the second time >>> we want a EX lock? fortunately, this case never happens in the real >>> world, >>> as far as I can see, including permission check, >>> (get|set)_(acl|attr), and >>> the gfs2 code also do so. >>> >>> Signed-off-by: Eric Ren >>> --- >>> fs/ocfs2/dlmglue.c | 47 >>> --- >>> fs/ocfs2/dlmglue.h | 18 ++ >>> fs/ocfs2/ocfs2.h | 1 + >>> 3 files changed, 63 insertions(+), 3 deletions(-) >>> >>> diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c >>> index 83d576f..500bda4 100644 >>> --- a/fs/ocfs2/dlmglue.c >>> +++ b/fs/ocfs2/dlmglue.c >>> @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct >>> ocfs2_lock_res *res) >>> init_waitqueue_head(>l_event); >>> INIT_LIST_HEAD(>l_blocked_list); >>> INIT_LIST_HEAD(>l_mask_waiters); >>> +INIT_LIST_HEAD(>l_holders); >>> } >>> void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, >>> @@ -749,6 +750,45 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res >>> *res) >>> res->l_flags = 0UL; >>> } >>> +inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, >>> + struct ocfs2_holder *oh) >>> +{ >>> +INIT_LIST_HEAD(>oh_list); >>> +oh->oh_owner_pid = get_pid(task_pid(current)); >> struct pid(oh->oh_owner_pid) looks complicated here, why not use >> task_struct(current) or pid_t(current->pid) directly? Also i didn't see >> the ref count needs
Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points
On 01/05/2017 11:31 PM, Eric Ren wrote: > Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()") > results in a deadlock, as the author "Tariq Saeed" realized shortly > after the patch was merged. The discussion happened here > (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html). > > The reason why taking cluster inode lock at vfs entry points opens up > a self deadlock window, is explained in the previous patch of this > series. > > So far, we have seen two different code paths that have this issue. > 1. do_sys_open > may_open > inode_permission >ocfs2_permission > ocfs2_inode_lock() <=== take PR > generic_permission > get_acl >ocfs2_iop_get_acl > ocfs2_inode_lock() <=== take PR > 2. fchmod|fchmodat > chmod_common > notify_change > ocfs2_setattr <=== take EX >posix_acl_chmod > get_acl > ocfs2_iop_get_acl <=== take PR > ocfs2_iop_set_acl <=== take EX > > Fixes them by adding the tracking logic (in the previous patch) for > these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(), > ocfs2_setattr(). > > Signed-off-by: Eric Ren> --- > fs/ocfs2/acl.c | 39 ++- > fs/ocfs2/file.c | 44 ++-- > 2 files changed, 68 insertions(+), 15 deletions(-) > > diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c > index bed1fcb..c539890 100644 > --- a/fs/ocfs2/acl.c > +++ b/fs/ocfs2/acl.c > @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode, struct > posix_acl *acl, int type) > { > struct buffer_head *bh = NULL; > int status = 0; > - > - status = ocfs2_inode_lock(inode, , 1); > + int arg_flags = 0, has_locked; > + struct ocfs2_holder oh; > + struct ocfs2_lock_res *lockres; > + > + lockres = _I(inode)->ip_inode_lockres; > + has_locked = (ocfs2_is_locked_by_me(lockres) != NULL); > + if (has_locked) > + arg_flags = OCFS2_META_LOCK_GETBH; > + status = ocfs2_inode_lock_full(inode, , 1, arg_flags); > if (status < 0) { > if (status != -ENOENT) > mlog_errno(status); > return status; > } > + if (!has_locked) > + ocfs2_add_holder(lockres, ); > + > status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); > - ocfs2_inode_unlock(inode, 1); > + > + if (!has_locked) { > + ocfs2_remove_holder(lockres, ); > + ocfs2_inode_unlock(inode, 1); > + } > brelse(bh); > + > return status; > } > > @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode > *inode, int type) > struct buffer_head *di_bh = NULL; > struct posix_acl *acl; > int ret; > + int arg_flags = 0, has_locked; > + struct ocfs2_holder oh; > + struct ocfs2_lock_res *lockres; > > osb = OCFS2_SB(inode->i_sb); > if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) > return NULL; > - ret = ocfs2_inode_lock(inode, _bh, 0); > + > + lockres = _I(inode)->ip_inode_lockres; > + has_locked = (ocfs2_is_locked_by_me(lockres) != NULL); > + if (has_locked) > + arg_flags = OCFS2_META_LOCK_GETBH; > + ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags); > if (ret < 0) { > if (ret != -ENOENT) > mlog_errno(ret); > return ERR_PTR(ret); > } > + if (!has_locked) > + ocfs2_add_holder(lockres, ); > > acl = ocfs2_get_acl_nolock(inode, type, di_bh); > > - ocfs2_inode_unlock(inode, 0); > + if (!has_locked) { > + ocfs2_remove_holder(lockres, ); > + ocfs2_inode_unlock(inode, 0); > + } > brelse(di_bh); > + > return acl; > } > > diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c > index c488965..62be75d 100644 > --- a/fs/ocfs2/file.c > +++ b/fs/ocfs2/file.c > @@ -1138,6 +1138,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr > *attr) > handle_t *handle = NULL; > struct dquot *transfer_to[MAXQUOTAS] = { }; > int qtype; > + int arg_flags = 0, had_lock; > + struct ocfs2_holder oh; > + struct ocfs2_lock_res *lockres; > > trace_ocfs2_setattr(inode, dentry, > (unsigned long long)OCFS2_I(inode)->ip_blkno, > @@ -1173,13 +1176,20 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr > *attr) > } > } > > - status = ocfs2_inode_lock(inode, , 1); > + lockres = _I(inode)->ip_inode_lockres; > + had_lock = (ocfs2_is_locked_by_me(lockres) != NULL); If had_lock==true, it is a bug? I think we should BUG_ON for it, that can help us catch bug at the first time. > + if (had_lock) > + arg_flags = OCFS2_META_LOCK_GETBH; > + status = ocfs2_inode_lock_full(inode, , 1, arg_flags); > if (status < 0) { >
Re: [PATCH 2/2] ocfs2: fix deadlocks when taking inode lock at vfs entry points
On 01/05/2017 11:31 PM, Eric Ren wrote: > Commit 743b5f1434f5 ("ocfs2: take inode lock in ocfs2_iop_set/get_acl()") > results in a deadlock, as the author "Tariq Saeed" realized shortly > after the patch was merged. The discussion happened here > (https://oss.oracle.com/pipermail/ocfs2-devel/2015-September/011085.html). > > The reason why taking cluster inode lock at vfs entry points opens up > a self deadlock window, is explained in the previous patch of this > series. > > So far, we have seen two different code paths that have this issue. > 1. do_sys_open > may_open > inode_permission >ocfs2_permission > ocfs2_inode_lock() <=== take PR > generic_permission > get_acl >ocfs2_iop_get_acl > ocfs2_inode_lock() <=== take PR > 2. fchmod|fchmodat > chmod_common > notify_change > ocfs2_setattr <=== take EX >posix_acl_chmod > get_acl > ocfs2_iop_get_acl <=== take PR > ocfs2_iop_set_acl <=== take EX > > Fixes them by adding the tracking logic (in the previous patch) for > these funcs above, ocfs2_permission(), ocfs2_iop_[set|get]_acl(), > ocfs2_setattr(). > > Signed-off-by: Eric Ren > --- > fs/ocfs2/acl.c | 39 ++- > fs/ocfs2/file.c | 44 ++-- > 2 files changed, 68 insertions(+), 15 deletions(-) > > diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c > index bed1fcb..c539890 100644 > --- a/fs/ocfs2/acl.c > +++ b/fs/ocfs2/acl.c > @@ -284,16 +284,31 @@ int ocfs2_iop_set_acl(struct inode *inode, struct > posix_acl *acl, int type) > { > struct buffer_head *bh = NULL; > int status = 0; > - > - status = ocfs2_inode_lock(inode, , 1); > + int arg_flags = 0, has_locked; > + struct ocfs2_holder oh; > + struct ocfs2_lock_res *lockres; > + > + lockres = _I(inode)->ip_inode_lockres; > + has_locked = (ocfs2_is_locked_by_me(lockres) != NULL); > + if (has_locked) > + arg_flags = OCFS2_META_LOCK_GETBH; > + status = ocfs2_inode_lock_full(inode, , 1, arg_flags); > if (status < 0) { > if (status != -ENOENT) > mlog_errno(status); > return status; > } > + if (!has_locked) > + ocfs2_add_holder(lockres, ); > + > status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL); > - ocfs2_inode_unlock(inode, 1); > + > + if (!has_locked) { > + ocfs2_remove_holder(lockres, ); > + ocfs2_inode_unlock(inode, 1); > + } > brelse(bh); > + > return status; > } > > @@ -303,21 +318,35 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode > *inode, int type) > struct buffer_head *di_bh = NULL; > struct posix_acl *acl; > int ret; > + int arg_flags = 0, has_locked; > + struct ocfs2_holder oh; > + struct ocfs2_lock_res *lockres; > > osb = OCFS2_SB(inode->i_sb); > if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) > return NULL; > - ret = ocfs2_inode_lock(inode, _bh, 0); > + > + lockres = _I(inode)->ip_inode_lockres; > + has_locked = (ocfs2_is_locked_by_me(lockres) != NULL); > + if (has_locked) > + arg_flags = OCFS2_META_LOCK_GETBH; > + ret = ocfs2_inode_lock_full(inode, _bh, 0, arg_flags); > if (ret < 0) { > if (ret != -ENOENT) > mlog_errno(ret); > return ERR_PTR(ret); > } > + if (!has_locked) > + ocfs2_add_holder(lockres, ); > > acl = ocfs2_get_acl_nolock(inode, type, di_bh); > > - ocfs2_inode_unlock(inode, 0); > + if (!has_locked) { > + ocfs2_remove_holder(lockres, ); > + ocfs2_inode_unlock(inode, 0); > + } > brelse(di_bh); > + > return acl; > } > > diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c > index c488965..62be75d 100644 > --- a/fs/ocfs2/file.c > +++ b/fs/ocfs2/file.c > @@ -1138,6 +1138,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr > *attr) > handle_t *handle = NULL; > struct dquot *transfer_to[MAXQUOTAS] = { }; > int qtype; > + int arg_flags = 0, had_lock; > + struct ocfs2_holder oh; > + struct ocfs2_lock_res *lockres; > > trace_ocfs2_setattr(inode, dentry, > (unsigned long long)OCFS2_I(inode)->ip_blkno, > @@ -1173,13 +1176,20 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr > *attr) > } > } > > - status = ocfs2_inode_lock(inode, , 1); > + lockres = _I(inode)->ip_inode_lockres; > + had_lock = (ocfs2_is_locked_by_me(lockres) != NULL); If had_lock==true, it is a bug? I think we should BUG_ON for it, that can help us catch bug at the first time. > + if (had_lock) > + arg_flags = OCFS2_META_LOCK_GETBH; > + status = ocfs2_inode_lock_full(inode, , 1, arg_flags); > if (status < 0) { > if
Re: [PATCH 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock
On 01/05/2017 11:31 PM, Eric Ren wrote: > We are in the situation that we have to avoid recursive cluster locking, > but there is no way to check if a cluster lock has been taken by a > precess already. > > Mostly, we can avoid recursive locking by writing code carefully. > However, we found that it's very hard to handle the routines that > are invoked directly by vfs code. For instance: > > const struct inode_operations ocfs2_file_iops = { > .permission = ocfs2_permission, > .get_acl= ocfs2_iop_get_acl, > .set_acl= ocfs2_iop_set_acl, > }; > > Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR): > do_sys_open > may_open > inode_permission >ocfs2_permission > ocfs2_inode_lock() <=== first time > generic_permission > get_acl >ocfs2_iop_get_acl > ocfs2_inode_lock() <=== recursive one > > A deadlock will occur if a remote EX request comes in between two > of ocfs2_inode_lock(). Briefly describe how the deadlock is formed: > > On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in > BAST(ocfs2_generic_handle_bast) when downconvert is started > on behalf of the remote EX lock request. Another hand, the recursive > cluster lock (the second one) will be blocked in in __ocfs2_cluster_lock() > because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why? > because there is no chance for the first cluster lock on this node to be > unlocked - we block ourselves in the code path. > > The idea to fix this issue is mostly taken from gfs2 code. > 1. introduce a new field: struct ocfs2_lock_res.l_holders, to > keep track of the processes' pid who has taken the cluster lock > of this lock resource; > 2. introduce a new flag for ocfs2_inode_lock_full: OCFS2_META_LOCK_GETBH; > it means just getting back disk inode bh for us if we've got cluster lock. > 3. export a helper: ocfs2_is_locked_by_me() is used to check if we > have got the cluster lock in the upper code path. > > The tracking logic should be used by some of the ocfs2 vfs's callbacks, > to solve the recursive locking issue cuased by the fact that vfs routines > can call into each other. > > The performance penalty of processing the holder list should only be seen > at a few cases where the tracking logic is used, such as get/set acl. > > You may ask what if the first time we got a PR lock, and the second time > we want a EX lock? fortunately, this case never happens in the real world, > as far as I can see, including permission check, (get|set)_(acl|attr), and > the gfs2 code also do so. > > Signed-off-by: Eric Ren> --- > fs/ocfs2/dlmglue.c | 47 --- > fs/ocfs2/dlmglue.h | 18 ++ > fs/ocfs2/ocfs2.h | 1 + > 3 files changed, 63 insertions(+), 3 deletions(-) > > diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c > index 83d576f..500bda4 100644 > --- a/fs/ocfs2/dlmglue.c > +++ b/fs/ocfs2/dlmglue.c > @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) > init_waitqueue_head(>l_event); > INIT_LIST_HEAD(>l_blocked_list); > INIT_LIST_HEAD(>l_mask_waiters); > + INIT_LIST_HEAD(>l_holders); > } > > void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, > @@ -749,6 +750,45 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) > res->l_flags = 0UL; > } > > +inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, > +struct ocfs2_holder *oh) > +{ > + INIT_LIST_HEAD(>oh_list); > + oh->oh_owner_pid = get_pid(task_pid(current)); struct pid(oh->oh_owner_pid) looks complicated here, why not use task_struct(current) or pid_t(current->pid) directly? Also i didn't see the ref count needs to be considered. > + > + spin_lock(>l_lock); > + list_add_tail(>oh_list, >l_holders); > + spin_unlock(>l_lock); > +} > + > +inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, > +struct ocfs2_holder *oh) > +{ > + spin_lock(>l_lock); > + list_del(>oh_list); > + spin_unlock(>l_lock); > + > + put_pid(oh->oh_owner_pid); same the above > +} > + > +inline struct ocfs2_holder *ocfs2_is_locked_by_me(struct ocfs2_lock_res > *lockres) Agree with Joseph, return bool looks better. I didn't see how that help debug since the return value is not used. > +{ > + struct ocfs2_holder *oh; > + struct pid *pid; > + > + /* look in the list of holders for one with the current task as owner */ > + spin_lock(>l_lock); > + pid = task_pid(current); > + list_for_each_entry(oh, >l_holders, oh_list) { > + if (oh->oh_owner_pid == pid) > + goto out; > + } > + oh = NULL; > +out: > + spin_unlock(>l_lock); > + return oh; > +} > + > static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, >int level) > { > @@ -2333,8 +2373,9 @@
Re: [PATCH 1/2] ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock
On 01/05/2017 11:31 PM, Eric Ren wrote: > We are in the situation that we have to avoid recursive cluster locking, > but there is no way to check if a cluster lock has been taken by a > precess already. > > Mostly, we can avoid recursive locking by writing code carefully. > However, we found that it's very hard to handle the routines that > are invoked directly by vfs code. For instance: > > const struct inode_operations ocfs2_file_iops = { > .permission = ocfs2_permission, > .get_acl= ocfs2_iop_get_acl, > .set_acl= ocfs2_iop_set_acl, > }; > > Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR): > do_sys_open > may_open > inode_permission >ocfs2_permission > ocfs2_inode_lock() <=== first time > generic_permission > get_acl >ocfs2_iop_get_acl > ocfs2_inode_lock() <=== recursive one > > A deadlock will occur if a remote EX request comes in between two > of ocfs2_inode_lock(). Briefly describe how the deadlock is formed: > > On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in > BAST(ocfs2_generic_handle_bast) when downconvert is started > on behalf of the remote EX lock request. Another hand, the recursive > cluster lock (the second one) will be blocked in in __ocfs2_cluster_lock() > because of OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why? > because there is no chance for the first cluster lock on this node to be > unlocked - we block ourselves in the code path. > > The idea to fix this issue is mostly taken from gfs2 code. > 1. introduce a new field: struct ocfs2_lock_res.l_holders, to > keep track of the processes' pid who has taken the cluster lock > of this lock resource; > 2. introduce a new flag for ocfs2_inode_lock_full: OCFS2_META_LOCK_GETBH; > it means just getting back disk inode bh for us if we've got cluster lock. > 3. export a helper: ocfs2_is_locked_by_me() is used to check if we > have got the cluster lock in the upper code path. > > The tracking logic should be used by some of the ocfs2 vfs's callbacks, > to solve the recursive locking issue cuased by the fact that vfs routines > can call into each other. > > The performance penalty of processing the holder list should only be seen > at a few cases where the tracking logic is used, such as get/set acl. > > You may ask what if the first time we got a PR lock, and the second time > we want a EX lock? fortunately, this case never happens in the real world, > as far as I can see, including permission check, (get|set)_(acl|attr), and > the gfs2 code also do so. > > Signed-off-by: Eric Ren > --- > fs/ocfs2/dlmglue.c | 47 --- > fs/ocfs2/dlmglue.h | 18 ++ > fs/ocfs2/ocfs2.h | 1 + > 3 files changed, 63 insertions(+), 3 deletions(-) > > diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c > index 83d576f..500bda4 100644 > --- a/fs/ocfs2/dlmglue.c > +++ b/fs/ocfs2/dlmglue.c > @@ -532,6 +532,7 @@ void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res) > init_waitqueue_head(>l_event); > INIT_LIST_HEAD(>l_blocked_list); > INIT_LIST_HEAD(>l_mask_waiters); > + INIT_LIST_HEAD(>l_holders); > } > > void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, > @@ -749,6 +750,45 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res) > res->l_flags = 0UL; > } > > +inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, > +struct ocfs2_holder *oh) > +{ > + INIT_LIST_HEAD(>oh_list); > + oh->oh_owner_pid = get_pid(task_pid(current)); struct pid(oh->oh_owner_pid) looks complicated here, why not use task_struct(current) or pid_t(current->pid) directly? Also i didn't see the ref count needs to be considered. > + > + spin_lock(>l_lock); > + list_add_tail(>oh_list, >l_holders); > + spin_unlock(>l_lock); > +} > + > +inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, > +struct ocfs2_holder *oh) > +{ > + spin_lock(>l_lock); > + list_del(>oh_list); > + spin_unlock(>l_lock); > + > + put_pid(oh->oh_owner_pid); same the above > +} > + > +inline struct ocfs2_holder *ocfs2_is_locked_by_me(struct ocfs2_lock_res > *lockres) Agree with Joseph, return bool looks better. I didn't see how that help debug since the return value is not used. > +{ > + struct ocfs2_holder *oh; > + struct pid *pid; > + > + /* look in the list of holders for one with the current task as owner */ > + spin_lock(>l_lock); > + pid = task_pid(current); > + list_for_each_entry(oh, >l_holders, oh_list) { > + if (oh->oh_owner_pid == pid) > + goto out; > + } > + oh = NULL; > +out: > + spin_unlock(>l_lock); > + return oh; > +} > + > static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, >int level) > { > @@ -2333,8 +2373,9 @@ int
kernel panic on next-20160225
Hi, The following panic is triggered when run ocfs2 xattr test on linux-next-20160225. Did anybody ever see this? [ 254.604228] BUG: unable to handle kernel paging request at 0002000800c0 [ 254.605013] IP: [] kmem_cache_alloc+0x78/0x160 [ 254.605013] PGD 7bbe5067 PUD 0 [ 254.605013] Oops: [#1] SMP [ 254.605013] Modules linked in: ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi xen_kbdfront xen_netfront xen_fbfront xen_blkfront [ 254.605013] CPU: 2 PID: 4044 Comm: mpirun Not tainted 4.5.0-rc5-next-20160225 #1 [ 254.605013] Hardware name: Xen HVM domU, BIOS 4.3.1OVM 05/14/2014 [ 254.605013] task: 88007a521a80 ti: 88007aed task.ti: 88007aed [ 254.605013] RIP: 0010:[] [] kmem_cache_alloc+0x78/0x160 [ 254.605013] RSP: 0018:88007aed3a48 EFLAGS: 00010282 [ 254.605013] RAX: RBX: RCX: 1991 [ 254.605013] RDX: 1990 RSI: 024000c0 RDI: 0001b330 [ 254.605013] RBP: 88007aed3a98 R08: 88007d29b330 R09: 0002000800c0 [ 254.605013] R10: 000c51376d87 R11: 8800792cac38 R12: 88007cc30f00 [ 254.605013] R13: 024000c0 R14: 811b053f R15: 88007aed3ce7 [ 254.605013] FS: () GS:88007d28() knlGS: [ 254.605013] CS: 0010 DS: ES: CR0: 80050033 [ 254.605013] CR2: 0002000800c0 CR3: 7aeb2000 CR4: 000406e0 [ 254.605013] Stack: [ 254.605013] 13082000 88007aed3d28 0079 0001 [ 254.605013] 2f2f2f2f 8800792cac00 88007aed3d38 0101 [ 254.605013] 88007a5e2000 88007aed3ce7 88007aed3b08 811b053f [ 254.605013] Call Trace: [ 254.605013] [] __d_alloc+0x2f/0x1a0 [ 254.605013] [] ? unlazy_walk+0xe2/0x160 [ 254.605013] [] d_alloc+0x17/0x80 [ 254.605013] [] lookup_dcache+0x8a/0xc0 [ 254.605013] [] ? __alloc_pages_nodemask+0x173/0xeb0 [ 254.605013] [] path_openat+0x3c3/0x1210 [ 254.605013] [] ? radix_tree_lookup_slot+0x13/0x30 [ 254.605013] [] ? find_get_entry+0x32/0xc0 [ 254.605013] [] ? atime_needs_update+0x55/0xe0 [ 254.605013] [] ? filemap_fault+0xd1/0x4b0 [ 254.605013] [] ? do_set_pte+0xb6/0x140 [ 254.605013] [] do_filp_open+0x80/0xe0 [ 254.605013] [] ? __alloc_fd+0x48/0x1a0 [ 254.605013] [] ? getname_flags+0x7a/0x1e0 [ 254.605013] [] do_sys_open+0x110/0x200 [ 254.605013] [] SyS_open+0x19/0x20 [ 254.605013] [] do_syscall_64+0x72/0x230 [ 254.605013] [] ? __do_page_fault+0x177/0x430 [ 254.605013] [] entry_SYSCALL64_slow_path+0x25/0x25 [ 254.605013] Code: 05 e6 77 e7 7e 4d 8b 08 49 8b 40 10 4d 85 c9 0f 84 dd 00 00 00 48 85 c0 0f 84 d4 00 00 00 49 63 44 24 20 49 8b 3c 24 48 8d 4a 01 <49> 8b 1c 01 4c 89 c8 65 48 0f c7 0f 0f 94 c0 3c 01 75 b6 49 63 [ 254.605013] RIP [] kmem_cache_alloc+0x78/0x160 [ 254.605013] RSP [ 254.605013] CR2: 0002000800c0 [ 254.792273] ---[ end trace 823969e602e4aaac ]--- Thanks, Junxiao.
kernel panic on next-20160225
Hi, The following panic is triggered when run ocfs2 xattr test on linux-next-20160225. Did anybody ever see this? [ 254.604228] BUG: unable to handle kernel paging request at 0002000800c0 [ 254.605013] IP: [] kmem_cache_alloc+0x78/0x160 [ 254.605013] PGD 7bbe5067 PUD 0 [ 254.605013] Oops: [#1] SMP [ 254.605013] Modules linked in: ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm ocfs2_nodemanager ocfs2_stackglue iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi xen_kbdfront xen_netfront xen_fbfront xen_blkfront [ 254.605013] CPU: 2 PID: 4044 Comm: mpirun Not tainted 4.5.0-rc5-next-20160225 #1 [ 254.605013] Hardware name: Xen HVM domU, BIOS 4.3.1OVM 05/14/2014 [ 254.605013] task: 88007a521a80 ti: 88007aed task.ti: 88007aed [ 254.605013] RIP: 0010:[] [] kmem_cache_alloc+0x78/0x160 [ 254.605013] RSP: 0018:88007aed3a48 EFLAGS: 00010282 [ 254.605013] RAX: RBX: RCX: 1991 [ 254.605013] RDX: 1990 RSI: 024000c0 RDI: 0001b330 [ 254.605013] RBP: 88007aed3a98 R08: 88007d29b330 R09: 0002000800c0 [ 254.605013] R10: 000c51376d87 R11: 8800792cac38 R12: 88007cc30f00 [ 254.605013] R13: 024000c0 R14: 811b053f R15: 88007aed3ce7 [ 254.605013] FS: () GS:88007d28() knlGS: [ 254.605013] CS: 0010 DS: ES: CR0: 80050033 [ 254.605013] CR2: 0002000800c0 CR3: 7aeb2000 CR4: 000406e0 [ 254.605013] Stack: [ 254.605013] 13082000 88007aed3d28 0079 0001 [ 254.605013] 2f2f2f2f 8800792cac00 88007aed3d38 0101 [ 254.605013] 88007a5e2000 88007aed3ce7 88007aed3b08 811b053f [ 254.605013] Call Trace: [ 254.605013] [] __d_alloc+0x2f/0x1a0 [ 254.605013] [] ? unlazy_walk+0xe2/0x160 [ 254.605013] [] d_alloc+0x17/0x80 [ 254.605013] [] lookup_dcache+0x8a/0xc0 [ 254.605013] [] ? __alloc_pages_nodemask+0x173/0xeb0 [ 254.605013] [] path_openat+0x3c3/0x1210 [ 254.605013] [] ? radix_tree_lookup_slot+0x13/0x30 [ 254.605013] [] ? find_get_entry+0x32/0xc0 [ 254.605013] [] ? atime_needs_update+0x55/0xe0 [ 254.605013] [] ? filemap_fault+0xd1/0x4b0 [ 254.605013] [] ? do_set_pte+0xb6/0x140 [ 254.605013] [] do_filp_open+0x80/0xe0 [ 254.605013] [] ? __alloc_fd+0x48/0x1a0 [ 254.605013] [] ? getname_flags+0x7a/0x1e0 [ 254.605013] [] do_sys_open+0x110/0x200 [ 254.605013] [] SyS_open+0x19/0x20 [ 254.605013] [] do_syscall_64+0x72/0x230 [ 254.605013] [] ? __do_page_fault+0x177/0x430 [ 254.605013] [] entry_SYSCALL64_slow_path+0x25/0x25 [ 254.605013] Code: 05 e6 77 e7 7e 4d 8b 08 49 8b 40 10 4d 85 c9 0f 84 dd 00 00 00 48 85 c0 0f 84 d4 00 00 00 49 63 44 24 20 49 8b 3c 24 48 8d 4a 01 <49> 8b 1c 01 4c 89 c8 65 48 0f c7 0f 0f 94 c0 3c 01 75 b6 49 63 [ 254.605013] RIP [] kmem_cache_alloc+0x78/0x160 [ 254.605013] RSP [ 254.605013] CR2: 0002000800c0 [ 254.792273] ---[ end trace 823969e602e4aaac ]--- Thanks, Junxiao.
Re: linux-next: kernel panic in ipv6_defrag
On 12/23/2015 04:59 PM, Florian Westphal wrote: > Junxiao Bi wrote: >> The following panic happened when I run ocfs2-test on linux-next. Kernel >> config is attached. >> >> [64910.905501] BUG: unable to handle kernel NULL pointer dereference at >> (null) >> [64910.906466] IP: [] nf_ct_frag6_gather+0x7ad/0x9c0 > [..] >> ocfs2_stack_o2cb(O) ocfs2_dlm(O) ocfs2_nodemanager(O) ocfs2_stackglue(O) >> iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi xen_kbdfront >> xen_netfront xen_fbfront xen_blkfront [last unloaded: ocfs2_stackglue] >> [64910.906466] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G O >> 4.4.0-rc5-next-20151217 #1 > > Seems like this snapshot still lacks > > e97ac12859dbf4d3ee0eddb9798867541d1d1e1e > ("netfilter: ipv6: nf_defrag: fix NULL deref panic"), > https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git/commit/net/ipv6/netfilter/nf_conntrack_reasm.c?id=e97ac12859dbf4d3ee0eddb9798867541d1d1e1e > > Its included starting with next-20151221. > > Please report back if it occurs with above commit present. Looks issue resolved with this fix. Thank you. > > Thanks. > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: kernel panic in ipv6_defrag
On 12/23/2015 04:59 PM, Florian Westphal wrote: > Junxiao Bi <junxiao...@oracle.com> wrote: >> The following panic happened when I run ocfs2-test on linux-next. Kernel >> config is attached. >> >> [64910.905501] BUG: unable to handle kernel NULL pointer dereference at >> (null) >> [64910.906466] IP: [] nf_ct_frag6_gather+0x7ad/0x9c0 > [..] >> ocfs2_stack_o2cb(O) ocfs2_dlm(O) ocfs2_nodemanager(O) ocfs2_stackglue(O) >> iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi xen_kbdfront >> xen_netfront xen_fbfront xen_blkfront [last unloaded: ocfs2_stackglue] >> [64910.906466] CPU: 0 PID: 0 Comm: swapper/0 Tainted: G O >> 4.4.0-rc5-next-20151217 #1 > > Seems like this snapshot still lacks > > e97ac12859dbf4d3ee0eddb9798867541d1d1e1e > ("netfilter: ipv6: nf_defrag: fix NULL deref panic"), > https://git.kernel.org/cgit/linux/kernel/git/next/linux-next.git/commit/net/ipv6/netfilter/nf_conntrack_reasm.c?id=e97ac12859dbf4d3ee0eddb9798867541d1d1e1e > > Its included starting with next-20151221. > > Please report back if it occurs with above commit present. Looks issue resolved with this fix. Thank you. > > Thanks. > -- > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Ocfs2-devel] [PATCH v2 4/4] ocfs2: check/fix inode block for online file check
On 11/25/2015 01:04 PM, Gang He wrote: > Hi Mark and Junxiao, > > >>>> >> Hi Mark, >> >> On 11/25/2015 06:16 AM, Mark Fasheh wrote: >>> Hi Junxiao, >>> >>> On Tue, Nov 03, 2015 at 03:12:35PM +0800, Junxiao Bi wrote: >>>> Hi Gang, >>>> >>>> This is not like a right patch. >>>> First, online file check only checks inode's block number, valid flag, >>>> fs generation value, and meta ecc. I never see a real corruption >>>> happened only on this field, if these fields are corrupted, that means >>>> something bad may happen on other place. So fix this field may not help >>>> and even cause corruption more hard. >>> >>> I agree that these are rather uncommon, we might even consider removing the >>> VALID_FL fixup. I definitely don't think we're ready for anything more >>> complicated than this though either. We kind of have to start somewhere too. >>> >> Yes, the fix is too simple, and just a start, I think we'd better wait >> more useful parts done before merging it. > I agree, just remark VALID_FL flag to fix this field is too simple, we should > delay this field fix before > I have a flawless solution, I will remove these lines code in the first > version patches. In the future submits, > I also hope your guys to help review the code carefully, shout out your > comments when you doubt somewhere. Sure. > > > >>> >>>> Second, the repair way is wrong. In >>>> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't >>>> match the ones in memory, the ones in memory are used to update the disk >>>> fields. The question is how do you know these field in memory are >>>> right(they may be the real corrupted ones)? >>> >>> Your second point (and the last part of your 1st point) makes a good >>> argument for why this shouldn't happen automatically. Some of these >>> corruptions might require a human to look at the log and decide what to do. >>> Especially as you point out, where we might not know where the source of the >>> corruption is. And if the human can't figure it out, then it's probably time >>> to unmount and fsck. >> The point is that the fix way is wrong, just flush memory info to disk >> is not right. I agree online fsck is good feature, but need carefully >> design, it should not involve more corruptions. A rough idea from mine >> is that maybe we need some "frezee" mechanism in fs, which can hung all >> fs op and let fs stop at a safe area. After freeze fs, we can do some >> fsck work on it and these works should not cost lots time. What's your idea? > If we need to touch some global data structures, freezing fs can be > considered when we can't > get any way in case using the locks. > If we only handle some independent problem, we just need to lock the related > data structures. Hmm, I am not sure whether it's hard to decide an independent issue. Thanks, Junxiao. > >> >> Thanks, >> Junxiao. >> >>> >>> Thanks, >>> --Mark >>> >>> -- >>> Mark Fasheh >>> > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check
Hi Gang, On 11/25/2015 11:29 AM, Gang He wrote: > Hi Mark and Junxiao, > > >>>> >> On Tue, Nov 03, 2015 at 04:20:27PM +0800, Junxiao Bi wrote: >>> Hi Gang, >>> >>> On 11/03/2015 03:54 PM, Gang He wrote: >>>> Hi Junxiao, >>>> >>>> Thank for your reviewing. >>>> Current design, we use a sysfile as a interface to check/fix a file (via >> pass a ino number). >>>> But, this operation is manually triggered by user, instead of >>>> automatically >> fix in the kernel. >>>> Why? >>>> 1) we should let users make this decision, since some users do not want to >> fix when encountering a file system corruption, maybe they want to keep the >> file system unchanged for a further investigation. >>> If user don't want this, they should not use error=continue option, let >>> fs go after a corruption is very dangerous. >> >> Maybe we need another errors=XXX flag (maybe errors=fix)? >> >> You both make good points, here's what I gather from the conversation: >> >> - Some customers would be sad if they have to manually fix corruptions. >>This takes effort on their part, and if the FS can handle it >>automatically, it should. >> >> - There are valid concerns that automatically fixing things is a change in >>behavior that might not be welcome, or worse might lead to unforseeable >>circumstances. >> >> - I will add that fixing things automatically implies checking them >>automatically which could introduce some performance impact depending on >>how much checking we're doing. >> >> So if the user wants errors to be fixed automatically, they could mount with >> errros=fix, and everyone else would have no change in behavior unless they >> wanted to make use of the new feature. > That is what I want to say, add a mount option to let users to decide. Here, > I want to split "error=fix" > mount option task out from online file check feature, I think this part > should be a independent feature. > We can implement this feature after online file check is done, I want to > split the feature into some more > detailed features, implement them one by one. Do you agree this point? With error=fix, when a possible corruption is found, online fsck will start to check and fix things. So this doesn't looks like a independent feature. Thanks, Junxiao. > >> >> >>>> 2) frankly speaking, this feature will probably bring a second corruption >> if there is some error in the code, I do not suggest to use automatically >> fix >> by default in the first version. >>> I think if this feature could bring more corruption, then this should be >>> fixed first. >> >> Btw, I am pretty sure that Gang is referring to the feature being new and >> thus more likely to have problems. There is nothing I see in here that is >> file system corrupting. >> --Mark >> >> >> -- >> Mark Fasheh > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check
On 11/25/2015 05:46 AM, Mark Fasheh wrote: > On Tue, Nov 03, 2015 at 04:20:27PM +0800, Junxiao Bi wrote: >> Hi Gang, >> >> On 11/03/2015 03:54 PM, Gang He wrote: >>> Hi Junxiao, >>> >>> Thank for your reviewing. >>> Current design, we use a sysfile as a interface to check/fix a file (via >>> pass a ino number). >>> But, this operation is manually triggered by user, instead of automatically >>> fix in the kernel. >>> Why? >>> 1) we should let users make this decision, since some users do not want to >>> fix when encountering a file system corruption, maybe they want to keep the >>> file system unchanged for a further investigation. >> If user don't want this, they should not use error=continue option, let >> fs go after a corruption is very dangerous. > > Maybe we need another errors=XXX flag (maybe errors=fix)? Sound great. This is a good option since user may have not enough knowledge whether to fix the found issue. Thanks, Junxiao. > > You both make good points, here's what I gather from the conversation: > > - Some customers would be sad if they have to manually fix corruptions. >This takes effort on their part, and if the FS can handle it >automatically, it should. > > - There are valid concerns that automatically fixing things is a change in >behavior that might not be welcome, or worse might lead to unforseeable >circumstances. > > - I will add that fixing things automatically implies checking them >automatically which could introduce some performance impact depending on >how much checking we're doing. > > So if the user wants errors to be fixed automatically, they could mount with > errros=fix, and everyone else would have no change in behavior unless they > wanted to make use of the new feature. > > >>> 2) frankly speaking, this feature will probably bring a second corruption >>> if there is some error in the code, I do not suggest to use automatically >>> fix by default in the first version. >> I think if this feature could bring more corruption, then this should be >> fixed first. > > Btw, I am pretty sure that Gang is referring to the feature being new and > thus more likely to have problems. There is nothing I see in here that is > file system corrupting. > --Mark > > > -- > Mark Fasheh > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Ocfs2-devel] [PATCH v2 4/4] ocfs2: check/fix inode block for online file check
Hi Mark, On 11/25/2015 06:16 AM, Mark Fasheh wrote: > Hi Junxiao, > > On Tue, Nov 03, 2015 at 03:12:35PM +0800, Junxiao Bi wrote: >> Hi Gang, >> >> This is not like a right patch. >> First, online file check only checks inode's block number, valid flag, >> fs generation value, and meta ecc. I never see a real corruption >> happened only on this field, if these fields are corrupted, that means >> something bad may happen on other place. So fix this field may not help >> and even cause corruption more hard. > > I agree that these are rather uncommon, we might even consider removing the > VALID_FL fixup. I definitely don't think we're ready for anything more > complicated than this though either. We kind of have to start somewhere too. > Yes, the fix is too simple, and just a start, I think we'd better wait more useful parts done before merging it. > >> Second, the repair way is wrong. In >> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't >> match the ones in memory, the ones in memory are used to update the disk >> fields. The question is how do you know these field in memory are >> right(they may be the real corrupted ones)? > > Your second point (and the last part of your 1st point) makes a good > argument for why this shouldn't happen automatically. Some of these > corruptions might require a human to look at the log and decide what to do. > Especially as you point out, where we might not know where the source of the > corruption is. And if the human can't figure it out, then it's probably time > to unmount and fsck. The point is that the fix way is wrong, just flush memory info to disk is not right. I agree online fsck is good feature, but need carefully design, it should not involve more corruptions. A rough idea from mine is that maybe we need some "frezee" mechanism in fs, which can hung all fs op and let fs stop at a safe area. After freeze fs, we can do some fsck work on it and these works should not cost lots time. What's your idea? Thanks, Junxiao. > > Thanks, > --Mark > > -- > Mark Fasheh > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Ocfs2-devel] [PATCH v2 4/4] ocfs2: check/fix inode block for online file check
Hi Mark, On 11/25/2015 06:16 AM, Mark Fasheh wrote: > Hi Junxiao, > > On Tue, Nov 03, 2015 at 03:12:35PM +0800, Junxiao Bi wrote: >> Hi Gang, >> >> This is not like a right patch. >> First, online file check only checks inode's block number, valid flag, >> fs generation value, and meta ecc. I never see a real corruption >> happened only on this field, if these fields are corrupted, that means >> something bad may happen on other place. So fix this field may not help >> and even cause corruption more hard. > > I agree that these are rather uncommon, we might even consider removing the > VALID_FL fixup. I definitely don't think we're ready for anything more > complicated than this though either. We kind of have to start somewhere too. > Yes, the fix is too simple, and just a start, I think we'd better wait more useful parts done before merging it. > >> Second, the repair way is wrong. In >> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't >> match the ones in memory, the ones in memory are used to update the disk >> fields. The question is how do you know these field in memory are >> right(they may be the real corrupted ones)? > > Your second point (and the last part of your 1st point) makes a good > argument for why this shouldn't happen automatically. Some of these > corruptions might require a human to look at the log and decide what to do. > Especially as you point out, where we might not know where the source of the > corruption is. And if the human can't figure it out, then it's probably time > to unmount and fsck. The point is that the fix way is wrong, just flush memory info to disk is not right. I agree online fsck is good feature, but need carefully design, it should not involve more corruptions. A rough idea from mine is that maybe we need some "frezee" mechanism in fs, which can hung all fs op and let fs stop at a safe area. After freeze fs, we can do some fsck work on it and these works should not cost lots time. What's your idea? Thanks, Junxiao. > > Thanks, > --Mark > > -- > Mark Fasheh > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check
On 11/25/2015 05:46 AM, Mark Fasheh wrote: > On Tue, Nov 03, 2015 at 04:20:27PM +0800, Junxiao Bi wrote: >> Hi Gang, >> >> On 11/03/2015 03:54 PM, Gang He wrote: >>> Hi Junxiao, >>> >>> Thank for your reviewing. >>> Current design, we use a sysfile as a interface to check/fix a file (via >>> pass a ino number). >>> But, this operation is manually triggered by user, instead of automatically >>> fix in the kernel. >>> Why? >>> 1) we should let users make this decision, since some users do not want to >>> fix when encountering a file system corruption, maybe they want to keep the >>> file system unchanged for a further investigation. >> If user don't want this, they should not use error=continue option, let >> fs go after a corruption is very dangerous. > > Maybe we need another errors=XXX flag (maybe errors=fix)? Sound great. This is a good option since user may have not enough knowledge whether to fix the found issue. Thanks, Junxiao. > > You both make good points, here's what I gather from the conversation: > > - Some customers would be sad if they have to manually fix corruptions. >This takes effort on their part, and if the FS can handle it >automatically, it should. > > - There are valid concerns that automatically fixing things is a change in >behavior that might not be welcome, or worse might lead to unforseeable >circumstances. > > - I will add that fixing things automatically implies checking them >automatically which could introduce some performance impact depending on >how much checking we're doing. > > So if the user wants errors to be fixed automatically, they could mount with > errros=fix, and everyone else would have no change in behavior unless they > wanted to make use of the new feature. > > >>> 2) frankly speaking, this feature will probably bring a second corruption >>> if there is some error in the code, I do not suggest to use automatically >>> fix by default in the first version. >> I think if this feature could bring more corruption, then this should be >> fixed first. > > Btw, I am pretty sure that Gang is referring to the feature being new and > thus more likely to have problems. There is nothing I see in here that is > file system corrupting. > --Mark > > > -- > Mark Fasheh > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check
Hi Gang, On 11/25/2015 11:29 AM, Gang He wrote: > Hi Mark and Junxiao, > > >>>> >> On Tue, Nov 03, 2015 at 04:20:27PM +0800, Junxiao Bi wrote: >>> Hi Gang, >>> >>> On 11/03/2015 03:54 PM, Gang He wrote: >>>> Hi Junxiao, >>>> >>>> Thank for your reviewing. >>>> Current design, we use a sysfile as a interface to check/fix a file (via >> pass a ino number). >>>> But, this operation is manually triggered by user, instead of >>>> automatically >> fix in the kernel. >>>> Why? >>>> 1) we should let users make this decision, since some users do not want to >> fix when encountering a file system corruption, maybe they want to keep the >> file system unchanged for a further investigation. >>> If user don't want this, they should not use error=continue option, let >>> fs go after a corruption is very dangerous. >> >> Maybe we need another errors=XXX flag (maybe errors=fix)? >> >> You both make good points, here's what I gather from the conversation: >> >> - Some customers would be sad if they have to manually fix corruptions. >>This takes effort on their part, and if the FS can handle it >>automatically, it should. >> >> - There are valid concerns that automatically fixing things is a change in >>behavior that might not be welcome, or worse might lead to unforseeable >>circumstances. >> >> - I will add that fixing things automatically implies checking them >>automatically which could introduce some performance impact depending on >>how much checking we're doing. >> >> So if the user wants errors to be fixed automatically, they could mount with >> errros=fix, and everyone else would have no change in behavior unless they >> wanted to make use of the new feature. > That is what I want to say, add a mount option to let users to decide. Here, > I want to split "error=fix" > mount option task out from online file check feature, I think this part > should be a independent feature. > We can implement this feature after online file check is done, I want to > split the feature into some more > detailed features, implement them one by one. Do you agree this point? With error=fix, when a possible corruption is found, online fsck will start to check and fix things. So this doesn't looks like a independent feature. Thanks, Junxiao. > >> >> >>>> 2) frankly speaking, this feature will probably bring a second corruption >> if there is some error in the code, I do not suggest to use automatically >> fix >> by default in the first version. >>> I think if this feature could bring more corruption, then this should be >>> fixed first. >> >> Btw, I am pretty sure that Gang is referring to the feature being new and >> thus more likely to have problems. There is nothing I see in here that is >> file system corrupting. >> --Mark >> >> >> -- >> Mark Fasheh > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [Ocfs2-devel] [PATCH v2 4/4] ocfs2: check/fix inode block for online file check
On 11/25/2015 01:04 PM, Gang He wrote: > Hi Mark and Junxiao, > > >>>> >> Hi Mark, >> >> On 11/25/2015 06:16 AM, Mark Fasheh wrote: >>> Hi Junxiao, >>> >>> On Tue, Nov 03, 2015 at 03:12:35PM +0800, Junxiao Bi wrote: >>>> Hi Gang, >>>> >>>> This is not like a right patch. >>>> First, online file check only checks inode's block number, valid flag, >>>> fs generation value, and meta ecc. I never see a real corruption >>>> happened only on this field, if these fields are corrupted, that means >>>> something bad may happen on other place. So fix this field may not help >>>> and even cause corruption more hard. >>> >>> I agree that these are rather uncommon, we might even consider removing the >>> VALID_FL fixup. I definitely don't think we're ready for anything more >>> complicated than this though either. We kind of have to start somewhere too. >>> >> Yes, the fix is too simple, and just a start, I think we'd better wait >> more useful parts done before merging it. > I agree, just remark VALID_FL flag to fix this field is too simple, we should > delay this field fix before > I have a flawless solution, I will remove these lines code in the first > version patches. In the future submits, > I also hope your guys to help review the code carefully, shout out your > comments when you doubt somewhere. Sure. > > > >>> >>>> Second, the repair way is wrong. In >>>> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't >>>> match the ones in memory, the ones in memory are used to update the disk >>>> fields. The question is how do you know these field in memory are >>>> right(they may be the real corrupted ones)? >>> >>> Your second point (and the last part of your 1st point) makes a good >>> argument for why this shouldn't happen automatically. Some of these >>> corruptions might require a human to look at the log and decide what to do. >>> Especially as you point out, where we might not know where the source of the >>> corruption is. And if the human can't figure it out, then it's probably time >>> to unmount and fsck. >> The point is that the fix way is wrong, just flush memory info to disk >> is not right. I agree online fsck is good feature, but need carefully >> design, it should not involve more corruptions. A rough idea from mine >> is that maybe we need some "frezee" mechanism in fs, which can hung all >> fs op and let fs stop at a safe area. After freeze fs, we can do some >> fsck work on it and these works should not cost lots time. What's your idea? > If we need to touch some global data structures, freezing fs can be > considered when we can't > get any way in case using the locks. > If we only handle some independent problem, we just need to lock the related > data structures. Hmm, I am not sure whether it's hard to decide an independent issue. Thanks, Junxiao. > >> >> Thanks, >> Junxiao. >> >>> >>> Thanks, >>> --Mark >>> >>> -- >>> Mark Fasheh >>> > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check
On 11/03/2015 04:47 PM, Gang He wrote: > > > >> On 11/03/2015 04:15 PM, Gang He wrote: >>> Hello Junxiao, >>> >>> See my comments inline. >>> >>> >> Hi Gang, This is not like a right patch. First, online file check only checks inode's block number, valid flag, fs generation value, and meta ecc. I never see a real corruption happened only on this field, if these fields are corrupted, that means something bad may happen on other place. So fix this field may not help and even cause corruption more hard. >>> This online file check/fix feature is used to check/fix some light file >>> meta >> block corruption, instead of turning a file system off and using fsck.ocfs2. >> What's light meta block corruption? Do you have a case about it? >>> e.g. meta ecc error, we really need not to use fsck.ocfs2. >>> of course, this feature does not replace fsck.ocfs2 and touch some >> complicated meta block problems, if there is some potential problem in some >> areas, we can discuss them one by one. >>> >>> >>> Second, the repair way is wrong. In ocfs2_filecheck_repair_inode_block(), if these fields in disk don't match the ones in memory, the ones in memory are used to update the disk fields. The question is how do you know these field in memory are right(they may be the real corrupted ones)? >>> Here, if the inode block was corrupted, the file system is not able to load >> it into the memory. >> How do you know inode block corrupted? If bh for inode block is >> overwritten, i mean bh corrupted, the repair will corrupted a good inode >> block. > You know, the meta block is only validated when the file system loads the > block from disk to memory. > If the inode object is in the memory, we consider this inode block is OK. This assuming is not true as there are always bugs. Bugs can make inode object in memory bad and corrupted the fs when repair the inode. Thanks, Junxiao. > If the inode is not loaded by the file system via the normal way, the file > system will print a kernel error log to tell which ino is corrupted. > we will use ocfs2_filecheck_repair_inode_block() function to fix the inode > block before loading. > > Thanks > Gang > >> >> Thanks, >> Junxiao. >> >>> ocfs2_filecheck_repair_inode_block() will able to load it into the memory, >> since it try to fix these light-level problem before loading. >>> if the fix is OK, the changed meta-block can pass the block-validate >>> function >> and load into the memory as a inode object. >>> Since the file system is under a cluster environment, we have to use some >> existing function and code path to keep these block operation under a >> cluster >> lock. >>> >>> >>> Thanks >>> Gang >>> Thanks, Junxiao. On 10/28/2015 02:26 PM, Gang He wrote: > +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, > +struct buffer_head *bh) > +{ > + int rc; > + int changed = 0; > + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; > + > + rc = ocfs2_filecheck_validate_inode_block(sb, bh); > + /* Can't fix invalid inode block */ > + if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO) > + return rc; > + > + trace_ocfs2_filecheck_repair_inode_block( > + (unsigned long long)bh->b_blocknr); > + > + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || > + ocfs2_is_soft_readonly(OCFS2_SB(sb))) { > + mlog(ML_ERROR, > + "Filecheck: try to repair dinode #%llu on readonly > filesystem\n", > + (unsigned long long)bh->b_blocknr); > + return -OCFS2_FILECHECK_ERR_READONLY; > + } > + > + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { > + di->i_blkno = cpu_to_le64(bh->b_blocknr); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: i_blkno to %llu\n", > + (unsigned long long)bh->b_blocknr, > + (unsigned long long)le64_to_cpu(di->i_blkno)); > + } > + > + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { > + di->i_flags |= cpu_to_le32(OCFS2_VALID_FL); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is > set\n", > + (unsigned long long)bh->b_blocknr); > + } > + > + if (le32_to_cpu(di->i_fs_generation) != > + OCFS2_SB(sb)->fs_generation) { > + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: fs_generation to %u\n", > + (unsigned long long)bh->b_blocknr, > + le32_to_cpu(di->i_fs_generation)); > + } > + > + if (changed || > +
Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check
On 11/03/2015 04:15 PM, Gang He wrote: > Hello Junxiao, > > See my comments inline. > > >> Hi Gang, >> >> This is not like a right patch. >> First, online file check only checks inode's block number, valid flag, >> fs generation value, and meta ecc. I never see a real corruption >> happened only on this field, if these fields are corrupted, that means >> something bad may happen on other place. So fix this field may not help >> and even cause corruption more hard. > This online file check/fix feature is used to check/fix some light file meta > block corruption, instead of turning a file system off and using fsck.ocfs2. What's light meta block corruption? Do you have a case about it? > e.g. meta ecc error, we really need not to use fsck.ocfs2. > of course, this feature does not replace fsck.ocfs2 and touch some > complicated meta block problems, if there is some potential problem in some > areas, we can discuss them one by one. > > > >> Second, the repair way is wrong. In >> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't >> match the ones in memory, the ones in memory are used to update the disk >> fields. The question is how do you know these field in memory are >> right(they may be the real corrupted ones)? > Here, if the inode block was corrupted, the file system is not able to load > it into the memory. How do you know inode block corrupted? If bh for inode block is overwritten, i mean bh corrupted, the repair will corrupted a good inode block. Thanks, Junxiao. > ocfs2_filecheck_repair_inode_block() will able to load it into the memory, > since it try to fix these light-level problem before loading. > if the fix is OK, the changed meta-block can pass the block-validate function > and load into the memory as a inode object. > Since the file system is under a cluster environment, we have to use some > existing function and code path to keep these block operation under a cluster > lock. > > > Thanks > Gang > >> >> Thanks, >> Junxiao. >> On 10/28/2015 02:26 PM, Gang He wrote: >>> +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, >>> + struct buffer_head *bh) >>> +{ >>> + int rc; >>> + int changed = 0; >>> + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; >>> + >>> + rc = ocfs2_filecheck_validate_inode_block(sb, bh); >>> + /* Can't fix invalid inode block */ >>> + if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO) >>> + return rc; >>> + >>> + trace_ocfs2_filecheck_repair_inode_block( >>> + (unsigned long long)bh->b_blocknr); >>> + >>> + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || >>> + ocfs2_is_soft_readonly(OCFS2_SB(sb))) { >>> + mlog(ML_ERROR, >>> + "Filecheck: try to repair dinode #%llu on readonly >>> filesystem\n", >>> + (unsigned long long)bh->b_blocknr); >>> + return -OCFS2_FILECHECK_ERR_READONLY; >>> + } >>> + >>> + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { >>> + di->i_blkno = cpu_to_le64(bh->b_blocknr); >>> + changed = 1; >>> + mlog(ML_ERROR, >>> + "Filecheck: reset dinode #%llu: i_blkno to %llu\n", >>> + (unsigned long long)bh->b_blocknr, >>> + (unsigned long long)le64_to_cpu(di->i_blkno)); >>> + } >>> + >>> + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { >>> + di->i_flags |= cpu_to_le32(OCFS2_VALID_FL); >>> + changed = 1; >>> + mlog(ML_ERROR, >>> + "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is >>> set\n", >>> + (unsigned long long)bh->b_blocknr); >>> + } >>> + >>> + if (le32_to_cpu(di->i_fs_generation) != >>> + OCFS2_SB(sb)->fs_generation) { >>> + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); >>> + changed = 1; >>> + mlog(ML_ERROR, >>> + "Filecheck: reset dinode #%llu: fs_generation to %u\n", >>> + (unsigned long long)bh->b_blocknr, >>> + le32_to_cpu(di->i_fs_generation)); >>> + } >>> + >>> + if (changed || >>> + ocfs2_validate_meta_ecc(sb, bh->b_data, >i_check)) { >>> + ocfs2_compute_meta_ecc(sb, bh->b_data, >i_check); >>> + mark_buffer_dirty(bh); >>> + mlog(ML_ERROR, >>> + "Filecheck: reset dinode #%llu: compute meta ecc\n", >>> + (unsigned long long)bh->b_blocknr); >>> + } >>> + >>> + return 0; >>> +} > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check
Hi Gang, On 11/03/2015 03:54 PM, Gang He wrote: > Hi Junxiao, > > Thank for your reviewing. > Current design, we use a sysfile as a interface to check/fix a file (via pass > a ino number). > But, this operation is manually triggered by user, instead of automatically > fix in the kernel. > Why? > 1) we should let users make this decision, since some users do not want to > fix when encountering a file system corruption, maybe they want to keep the > file system unchanged for a further investigation. If user don't want this, they should not use error=continue option, let fs go after a corruption is very dangerous. > 2) frankly speaking, this feature will probably bring a second corruption if > there is some error in the code, I do not suggest to use automatically fix by > default in the first version. I think if this feature could bring more corruption, then this should be fixed first. Thanks, Junxiao > 3) in the future, if this feature is well proved, we can add a mount option > to make this automatically fix enabled. > > > Thanks > Gang > > > >> Hi Gang, >> >> I didn't see a need to add a sysfs file for the check and repair. This >> leaves a hard problem for customer to decide. How they decide whether >> they should repair the bad inode since this may cause corruption even >> harder? >> I think the error should be fixed by this feature automaticlly if repair >> helps, of course this can be done only when error=continue is enabled or >> add some mount option for it. >> >> Thanks, >> Junxiao. >> >> On 10/28/2015 02:25 PM, Gang He wrote: >>> Implement online file check sysfile interfaces, e.g. >>> how to create the related sysfile according to device name, >>> how to display/handle file check request from the sysfile. >>> >>> Signed-off-by: Gang He >>> --- >>> fs/ocfs2/Makefile| 3 +- >>> fs/ocfs2/filecheck.c | 566 >> +++ >>> fs/ocfs2/filecheck.h | 48 + >>> fs/ocfs2/inode.h | 3 + >>> 4 files changed, 619 insertions(+), 1 deletion(-) >>> create mode 100644 fs/ocfs2/filecheck.c >>> create mode 100644 fs/ocfs2/filecheck.h >>> >>> diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile >>> index ce210d4..e27e652 100644 >>> --- a/fs/ocfs2/Makefile >>> +++ b/fs/ocfs2/Makefile >>> @@ -41,7 +41,8 @@ ocfs2-objs := \ >>> quota_local.o \ >>> quota_global.o \ >>> xattr.o \ >>> - acl.o >>> + acl.o \ >>> + filecheck.o >>> >>> ocfs2_stackglue-objs := stackglue.o >>> ocfs2_stack_o2cb-objs := stack_o2cb.o >>> diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c >>> new file mode 100644 >>> index 000..f12ed1f >>> --- /dev/null >>> +++ b/fs/ocfs2/filecheck.c >>> @@ -0,0 +1,566 @@ >>> +/* -*- mode: c; c-basic-offset: 8; -*- >>> + * vim: noexpandtab sw=8 ts=8 sts=0: >>> + * >>> + * filecheck.c >>> + * >>> + * Code which implements online file check. >>> + * >>> + * Copyright (C) 2015 Novell. All rights reserved. >>> + * >>> + * This program is free software; you can redistribute it and/or >>> + * modify it under the terms of the GNU General Public >>> + * License as published by the Free Software Foundation, version 2. >>> + * >>> + * This program is distributed in the hope that it will be useful, >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + * General Public License for more details. >>> + */ >>> + >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> + >>> +#include "ocfs2.h" >>> +#include "ocfs2_fs.h" >>> +#include "stackglue.h" >>> +#include "inode.h" >>> + >>> +#include "filecheck.h" >>> + >>> + >>> +/* File check error strings, >>> + * must correspond with error number in header file. >>> + */ >>> +static const char * const ocfs2_filecheck_errs[] = { >>> + "SUCCESS", >>> + "FAILED", >>> + "INPROGRESS", >>> + "READONLY", >>> + "INVALIDINO", >>> + "BLOCKECC", >>> + "BLOCKNO", >>> + "VALIDFLAG", >>> + "GENERATION", >>> + "UNSUPPORTED" >>> +}; >>> + >>> +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); >>> +static LIST_HEAD(ocfs2_filecheck_sysfs_list); >>> + >>> +struct ocfs2_filecheck { >>> + struct list_head fc_head; /* File check entry list head */ >>> + spinlock_t fc_lock; >>> + unsigned int fc_max;/* Maximum number of entry in list */ >>> + unsigned int fc_size; /* Current entry count in list */ >>> + unsigned int fc_done; /* File check entries are done in list */ >>> +}; >>> + >>> +struct ocfs2_filecheck_sysfs_entry { >>> + struct list_head fs_list; >>> + atomic_t fs_count; >>> + struct super_block *fs_sb; >>> + struct kset *fs_kset; >>> + struct ocfs2_filecheck *fs_fcheck; >>> +}; >>> + >>> +#define OCFS2_FILECHECK_MAXSIZE100 >>> +#define OCFS2_FILECHECK_MINSIZE10 >>> + >>> +/* File
Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check
On 11/03/2015 04:47 PM, Gang He wrote: > > > >> On 11/03/2015 04:15 PM, Gang He wrote: >>> Hello Junxiao, >>> >>> See my comments inline. >>> >>> >> Hi Gang, This is not like a right patch. First, online file check only checks inode's block number, valid flag, fs generation value, and meta ecc. I never see a real corruption happened only on this field, if these fields are corrupted, that means something bad may happen on other place. So fix this field may not help and even cause corruption more hard. >>> This online file check/fix feature is used to check/fix some light file >>> meta >> block corruption, instead of turning a file system off and using fsck.ocfs2. >> What's light meta block corruption? Do you have a case about it? >>> e.g. meta ecc error, we really need not to use fsck.ocfs2. >>> of course, this feature does not replace fsck.ocfs2 and touch some >> complicated meta block problems, if there is some potential problem in some >> areas, we can discuss them one by one. >>> >>> >>> Second, the repair way is wrong. In ocfs2_filecheck_repair_inode_block(), if these fields in disk don't match the ones in memory, the ones in memory are used to update the disk fields. The question is how do you know these field in memory are right(they may be the real corrupted ones)? >>> Here, if the inode block was corrupted, the file system is not able to load >> it into the memory. >> How do you know inode block corrupted? If bh for inode block is >> overwritten, i mean bh corrupted, the repair will corrupted a good inode >> block. > You know, the meta block is only validated when the file system loads the > block from disk to memory. > If the inode object is in the memory, we consider this inode block is OK. This assuming is not true as there are always bugs. Bugs can make inode object in memory bad and corrupted the fs when repair the inode. Thanks, Junxiao. > If the inode is not loaded by the file system via the normal way, the file > system will print a kernel error log to tell which ino is corrupted. > we will use ocfs2_filecheck_repair_inode_block() function to fix the inode > block before loading. > > Thanks > Gang > >> >> Thanks, >> Junxiao. >> >>> ocfs2_filecheck_repair_inode_block() will able to load it into the memory, >> since it try to fix these light-level problem before loading. >>> if the fix is OK, the changed meta-block can pass the block-validate >>> function >> and load into the memory as a inode object. >>> Since the file system is under a cluster environment, we have to use some >> existing function and code path to keep these block operation under a >> cluster >> lock. >>> >>> >>> Thanks >>> Gang >>> Thanks, Junxiao. On 10/28/2015 02:26 PM, Gang He wrote: > +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, > +struct buffer_head *bh) > +{ > + int rc; > + int changed = 0; > + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; > + > + rc = ocfs2_filecheck_validate_inode_block(sb, bh); > + /* Can't fix invalid inode block */ > + if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO) > + return rc; > + > + trace_ocfs2_filecheck_repair_inode_block( > + (unsigned long long)bh->b_blocknr); > + > + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || > + ocfs2_is_soft_readonly(OCFS2_SB(sb))) { > + mlog(ML_ERROR, > + "Filecheck: try to repair dinode #%llu on readonly > filesystem\n", > + (unsigned long long)bh->b_blocknr); > + return -OCFS2_FILECHECK_ERR_READONLY; > + } > + > + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { > + di->i_blkno = cpu_to_le64(bh->b_blocknr); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: i_blkno to %llu\n", > + (unsigned long long)bh->b_blocknr, > + (unsigned long long)le64_to_cpu(di->i_blkno)); > + } > + > + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { > + di->i_flags |= cpu_to_le32(OCFS2_VALID_FL); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is > set\n", > + (unsigned long long)bh->b_blocknr); > + } > + > + if (le32_to_cpu(di->i_fs_generation) != > + OCFS2_SB(sb)->fs_generation) { > + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: fs_generation to %u\n", > + (unsigned long long)bh->b_blocknr, > + le32_to_cpu(di->i_fs_generation)); > + } > + > + if (changed || > +
Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check
Hi Gang, On 11/03/2015 03:54 PM, Gang He wrote: > Hi Junxiao, > > Thank for your reviewing. > Current design, we use a sysfile as a interface to check/fix a file (via pass > a ino number). > But, this operation is manually triggered by user, instead of automatically > fix in the kernel. > Why? > 1) we should let users make this decision, since some users do not want to > fix when encountering a file system corruption, maybe they want to keep the > file system unchanged for a further investigation. If user don't want this, they should not use error=continue option, let fs go after a corruption is very dangerous. > 2) frankly speaking, this feature will probably bring a second corruption if > there is some error in the code, I do not suggest to use automatically fix by > default in the first version. I think if this feature could bring more corruption, then this should be fixed first. Thanks, Junxiao > 3) in the future, if this feature is well proved, we can add a mount option > to make this automatically fix enabled. > > > Thanks > Gang > > > >> Hi Gang, >> >> I didn't see a need to add a sysfs file for the check and repair. This >> leaves a hard problem for customer to decide. How they decide whether >> they should repair the bad inode since this may cause corruption even >> harder? >> I think the error should be fixed by this feature automaticlly if repair >> helps, of course this can be done only when error=continue is enabled or >> add some mount option for it. >> >> Thanks, >> Junxiao. >> >> On 10/28/2015 02:25 PM, Gang He wrote: >>> Implement online file check sysfile interfaces, e.g. >>> how to create the related sysfile according to device name, >>> how to display/handle file check request from the sysfile. >>> >>> Signed-off-by: Gang He>>> --- >>> fs/ocfs2/Makefile| 3 +- >>> fs/ocfs2/filecheck.c | 566 >> +++ >>> fs/ocfs2/filecheck.h | 48 + >>> fs/ocfs2/inode.h | 3 + >>> 4 files changed, 619 insertions(+), 1 deletion(-) >>> create mode 100644 fs/ocfs2/filecheck.c >>> create mode 100644 fs/ocfs2/filecheck.h >>> >>> diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile >>> index ce210d4..e27e652 100644 >>> --- a/fs/ocfs2/Makefile >>> +++ b/fs/ocfs2/Makefile >>> @@ -41,7 +41,8 @@ ocfs2-objs := \ >>> quota_local.o \ >>> quota_global.o \ >>> xattr.o \ >>> - acl.o >>> + acl.o \ >>> + filecheck.o >>> >>> ocfs2_stackglue-objs := stackglue.o >>> ocfs2_stack_o2cb-objs := stack_o2cb.o >>> diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c >>> new file mode 100644 >>> index 000..f12ed1f >>> --- /dev/null >>> +++ b/fs/ocfs2/filecheck.c >>> @@ -0,0 +1,566 @@ >>> +/* -*- mode: c; c-basic-offset: 8; -*- >>> + * vim: noexpandtab sw=8 ts=8 sts=0: >>> + * >>> + * filecheck.c >>> + * >>> + * Code which implements online file check. >>> + * >>> + * Copyright (C) 2015 Novell. All rights reserved. >>> + * >>> + * This program is free software; you can redistribute it and/or >>> + * modify it under the terms of the GNU General Public >>> + * License as published by the Free Software Foundation, version 2. >>> + * >>> + * This program is distributed in the hope that it will be useful, >>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU >>> + * General Public License for more details. >>> + */ >>> + >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> +#include >>> + >>> +#include "ocfs2.h" >>> +#include "ocfs2_fs.h" >>> +#include "stackglue.h" >>> +#include "inode.h" >>> + >>> +#include "filecheck.h" >>> + >>> + >>> +/* File check error strings, >>> + * must correspond with error number in header file. >>> + */ >>> +static const char * const ocfs2_filecheck_errs[] = { >>> + "SUCCESS", >>> + "FAILED", >>> + "INPROGRESS", >>> + "READONLY", >>> + "INVALIDINO", >>> + "BLOCKECC", >>> + "BLOCKNO", >>> + "VALIDFLAG", >>> + "GENERATION", >>> + "UNSUPPORTED" >>> +}; >>> + >>> +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); >>> +static LIST_HEAD(ocfs2_filecheck_sysfs_list); >>> + >>> +struct ocfs2_filecheck { >>> + struct list_head fc_head; /* File check entry list head */ >>> + spinlock_t fc_lock; >>> + unsigned int fc_max;/* Maximum number of entry in list */ >>> + unsigned int fc_size; /* Current entry count in list */ >>> + unsigned int fc_done; /* File check entries are done in list */ >>> +}; >>> + >>> +struct ocfs2_filecheck_sysfs_entry { >>> + struct list_head fs_list; >>> + atomic_t fs_count; >>> + struct super_block *fs_sb; >>> + struct kset *fs_kset; >>> + struct ocfs2_filecheck *fs_fcheck; >>> +}; >>> + >>> +#define OCFS2_FILECHECK_MAXSIZE100 >>> +#define OCFS2_FILECHECK_MINSIZE10 >>>
Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check
On 11/03/2015 04:15 PM, Gang He wrote: > Hello Junxiao, > > See my comments inline. > > >> Hi Gang, >> >> This is not like a right patch. >> First, online file check only checks inode's block number, valid flag, >> fs generation value, and meta ecc. I never see a real corruption >> happened only on this field, if these fields are corrupted, that means >> something bad may happen on other place. So fix this field may not help >> and even cause corruption more hard. > This online file check/fix feature is used to check/fix some light file meta > block corruption, instead of turning a file system off and using fsck.ocfs2. What's light meta block corruption? Do you have a case about it? > e.g. meta ecc error, we really need not to use fsck.ocfs2. > of course, this feature does not replace fsck.ocfs2 and touch some > complicated meta block problems, if there is some potential problem in some > areas, we can discuss them one by one. > > > >> Second, the repair way is wrong. In >> ocfs2_filecheck_repair_inode_block(), if these fields in disk don't >> match the ones in memory, the ones in memory are used to update the disk >> fields. The question is how do you know these field in memory are >> right(they may be the real corrupted ones)? > Here, if the inode block was corrupted, the file system is not able to load > it into the memory. How do you know inode block corrupted? If bh for inode block is overwritten, i mean bh corrupted, the repair will corrupted a good inode block. Thanks, Junxiao. > ocfs2_filecheck_repair_inode_block() will able to load it into the memory, > since it try to fix these light-level problem before loading. > if the fix is OK, the changed meta-block can pass the block-validate function > and load into the memory as a inode object. > Since the file system is under a cluster environment, we have to use some > existing function and code path to keep these block operation under a cluster > lock. > > > Thanks > Gang > >> >> Thanks, >> Junxiao. >> On 10/28/2015 02:26 PM, Gang He wrote: >>> +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, >>> + struct buffer_head *bh) >>> +{ >>> + int rc; >>> + int changed = 0; >>> + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; >>> + >>> + rc = ocfs2_filecheck_validate_inode_block(sb, bh); >>> + /* Can't fix invalid inode block */ >>> + if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO) >>> + return rc; >>> + >>> + trace_ocfs2_filecheck_repair_inode_block( >>> + (unsigned long long)bh->b_blocknr); >>> + >>> + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || >>> + ocfs2_is_soft_readonly(OCFS2_SB(sb))) { >>> + mlog(ML_ERROR, >>> + "Filecheck: try to repair dinode #%llu on readonly >>> filesystem\n", >>> + (unsigned long long)bh->b_blocknr); >>> + return -OCFS2_FILECHECK_ERR_READONLY; >>> + } >>> + >>> + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { >>> + di->i_blkno = cpu_to_le64(bh->b_blocknr); >>> + changed = 1; >>> + mlog(ML_ERROR, >>> + "Filecheck: reset dinode #%llu: i_blkno to %llu\n", >>> + (unsigned long long)bh->b_blocknr, >>> + (unsigned long long)le64_to_cpu(di->i_blkno)); >>> + } >>> + >>> + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { >>> + di->i_flags |= cpu_to_le32(OCFS2_VALID_FL); >>> + changed = 1; >>> + mlog(ML_ERROR, >>> + "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is >>> set\n", >>> + (unsigned long long)bh->b_blocknr); >>> + } >>> + >>> + if (le32_to_cpu(di->i_fs_generation) != >>> + OCFS2_SB(sb)->fs_generation) { >>> + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); >>> + changed = 1; >>> + mlog(ML_ERROR, >>> + "Filecheck: reset dinode #%llu: fs_generation to %u\n", >>> + (unsigned long long)bh->b_blocknr, >>> + le32_to_cpu(di->i_fs_generation)); >>> + } >>> + >>> + if (changed || >>> + ocfs2_validate_meta_ecc(sb, bh->b_data, >i_check)) { >>> + ocfs2_compute_meta_ecc(sb, bh->b_data, >i_check); >>> + mark_buffer_dirty(bh); >>> + mlog(ML_ERROR, >>> + "Filecheck: reset dinode #%llu: compute meta ecc\n", >>> + (unsigned long long)bh->b_blocknr); >>> + } >>> + >>> + return 0; >>> +} > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check
Hi Gang, I didn't see a need to add a sysfs file for the check and repair. This leaves a hard problem for customer to decide. How they decide whether they should repair the bad inode since this may cause corruption even harder? I think the error should be fixed by this feature automaticlly if repair helps, of course this can be done only when error=continue is enabled or add some mount option for it. Thanks, Junxiao. On 10/28/2015 02:25 PM, Gang He wrote: > Implement online file check sysfile interfaces, e.g. > how to create the related sysfile according to device name, > how to display/handle file check request from the sysfile. > > Signed-off-by: Gang He > --- > fs/ocfs2/Makefile| 3 +- > fs/ocfs2/filecheck.c | 566 > +++ > fs/ocfs2/filecheck.h | 48 + > fs/ocfs2/inode.h | 3 + > 4 files changed, 619 insertions(+), 1 deletion(-) > create mode 100644 fs/ocfs2/filecheck.c > create mode 100644 fs/ocfs2/filecheck.h > > diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile > index ce210d4..e27e652 100644 > --- a/fs/ocfs2/Makefile > +++ b/fs/ocfs2/Makefile > @@ -41,7 +41,8 @@ ocfs2-objs := \ > quota_local.o \ > quota_global.o \ > xattr.o \ > - acl.o > + acl.o \ > + filecheck.o > > ocfs2_stackglue-objs := stackglue.o > ocfs2_stack_o2cb-objs := stack_o2cb.o > diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c > new file mode 100644 > index 000..f12ed1f > --- /dev/null > +++ b/fs/ocfs2/filecheck.c > @@ -0,0 +1,566 @@ > +/* -*- mode: c; c-basic-offset: 8; -*- > + * vim: noexpandtab sw=8 ts=8 sts=0: > + * > + * filecheck.c > + * > + * Code which implements online file check. > + * > + * Copyright (C) 2015 Novell. All rights reserved. > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public > + * License as published by the Free Software Foundation, version 2. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * General Public License for more details. > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include "ocfs2.h" > +#include "ocfs2_fs.h" > +#include "stackglue.h" > +#include "inode.h" > + > +#include "filecheck.h" > + > + > +/* File check error strings, > + * must correspond with error number in header file. > + */ > +static const char * const ocfs2_filecheck_errs[] = { > + "SUCCESS", > + "FAILED", > + "INPROGRESS", > + "READONLY", > + "INVALIDINO", > + "BLOCKECC", > + "BLOCKNO", > + "VALIDFLAG", > + "GENERATION", > + "UNSUPPORTED" > +}; > + > +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); > +static LIST_HEAD(ocfs2_filecheck_sysfs_list); > + > +struct ocfs2_filecheck { > + struct list_head fc_head; /* File check entry list head */ > + spinlock_t fc_lock; > + unsigned int fc_max;/* Maximum number of entry in list */ > + unsigned int fc_size; /* Current entry count in list */ > + unsigned int fc_done; /* File check entries are done in list */ > +}; > + > +struct ocfs2_filecheck_sysfs_entry { > + struct list_head fs_list; > + atomic_t fs_count; > + struct super_block *fs_sb; > + struct kset *fs_kset; > + struct ocfs2_filecheck *fs_fcheck; > +}; > + > +#define OCFS2_FILECHECK_MAXSIZE 100 > +#define OCFS2_FILECHECK_MINSIZE 10 > + > +/* File check operation type */ > +enum { > + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file */ > + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file */ > + OCFS2_FILECHECK_TYPE_SET = 100 /* Set file check options */ > +}; > + > +struct ocfs2_filecheck_entry { > + struct list_head fe_list; > + unsigned long fe_ino; > + unsigned int fe_type; > + unsigned short fe_done:1; > + unsigned short fe_status:15; > +}; > + > +struct ocfs2_filecheck_args { > + unsigned int fa_type; > + union { > + unsigned long fa_ino; > + unsigned int fa_len; > + }; > +}; > + > +static const char * > +ocfs2_filecheck_error(int errno) > +{ > + if (!errno) > + return ocfs2_filecheck_errs[errno]; > + > + BUG_ON(errno < OCFS2_FILECHECK_ERR_START || > + errno > OCFS2_FILECHECK_ERR_END); > + return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; > +} > + > +static ssize_t ocfs2_filecheck_show(struct kobject *kobj, > + struct kobj_attribute *attr, > + char *buf); > +static ssize_t ocfs2_filecheck_store(struct kobject *kobj, > + struct kobj_attribute *attr, > +
Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check
Hi Gang, This is not like a right patch. First, online file check only checks inode's block number, valid flag, fs generation value, and meta ecc. I never see a real corruption happened only on this field, if these fields are corrupted, that means something bad may happen on other place. So fix this field may not help and even cause corruption more hard. Second, the repair way is wrong. In ocfs2_filecheck_repair_inode_block(), if these fields in disk don't match the ones in memory, the ones in memory are used to update the disk fields. The question is how do you know these field in memory are right(they may be the real corrupted ones)? Thanks, Junxiao. On 10/28/2015 02:26 PM, Gang He wrote: > +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, > +struct buffer_head *bh) > +{ > + int rc; > + int changed = 0; > + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; > + > + rc = ocfs2_filecheck_validate_inode_block(sb, bh); > + /* Can't fix invalid inode block */ > + if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO) > + return rc; > + > + trace_ocfs2_filecheck_repair_inode_block( > + (unsigned long long)bh->b_blocknr); > + > + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || > + ocfs2_is_soft_readonly(OCFS2_SB(sb))) { > + mlog(ML_ERROR, > + "Filecheck: try to repair dinode #%llu on readonly > filesystem\n", > + (unsigned long long)bh->b_blocknr); > + return -OCFS2_FILECHECK_ERR_READONLY; > + } > + > + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { > + di->i_blkno = cpu_to_le64(bh->b_blocknr); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: i_blkno to %llu\n", > + (unsigned long long)bh->b_blocknr, > + (unsigned long long)le64_to_cpu(di->i_blkno)); > + } > + > + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { > + di->i_flags |= cpu_to_le32(OCFS2_VALID_FL); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is > set\n", > + (unsigned long long)bh->b_blocknr); > + } > + > + if (le32_to_cpu(di->i_fs_generation) != > + OCFS2_SB(sb)->fs_generation) { > + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: fs_generation to %u\n", > + (unsigned long long)bh->b_blocknr, > + le32_to_cpu(di->i_fs_generation)); > + } > + > + if (changed || > + ocfs2_validate_meta_ecc(sb, bh->b_data, >i_check)) { > + ocfs2_compute_meta_ecc(sb, bh->b_data, >i_check); > + mark_buffer_dirty(bh); > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: compute meta ecc\n", > + (unsigned long long)bh->b_blocknr); > + } > + > + return 0; > +} -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 4/4] ocfs2: check/fix inode block for online file check
Hi Gang, This is not like a right patch. First, online file check only checks inode's block number, valid flag, fs generation value, and meta ecc. I never see a real corruption happened only on this field, if these fields are corrupted, that means something bad may happen on other place. So fix this field may not help and even cause corruption more hard. Second, the repair way is wrong. In ocfs2_filecheck_repair_inode_block(), if these fields in disk don't match the ones in memory, the ones in memory are used to update the disk fields. The question is how do you know these field in memory are right(they may be the real corrupted ones)? Thanks, Junxiao. On 10/28/2015 02:26 PM, Gang He wrote: > +static int ocfs2_filecheck_repair_inode_block(struct super_block *sb, > +struct buffer_head *bh) > +{ > + int rc; > + int changed = 0; > + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; > + > + rc = ocfs2_filecheck_validate_inode_block(sb, bh); > + /* Can't fix invalid inode block */ > + if (!rc || rc == -OCFS2_FILECHECK_ERR_INVALIDINO) > + return rc; > + > + trace_ocfs2_filecheck_repair_inode_block( > + (unsigned long long)bh->b_blocknr); > + > + if (ocfs2_is_hard_readonly(OCFS2_SB(sb)) || > + ocfs2_is_soft_readonly(OCFS2_SB(sb))) { > + mlog(ML_ERROR, > + "Filecheck: try to repair dinode #%llu on readonly > filesystem\n", > + (unsigned long long)bh->b_blocknr); > + return -OCFS2_FILECHECK_ERR_READONLY; > + } > + > + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { > + di->i_blkno = cpu_to_le64(bh->b_blocknr); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: i_blkno to %llu\n", > + (unsigned long long)bh->b_blocknr, > + (unsigned long long)le64_to_cpu(di->i_blkno)); > + } > + > + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { > + di->i_flags |= cpu_to_le32(OCFS2_VALID_FL); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: OCFS2_VALID_FL is > set\n", > + (unsigned long long)bh->b_blocknr); > + } > + > + if (le32_to_cpu(di->i_fs_generation) != > + OCFS2_SB(sb)->fs_generation) { > + di->i_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation); > + changed = 1; > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: fs_generation to %u\n", > + (unsigned long long)bh->b_blocknr, > + le32_to_cpu(di->i_fs_generation)); > + } > + > + if (changed || > + ocfs2_validate_meta_ecc(sb, bh->b_data, >i_check)) { > + ocfs2_compute_meta_ecc(sb, bh->b_data, >i_check); > + mark_buffer_dirty(bh); > + mlog(ML_ERROR, > + "Filecheck: reset dinode #%llu: compute meta ecc\n", > + (unsigned long long)bh->b_blocknr); > + } > + > + return 0; > +} -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH v2 2/4] ocfs2: sysfile interfaces for online file check
Hi Gang, I didn't see a need to add a sysfs file for the check and repair. This leaves a hard problem for customer to decide. How they decide whether they should repair the bad inode since this may cause corruption even harder? I think the error should be fixed by this feature automaticlly if repair helps, of course this can be done only when error=continue is enabled or add some mount option for it. Thanks, Junxiao. On 10/28/2015 02:25 PM, Gang He wrote: > Implement online file check sysfile interfaces, e.g. > how to create the related sysfile according to device name, > how to display/handle file check request from the sysfile. > > Signed-off-by: Gang He> --- > fs/ocfs2/Makefile| 3 +- > fs/ocfs2/filecheck.c | 566 > +++ > fs/ocfs2/filecheck.h | 48 + > fs/ocfs2/inode.h | 3 + > 4 files changed, 619 insertions(+), 1 deletion(-) > create mode 100644 fs/ocfs2/filecheck.c > create mode 100644 fs/ocfs2/filecheck.h > > diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile > index ce210d4..e27e652 100644 > --- a/fs/ocfs2/Makefile > +++ b/fs/ocfs2/Makefile > @@ -41,7 +41,8 @@ ocfs2-objs := \ > quota_local.o \ > quota_global.o \ > xattr.o \ > - acl.o > + acl.o \ > + filecheck.o > > ocfs2_stackglue-objs := stackglue.o > ocfs2_stack_o2cb-objs := stack_o2cb.o > diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c > new file mode 100644 > index 000..f12ed1f > --- /dev/null > +++ b/fs/ocfs2/filecheck.c > @@ -0,0 +1,566 @@ > +/* -*- mode: c; c-basic-offset: 8; -*- > + * vim: noexpandtab sw=8 ts=8 sts=0: > + * > + * filecheck.c > + * > + * Code which implements online file check. > + * > + * Copyright (C) 2015 Novell. All rights reserved. > + * > + * This program is free software; you can redistribute it and/or > + * modify it under the terms of the GNU General Public > + * License as published by the Free Software Foundation, version 2. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * General Public License for more details. > + */ > + > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > +#include > + > +#include "ocfs2.h" > +#include "ocfs2_fs.h" > +#include "stackglue.h" > +#include "inode.h" > + > +#include "filecheck.h" > + > + > +/* File check error strings, > + * must correspond with error number in header file. > + */ > +static const char * const ocfs2_filecheck_errs[] = { > + "SUCCESS", > + "FAILED", > + "INPROGRESS", > + "READONLY", > + "INVALIDINO", > + "BLOCKECC", > + "BLOCKNO", > + "VALIDFLAG", > + "GENERATION", > + "UNSUPPORTED" > +}; > + > +static DEFINE_SPINLOCK(ocfs2_filecheck_sysfs_lock); > +static LIST_HEAD(ocfs2_filecheck_sysfs_list); > + > +struct ocfs2_filecheck { > + struct list_head fc_head; /* File check entry list head */ > + spinlock_t fc_lock; > + unsigned int fc_max;/* Maximum number of entry in list */ > + unsigned int fc_size; /* Current entry count in list */ > + unsigned int fc_done; /* File check entries are done in list */ > +}; > + > +struct ocfs2_filecheck_sysfs_entry { > + struct list_head fs_list; > + atomic_t fs_count; > + struct super_block *fs_sb; > + struct kset *fs_kset; > + struct ocfs2_filecheck *fs_fcheck; > +}; > + > +#define OCFS2_FILECHECK_MAXSIZE 100 > +#define OCFS2_FILECHECK_MINSIZE 10 > + > +/* File check operation type */ > +enum { > + OCFS2_FILECHECK_TYPE_CHK = 0, /* Check a file */ > + OCFS2_FILECHECK_TYPE_FIX, /* Fix a file */ > + OCFS2_FILECHECK_TYPE_SET = 100 /* Set file check options */ > +}; > + > +struct ocfs2_filecheck_entry { > + struct list_head fe_list; > + unsigned long fe_ino; > + unsigned int fe_type; > + unsigned short fe_done:1; > + unsigned short fe_status:15; > +}; > + > +struct ocfs2_filecheck_args { > + unsigned int fa_type; > + union { > + unsigned long fa_ino; > + unsigned int fa_len; > + }; > +}; > + > +static const char * > +ocfs2_filecheck_error(int errno) > +{ > + if (!errno) > + return ocfs2_filecheck_errs[errno]; > + > + BUG_ON(errno < OCFS2_FILECHECK_ERR_START || > + errno > OCFS2_FILECHECK_ERR_END); > + return ocfs2_filecheck_errs[errno - OCFS2_FILECHECK_ERR_START + 1]; > +} > + > +static ssize_t ocfs2_filecheck_show(struct kobject *kobj, > + struct kobj_attribute *attr, > + char *buf); > +static ssize_t ocfs2_filecheck_store(struct kobject *kobj, > + struct kobj_attribute *attr, > +
[PATCH v2] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O during memory allocation") introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, __GFP_IO is cleared when this flag is set, but __GFP_FS implies __GFP_IO, it should also be cleared. Or it may still run into I/O, like in superblock shrinker. And this will make the kernel run into the deadlock case described in that commit. See Dave Chinner's comment about io in superblock shrinker: Filesystem shrinkers do indeed perform IO from the superblock shrinker and have for years. Even clean inodes can require IO before they can be freed - e.g. on an orphan list, need truncation of post-eof blocks, need to wait for ordered operations to complete before it can be freed, etc. IOWs, Ext4, btrfs and XFS all can issue and/or block on arbitrary amounts of IO in the superblock shrinker context. XFS, in particular, has been doing transactions and IO from the VFS inode cache shrinker since it was first introduced Fix this by clearing __GFP_FS in memalloc_noio_flags(), this function has masked all the gfp_mask that will be passed into fs for the processes setting PF_MEMALLOC_NOIO in the direct reclaim path. v1 thread at: https://lkml.org/lkml/2014/9/3/32 v2 changes: patch log update to make the issue more clear. Signed-off-by: Junxiao Bi Cc: Dave Chinner Cc: joyce.xue Cc: Ming Lei Cc: --- include/linux/sched.h |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885..2fb2c47 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags + * __GFP_FS is also cleared as it implies __GFP_IO. + */ static inline gfp_t memalloc_noio_flags(gfp_t flags) { if (unlikely(current->flags & PF_MEMALLOC_NOIO)) - flags &= ~__GFP_IO; + flags &= ~(__GFP_IO | __GFP_FS); return flags; } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH v2] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O during memory allocation) introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, __GFP_IO is cleared when this flag is set, but __GFP_FS implies __GFP_IO, it should also be cleared. Or it may still run into I/O, like in superblock shrinker. And this will make the kernel run into the deadlock case described in that commit. See Dave Chinner's comment about io in superblock shrinker: Filesystem shrinkers do indeed perform IO from the superblock shrinker and have for years. Even clean inodes can require IO before they can be freed - e.g. on an orphan list, need truncation of post-eof blocks, need to wait for ordered operations to complete before it can be freed, etc. IOWs, Ext4, btrfs and XFS all can issue and/or block on arbitrary amounts of IO in the superblock shrinker context. XFS, in particular, has been doing transactions and IO from the VFS inode cache shrinker since it was first introduced Fix this by clearing __GFP_FS in memalloc_noio_flags(), this function has masked all the gfp_mask that will be passed into fs for the processes setting PF_MEMALLOC_NOIO in the direct reclaim path. v1 thread at: https://lkml.org/lkml/2014/9/3/32 v2 changes: patch log update to make the issue more clear. Signed-off-by: Junxiao Bi junxiao...@oracle.com Cc: Dave Chinner da...@fromorbit.com Cc: joyce.xue xuejiu...@huawei.com Cc: Ming Lei ming@canonical.com Cc: sta...@vger.kernel.org --- include/linux/sched.h |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885..2fb2c47 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define tsk_used_math(p) ((p)-flags PF_USED_MATH) #define used_math() tsk_used_math(current) -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */ +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags + * __GFP_FS is also cleared as it implies __GFP_IO. + */ static inline gfp_t memalloc_noio_flags(gfp_t flags) { if (unlikely(current-flags PF_MEMALLOC_NOIO)) - flags = ~__GFP_IO; + flags = ~(__GFP_IO | __GFP_FS); return flags; } -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
On 09/05/2014 10:32 AM, Junxiao Bi wrote: > On 09/04/2014 05:23 PM, Dave Chinner wrote: >> On Wed, Sep 03, 2014 at 01:54:54PM +0800, Junxiao Bi wrote: >>> commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O >>> during memory allocation") >>> introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory >>> allocation, __GFP_IO is cleared >>> when this flag is set, but __GFP_FS implies __GFP_IO, it should also be >>> cleared. Or it may still >>> run into I/O, like in superblock shrinker. >>> >>> Signed-off-by: Junxiao Bi >>> Cc: joyce.xue >>> Cc: Ming Lei >>> --- >>> include/linux/sched.h |6 -- >>> 1 file changed, 4 insertions(+), 2 deletions(-) >>> >>> diff --git a/include/linux/sched.h b/include/linux/sched.h >>> index 5c2c885..2fb2c47 100644 >>> --- a/include/linux/sched.h >>> +++ b/include/linux/sched.h >>> @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct >>> task_struct *p, cputime_t *ut, >>> #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) >>> #define used_math() tsk_used_math(current) >>> >>> -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ >>> +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags >>> + * __GFP_FS is also cleared as it implies __GFP_IO. >>> + */ >>> static inline gfp_t memalloc_noio_flags(gfp_t flags) >>> { >>> if (unlikely(current->flags & PF_MEMALLOC_NOIO)) >>> - flags &= ~__GFP_IO; >>> + flags &= ~(__GFP_IO | __GFP_FS); >>> return flags; >>> } >> >> You also need to mask all the shrink_control->gfp_mask >> initialisations in mm/vmscan.c. The current code only masks the page >> reclaim gfp_mask, not those that are passed to the shrinkers. > Yes, there are some shrink_control->gfp_mask not masked in vmscan.c in > the following functions. Beside this, all seemed be masked from direct > reclaim path by memalloc_noio_flags(). > > -reclaim_clean_pages_from_list() > used by alloc_contig_range(), this function is invoked in hugetlb and > cma, for hugetlb, it should be safe as only userspace use it. I am not > sure about the cma. > David & Andrew, may you share your idea about whether cma is affected? > Look at CMA, it's used for device which doesn't support scatter/gather dma and mainly used for embedded device like camera, this should not be the case of the block device. So i think this gfp_mask doesn't need be masked. Thanks, Junxiao. > -mem_cgroup_shrink_node_zone() > -try_to_free_mem_cgroup_pages() > These two are used by mem cgroup, as no kernel thread can be assigned > into such cgroup, so i think, no need mask. > > -balance_pgdat() > used by kswapd, no need mask. > > -shrink_all_memory() > used by hibernate, should be safe with GFP_FS/IO. > > Thanks, > Junxiao. >> >> Cheers, >> >> Dave. >> > > -- > To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in > the body of a message to majord...@vger.kernel.org > More majordomo info at http://vger.kernel.org/majordomo-info.html > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
On 09/04/2014 05:23 PM, Dave Chinner wrote: > On Wed, Sep 03, 2014 at 01:54:54PM +0800, Junxiao Bi wrote: >> commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O >> during memory allocation") >> introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory >> allocation, __GFP_IO is cleared >> when this flag is set, but __GFP_FS implies __GFP_IO, it should also be >> cleared. Or it may still >> run into I/O, like in superblock shrinker. >> >> Signed-off-by: Junxiao Bi >> Cc: joyce.xue >> Cc: Ming Lei >> --- >> include/linux/sched.h |6 -- >> 1 file changed, 4 insertions(+), 2 deletions(-) >> >> diff --git a/include/linux/sched.h b/include/linux/sched.h >> index 5c2c885..2fb2c47 100644 >> --- a/include/linux/sched.h >> +++ b/include/linux/sched.h >> @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct >> task_struct *p, cputime_t *ut, >> #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) >> #define used_math() tsk_used_math(current) >> >> -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ >> +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags >> + * __GFP_FS is also cleared as it implies __GFP_IO. >> + */ >> static inline gfp_t memalloc_noio_flags(gfp_t flags) >> { >> if (unlikely(current->flags & PF_MEMALLOC_NOIO)) >> -flags &= ~__GFP_IO; >> +flags &= ~(__GFP_IO | __GFP_FS); >> return flags; >> } > > You also need to mask all the shrink_control->gfp_mask > initialisations in mm/vmscan.c. The current code only masks the page > reclaim gfp_mask, not those that are passed to the shrinkers. Yes, there are some shrink_control->gfp_mask not masked in vmscan.c in the following functions. Beside this, all seemed be masked from direct reclaim path by memalloc_noio_flags(). -reclaim_clean_pages_from_list() used by alloc_contig_range(), this function is invoked in hugetlb and cma, for hugetlb, it should be safe as only userspace use it. I am not sure about the cma. David & Andrew, may you share your idea about whether cma is affected? -mem_cgroup_shrink_node_zone() -try_to_free_mem_cgroup_pages() These two are used by mem cgroup, as no kernel thread can be assigned into such cgroup, so i think, no need mask. -balance_pgdat() used by kswapd, no need mask. -shrink_all_memory() used by hibernate, should be safe with GFP_FS/IO. Thanks, Junxiao. > > Cheers, > > Dave. > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
On 09/04/2014 05:23 PM, Dave Chinner wrote: On Wed, Sep 03, 2014 at 01:54:54PM +0800, Junxiao Bi wrote: commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O during memory allocation) introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, __GFP_IO is cleared when this flag is set, but __GFP_FS implies __GFP_IO, it should also be cleared. Or it may still run into I/O, like in superblock shrinker. Signed-off-by: Junxiao Bi junxiao...@oracle.com Cc: joyce.xue xuejiu...@huawei.com Cc: Ming Lei ming@canonical.com --- include/linux/sched.h |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885..2fb2c47 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define tsk_used_math(p) ((p)-flags PF_USED_MATH) #define used_math() tsk_used_math(current) -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */ +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags + * __GFP_FS is also cleared as it implies __GFP_IO. + */ static inline gfp_t memalloc_noio_flags(gfp_t flags) { if (unlikely(current-flags PF_MEMALLOC_NOIO)) -flags = ~__GFP_IO; +flags = ~(__GFP_IO | __GFP_FS); return flags; } You also need to mask all the shrink_control-gfp_mask initialisations in mm/vmscan.c. The current code only masks the page reclaim gfp_mask, not those that are passed to the shrinkers. Yes, there are some shrink_control-gfp_mask not masked in vmscan.c in the following functions. Beside this, all seemed be masked from direct reclaim path by memalloc_noio_flags(). -reclaim_clean_pages_from_list() used by alloc_contig_range(), this function is invoked in hugetlb and cma, for hugetlb, it should be safe as only userspace use it. I am not sure about the cma. David Andrew, may you share your idea about whether cma is affected? -mem_cgroup_shrink_node_zone() -try_to_free_mem_cgroup_pages() These two are used by mem cgroup, as no kernel thread can be assigned into such cgroup, so i think, no need mask. -balance_pgdat() used by kswapd, no need mask. -shrink_all_memory() used by hibernate, should be safe with GFP_FS/IO. Thanks, Junxiao. Cheers, Dave. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
On 09/05/2014 10:32 AM, Junxiao Bi wrote: On 09/04/2014 05:23 PM, Dave Chinner wrote: On Wed, Sep 03, 2014 at 01:54:54PM +0800, Junxiao Bi wrote: commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O during memory allocation) introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, __GFP_IO is cleared when this flag is set, but __GFP_FS implies __GFP_IO, it should also be cleared. Or it may still run into I/O, like in superblock shrinker. Signed-off-by: Junxiao Bi junxiao...@oracle.com Cc: joyce.xue xuejiu...@huawei.com Cc: Ming Lei ming@canonical.com --- include/linux/sched.h |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885..2fb2c47 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define tsk_used_math(p) ((p)-flags PF_USED_MATH) #define used_math() tsk_used_math(current) -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */ +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags + * __GFP_FS is also cleared as it implies __GFP_IO. + */ static inline gfp_t memalloc_noio_flags(gfp_t flags) { if (unlikely(current-flags PF_MEMALLOC_NOIO)) - flags = ~__GFP_IO; + flags = ~(__GFP_IO | __GFP_FS); return flags; } You also need to mask all the shrink_control-gfp_mask initialisations in mm/vmscan.c. The current code only masks the page reclaim gfp_mask, not those that are passed to the shrinkers. Yes, there are some shrink_control-gfp_mask not masked in vmscan.c in the following functions. Beside this, all seemed be masked from direct reclaim path by memalloc_noio_flags(). -reclaim_clean_pages_from_list() used by alloc_contig_range(), this function is invoked in hugetlb and cma, for hugetlb, it should be safe as only userspace use it. I am not sure about the cma. David Andrew, may you share your idea about whether cma is affected? Look at CMA, it's used for device which doesn't support scatter/gather dma and mainly used for embedded device like camera, this should not be the case of the block device. So i think this gfp_mask doesn't need be masked. Thanks, Junxiao. -mem_cgroup_shrink_node_zone() -try_to_free_mem_cgroup_pages() These two are used by mem cgroup, as no kernel thread can be assigned into such cgroup, so i think, no need mask. -balance_pgdat() used by kswapd, no need mask. -shrink_all_memory() used by hibernate, should be safe with GFP_FS/IO. Thanks, Junxiao. Cheers, Dave. -- To unsubscribe from this list: send the line unsubscribe linux-fsdevel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
On 09/04/2014 10:30 AM, Andrew Morton wrote: > On Thu, 04 Sep 2014 10:08:09 +0800 Junxiao Bi wrote: > >> On 09/04/2014 07:10 AM, Andrew Morton wrote: >>> On Wed, 3 Sep 2014 13:54:54 +0800 Junxiao Bi wrote: >>> >>>> commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O >>>> during memory allocation") >>>> introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory >>>> allocation, __GFP_IO is cleared >>>> when this flag is set, but __GFP_FS implies __GFP_IO, it should also be >>>> cleared. Or it may still >>>> run into I/O, like in superblock shrinker. >>> >>> Is there an actual bug which inspired this fix? If so, please describe >>> it. >>> >> Yes, an ocfs2 deadlock bug is related to this, there is a workqueue in >> ocfs2 who is for building tcp connections and processing ocfs2 message. >> Like when an new node is up in ocfs2 cluster, the workqueue will try to >> build the connections to it, since there are some common code in >> networking like sock_alloc() using GFP_KERNEL to allocate memory, direct >> reclaim will be triggered and call into superblock shrinker if available >> memory is not enough even set PF_MEMALLOC_NOIO for the workqueue. To >> shrink the inode cache, ocfs2 needs release cluster lock and this >> depends on workqueue to do it, so cause the deadlock. Not sure whether >> there are similar issue for other cluster fs, like nfs, it is possible >> rpciod hung like the ocfs2 workqueue? > > All this info should be in the changelog. > >> >>> I don't think it's accurate to say that __GFP_FS implies __GFP_IO. >>> Where did that info come from? >> __GFP_FS allowed callback into fs during memory allocation, and fs may >> do io whatever __GFP_IO is set? > > __GFP_FS and __GFP_IO are (or were) for communicating to vmscan: don't > enter the fs for writepage, don't write back swapcache. > > I guess those concepts have grown over time without a ton of thought > going into it. Yes, I suppose that if a filesystem's writepage is > called (for example) it expects that it will be able to perform > writeback and it won't check (or even be passed) the __GFP_IO setting. > > So I guess we could say that !__GFP_FS && GFP_IO is not implemented and > shouldn't occur. > > That being said, it still seems quite bad to disable VFS cache > shrinking for PF_MEMALLOC_NOIO allocation attempts. Even without this ocfs2 deadlock bug, the implement of PF_MEMALLOC_NOIO is wrong. See the deadlock case described in its log below. Let see the case "block device runtime resume", since __GFP_FS is not cleared, it could run into fs writepage and cause deadlock. >From 21caf2fc1931b485483ddd254b634fa8f0099963 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 22 Feb 2013 16:34:08 -0800 Subject: [PATCH] mm: teach mm by current context info to not do I/O during memory allocation This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of 'struct task_struct'), so that the flag can be set by one task to avoid doing I/O inside memory allocation in the task's context. The patch trys to solve one deadlock problem caused by block device, and the problem may happen at least in the below situations: - during block device runtime resume, if memory allocation with GFP_KERNEL is called inside runtime resume callback of any one of its ancestors(or the block device itself), the deadlock may be triggered inside the memory allocation since it might not complete until the block device becomes active and the involed page I/O finishes. The situation is pointed out first by Alan Stern. It is not a good approach to convert all GFP_KERNEL[1] in the path into GFP_NOIO because several subsystems may be involved(for example, PCI, USB and SCSI may be involved for usb mass stoarage device, network devices involved too in the iSCSI case) - during block device runtime suspend, because runtime resume need to wait for completion of concurrent runtime suspend. - during error handling of usb mass storage deivce, USB bus reset will be put on the device, so there shouldn't have any memory allocation with GFP_KERNEL during USB bus reset, otherwise the deadlock similar with above may be triggered. Unfortunately, any usb device may include one mass storage interface in theory, so it requires all usb interface drivers to handle the situation. In fact, most usb drivers don't know how to handle bus reset on the device and don't provide .pre_set() and .post_reset() callback at all, so USB core has to unbind and bind driver for these devices. So it is still not practical to resort to GFP_NOIO for solving the problem. Thanks, Junxiao.
Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
On 09/03/2014 08:20 PM, Trond Myklebust wrote: > On Wed, Sep 3, 2014 at 1:54 AM, Junxiao Bi wrote: >> commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O >> during memory allocation") >> introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory >> allocation, __GFP_IO is cleared >> when this flag is set, but __GFP_FS implies __GFP_IO, it should also be >> cleared. Or it may still >> run into I/O, like in superblock shrinker. >> >> Signed-off-by: Junxiao Bi >> Cc: joyce.xue >> Cc: Ming Lei >> --- >> include/linux/sched.h |6 -- >> 1 file changed, 4 insertions(+), 2 deletions(-) >> >> diff --git a/include/linux/sched.h b/include/linux/sched.h >> index 5c2c885..2fb2c47 100644 >> --- a/include/linux/sched.h >> +++ b/include/linux/sched.h >> @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct >> task_struct *p, cputime_t *ut, >> #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) >> #define used_math() tsk_used_math(current) >> >> -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ >> +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags >> + * __GFP_FS is also cleared as it implies __GFP_IO. >> + */ >> static inline gfp_t memalloc_noio_flags(gfp_t flags) >> { >> if (unlikely(current->flags & PF_MEMALLOC_NOIO)) >> - flags &= ~__GFP_IO; >> + flags &= ~(__GFP_IO | __GFP_FS); >> return flags; >> } >> > > Shouldn't this be a stable fix? If it is needed, then it will affect > all kernels that define PF_MEMALLOC_NOIO. Yes, should be. An ocfs2 deadlock bug related to this. Thanks, Junxiao. > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
On 09/04/2014 07:10 AM, Andrew Morton wrote: > On Wed, 3 Sep 2014 13:54:54 +0800 Junxiao Bi wrote: > >> commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O >> during memory allocation") >> introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory >> allocation, __GFP_IO is cleared >> when this flag is set, but __GFP_FS implies __GFP_IO, it should also be >> cleared. Or it may still >> run into I/O, like in superblock shrinker. > > Is there an actual bug which inspired this fix? If so, please describe > it. > Yes, an ocfs2 deadlock bug is related to this, there is a workqueue in ocfs2 who is for building tcp connections and processing ocfs2 message. Like when an new node is up in ocfs2 cluster, the workqueue will try to build the connections to it, since there are some common code in networking like sock_alloc() using GFP_KERNEL to allocate memory, direct reclaim will be triggered and call into superblock shrinker if available memory is not enough even set PF_MEMALLOC_NOIO for the workqueue. To shrink the inode cache, ocfs2 needs release cluster lock and this depends on workqueue to do it, so cause the deadlock. Not sure whether there are similar issue for other cluster fs, like nfs, it is possible rpciod hung like the ocfs2 workqueue? > I don't think it's accurate to say that __GFP_FS implies __GFP_IO. > Where did that info come from? __GFP_FS allowed callback into fs during memory allocation, and fs may do io whatever __GFP_IO is set? > > And the superblock shrinker is a good example of why this shouldn't be > the case. The main thing that code does is to reclaim clean fs objects > without performing IO. AFAICT the proposed patch will significantly > weaken PF_MEMALLOC_NOIO allocation attempts by needlessly preventing > the kernel from reclaiming such objects? Even fs didn't do io in superblock shrinker, it is possible for a fs process who is not convenient to set GFP_NOFS holding some fs lock and call back fs again? PF_MEMALLOC_NOIO is only set for some special processes. I think it won't affect much. Thanks, Junxiao. > >> --- a/include/linux/sched.h >> +++ b/include/linux/sched.h >> @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct >> task_struct *p, cputime_t *ut, >> #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) >> #define used_math() tsk_used_math(current) >> >> -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ >> +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags >> + * __GFP_FS is also cleared as it implies __GFP_IO. >> + */ >> static inline gfp_t memalloc_noio_flags(gfp_t flags) >> { >> if (unlikely(current->flags & PF_MEMALLOC_NOIO)) >> -flags &= ~__GFP_IO; >> +flags &= ~(__GFP_IO | __GFP_FS); >> return flags; >> } > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
On 09/04/2014 07:10 AM, Andrew Morton wrote: On Wed, 3 Sep 2014 13:54:54 +0800 Junxiao Bi junxiao...@oracle.com wrote: commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O during memory allocation) introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, __GFP_IO is cleared when this flag is set, but __GFP_FS implies __GFP_IO, it should also be cleared. Or it may still run into I/O, like in superblock shrinker. Is there an actual bug which inspired this fix? If so, please describe it. Yes, an ocfs2 deadlock bug is related to this, there is a workqueue in ocfs2 who is for building tcp connections and processing ocfs2 message. Like when an new node is up in ocfs2 cluster, the workqueue will try to build the connections to it, since there are some common code in networking like sock_alloc() using GFP_KERNEL to allocate memory, direct reclaim will be triggered and call into superblock shrinker if available memory is not enough even set PF_MEMALLOC_NOIO for the workqueue. To shrink the inode cache, ocfs2 needs release cluster lock and this depends on workqueue to do it, so cause the deadlock. Not sure whether there are similar issue for other cluster fs, like nfs, it is possible rpciod hung like the ocfs2 workqueue? I don't think it's accurate to say that __GFP_FS implies __GFP_IO. Where did that info come from? __GFP_FS allowed callback into fs during memory allocation, and fs may do io whatever __GFP_IO is set? And the superblock shrinker is a good example of why this shouldn't be the case. The main thing that code does is to reclaim clean fs objects without performing IO. AFAICT the proposed patch will significantly weaken PF_MEMALLOC_NOIO allocation attempts by needlessly preventing the kernel from reclaiming such objects? Even fs didn't do io in superblock shrinker, it is possible for a fs process who is not convenient to set GFP_NOFS holding some fs lock and call back fs again? PF_MEMALLOC_NOIO is only set for some special processes. I think it won't affect much. Thanks, Junxiao. --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define tsk_used_math(p) ((p)-flags PF_USED_MATH) #define used_math() tsk_used_math(current) -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */ +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags + * __GFP_FS is also cleared as it implies __GFP_IO. + */ static inline gfp_t memalloc_noio_flags(gfp_t flags) { if (unlikely(current-flags PF_MEMALLOC_NOIO)) -flags = ~__GFP_IO; +flags = ~(__GFP_IO | __GFP_FS); return flags; } -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
On 09/03/2014 08:20 PM, Trond Myklebust wrote: On Wed, Sep 3, 2014 at 1:54 AM, Junxiao Bi junxiao...@oracle.com wrote: commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O during memory allocation) introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, __GFP_IO is cleared when this flag is set, but __GFP_FS implies __GFP_IO, it should also be cleared. Or it may still run into I/O, like in superblock shrinker. Signed-off-by: Junxiao Bi junxiao...@oracle.com Cc: joyce.xue xuejiu...@huawei.com Cc: Ming Lei ming@canonical.com --- include/linux/sched.h |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885..2fb2c47 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define tsk_used_math(p) ((p)-flags PF_USED_MATH) #define used_math() tsk_used_math(current) -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */ +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags + * __GFP_FS is also cleared as it implies __GFP_IO. + */ static inline gfp_t memalloc_noio_flags(gfp_t flags) { if (unlikely(current-flags PF_MEMALLOC_NOIO)) - flags = ~__GFP_IO; + flags = ~(__GFP_IO | __GFP_FS); return flags; } Shouldn't this be a stable fix? If it is needed, then it will affect all kernels that define PF_MEMALLOC_NOIO. Yes, should be. An ocfs2 deadlock bug related to this. Thanks, Junxiao. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
On 09/04/2014 10:30 AM, Andrew Morton wrote: On Thu, 04 Sep 2014 10:08:09 +0800 Junxiao Bi junxiao...@oracle.com wrote: On 09/04/2014 07:10 AM, Andrew Morton wrote: On Wed, 3 Sep 2014 13:54:54 +0800 Junxiao Bi junxiao...@oracle.com wrote: commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O during memory allocation) introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, __GFP_IO is cleared when this flag is set, but __GFP_FS implies __GFP_IO, it should also be cleared. Or it may still run into I/O, like in superblock shrinker. Is there an actual bug which inspired this fix? If so, please describe it. Yes, an ocfs2 deadlock bug is related to this, there is a workqueue in ocfs2 who is for building tcp connections and processing ocfs2 message. Like when an new node is up in ocfs2 cluster, the workqueue will try to build the connections to it, since there are some common code in networking like sock_alloc() using GFP_KERNEL to allocate memory, direct reclaim will be triggered and call into superblock shrinker if available memory is not enough even set PF_MEMALLOC_NOIO for the workqueue. To shrink the inode cache, ocfs2 needs release cluster lock and this depends on workqueue to do it, so cause the deadlock. Not sure whether there are similar issue for other cluster fs, like nfs, it is possible rpciod hung like the ocfs2 workqueue? All this info should be in the changelog. I don't think it's accurate to say that __GFP_FS implies __GFP_IO. Where did that info come from? __GFP_FS allowed callback into fs during memory allocation, and fs may do io whatever __GFP_IO is set? __GFP_FS and __GFP_IO are (or were) for communicating to vmscan: don't enter the fs for writepage, don't write back swapcache. I guess those concepts have grown over time without a ton of thought going into it. Yes, I suppose that if a filesystem's writepage is called (for example) it expects that it will be able to perform writeback and it won't check (or even be passed) the __GFP_IO setting. So I guess we could say that !__GFP_FS GFP_IO is not implemented and shouldn't occur. That being said, it still seems quite bad to disable VFS cache shrinking for PF_MEMALLOC_NOIO allocation attempts. Even without this ocfs2 deadlock bug, the implement of PF_MEMALLOC_NOIO is wrong. See the deadlock case described in its log below. Let see the case block device runtime resume, since __GFP_FS is not cleared, it could run into fs writepage and cause deadlock. From 21caf2fc1931b485483ddd254b634fa8f0099963 Mon Sep 17 00:00:00 2001 From: Ming Lei ming@canonical.com Date: Fri, 22 Feb 2013 16:34:08 -0800 Subject: [PATCH] mm: teach mm by current context info to not do I/O during memory allocation This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of 'struct task_struct'), so that the flag can be set by one task to avoid doing I/O inside memory allocation in the task's context. The patch trys to solve one deadlock problem caused by block device, and the problem may happen at least in the below situations: - during block device runtime resume, if memory allocation with GFP_KERNEL is called inside runtime resume callback of any one of its ancestors(or the block device itself), the deadlock may be triggered inside the memory allocation since it might not complete until the block device becomes active and the involed page I/O finishes. The situation is pointed out first by Alan Stern. It is not a good approach to convert all GFP_KERNEL[1] in the path into GFP_NOIO because several subsystems may be involved(for example, PCI, USB and SCSI may be involved for usb mass stoarage device, network devices involved too in the iSCSI case) - during block device runtime suspend, because runtime resume need to wait for completion of concurrent runtime suspend. - during error handling of usb mass storage deivce, USB bus reset will be put on the device, so there shouldn't have any memory allocation with GFP_KERNEL during USB bus reset, otherwise the deadlock similar with above may be triggered. Unfortunately, any usb device may include one mass storage interface in theory, so it requires all usb interface drivers to handle the situation. In fact, most usb drivers don't know how to handle bus reset on the device and don't provide .pre_set() and .post_reset() callback at all, so USB core has to unbind and bind driver for these devices. So it is still not practical to resort to GFP_NOIO for solving the problem. Thanks, Junxiao. And the superblock shrinker is a good example of why this shouldn't be the case. The main thing that code does is to reclaim clean fs objects without performing IO. AFAICT the proposed patch will significantly weaken PF_MEMALLOC_NOIO allocation attempts by needlessly preventing the kernel from reclaiming such objects? Even fs didn't do io in superblock shrinker
[PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O during memory allocation") introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, __GFP_IO is cleared when this flag is set, but __GFP_FS implies __GFP_IO, it should also be cleared. Or it may still run into I/O, like in superblock shrinker. Signed-off-by: Junxiao Bi Cc: joyce.xue Cc: Ming Lei --- include/linux/sched.h |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885..2fb2c47 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define tsk_used_math(p) ((p)->flags & PF_USED_MATH) #define used_math() tsk_used_math(current) -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags */ +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags + * __GFP_FS is also cleared as it implies __GFP_IO. + */ static inline gfp_t memalloc_noio_flags(gfp_t flags) { if (unlikely(current->flags & PF_MEMALLOC_NOIO)) - flags &= ~__GFP_IO; + flags &= ~(__GFP_IO | __GFP_FS); return flags; } -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs/super.c: do not shrink fs slab during direct memory reclaim
On 09/03/2014 11:10 AM, Dave Chinner wrote: > On Wed, Sep 03, 2014 at 09:38:31AM +0800, Junxiao Bi wrote: >> Hi Jiufei, >> >> On 09/02/2014 05:03 PM, Xue jiufei wrote: >>> Hi, Dave >>> On 2014/9/2 7:51, Dave Chinner wrote: >>>> On Fri, Aug 29, 2014 at 05:57:22PM +0800, Xue jiufei wrote: >>>>> The patch trys to solve one deadlock problem caused by cluster >>>>> fs, like ocfs2. And the problem may happen at least in the below >>>>> situations: >>>>> 1)Receiving a connect message from other nodes, node queues a >>>>> work_struct o2net_listen_work. >>>>> 2)o2net_wq processes this work and calls sock_alloc() to allocate >>>>> memory for a new socket. >>>>> 3)It would do direct memory reclaim when available memory is not >>>>> enough and trigger the inode cleanup. That inode being cleaned up >>>>> is happened to be ocfs2 inode, so call evict()->ocfs2_evict_inode() >>>>> ->ocfs2_drop_lock()->dlmunlock()->o2net_send_message_vec(), >>>>> and wait for the unlock response from master. >>>>> 4)tcp layer received the response, call o2net_data_ready() and >>>>> queue sc_rx_work, waiting o2net_wq to process this work. >>>>> 5)o2net_wq is a single thread workqueue, it process the work one by >>>>> one. Right now it is still doing o2net_listen_work and cannot handle >>>>> sc_rx_work. so we deadlock. >>>>> >>>>> It is impossible to set GFP_NOFS for memory allocation in sock_alloc(). >>>>> So we use PF_FSTRANS to avoid the task reentering filesystem when >>>>> available memory is not enough. >>>>> >>>>> Signed-off-by: joyce.xue >>>> >>>> For the second time: use memalloc_noio_save/memalloc_noio_restore. >>>> And please put a great big comment in the code explaining why you >>>> need to do this special thing with memory reclaim flags. >>>> >>>> Cheers, >>>> >>>> Dave. >>>> >>> Thanks for your reply. But I am afraid that memalloc_noio_save/ >>> memalloc_noio_restore can not solve my problem. __GFP_IO is cleared >>> if PF_MEMALLOC_NOIO is set and can avoid doing IO in direct memory >>> reclaim. However, __GFP_FS is still set that can not avoid pruning >>> dcache and icache in memory allocation, resulting in the deadlock I >>> described. >> >> You can use PF_MEMALLOC_NOIO to replace PF_FSTRANS, set this flag in >> ocfs2 and check it in sb shrinker. > > No changes to the superblock shrinker, please. The flag should > modify the gfp_mask in the struct shrink_control passed to the > shrinker, just like the noio flag is used in the rest of the mm > code. __GFP_FS seemed imply __GFP_IO, can superblock shrinker check !(sc->gfp_mask & __GFP_IO) and stop? Thanks, Junxiao. > > Cheers, > > Dave. > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs/super.c: do not shrink fs slab during direct memory reclaim
Hi Jiufei, On 09/02/2014 05:03 PM, Xue jiufei wrote: > Hi, Dave > On 2014/9/2 7:51, Dave Chinner wrote: >> On Fri, Aug 29, 2014 at 05:57:22PM +0800, Xue jiufei wrote: >>> The patch trys to solve one deadlock problem caused by cluster >>> fs, like ocfs2. And the problem may happen at least in the below >>> situations: >>> 1)Receiving a connect message from other nodes, node queues a >>> work_struct o2net_listen_work. >>> 2)o2net_wq processes this work and calls sock_alloc() to allocate >>> memory for a new socket. >>> 3)It would do direct memory reclaim when available memory is not >>> enough and trigger the inode cleanup. That inode being cleaned up >>> is happened to be ocfs2 inode, so call evict()->ocfs2_evict_inode() >>> ->ocfs2_drop_lock()->dlmunlock()->o2net_send_message_vec(), >>> and wait for the unlock response from master. >>> 4)tcp layer received the response, call o2net_data_ready() and >>> queue sc_rx_work, waiting o2net_wq to process this work. >>> 5)o2net_wq is a single thread workqueue, it process the work one by >>> one. Right now it is still doing o2net_listen_work and cannot handle >>> sc_rx_work. so we deadlock. >>> >>> It is impossible to set GFP_NOFS for memory allocation in sock_alloc(). >>> So we use PF_FSTRANS to avoid the task reentering filesystem when >>> available memory is not enough. >>> >>> Signed-off-by: joyce.xue >> >> For the second time: use memalloc_noio_save/memalloc_noio_restore. >> And please put a great big comment in the code explaining why you >> need to do this special thing with memory reclaim flags. >> >> Cheers, >> >> Dave. >> > Thanks for your reply. But I am afraid that memalloc_noio_save/ > memalloc_noio_restore can not solve my problem. __GFP_IO is cleared > if PF_MEMALLOC_NOIO is set and can avoid doing IO in direct memory > reclaim. However, __GFP_FS is still set that can not avoid pruning > dcache and icache in memory allocation, resulting in the deadlock I > described. You can use PF_MEMALLOC_NOIO to replace PF_FSTRANS, set this flag in ocfs2 and check it in sb shrinker. Thanks, Junxiao. > > Thanks. > XueJiufei > > > -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs/super.c: do not shrink fs slab during direct memory reclaim
Hi Jiufei, On 09/02/2014 05:03 PM, Xue jiufei wrote: Hi, Dave On 2014/9/2 7:51, Dave Chinner wrote: On Fri, Aug 29, 2014 at 05:57:22PM +0800, Xue jiufei wrote: The patch trys to solve one deadlock problem caused by cluster fs, like ocfs2. And the problem may happen at least in the below situations: 1)Receiving a connect message from other nodes, node queues a work_struct o2net_listen_work. 2)o2net_wq processes this work and calls sock_alloc() to allocate memory for a new socket. 3)It would do direct memory reclaim when available memory is not enough and trigger the inode cleanup. That inode being cleaned up is happened to be ocfs2 inode, so call evict()-ocfs2_evict_inode() -ocfs2_drop_lock()-dlmunlock()-o2net_send_message_vec(), and wait for the unlock response from master. 4)tcp layer received the response, call o2net_data_ready() and queue sc_rx_work, waiting o2net_wq to process this work. 5)o2net_wq is a single thread workqueue, it process the work one by one. Right now it is still doing o2net_listen_work and cannot handle sc_rx_work. so we deadlock. It is impossible to set GFP_NOFS for memory allocation in sock_alloc(). So we use PF_FSTRANS to avoid the task reentering filesystem when available memory is not enough. Signed-off-by: joyce.xue xuejiu...@huawei.com For the second time: use memalloc_noio_save/memalloc_noio_restore. And please put a great big comment in the code explaining why you need to do this special thing with memory reclaim flags. Cheers, Dave. Thanks for your reply. But I am afraid that memalloc_noio_save/ memalloc_noio_restore can not solve my problem. __GFP_IO is cleared if PF_MEMALLOC_NOIO is set and can avoid doing IO in direct memory reclaim. However, __GFP_FS is still set that can not avoid pruning dcache and icache in memory allocation, resulting in the deadlock I described. You can use PF_MEMALLOC_NOIO to replace PF_FSTRANS, set this flag in ocfs2 and check it in sb shrinker. Thanks, Junxiao. Thanks. XueJiufei -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] fs/super.c: do not shrink fs slab during direct memory reclaim
On 09/03/2014 11:10 AM, Dave Chinner wrote: On Wed, Sep 03, 2014 at 09:38:31AM +0800, Junxiao Bi wrote: Hi Jiufei, On 09/02/2014 05:03 PM, Xue jiufei wrote: Hi, Dave On 2014/9/2 7:51, Dave Chinner wrote: On Fri, Aug 29, 2014 at 05:57:22PM +0800, Xue jiufei wrote: The patch trys to solve one deadlock problem caused by cluster fs, like ocfs2. And the problem may happen at least in the below situations: 1)Receiving a connect message from other nodes, node queues a work_struct o2net_listen_work. 2)o2net_wq processes this work and calls sock_alloc() to allocate memory for a new socket. 3)It would do direct memory reclaim when available memory is not enough and trigger the inode cleanup. That inode being cleaned up is happened to be ocfs2 inode, so call evict()-ocfs2_evict_inode() -ocfs2_drop_lock()-dlmunlock()-o2net_send_message_vec(), and wait for the unlock response from master. 4)tcp layer received the response, call o2net_data_ready() and queue sc_rx_work, waiting o2net_wq to process this work. 5)o2net_wq is a single thread workqueue, it process the work one by one. Right now it is still doing o2net_listen_work and cannot handle sc_rx_work. so we deadlock. It is impossible to set GFP_NOFS for memory allocation in sock_alloc(). So we use PF_FSTRANS to avoid the task reentering filesystem when available memory is not enough. Signed-off-by: joyce.xue xuejiu...@huawei.com For the second time: use memalloc_noio_save/memalloc_noio_restore. And please put a great big comment in the code explaining why you need to do this special thing with memory reclaim flags. Cheers, Dave. Thanks for your reply. But I am afraid that memalloc_noio_save/ memalloc_noio_restore can not solve my problem. __GFP_IO is cleared if PF_MEMALLOC_NOIO is set and can avoid doing IO in direct memory reclaim. However, __GFP_FS is still set that can not avoid pruning dcache and icache in memory allocation, resulting in the deadlock I described. You can use PF_MEMALLOC_NOIO to replace PF_FSTRANS, set this flag in ocfs2 and check it in sb shrinker. No changes to the superblock shrinker, please. The flag should modify the gfp_mask in the struct shrink_control passed to the shrinker, just like the noio flag is used in the rest of the mm code. __GFP_FS seemed imply __GFP_IO, can superblock shrinker check !(sc-gfp_mask __GFP_IO) and stop? Thanks, Junxiao. Cheers, Dave. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
commit 21caf2fc1931 (mm: teach mm by current context info to not do I/O during memory allocation) introduces PF_MEMALLOC_NOIO flag to avoid doing I/O inside memory allocation, __GFP_IO is cleared when this flag is set, but __GFP_FS implies __GFP_IO, it should also be cleared. Or it may still run into I/O, like in superblock shrinker. Signed-off-by: Junxiao Bi junxiao...@oracle.com Cc: joyce.xue xuejiu...@huawei.com Cc: Ming Lei ming@canonical.com --- include/linux/sched.h |6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c2c885..2fb2c47 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1936,11 +1936,13 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define tsk_used_math(p) ((p)-flags PF_USED_MATH) #define used_math() tsk_used_math(current) -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags */ +/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current-flags + * __GFP_FS is also cleared as it implies __GFP_IO. + */ static inline gfp_t memalloc_noio_flags(gfp_t flags) { if (unlikely(current-flags PF_MEMALLOC_NOIO)) - flags = ~__GFP_IO; + flags = ~(__GFP_IO | __GFP_FS); return flags; } -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] block: fix reqeust->__data_len overflow
blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req) is used to check whether a bio can be merged into an exist request. If can, req->__data_len += bio->bio_size. Since req->__data_len is a 32bit uint, if blk_rq_get_max_sectors(req) > (UINT_MAX >> 9), req->__date_len may overflow when merging a new bio. This probably happen for discard request. In xen blkfront driver, its max_discard_sectors is set to the whole disk sector size, see xlvbd_init_blk_queue(). So issuing discrad requests to a xen virtual disk with a size over 4G is very possible to trigger the overflow. This overflow will cause kernel panic in blk_end_request_all() due to BUG() triggered. The following is a call trace we saw in 3.0.69. Upstream kernel also suffer this issue. @ __end_that: dev xvdg: type=1, flags=2224441 @ sector 0, nr/cnr 8378368/4294959104 @ bio 8803d8cf3080, biotail 8803d8cf32c0, buffer (null), @ len 4289724416 @ blk_update_request: bio idx 0 >= vcnt 0 @ request botched: dev xvdg: type=1, flags=2224441 @ sector 0, nr/cnr 8378368/4294959104 @ bio 8803d8cf3080, biotail 8803d8cf32c0, buffer (null), @ len 4289724416 @ [ cut here ] @ kernel BUG at block/blk-core.c:2394! @ invalid opcode: [#1] SMP @ CPU 0 @ Modules linked in: nfs fscache auth_rpcgss nfs_acl autofs4 i2c_dev i2c_core @ lockd sunrpc(U) ksplice_x773z34q_vmlinux_new(U) ksplice_x773z34q(U) @ ksplice_bj7y22gc_vmlinux_new(U) ksplice_bj7y22gc_ipv6_new(U) @ ksplice_bj7y22gc(U) @ . @ ksplice_i1o46065(U) ksplice_5gqtkuvt_vmlinux_new(U) ksplice_5gqtkuvt(U) @ ksplice_2bcv8td6(U) ksplice_v5bs54bz_vmlinux_new(U) ksplice_v5bs54bz(U) @ ksplice_l7s0dhx6(U) ksplice_aur7sgvi(U) ksplice_ckie4cpv(U) @ nf_conntrack_netbios_ns @ . @ nf_conntrack_broadcast ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_state @ nf_conntrack xt_comment iptable_filter ip_tables be2iscsi iscsi_boot_sysfs @ ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp bnx2i cnic @ uio ipv6 @ . @ cxgb3i libcxgbi cxgb3 mdio libiscsi_tcp libiscsi scsi_transport_iscsi @ parport_pc lp parport snd_seq_dummy snd_seq_oss snd_seq_midi_event snd_seq @ snd_seq_device snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd soundcore @ snd_page_alloc @ . @ pcspkr xen_netfront dm_snapshot dm_zero dm_mirror dm_region_hash dm_log @ dm_mod xen_blkfront ext3 jbd mbcache sd_mod crc_t10dif [last unloaded: @ ksplice_x773z34q_vmlinux_old] @ . @ Pid: 0, comm: swapper Not tainted 2.6.39-400.212.1.el5uek #1 @ RIP: e030:[] [] @ __blk_end_request_all+0x2a/0x40 @ RSP: e02b:8803ffc03df8 EFLAGS: 00010002 @ RAX: 0001 RBX: 8803db3c8000 RCX: 8803d8cf32c0 @ RDX: 0001 RSI: 8803d8cf3080 RDI: 8803daed08d8 @ RBP: 8803ffc03df8 R08: R09: 8803daed08d8 @ R10: R11: 000a R12: @ R13: 8803dad5e3c0 R14: 0001 R15: 0029 @ FS: 7f1f34a32940() GS:8803ffc0() knlGS: @ CS: e033 DS: ES: CR0: 8005003b @ CR2: 020c6148 CR3: 0003c6492000 CR4: 2660 @ DR0: DR1: DR2: @ DR3: DR6: 0ff0 DR7: 0400 @ Process swapper (pid: 0, threadinfo 81794000, task 8179f020) @ Stack: @ 8803ffc03e48 a005c56a 8803da57a8d0 0028810d99ee @ 8803db1ea7c0 8803db1beec0 005e @ 0001 8803ffc03e98 810d735d @ Call Trace: @ @ [] blkif_interrupt+0x20a/0x3a0 [xen_blkfront] @ [] handle_irq_event_percpu+0x5d/0x1a0 @ [] handle_irq_event+0x4f/0x80 @ [] handle_edge_irq+0xa5/0x100 @ [] __xen_evtchn_do_upcall+0x218/0x310 @ [] xen_evtchn_do_upcall+0x2f/0x50 @ [] xen_do_hypervisor_callback+0x1e/0x30 @ @ [] ? xen_hypercall_sched_op+0xa/0x20 @ [] ? xen_hypercall_sched_op+0xa/0x20 @ [] ? xen_safe_halt+0x10/0x20 @ [] ? default_idle+0x5b/0x170 @ [] ? cpu_idle+0xc6/0xf0 @ [] ? rest_init+0x72/0x80 @ [] ? start_kernel+0x2aa/0x390 @ [] ? x86_64_start_reservations+0x6a/0xa0 @ [] ? xen_start_kernel+0x315/0x440 @ Code: 00 55 48 89 e5 0f 1f 44 00 00 48 8b 87 60 01 00 00 31 c9 48 85 c0 75 0e @ 8b 57 54 e8 91 ff ff ff 84 c0 75 07 c9 c3 8b 48 54 eb ed <0f> 0b 0f 1f 40 00 @ eb fa 0f 1f 80 00 00 00 00 0f 1f 80 00 00 00 @ RIP [] __blk_end_request_all+0x2a/0x40 @ RSP @ ---[ end trace b09ff97496363201 ]--- Signed-off-by: Junxiao Bi --- block/blk-merge.c | 29 +++-- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index b3bf0df..ae4f4c8 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -325,11 +325,30 @@ no_merge: return 0; } -int ll_back_merge_fn(struct request_queue *q, struct request *req, +static inline bool ll_allow_merge_bio(struct request *req,
[PATCH] block: fix reqeust-__data_len overflow
blk_rq_sectors(req) + bio_sectors(bio) blk_rq_get_max_sectors(req) is used to check whether a bio can be merged into an exist request. If can, req-__data_len += bio-bio_size. Since req-__data_len is a 32bit uint, if blk_rq_get_max_sectors(req) (UINT_MAX 9), req-__date_len may overflow when merging a new bio. This probably happen for discard request. In xen blkfront driver, its max_discard_sectors is set to the whole disk sector size, see xlvbd_init_blk_queue(). So issuing discrad requests to a xen virtual disk with a size over 4G is very possible to trigger the overflow. This overflow will cause kernel panic in blk_end_request_all() due to BUG() triggered. The following is a call trace we saw in 3.0.69. Upstream kernel also suffer this issue. @ __end_that: dev xvdg: type=1, flags=2224441 @ sector 0, nr/cnr 8378368/4294959104 @ bio 8803d8cf3080, biotail 8803d8cf32c0, buffer (null), @ len 4289724416 @ blk_update_request: bio idx 0 = vcnt 0 @ request botched: dev xvdg: type=1, flags=2224441 @ sector 0, nr/cnr 8378368/4294959104 @ bio 8803d8cf3080, biotail 8803d8cf32c0, buffer (null), @ len 4289724416 @ [ cut here ] @ kernel BUG at block/blk-core.c:2394! @ invalid opcode: [#1] SMP @ CPU 0 @ Modules linked in: nfs fscache auth_rpcgss nfs_acl autofs4 i2c_dev i2c_core @ lockd sunrpc(U) ksplice_x773z34q_vmlinux_new(U) ksplice_x773z34q(U) @ ksplice_bj7y22gc_vmlinux_new(U) ksplice_bj7y22gc_ipv6_new(U) @ ksplice_bj7y22gc(U) @ . @ ksplice_i1o46065(U) ksplice_5gqtkuvt_vmlinux_new(U) ksplice_5gqtkuvt(U) @ ksplice_2bcv8td6(U) ksplice_v5bs54bz_vmlinux_new(U) ksplice_v5bs54bz(U) @ ksplice_l7s0dhx6(U) ksplice_aur7sgvi(U) ksplice_ckie4cpv(U) @ nf_conntrack_netbios_ns @ . @ nf_conntrack_broadcast ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 xt_state @ nf_conntrack xt_comment iptable_filter ip_tables be2iscsi iscsi_boot_sysfs @ ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp bnx2i cnic @ uio ipv6 @ . @ cxgb3i libcxgbi cxgb3 mdio libiscsi_tcp libiscsi scsi_transport_iscsi @ parport_pc lp parport snd_seq_dummy snd_seq_oss snd_seq_midi_event snd_seq @ snd_seq_device snd_pcm_oss snd_mixer_oss snd_pcm snd_timer snd soundcore @ snd_page_alloc @ . @ pcspkr xen_netfront dm_snapshot dm_zero dm_mirror dm_region_hash dm_log @ dm_mod xen_blkfront ext3 jbd mbcache sd_mod crc_t10dif [last unloaded: @ ksplice_x773z34q_vmlinux_old] @ . @ Pid: 0, comm: swapper Not tainted 2.6.39-400.212.1.el5uek #1 @ RIP: e030:[8123757a] [8123757a] @ __blk_end_request_all+0x2a/0x40 @ RSP: e02b:8803ffc03df8 EFLAGS: 00010002 @ RAX: 0001 RBX: 8803db3c8000 RCX: 8803d8cf32c0 @ RDX: 0001 RSI: 8803d8cf3080 RDI: 8803daed08d8 @ RBP: 8803ffc03df8 R08: R09: 8803daed08d8 @ R10: R11: 000a R12: @ R13: 8803dad5e3c0 R14: 0001 R15: 0029 @ FS: 7f1f34a32940() GS:8803ffc0() knlGS: @ CS: e033 DS: ES: CR0: 8005003b @ CR2: 020c6148 CR3: 0003c6492000 CR4: 2660 @ DR0: DR1: DR2: @ DR3: DR6: 0ff0 DR7: 0400 @ Process swapper (pid: 0, threadinfo 81794000, task 8179f020) @ Stack: @ 8803ffc03e48 a005c56a 8803da57a8d0 0028810d99ee @ 8803db1ea7c0 8803db1beec0 005e @ 0001 8803ffc03e98 810d735d @ Call Trace: @ IRQ @ [a005c56a] blkif_interrupt+0x20a/0x3a0 [xen_blkfront] @ [810d735d] handle_irq_event_percpu+0x5d/0x1a0 @ [810d74ef] handle_irq_event+0x4f/0x80 @ [810d9e25] handle_edge_irq+0xa5/0x100 @ [812f7cc8] __xen_evtchn_do_upcall+0x218/0x310 @ [812f7e7f] xen_evtchn_do_upcall+0x2f/0x50 @ [8151168e] xen_do_hypervisor_callback+0x1e/0x30 @ EOI @ [810013aa] ? xen_hypercall_sched_op+0xa/0x20 @ [810013aa] ? xen_hypercall_sched_op+0xa/0x20 @ [8100a2b0] ? xen_safe_halt+0x10/0x20 @ [8101dffb] ? default_idle+0x5b/0x170 @ [81014ac6] ? cpu_idle+0xc6/0xf0 @ [814eab62] ? rest_init+0x72/0x80 @ [819c902a] ? start_kernel+0x2aa/0x390 @ [819c832a] ? x86_64_start_reservations+0x6a/0xa0 @ [819cc9b5] ? xen_start_kernel+0x315/0x440 @ Code: 00 55 48 89 e5 0f 1f 44 00 00 48 8b 87 60 01 00 00 31 c9 48 85 c0 75 0e @ 8b 57 54 e8 91 ff ff ff 84 c0 75 07 c9 c3 8b 48 54 eb ed 0f 0b 0f 1f 40 00 @ eb fa 0f 1f 80 00 00 00 00 0f 1f 80 00 00 00 @ RIP [8123757a] __blk_end_request_all+0x2a/0x40 @ RSP 8803ffc03df8 @ ---[ end trace b09ff97496363201 ]--- Signed-off-by: Junxiao Bi junxiao...@oracle.com --- block/blk-merge.c | 29 +++-- 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/block/blk-merge.c
Re: [PATCH] block: fix uint overflow when merging io requests
On 06/27/2014 03:24 PM, Junxiao Bi wrote: > This uint overflow will cause req->__data_len < req->bio->bi_size, > this will confuse block layer and device driver. > > I watched a panic caused by this when mkfs.ext4 a volume of a large > virtual disk on vm guest, blkdev_issue_discard() issue two bio with > a total size over UINT_MAX, but the check in ll_back_merge_fn() didn't > take affect due to the overflow and they were merged into one request. > After the request is done, in blk_end_request_all(), BUG_ON(pending) > was triggered and kernel panic. "pending" is true is because > blk_update_request() return ture when req->__data_len is less > than req->bio->bi_size. Any body help review this patch? blk_rq_sectors(), bio_sectors(), blk_rq_get_max_sectors() are all uint. blk_rq_sectors(req) + bio_sectors(bio) > blk_rq_get_max_sectors(req) This checking is bypassed when overflow happen. It will cause an io request's length less than its child bio's size. Thanks, Junxiao. > > Signed-off-by: Junxiao Bi > --- > block/blk-merge.c | 40 ++-- > 1 file changed, 34 insertions(+), 6 deletions(-) > > diff --git a/block/blk-merge.c b/block/blk-merge.c > index b3bf0df..340c0a7 100644 > --- a/block/blk-merge.c > +++ b/block/blk-merge.c > @@ -325,11 +325,41 @@ no_merge: > return 0; > } > > -int ll_back_merge_fn(struct request_queue *q, struct request *req, > +static inline bool ll_allow_merge_bio(struct request *req, >struct bio *bio) > { > if (blk_rq_sectors(req) + bio_sectors(bio) > > - blk_rq_get_max_sectors(req)) { > + blk_rq_get_max_sectors(req)) > + return false; > + > + /* check uint overflow */ > + if (blk_rq_sectors(req) + bio_sectors(bio) < blk_rq_sectors(req) > + || blk_rq_sectors(req) + bio_sectors(bio) < bio_sectors(bio)) > + return false; > + > + return true; > +} > + > +static inline bool ll_allow_merge_req(struct request *req, > + struct request *next) > +{ > + if (blk_rq_sectors(req) + blk_rq_sectors(next) > > + blk_rq_get_max_sectors(req)) > + return false; > + > + /* check uint overflow */ > + if (blk_rq_sectors(req) + blk_rq_sectors(next) < blk_rq_sectors(req) > + || blk_rq_sectors(req) + blk_rq_sectors(next) < > + blk_rq_sectors(next)) > + return false; > + > + return true; > +} > + > +int ll_back_merge_fn(struct request_queue *q, struct request *req, > + struct bio *bio) > +{ > + if (!ll_allow_merge_bio(req, bio)) { > req->cmd_flags |= REQ_NOMERGE; > if (req == q->last_merge) > q->last_merge = NULL; > @@ -346,8 +376,7 @@ int ll_back_merge_fn(struct request_queue *q, struct > request *req, > int ll_front_merge_fn(struct request_queue *q, struct request *req, > struct bio *bio) > { > - if (blk_rq_sectors(req) + bio_sectors(bio) > > - blk_rq_get_max_sectors(req)) { > + if (!ll_allow_merge_bio(req, bio)) { > req->cmd_flags |= REQ_NOMERGE; > if (req == q->last_merge) > q->last_merge = NULL; > @@ -389,8 +418,7 @@ static int ll_merge_requests_fn(struct request_queue *q, > struct request *req, > /* >* Will it become too large? >*/ > - if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > > - blk_rq_get_max_sectors(req)) > + if (!ll_allow_merge_req(req, next)) > return 0; > > total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] block: fix uint overflow when merging io requests
On 06/27/2014 03:24 PM, Junxiao Bi wrote: This uint overflow will cause req-__data_len req-bio-bi_size, this will confuse block layer and device driver. I watched a panic caused by this when mkfs.ext4 a volume of a large virtual disk on vm guest, blkdev_issue_discard() issue two bio with a total size over UINT_MAX, but the check in ll_back_merge_fn() didn't take affect due to the overflow and they were merged into one request. After the request is done, in blk_end_request_all(), BUG_ON(pending) was triggered and kernel panic. pending is true is because blk_update_request() return ture when req-__data_len is less than req-bio-bi_size. Any body help review this patch? blk_rq_sectors(), bio_sectors(), blk_rq_get_max_sectors() are all uint. blk_rq_sectors(req) + bio_sectors(bio) blk_rq_get_max_sectors(req) This checking is bypassed when overflow happen. It will cause an io request's length less than its child bio's size. Thanks, Junxiao. Signed-off-by: Junxiao Bi junxiao...@oracle.com --- block/blk-merge.c | 40 ++-- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index b3bf0df..340c0a7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -325,11 +325,41 @@ no_merge: return 0; } -int ll_back_merge_fn(struct request_queue *q, struct request *req, +static inline bool ll_allow_merge_bio(struct request *req, struct bio *bio) { if (blk_rq_sectors(req) + bio_sectors(bio) - blk_rq_get_max_sectors(req)) { + blk_rq_get_max_sectors(req)) + return false; + + /* check uint overflow */ + if (blk_rq_sectors(req) + bio_sectors(bio) blk_rq_sectors(req) + || blk_rq_sectors(req) + bio_sectors(bio) bio_sectors(bio)) + return false; + + return true; +} + +static inline bool ll_allow_merge_req(struct request *req, + struct request *next) +{ + if (blk_rq_sectors(req) + blk_rq_sectors(next) + blk_rq_get_max_sectors(req)) + return false; + + /* check uint overflow */ + if (blk_rq_sectors(req) + blk_rq_sectors(next) blk_rq_sectors(req) + || blk_rq_sectors(req) + blk_rq_sectors(next) + blk_rq_sectors(next)) + return false; + + return true; +} + +int ll_back_merge_fn(struct request_queue *q, struct request *req, + struct bio *bio) +{ + if (!ll_allow_merge_bio(req, bio)) { req-cmd_flags |= REQ_NOMERGE; if (req == q-last_merge) q-last_merge = NULL; @@ -346,8 +376,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { - if (blk_rq_sectors(req) + bio_sectors(bio) - blk_rq_get_max_sectors(req)) { + if (!ll_allow_merge_bio(req, bio)) { req-cmd_flags |= REQ_NOMERGE; if (req == q-last_merge) q-last_merge = NULL; @@ -389,8 +418,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, /* * Will it become too large? */ - if ((blk_rq_sectors(req) + blk_rq_sectors(next)) - blk_rq_get_max_sectors(req)) + if (!ll_allow_merge_req(req, next)) return 0; total_phys_segments = req-nr_phys_segments + next-nr_phys_segments; -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] block: fix uint overflow when merging io requests
This uint overflow will cause req->__data_len < req->bio->bi_size, this will confuse block layer and device driver. I watched a panic caused by this when mkfs.ext4 a volume of a large virtual disk on vm guest, blkdev_issue_discard() issue two bio with a total size over UINT_MAX, but the check in ll_back_merge_fn() didn't take affect due to the overflow and they were merged into one request. After the request is done, in blk_end_request_all(), BUG_ON(pending) was triggered and kernel panic. "pending" is true is because blk_update_request() return ture when req->__data_len is less than req->bio->bi_size. Signed-off-by: Junxiao Bi --- block/blk-merge.c | 40 ++-- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index b3bf0df..340c0a7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -325,11 +325,41 @@ no_merge: return 0; } -int ll_back_merge_fn(struct request_queue *q, struct request *req, +static inline bool ll_allow_merge_bio(struct request *req, struct bio *bio) { if (blk_rq_sectors(req) + bio_sectors(bio) > - blk_rq_get_max_sectors(req)) { + blk_rq_get_max_sectors(req)) + return false; + + /* check uint overflow */ + if (blk_rq_sectors(req) + bio_sectors(bio) < blk_rq_sectors(req) + || blk_rq_sectors(req) + bio_sectors(bio) < bio_sectors(bio)) + return false; + + return true; +} + +static inline bool ll_allow_merge_req(struct request *req, +struct request *next) +{ + if (blk_rq_sectors(req) + blk_rq_sectors(next) > + blk_rq_get_max_sectors(req)) + return false; + + /* check uint overflow */ + if (blk_rq_sectors(req) + blk_rq_sectors(next) < blk_rq_sectors(req) + || blk_rq_sectors(req) + blk_rq_sectors(next) < + blk_rq_sectors(next)) + return false; + + return true; +} + +int ll_back_merge_fn(struct request_queue *q, struct request *req, +struct bio *bio) +{ + if (!ll_allow_merge_bio(req, bio)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -346,8 +376,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { - if (blk_rq_sectors(req) + bio_sectors(bio) > - blk_rq_get_max_sectors(req)) { + if (!ll_allow_merge_bio(req, bio)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -389,8 +418,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, /* * Will it become too large? */ - if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > - blk_rq_get_max_sectors(req)) + if (!ll_allow_merge_req(req, next)) return 0; total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] block: fix uint overflow when merging io requests
This uint overflow will cause req-__data_len req-bio-bi_size, this will confuse block layer and device driver. I watched a panic caused by this when mkfs.ext4 a volume of a large virtual disk on vm guest, blkdev_issue_discard() issue two bio with a total size over UINT_MAX, but the check in ll_back_merge_fn() didn't take affect due to the overflow and they were merged into one request. After the request is done, in blk_end_request_all(), BUG_ON(pending) was triggered and kernel panic. pending is true is because blk_update_request() return ture when req-__data_len is less than req-bio-bi_size. Signed-off-by: Junxiao Bi junxiao...@oracle.com --- block/blk-merge.c | 40 ++-- 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index b3bf0df..340c0a7 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -325,11 +325,41 @@ no_merge: return 0; } -int ll_back_merge_fn(struct request_queue *q, struct request *req, +static inline bool ll_allow_merge_bio(struct request *req, struct bio *bio) { if (blk_rq_sectors(req) + bio_sectors(bio) - blk_rq_get_max_sectors(req)) { + blk_rq_get_max_sectors(req)) + return false; + + /* check uint overflow */ + if (blk_rq_sectors(req) + bio_sectors(bio) blk_rq_sectors(req) + || blk_rq_sectors(req) + bio_sectors(bio) bio_sectors(bio)) + return false; + + return true; +} + +static inline bool ll_allow_merge_req(struct request *req, +struct request *next) +{ + if (blk_rq_sectors(req) + blk_rq_sectors(next) + blk_rq_get_max_sectors(req)) + return false; + + /* check uint overflow */ + if (blk_rq_sectors(req) + blk_rq_sectors(next) blk_rq_sectors(req) + || blk_rq_sectors(req) + blk_rq_sectors(next) + blk_rq_sectors(next)) + return false; + + return true; +} + +int ll_back_merge_fn(struct request_queue *q, struct request *req, +struct bio *bio) +{ + if (!ll_allow_merge_bio(req, bio)) { req-cmd_flags |= REQ_NOMERGE; if (req == q-last_merge) q-last_merge = NULL; @@ -346,8 +376,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, int ll_front_merge_fn(struct request_queue *q, struct request *req, struct bio *bio) { - if (blk_rq_sectors(req) + bio_sectors(bio) - blk_rq_get_max_sectors(req)) { + if (!ll_allow_merge_bio(req, bio)) { req-cmd_flags |= REQ_NOMERGE; if (req == q-last_merge) q-last_merge = NULL; @@ -389,8 +418,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, /* * Will it become too large? */ - if ((blk_rq_sectors(req) + blk_rq_sectors(next)) - blk_rq_get_max_sectors(req)) + if (!ll_allow_merge_req(req, next)) return 0; total_phys_segments = req-nr_phys_segments + next-nr_phys_segments; -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH V2] block: make nr_requests tunable for loop
On 06/10/2014 11:12 AM, Jens Axboe wrote: > On 2014-06-09 20:50, Junxiao Bi wrote: >> On 06/10/2014 10:41 AM, Jens Axboe wrote: >>> On 2014-06-09 20:31, Junxiao Bi wrote: >>>> commit 7b5a3522 (loop: Limit the number of requests in the bio list) >>>> limit >>>> the request number in loop queue to not over 128. Since the >>>> "request_fn" of >>>> loop device is null, the requests number is not allowed tuned. Make >>>> it tunable >>>> from sysfs can improve performance. >>>> >>>> The following test is done on a machine with 512M memory. The >>>> backend of >>>> /dev/loop1 is a nfs file. >>>> >>>> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests >>>> 128 >>>> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 >>>> 5000+0 records in >>>> 5000+0 records out >>>> 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s >>>> [root@bijx mnt]# >>>> [root@bijx mnt]# echo 1024 > /sys/block/loop0/queue/nr_requests >>>> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests >>>> 1024 >>>> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 >>>> 5000+0 records in >>>> 5000+0 records out >>>> 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s >>>> >>>> Signed-off-by: Junxiao Bi >>>> --- >>>>block/blk-core.c |6 ++ >>>>block/blk-sysfs.c |9 +++-- >>>>2 files changed, 9 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/block/blk-core.c b/block/blk-core.c >>>> index 40d6548..58c4bd4 100644 >>>> --- a/block/blk-core.c >>>> +++ b/block/blk-core.c >>>> @@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue >>>> *q, unsigned int nr) >>>>q->nr_requests = nr; >>>>blk_queue_congestion_threshold(q); >>>> >>>> +/* for loop device, return after set its nr_requests */ >>>> +if (!q->request_fn) { >>>> +spin_unlock_irq(q->queue_lock); >>>> +return 0; >>>> +} >>> >>> It'd be prettier to split this differently - something ala: >>> >>> if (request_fn) >>> blk_update_congestion_thresholds(q); >> The congestion threshholds is needed in commit 7b5a3522 (loop: Limit the >> number of requests in the bio list). So I think it needs be set even >> request_fn is null. > > I mean the request list thresholds, the part below where you currently > just exit. > >>> But I think you have a larger issue here... For the request lists, we >>> update the congestion thresholds and wakeup anyone waiting, if we need >>> to. There's no way to do that for loop, since the waitqueue is >>> internal to loop. >> Loop do the congestion control by itself, in loop_make_request() / >> loop_thread(). > > Yes, that is my point! You update nr_congestion_off, but you don't > wake anyone currently sitting in wait_event_lock_irq() on that value. > See what the code below where you just exit does for request list > based devices. Jens, do you have an idea to resolve it? -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH V2] block: make nr_requests tunable for loop
On 06/10/2014 11:12 AM, Jens Axboe wrote: On 2014-06-09 20:50, Junxiao Bi wrote: On 06/10/2014 10:41 AM, Jens Axboe wrote: On 2014-06-09 20:31, Junxiao Bi wrote: commit 7b5a3522 (loop: Limit the number of requests in the bio list) limit the request number in loop queue to not over 128. Since the request_fn of loop device is null, the requests number is not allowed tuned. Make it tunable from sysfs can improve performance. The following test is done on a machine with 512M memory. The backend of /dev/loop1 is a nfs file. [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests 128 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 5000+0 records in 5000+0 records out 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s [root@bijx mnt]# [root@bijx mnt]# echo 1024 /sys/block/loop0/queue/nr_requests [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests 1024 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 5000+0 records in 5000+0 records out 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s Signed-off-by: Junxiao Bi junxiao...@oracle.com --- block/blk-core.c |6 ++ block/blk-sysfs.c |9 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 40d6548..58c4bd4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) q-nr_requests = nr; blk_queue_congestion_threshold(q); +/* for loop device, return after set its nr_requests */ +if (!q-request_fn) { +spin_unlock_irq(q-queue_lock); +return 0; +} It'd be prettier to split this differently - something ala: if (request_fn) blk_update_congestion_thresholds(q); The congestion threshholds is needed in commit 7b5a3522 (loop: Limit the number of requests in the bio list). So I think it needs be set even request_fn is null. I mean the request list thresholds, the part below where you currently just exit. But I think you have a larger issue here... For the request lists, we update the congestion thresholds and wakeup anyone waiting, if we need to. There's no way to do that for loop, since the waitqueue is internal to loop. Loop do the congestion control by itself, in loop_make_request() / loop_thread(). Yes, that is my point! You update nr_congestion_off, but you don't wake anyone currently sitting in wait_event_lock_irq() on that value. See what the code below where you just exit does for request list based devices. Jens, do you have an idea to resolve it? -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH V2] block: make nr_requests tunable for loop
On 06/10/2014 11:12 AM, Jens Axboe wrote: > On 2014-06-09 20:50, Junxiao Bi wrote: >> On 06/10/2014 10:41 AM, Jens Axboe wrote: >>> On 2014-06-09 20:31, Junxiao Bi wrote: >>>> commit 7b5a3522 (loop: Limit the number of requests in the bio list) >>>> limit >>>> the request number in loop queue to not over 128. Since the >>>> "request_fn" of >>>> loop device is null, the requests number is not allowed tuned. Make >>>> it tunable >>>> from sysfs can improve performance. >>>> >>>> The following test is done on a machine with 512M memory. The >>>> backend of >>>> /dev/loop1 is a nfs file. >>>> >>>> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests >>>> 128 >>>> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 >>>> 5000+0 records in >>>> 5000+0 records out >>>> 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s >>>> [root@bijx mnt]# >>>> [root@bijx mnt]# echo 1024 > /sys/block/loop0/queue/nr_requests >>>> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests >>>> 1024 >>>> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 >>>> 5000+0 records in >>>> 5000+0 records out >>>> 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s >>>> >>>> Signed-off-by: Junxiao Bi >>>> --- >>>>block/blk-core.c |6 ++ >>>>block/blk-sysfs.c |9 +++-- >>>>2 files changed, 9 insertions(+), 6 deletions(-) >>>> >>>> diff --git a/block/blk-core.c b/block/blk-core.c >>>> index 40d6548..58c4bd4 100644 >>>> --- a/block/blk-core.c >>>> +++ b/block/blk-core.c >>>> @@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue >>>> *q, unsigned int nr) >>>>q->nr_requests = nr; >>>>blk_queue_congestion_threshold(q); >>>> >>>> +/* for loop device, return after set its nr_requests */ >>>> +if (!q->request_fn) { >>>> +spin_unlock_irq(q->queue_lock); >>>> +return 0; >>>> +} >>> >>> It'd be prettier to split this differently - something ala: >>> >>> if (request_fn) >>> blk_update_congestion_thresholds(q); >> The congestion threshholds is needed in commit 7b5a3522 (loop: Limit the >> number of requests in the bio list). So I think it needs be set even >> request_fn is null. > > I mean the request list thresholds, the part below where you currently > just exit. > >>> But I think you have a larger issue here... For the request lists, we >>> update the congestion thresholds and wakeup anyone waiting, if we need >>> to. There's no way to do that for loop, since the waitqueue is >>> internal to loop. >> Loop do the congestion control by itself, in loop_make_request() / >> loop_thread(). > > Yes, that is my point! You update nr_congestion_off, but you don't > wake anyone currently sitting in wait_event_lock_irq() on that value. > See what the code below where you just exit does for request list > based devices. Ah, i see. It can't be wake up once nr_congestion_off is updated. But after a little delay, loop_thread will consume the requests in list and wake up it. Is this OK? -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH V2] block: make nr_requests tunable for loop
On 06/10/2014 10:41 AM, Jens Axboe wrote: > On 2014-06-09 20:31, Junxiao Bi wrote: >> commit 7b5a3522 (loop: Limit the number of requests in the bio list) >> limit >> the request number in loop queue to not over 128. Since the >> "request_fn" of >> loop device is null, the requests number is not allowed tuned. Make >> it tunable >> from sysfs can improve performance. >> >> The following test is done on a machine with 512M memory. The backend of >> /dev/loop1 is a nfs file. >> >> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests >> 128 >> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 >> 5000+0 records in >> 5000+0 records out >> 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s >> [root@bijx mnt]# >> [root@bijx mnt]# echo 1024 > /sys/block/loop0/queue/nr_requests >> [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests >> 1024 >> [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 >> 5000+0 records in >> 5000+0 records out >> 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s >> >> Signed-off-by: Junxiao Bi >> --- >> block/blk-core.c |6 ++ >> block/blk-sysfs.c |9 +++-- >> 2 files changed, 9 insertions(+), 6 deletions(-) >> >> diff --git a/block/blk-core.c b/block/blk-core.c >> index 40d6548..58c4bd4 100644 >> --- a/block/blk-core.c >> +++ b/block/blk-core.c >> @@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue >> *q, unsigned int nr) >> q->nr_requests = nr; >> blk_queue_congestion_threshold(q); >> >> +/* for loop device, return after set its nr_requests */ >> +if (!q->request_fn) { >> +spin_unlock_irq(q->queue_lock); >> +return 0; >> +} > > It'd be prettier to split this differently - something ala: > > if (request_fn) > blk_update_congestion_thresholds(q); The congestion threshholds is needed in commit 7b5a3522 (loop: Limit the number of requests in the bio list). So I think it needs be set even request_fn is null. > > But I think you have a larger issue here... For the request lists, we > update the congestion thresholds and wakeup anyone waiting, if we need > to. There's no way to do that for loop, since the waitqueue is > internal to loop. Loop do the congestion control by itself, in loop_make_request() / loop_thread(). -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH V2] block: make nr_requests tunable for loop
commit 7b5a3522 (loop: Limit the number of requests in the bio list) limit the request number in loop queue to not over 128. Since the "request_fn" of loop device is null, the requests number is not allowed tuned. Make it tunable from sysfs can improve performance. The following test is done on a machine with 512M memory. The backend of /dev/loop1 is a nfs file. [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests 128 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 5000+0 records in 5000+0 records out 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s [root@bijx mnt]# [root@bijx mnt]# echo 1024 > /sys/block/loop0/queue/nr_requests [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests 1024 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 5000+0 records in 5000+0 records out 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s Signed-off-by: Junxiao Bi --- block/blk-core.c |6 ++ block/blk-sysfs.c |9 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 40d6548..58c4bd4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) q->nr_requests = nr; blk_queue_congestion_threshold(q); + /* for loop device, return after set its nr_requests */ + if (!q->request_fn) { + spin_unlock_irq(q->queue_lock); + return 0; + } + /* congestion isn't cgroup aware and follows root blkcg for now */ rl = >root_rl; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 23321fb..c5456a5 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -51,9 +51,6 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) unsigned long nr; int ret, err; - if (!q->request_fn && !q->mq_ops) - return -EINVAL; - ret = queue_var_store(, page, count); if (ret < 0) return ret; @@ -61,10 +58,10 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - if (q->request_fn) - err = blk_update_nr_requests(q, nr); - else + if (q->mq_ops) err = blk_mq_update_nr_requests(q, nr); + else + err = blk_update_nr_requests(q, nr); if (err) return err; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] block: make nr_requests tunable for loop
On 06/09/2014 11:53 PM, Jens Axboe wrote: > On 2014-06-09 01:29, Andreas Mohr wrote: >> Hi, >> >> having had a look at current mainline sources, >> frankly I've (well, initially...) got trouble understanding >> what this patch is doing. >> >> It's replacing an aggressive error-type bail-out (-EINVAL) for NULL >> request_fn >> with an inoccuous-looking "return ret;", yet that ret content currently >> *implicitly* is a >= 0 value (resulting from processing by earlier code >> which may or may not get incomprehensibly rewritten in future). >> I don't understand the reasons for this huge change in return value >> handling >> (since it's now not assigning a specific return value >> for this modified bail-out case). >> >> OK, well... you could say that since all this function ever was >> interested in is the result value of queue_var_store() >> (except for error bail-out cases), doing an interim "return ret;" >> (which is exactly what the function tail is also doing) >> is exactly right. >> >> But still simple textual appearance of the resulting patch hunks >> seems strangely asymmetric >> which may easily be a canary for structurally wrong layering of this >> function. >> Not to mention the now required extra spin_unlock_irq() >> in interim return handler... >> >> >> Well, after further analysis I would come to the conclusion >> that in general queue_requests_store() does a LOT more than it should - >> since blk-sysfs.c's only (expected!) purpose is >> to do parameterization of request_queue behaviour as gathered >> from sysfs attribute space, >> all that function should ever be concerned with is parsing that sysfs >> value >> and then calling a blk helper for configuration of that very >> attribute value >> which would *internally* do all the strange internal queue magic >> that is currently being updated *open-coded* >> at this supposedly *sysfs*-specific place. Ugh. >> Main question here: what would one do if one decided to rip out sysfs >> and use something entirely different for parameterization? >> Yeah indeed - thought so... >> >> >> So yeah, I'd definitely say that that function is lacking some cleanup >> which would possibly then even lead (or: would have led ;) >> to a much more nicely symmetric textual appearance >> of the patch hunk of the small but quite likely useful change >> that you currently intend to have here. > > If you are done ranting, look at the current tree where it has been > split out. There was no reason to have it split before, since the > sysfs entry point was the only place where we updated nr_requests. If > that code has been duplicated, there would have been a justified > reason for writing two pages about it. Yes, agree, this is the only place updating nr_requests, we can split it as a separated function if it needs updating at some other places in future. -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] block: make nr_requests tunable for loop
commit 7b5a3522 (loop: Limit the number of requests in the bio list) limit the request number in loop queue to not over 128. Make the number tunable from sysfs can improve performance. The following test is done on a machine with 512M memory. The backend of /dev/loop1 is a nfs file. [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests 128 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 5000+0 records in 5000+0 records out 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s [root@bijx mnt]# [root@bijx mnt]# echo 1024 > /sys/block/loop0/queue/nr_requests [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests 1024 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 5000+0 records in 5000+0 records out 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s Signed-off-by: Junxiao Bi --- block/blk-sysfs.c |8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7500f87..193ad8a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -52,9 +52,6 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) unsigned long nr; int ret; - if (!q->request_fn) - return -EINVAL; - ret = queue_var_store(, page, count); if (ret < 0) return ret; @@ -66,6 +63,11 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) q->nr_requests = nr; blk_queue_congestion_threshold(q); + if (!q->request_fn) { + spin_unlock_irq(q->queue_lock); + return ret; + } + /* congestion isn't cgroup aware and follows root blkcg for now */ rl = >root_rl; -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH] block: make nr_requests tunable for loop
commit 7b5a3522 (loop: Limit the number of requests in the bio list) limit the request number in loop queue to not over 128. Make the number tunable from sysfs can improve performance. The following test is done on a machine with 512M memory. The backend of /dev/loop1 is a nfs file. [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests 128 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 5000+0 records in 5000+0 records out 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s [root@bijx mnt]# [root@bijx mnt]# echo 1024 /sys/block/loop0/queue/nr_requests [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests 1024 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 5000+0 records in 5000+0 records out 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s Signed-off-by: Junxiao Bi junxiao...@oracle.com --- block/blk-sysfs.c |8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 7500f87..193ad8a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -52,9 +52,6 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) unsigned long nr; int ret; - if (!q-request_fn) - return -EINVAL; - ret = queue_var_store(nr, page, count); if (ret 0) return ret; @@ -66,6 +63,11 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) q-nr_requests = nr; blk_queue_congestion_threshold(q); + if (!q-request_fn) { + spin_unlock_irq(q-queue_lock); + return ret; + } + /* congestion isn't cgroup aware and follows root blkcg for now */ rl = q-root_rl; -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] block: make nr_requests tunable for loop
On 06/09/2014 11:53 PM, Jens Axboe wrote: On 2014-06-09 01:29, Andreas Mohr wrote: Hi, having had a look at current mainline sources, frankly I've (well, initially...) got trouble understanding what this patch is doing. It's replacing an aggressive error-type bail-out (-EINVAL) for NULL request_fn with an inoccuous-looking return ret;, yet that ret content currently *implicitly* is a = 0 value (resulting from processing by earlier code which may or may not get incomprehensibly rewritten in future). I don't understand the reasons for this huge change in return value handling (since it's now not assigning a specific return value for this modified bail-out case). OK, well... you could say that since all this function ever was interested in is the result value of queue_var_store() (except for error bail-out cases), doing an interim return ret; (which is exactly what the function tail is also doing) is exactly right. But still simple textual appearance of the resulting patch hunks seems strangely asymmetric which may easily be a canary for structurally wrong layering of this function. Not to mention the now required extra spin_unlock_irq() in interim return handler... Well, after further analysis I would come to the conclusion that in general queue_requests_store() does a LOT more than it should - since blk-sysfs.c's only (expected!) purpose is to do parameterization of request_queue behaviour as gathered from sysfs attribute space, all that function should ever be concerned with is parsing that sysfs value and then calling a blk helper for configuration of that very attribute value which would *internally* do all the strange internal queue magic that is currently being updated *open-coded* at this supposedly *sysfs*-specific place. Ugh. Main question here: what would one do if one decided to rip out sysfs and use something entirely different for parameterization? Yeah indeed - thought so... So yeah, I'd definitely say that that function is lacking some cleanup which would possibly then even lead (or: would have led ;) to a much more nicely symmetric textual appearance of the patch hunk of the small but quite likely useful change that you currently intend to have here. If you are done ranting, look at the current tree where it has been split out. There was no reason to have it split before, since the sysfs entry point was the only place where we updated nr_requests. If that code has been duplicated, there would have been a justified reason for writing two pages about it. Yes, agree, this is the only place updating nr_requests, we can split it as a separated function if it needs updating at some other places in future. -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[PATCH V2] block: make nr_requests tunable for loop
commit 7b5a3522 (loop: Limit the number of requests in the bio list) limit the request number in loop queue to not over 128. Since the request_fn of loop device is null, the requests number is not allowed tuned. Make it tunable from sysfs can improve performance. The following test is done on a machine with 512M memory. The backend of /dev/loop1 is a nfs file. [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests 128 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 5000+0 records in 5000+0 records out 524288 bytes (5.2 GB) copied, 501.572 s, 10.5 MB/s [root@bijx mnt]# [root@bijx mnt]# echo 1024 /sys/block/loop0/queue/nr_requests [root@bijx mnt]# cat /sys/block/loop0/queue/nr_requests 1024 [root@bijx mnt]# dd if=/dev/zero of=/dev/loop0 bs=1M count=5000 5000+0 records in 5000+0 records out 524288 bytes (5.2 GB) copied, 464.481 s, 11.3 MB/s Signed-off-by: Junxiao Bi junxiao...@oracle.com --- block/blk-core.c |6 ++ block/blk-sysfs.c |9 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 40d6548..58c4bd4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -851,6 +851,12 @@ int blk_update_nr_requests(struct request_queue *q, unsigned int nr) q-nr_requests = nr; blk_queue_congestion_threshold(q); + /* for loop device, return after set its nr_requests */ + if (!q-request_fn) { + spin_unlock_irq(q-queue_lock); + return 0; + } + /* congestion isn't cgroup aware and follows root blkcg for now */ rl = q-root_rl; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 23321fb..c5456a5 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -51,9 +51,6 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) unsigned long nr; int ret, err; - if (!q-request_fn !q-mq_ops) - return -EINVAL; - ret = queue_var_store(nr, page, count); if (ret 0) return ret; @@ -61,10 +58,10 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (nr BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - if (q-request_fn) - err = blk_update_nr_requests(q, nr); - else + if (q-mq_ops) err = blk_mq_update_nr_requests(q, nr); + else + err = blk_update_nr_requests(q, nr); if (err) return err; -- 1.7.9.5 -- To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/