[Ocfs2-devel] [PATCH] ocfs2: Fix quota file corruption
Global quota files are accessed from different nodes. Thus we cannot cache offset of quota structure in the quota file after we drop our node reference count to it because after that moment quota structure may be freed and reallocated elsewhere by a different node resulting in corruption of quota file. Fix the problem by clearing dq_off when we are releasing dquot structure. We also remove the DB_READ_B handling because it is useless - DQ_ACTIVE_B is set iff DQ_READ_B is set. CC: sta...@vger.kernel.org CC: Goldwyn Rodrigues rgold...@suse.de CC: Mark Fasheh mfas...@suse.de Signed-off-by: Jan Kara j...@suse.cz --- fs/ocfs2/quota_global.c | 27 +-- fs/ocfs2/quota_local.c | 4 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index aaa50611ec66..d7b5108789e2 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -717,6 +717,12 @@ static int ocfs2_release_dquot(struct dquot *dquot) */ if (status 0) mlog_errno(status); + /* +* Clear dq_off so that we search for the structure in quota file next +* time we acquire it. The structure might be deleted and reallocated +* elsewhere by another node while our dquot structure is on freelist. +*/ + dquot-dq_off = 0; clear_bit(DQ_ACTIVE_B, dquot-dq_flags); out_trans: ocfs2_commit_trans(osb, handle); @@ -756,16 +762,17 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) status = ocfs2_lock_global_qf(info, 1); if (status 0) goto out; - if (!test_bit(DQ_READ_B, dquot-dq_flags)) { - status = ocfs2_qinfo_lock(info, 0); - if (status 0) - goto out_dq; - status = qtree_read_dquot(info-dqi_gi, dquot); - ocfs2_qinfo_unlock(info, 0); - if (status 0) - goto out_dq; - } - set_bit(DQ_READ_B, dquot-dq_flags); + status = ocfs2_qinfo_lock(info, 0); + if (status 0) + goto out_dq; + /* +* We always want to read dquot structure from disk because we don't +* know what happened with it while it was on freelist. +*/ + status = qtree_read_dquot(info-dqi_gi, dquot); + ocfs2_qinfo_unlock(info, 0); + if (status 0) + goto out_dq; OCFS2_DQUOT(dquot)-dq_use_count++; OCFS2_DQUOT(dquot)-dq_origspace = dquot-dq_dqb.dqb_curspace; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 2e4344be3b96..2001862bf2b1 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -1303,10 +1303,6 @@ int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot) ocfs2_journal_dirty(handle, od-dq_chunk-qc_headerbh); out: - /* Clear the read bit so that next time someone uses this -* dquot he reads fresh info from disk and allocates local -* dquot structure */ - clear_bit(DQ_READ_B, dquot-dq_flags); return status; } -- 1.8.1.4 ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
[Ocfs2-devel] [PATCH V2] ocfs2: dlm: fix recovery hung
There is a race window in dlm_do_recovery() between dlm_remaster_locks() and dlm_reset_recovery() when the recovery master nearly finish the recovery process for a dead node. After the master sends FINALIZE_RECO message in dlm_remaster_locks(), another node may become the recovery master for another dead node, and then send the BEGIN_RECO message to all the nodes included the old master, in the handler of this message dlm_begin_reco_handler() of old master, dlm-reco.dead_node and dlm-reco.new_master will be set to the second dead node and the new master, then in dlm_reset_recovery(), these two variables will be reset to default value. This will cause new recovery master can not finish the recovery process and hung, at last the whole cluster will hung for recovery. old recovery master: new recovery master: dlm_remaster_locks() become recovery master for another dead node. dlm_send_begin_reco_message() dlm_begin_reco_handler() { if (dlm-reco.state DLM_RECO_STATE_FINALIZE) { return -EAGAIN; } dlm_set_reco_master(dlm, br-node_idx); dlm_set_reco_dead_node(dlm, br-dead_node); } dlm_reset_recovery() { dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); } will hung in dlm_remaster_locks() for request dlm locks info Before send FINALIZE_RECO message, recovery master should set DLM_RECO_STATE_FINALIZE for itself and clear it after the recovery done, this can break the race windows as the BEGIN_RECO messages will not be handled before DLM_RECO_STATE_FINALIZE flag is cleared. A similar race may happen between new recovery master and normal node which is in dlm_finalize_reco_handler(), also fix it. Reviewed-by: Srinivas Eeda srinivas.e...@oracle.com Reviewed-by: Wengang Wang wen.gang.w...@oracle.com Cc: sta...@vger.kernel.org Signed-off-by: Junxiao Bi junxiao...@oracle.com --- fs/ocfs2/dlm/dlmrecovery.c | 15 +-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 7035af0..8179bd9 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -537,7 +537,10 @@ master_here: /* success! see if any other nodes need recovery */ mlog(0, DONE mastering recovery of %s:%u here(this=%u)!\n, dlm-name, dlm-reco.dead_node, dlm-node_num); - dlm_reset_recovery(dlm); + spin_lock(dlm-spinlock); + __dlm_reset_recovery(dlm); + dlm-reco.state = ~DLM_RECO_STATE_FINALIZE; + spin_unlock(dlm-spinlock); } dlm_end_recovery(dlm); @@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) if (all_nodes_done) { int ret; + /* Set this flag on recovery master to avoid +* a new recovery for another dead node start +* before the recovery is not done. That may +* cause recovery hung.*/ + spin_lock(dlm-spinlock); + dlm-reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(dlm-spinlock); + /* all nodes are now in DLM_RECO_NODE_DATA_DONE state * just send a finalize message to everyone and * clean up */ @@ -2882,8 +2893,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data, BUG(); } dlm-reco.state = ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); spin_unlock(dlm-spinlock); - dlm_reset_recovery(dlm); dlm_kick_recovery_thread(dlm); break; default: -- 1.7.9.5 ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
[Ocfs2-devel] [PATCH] fsck.ocfs2: Do not complain about oversized quota files
Quota files can have blocks beyond i_size. This is correct because quota code reserves these blocks to allow for extension of quota files without the need for allocation. So make fsck.ocfs2 not complain about them. Signed-off-by: Jan Kara j...@suse.cz --- fsck.ocfs2/pass1.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/fsck.ocfs2/pass1.c b/fsck.ocfs2/pass1.c index 22dafdf50647..5118e6c630c4 100644 --- a/fsck.ocfs2/pass1.c +++ b/fsck.ocfs2/pass1.c @@ -832,6 +832,12 @@ out: return ret; } +static int is_quota_file(uint64_t ino) +{ + return ino == USER_QUOTA_SYSTEM_INODE || + ino == GROUP_QUOTA_SYSTEM_INODE; +} + /* * this verifies i_size and i_clusters for inodes that use i_list to * reference extents of data. @@ -1002,6 +1008,7 @@ size_cluster_check: * i_clusters, even on a sparsed filesystem */ if (!S_ISLNK(di-i_mode) !S_ISDIR(di-i_mode) + !is_quota_file(di-i_blkno) di-i_size = unexpected prompt(ost, PY, PR_INODE_SPARSE_SIZE, Inode %PRIu64 has a size of %PRIu64 but has %PRIu64 -- 1.8.1.4 ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
[Ocfs2-devel] [PATCH 6/6] ocfs2: Revert iput deferring code in ocfs2_drop_dentry_lock
From: Goldwyn Rodrigues rgold...@suse.de The following patches are reverted in this patch because these patches caused performance regression in the remote unlink() calls. ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping of dentry lock to ocfs2_wq f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount 5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in ocfs2_kill_sb on a failed mount Previous patches in this series removed the possible deadlocks from downconvert thread so the above patches shouldn't be needed anymore. The regression is caused because these patches delay the iput() in case of dentry unlocks. This also delays the unlocking of the open lockres. The open lockresource is required to test if the inode can be wiped from disk or not. When the deleting node does not get the open lock, it marks it as orphan (even though it is not in use by another node/process) and causes a journal checkpoint. This delays operations following the inode eviction. This also moves the inode to the orphaned inode which further causes more I/O and a lot of unneccessary orphans. The following script can be used to generate the load causing issues: declare -a create declare -a remove declare -a iterations=(1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384) unique=`mktemp -u X` script=/tmp/idontknow-${unique}.sh cat EOF ${script} for n in {1..8}; do mkdir -p test/dir\${n} eval touch test/dir\${n}/foo{1..\$1} done EOF chmod 700 ${script} function fcreate () { exec 21 /usr/bin/time --format=%E ${script} $1 } function fremove () { exec 21 /usr/bin/time --format=%E ssh node2 cd `pwd`; rm -Rf test* } function fcp () { exec 21 /usr/bin/time --format=%E ssh node3 cd `pwd`; cp -R test test.new } echo - echo | # files | create #s | copy #s | remove #s | echo - for ((x=0; x ${#iterations[*]} ; x++)) do create[$x]=`fcreate ${iterations[$x]}` copy[$x]=`fcp ${iterations[$x]}` remove[$x]=`fremove` printf | %8d | %9s | %9s | %9s |\n ${iterations[$x]} ${create[$x]} ${copy[$x]} ${remove[$x]} done rm ${script} echo Signed-off-by: Srinivas Eeda srinivas.e...@oracle.com Signed-off-by: Goldwyn Rodrigues rgold...@suse.com Signed-off-by: Jan Kara j...@suse.cz --- fs/ocfs2/dcache.c | 61 +++ fs/ocfs2/dcache.h | 12 +-- fs/ocfs2/ocfs2.h | 28 - fs/ocfs2/super.c | 30 +-- 4 files changed, 9 insertions(+), 122 deletions(-) diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index 0d3a97d2d5f6..e2e05a106beb 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -37,7 +37,6 @@ #include dlmglue.h #include file.h #include inode.h -#include super.h #include ocfs2_trace.h void ocfs2_dentry_attach_gen(struct dentry *dentry) @@ -346,52 +345,6 @@ out_attach: return ret; } -DEFINE_SPINLOCK(dentry_list_lock); - -/* We limit the number of dentry locks to drop in one go. We have - * this limit so that we don't starve other users of ocfs2_wq. */ -#define DL_INODE_DROP_COUNT 64 - -/* Drop inode references from dentry locks */ -static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) -{ - struct ocfs2_dentry_lock *dl; - - spin_lock(dentry_list_lock); - while (osb-dentry_lock_list (drop_count 0 || drop_count--)) { - dl = osb-dentry_lock_list; - osb-dentry_lock_list = dl-dl_next; - spin_unlock(dentry_list_lock); - iput(dl-dl_inode); - kfree(dl); - spin_lock(dentry_list_lock); - } - spin_unlock(dentry_list_lock); -} - -void ocfs2_drop_dl_inodes(struct work_struct *work) -{ - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); - - __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); - /* -* Don't queue dropping if umount is in progress. We flush the -* list in ocfs2_dismount_volume -*/ - spin_lock(dentry_list_lock); - if (osb-dentry_lock_list - !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) - queue_work(ocfs2_wq, osb-dentry_lock_work); - spin_unlock(dentry_list_lock); -} - -/* Flush the whole work queue */ -void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) -{ - __ocfs2_drop_dl_inodes(osb, -1); -} - /* * ocfs2_dentry_iput() and friends. * @@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl) { + iput(dl-dl_inode); ocfs2_simple_drop_lockres(osb, dl-dl_lockres); ocfs2_lock_res_free(dl-dl_lockres); - - /* We leave dropping of inode reference to
[Ocfs2-devel] [PATCH 0/6 v2] ocfs2: Avoid pending orphaned inodes
Hello, here is a second version of my patchset to solve a deadlocks when we do not defer dropping of inode reference from downconvert worker. I have tested the patches (also with lockdep enabled) and they seem to work fine. Comments are welcome! Honza ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
[Ocfs2-devel] [PATCH 4/6] ocfs2: Implement delayed dropping of last dquot reference
We cannot drop last dquot reference from downconvert thread as that creates the following deadlock: NODE 1 NODE2 holds dentry lock for 'foo' holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE dquot_initialize(bar) ocfs2_dquot_acquire() ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) ... downconvert thread (triggered from another node or a different process from NODE2) ocfs2_dentry_post_unlock() ... iput(foo) ocfs2_evict_inode(foo) ocfs2_clear_inode(foo) dquot_drop(inode) ... ocfs2_dquot_release() ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) - blocks finds we need more space in quota file ... ocfs2_extend_no_holes() ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) - deadlocks waiting for downconvert thread We solve the problem by postponing dropping of the last dquot reference to a workqueue if it happens from the downconvert thread. Signed-off-by: Jan Kara j...@suse.cz --- fs/ocfs2/ocfs2.h| 5 + fs/ocfs2/quota.h| 2 ++ fs/ocfs2/quota_global.c | 35 +++ fs/ocfs2/super.c| 8 4 files changed, 50 insertions(+) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 553f53cc73ae..64c02239ba46 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -30,6 +30,7 @@ #include linux/sched.h #include linux/wait.h #include linux/list.h +#include linux/llist.h #include linux/rbtree.h #include linux/workqueue.h #include linux/kref.h @@ -419,6 +420,10 @@ struct ocfs2_super struct ocfs2_dentry_lock *dentry_lock_list; struct work_struct dentry_lock_work; + /* List of dquot structures to drop last reference to */ + struct llist_head dquot_drop_list; + struct work_struct dquot_drop_work; + wait_queue_head_t osb_mount_event; /* Truncate log info */ diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index d5ab56cbe5c5..f266d67df3c6 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -28,6 +28,7 @@ struct ocfs2_dquot { unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ s64 dq_origspace; /* Last globally synced space usage */ s64 dq_originodes; /* Last globally synced inode usage */ + struct llist_node list; /* Member of list of dquots to drop */ }; /* Description of one chunk to recover in memory */ @@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, int ocfs2_create_local_dquot(struct dquot *dquot); int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); int ocfs2_local_write_dquot(struct dquot *dquot); +void ocfs2_drop_dquot_refs(struct work_struct *work); extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index aaa50611ec66..7921e209c64b 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -10,6 +10,7 @@ #include linux/jiffies.h #include linux/writeback.h #include linux/workqueue.h +#include linux/llist.h #include cluster/masklog.h @@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) OCFS2_INODE_UPDATE_CREDITS; } +void ocfs2_drop_dquot_refs(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dquot_drop_work); + struct llist_node *list; + struct ocfs2_dquot *odquot, *next_odquot; + + list = llist_del_all(osb-dquot_drop_list); + llist_for_each_entry_safe(odquot, next_odquot, list, list) { + /* Drop the reference we acquired in ocfs2_dquot_release() */ + dqput(odquot-dq_dquot); + } +} + +/* + * Called when the last reference to dquot is dropped. If we are called from + * downconvert thread, we cannot do all the handling here because grabbing + * quota lock could deadlock (the node holding the quota lock could need some + * other cluster lock to proceed but with blocked downconvert thread we cannot + * release any lock). + */ static int ocfs2_release_dquot(struct dquot *dquot) { handle_t *handle; @@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot) /* Check whether we are not racing with some other dqget() */ if (atomic_read(dquot-dq_count)
[Ocfs2-devel] [PATCH 3/6] quota: Provide function to grab quota structure reference
Provide dqgrab() function to get quota structure reference when we are sure it already has at least one active reference. Make use of this function inside quota code. Signed-off-by: Jan Kara j...@suse.cz --- fs/quota/dquot.c | 4 ++-- include/linux/quotaops.h | 8 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 831d49a4111f..e3f09e34d0b2 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -528,7 +528,7 @@ restart: if (atomic_read(dquot-dq_count)) { DEFINE_WAIT(wait); - atomic_inc(dquot-dq_count); + dqgrab(dquot); prepare_to_wait(dquot-dq_wait_unused, wait, TASK_UNINTERRUPTIBLE); spin_unlock(dq_list_lock); @@ -624,7 +624,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type) /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ - atomic_inc(dquot-dq_count); + dqgrab(dquot); spin_unlock(dq_list_lock); dqstats_inc(DQST_LOOKUPS); err = sb-dq_op-write_dquot(dquot); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 6965fe394c3b..1d3eee594cd6 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -46,6 +46,14 @@ void inode_reclaim_rsv_space(struct inode *inode, qsize_t number); void dquot_initialize(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); +static inline struct dquot *dqgrab(struct dquot *dquot) +{ + /* Make sure someone else has active reference to dquot */ + WARN_ON_ONCE(!atomic_read(dquot-dq_count)); + WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, dquot-dq_flags)); + atomic_inc(dquot-dq_count); + return dquot; +} void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), -- 1.8.1.4 ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
[Ocfs2-devel] [PATCH 1/6] ocfs2: Remove OCFS2_INODE_SKIP_DELETE flag
The flag was never set, delete it. Signed-off-by: Jan Kara j...@suse.cz --- fs/ocfs2/inode.c | 6 -- fs/ocfs2/inode.h | 8 +++- fs/ocfs2/journal.c | 6 -- 3 files changed, 3 insertions(+), 17 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index f29a90fde619..b4baaefe4dd4 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -822,12 +822,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail_unlock; } - /* If we have allowd wipe of this inode for another node, it -* will be marked here so we can safely skip it. Recovery will -* cleanup any inodes we might inadvertently skip here. */ - if (oi-ip_flags OCFS2_INODE_SKIP_DELETE) - goto bail_unlock; - ret = 1; bail_unlock: spin_unlock(oi-ip_lock); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 621fc73bf23d..f60bc314ee0a 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -84,8 +84,6 @@ struct ocfs2_inode_info #define OCFS2_INODE_BITMAP 0x0004 /* This inode has been wiped from disk */ #define OCFS2_INODE_DELETED0x0008 -/* Another node is deleting, so our delete is a nop */ -#define OCFS2_INODE_SKIP_DELETE0x0010 /* Has the inode been orphaned on another node? * * This hints to ocfs2_drop_inode that it should clear i_nlink before @@ -100,11 +98,11 @@ struct ocfs2_inode_info * rely on ocfs2_delete_inode to sort things out under the proper * cluster locks. */ -#define OCFS2_INODE_MAYBE_ORPHANED 0x0020 +#define OCFS2_INODE_MAYBE_ORPHANED 0x0010 /* Does someone have the file open O_DIRECT */ -#define OCFS2_INODE_OPEN_DIRECT0x0040 +#define OCFS2_INODE_OPEN_DIRECT0x0020 /* Tell the inode wipe code it's not in orphan dir */ -#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x0080 +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x0040 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 44fc3e530c3d..03ea9314fecd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2132,12 +2132,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi-ip_next_orphan; spin_lock(oi-ip_lock); - /* The remote delete code may have set these on the -* assumption that the other node would wipe them -* successfully. If they are still in the node's -* orphan dir, we need to reset that state. */ - oi-ip_flags = ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); - /* Set the proper information to get us going into * ocfs2_delete_inode. */ oi-ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; -- 1.8.1.4 ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
[Ocfs2-devel] [PATCH 5/6] ocfs2: Avoid blocking in ocfs2_mark_lockres_freeing() in downconvert thread
If we are dropping last inode reference from downconvert thread, we will end up calling ocfs2_mark_lockres_freeing() which can block if the lock we are freeing is queued thus creating an A-A deadlock. Luckily, since we are the downconvert thread, we can immediately dequeue the lock and thus avoid waiting in this case. Signed-off-by: Jan Kara j...@suse.cz --- fs/ocfs2/dlmglue.c | 33 +++-- fs/ocfs2/dlmglue.h | 3 ++- fs/ocfs2/inode.c | 7 --- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 19986959d149..b7580157ef01 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3150,7 +3150,8 @@ out: * it safe to drop. * * You can *not* attempt to call cluster_lock on this lockres anymore. */ -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) { int status; struct ocfs2_mask_waiter mw; @@ -3160,6 +3161,33 @@ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) spin_lock_irqsave(lockres-l_lock, flags); lockres-l_flags |= OCFS2_LOCK_FREEING; + if (lockres-l_flags OCFS2_LOCK_QUEUED current == osb-dc_task) { + unsigned long flags; + + /* +* We know the downconvert is queued but not in progress +* because we are the downconvert thread and processing +* different lock. So we can just remove the lock from the +* queue. This is not only an optimization but also a way +* to avoid the following deadlock: +* ocfs2_dentry_post_unlock() +* ocfs2_dentry_lock_put() +* ocfs2_drop_dentry_lock() +* iput() +* ocfs2_evict_inode() +* ocfs2_clear_inode() +* ocfs2_mark_lockres_freeing() +* ... blocks waiting for OCFS2_LOCK_QUEUED +* since we are the downconvert thread which +* should clear the flag. +*/ + spin_lock_irqsave(osb-dc_task_lock, flags); + list_del_init(lockres-l_blocked_list); + osb-blocked_lock_count--; + spin_unlock_irqrestore(osb-dc_task_lock, flags); + lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); + goto out_unlock; + } while (lockres-l_flags OCFS2_LOCK_QUEUED) { lockres_add_mask_waiter(lockres, mw, OCFS2_LOCK_QUEUED, 0); spin_unlock_irqrestore(lockres-l_lock, flags); @@ -3172,6 +3200,7 @@ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) spin_lock_irqsave(lockres-l_lock, flags); } +out_unlock: spin_unlock_irqrestore(lockres-l_lock, flags); } @@ -3180,7 +3209,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, { int ret; - ocfs2_mark_lockres_freeing(lockres); + ocfs2_mark_lockres_freeing(osb, lockres); ret = ocfs2_drop_lock(osb, lockres); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 1d596d8c4a4a..d293a22c32c5 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex); void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 3b0d722de35e..9661f8db21dc 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1053,6 +1053,7 @@ static void ocfs2_clear_inode(struct inode *inode) { int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode-i_sb); clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi-ip_blkno, @@ -1069,9 +1070,9 @@ static void ocfs2_clear_inode(struct inode *inode) /* Do these before all the other work so that we don't bounce * the downconvert thread while waiting to destroy the locks. */ - ocfs2_mark_lockres_freeing(oi-ip_rw_lockres); - ocfs2_mark_lockres_freeing(oi-ip_inode_lockres); - ocfs2_mark_lockres_freeing(oi-ip_open_lockres); + ocfs2_mark_lockres_freeing(osb, oi-ip_rw_lockres); + ocfs2_mark_lockres_freeing(osb, oi-ip_inode_lockres); + ocfs2_mark_lockres_freeing(osb, oi-ip_open_lockres);
[Ocfs2-devel] [PATCH] quota: Fix race between dqput() and dquot_scan_active()
Currently last dqput() can race with dquot_scan_active() causing it to call callback for an already deactivated dquot. The race is as follows: CPU1CPU2 dqput() spin_lock(dq_list_lock); if (atomic_read(dquot-dq_count) 1) { - not taken if (test_bit(DQ_ACTIVE_B, dquot-dq_flags)) { spin_unlock(dq_list_lock); -release_dquot(dquot); if (atomic_read(dquot-dq_count) 1) - not taken dquot_scan_active() spin_lock(dq_list_lock); if (!test_bit(DQ_ACTIVE_B, dquot-dq_flags)) - not taken atomic_inc(dquot-dq_count); spin_unlock(dq_list_lock); - proceeds to release dquot ret = fn(dquot, priv); - called for inactive dquot Fix the problem by making sure possible -release_dquot() is finished by the time we call the callback and new calls to it will notice reference dquot_scan_active() has taken and bail out. Signed-off-by: Jan Kara j...@suse.cz --- fs/quota/dquot.c | 14 +++--- 1 file changed, 11 insertions(+), 3 deletions(-) This is the last patch needed to make ocfs2 quotas rock solid in my testing. I will carry it in my tree and push it to Linus soon. diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 831d49a4111f..cfc8dcc16043 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -581,9 +581,17 @@ int dquot_scan_active(struct super_block *sb, dqstats_inc(DQST_LOOKUPS); dqput(old_dquot); old_dquot = dquot; - ret = fn(dquot, priv); - if (ret 0) - goto out; + /* +* -release_dquot() can be racing with us. Our reference +* protects us from new calls to it so just wait for any +* outstanding call and recheck the DQ_ACTIVE_B after that. +*/ + wait_on_dquot(dquot); + if (test_bit(DQ_ACTIVE_B, dquot-dq_flags)) { + ret = fn(dquot, priv); + if (ret 0) + goto out; + } spin_lock(dq_list_lock); /* We are safe to continue now because our dquot could not * be moved out of the inuse list while we hold the reference */ -- 1.8.1.4 ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
[Ocfs2-devel] Hi everyone, is it an issue? The host is blocked as the issue created. Thanks a lot.
Hi everyone, as we test the performance of the ocfs2 with fio. As the test case running, one of host of ocfs2 cluster will be blocked a small time and restart sooner. The test environment is that there are six host sharing one iSCSI LUN which capacity is about 1T and it is formatted with ocfs2, and mount point on every host is /vms/vStore. All of the host's OS is ubuntu 12.04, and we upgrade the kernel with 3.2.50, and ocfs2 as compiled according with kernel 3.2.50. We test the performance of the ocfs2 with fio on one every host. The fio test configure is as below, and the filename is different on every host. Such as file1...file5 is on host1, file6file10 are on host2, and so on. One example fio file is as below: root@cvknode4:~/fios_test4# cat 1024k_10r [global] ioengine=libaio rw=read bs=1024K time_based runtime=180 size=9g direct=1 iodepth=1 [file1] filename=/vms/vStor/file41 [file2] filename=/vms/vStor/file42 [file3] filename=/vms/vStor/file43 [file4] filename=/vms/vStor/file44 [file5] filename=/vms/vStor/file45 As we start fio tools on the hosts sequent, several minute later, one host will blocked and restart(fenced). Is it one issue of ocfs2? Or is there any fixed patch for it? The syslog is as below: Feb 19 17:50:01 cvknode9 CRON[16143]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:01 cvknode9 CRON[16147]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:01 cvknode9 CRON[16146]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:01 cvknode9 CRON[16144]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:01 cvknode9 CRON[16141]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:02 cvknode9 CRON[16134]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:03 cvknode9 crmadmin: [16194]: ERROR: admin_message_timeout: No messages received in 2 seconds Feb 19 17:50:03 cvknode9 CRON[16140]: (CRON) info (No MTA installed, discarding output) Feb 19 17:51:00 cvknode9 kernel: [ 803.464977] [ cut here ] Feb 19 17:51:00 cvknode9 kernel: [ 803.464991] WARNING: at kernel/watchdog.c:241 watchdog_overflow_callback+0x9a/0xc0() Feb 19 17:51:00 cvknode9 kernel: [ 803.464993] Hardware name: FlexServer B590 Feb 19 17:51:00 cvknode9 kernel: [ 803.464995] Watchdog detected hard LOCKUP on cpu 0 Feb 19 17:51:00 cvknode9 kernel: [ 803.464997] Modules linked in: ip6table_filter ip6_tables iptable_filter ip_tables ebtable_nat ebtables x_tables ocfs2(O) quota_tree drbd lru_cache 8021q garp stp vhost_net macvtap macvlan kvm_intel kvm ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp libiscsi_tcp ocfs2_dlmfs(O) ocfs2_stack_o2cb(O) ocfs2_dlm(O) ocfs2_nodemanager(O) ocfs2_stackglue(O) configfs openvswitch_mod(O) nfsd nfs lockd fscache auth_rpcgss nfs_acl sunrpc psmouse ioatdma dm_multipath serio_raw sb_edac hpilo edac_core dca acpi_power_meter mac_hid lp parport hpsa be2iscsi iscsi_boot_sysfs libiscsi be2net scsi_transport_iscsi Feb 19 17:51:00 cvknode9 kernel: [ 803.465065] Pid: 6029, comm: ocfs2dc Tainted: G O 3.2.50 #1 Feb 19 17:51:00 cvknode9 kernel: [ 803.465067] Call Trace: Feb 19 17:51:00 cvknode9 kernel: [ 803.465069] NMI [81066daf] warn_slowpath_common+0x7f/0xc0 Feb 19 17:51:00 cvknode9 kernel: [ 803.465084] [81066ea6] warn_slowpath_fmt+0x46/0x50 Feb 19 17:51:00 cvknode9 kernel: [ 803.465089] [8101b833] ? native_sched_clock+0x13/0x80 Feb 19 17:51:00 cvknode9 kernel: [ 803.465093] [810d6b1a] watchdog_overflow_callback+0x9a/0xc0 Feb 19 17:51:00 cvknode9 kernel: [ 803.465099] [8110eb76] __perf_event_overflow+0x96/0x1f0 Feb 19 17:51:00 cvknode9 kernel: [ 803.465103] [8110c491] ? perf_event_update_userpage+0x11/0xc0 Feb 19 17:51:00 cvknode9 kernel: [ 803.465109] [8102468a] ? x86_perf_event_set_period+0xda/0x150 Feb 19 17:51:00 cvknode9 kernel: [ 803.465113] [8110f534] perf_event_overflow+0x14/0x20 Feb 19 17:51:00 cvknode9 kernel: [ 803.465118] [81028c93] intel_pmu_handle_irq+0x163/0x2e0 Feb 19 17:51:00 cvknode9 kernel: [ 803.465130] [81644b01] perf_event_nmi_handler+0x21/0x30 Feb 19 17:51:00 cvknode9 kernel: [ 803.465134] [816443d1] do_nmi+0x101/0x350 Feb 19 17:51:00 cvknode9 kernel: [ 803.465138] [81643a30] nmi+0x20/0x30 Feb 19 17:51:00 cvknode9 kernel: [ 803.465147] [8103db15] ? __ticket_spin_lock+0x25/0x30 Feb 19 17:51:00 cvknode9 kernel: [ 803.465149] EOE IRQ [81642fee] _raw_spin_lock+0xe/0x20 Feb 19 17:51:00 cvknode9 kernel: [ 803.465216] [a03e5487] ocfs2_wake_downconvert_thread+0x27/0x60 [ocfs2] Feb 19 17:51:00 cvknode9 kernel: [ 803.465231] [a03e5554] __ocfs2_cluster_unlock.isra.32+0x94/0xf0 [ocfs2] Feb 19 17:51:00 cvknode9 kernel: [ 803.465245] [a03e5b2b] ocfs2_rw_unlock+0x6b/0xe0 [ocfs2] Feb 19 17:51:00 cvknode9 kernel: [ 803.465252] [811aa22f] ?
Re: [Ocfs2-devel] [Ocfs2-users] Hi everyone, is it an issue? The host is blocked as the issue created. Thanks a lot.
Hi, By the looks of the call-trace this looks like an issue with communicating with the iSCSI target. I run 6 nodes with OCFS2 over Fibre Channel on Ubuntu 12.04 Linux 3.5 and Linux 3.13 (VM Cluster and Samba Cluster). Before investigating this any further I would advise upgrading your kernel to at least 3.8 (which is officially support by Ubuntu). There have been many improvements to OCFS2 in recent kernels which have increased the stability in (at least our environment) substantially. $ apt-get install linux-image-3.8.0-35-generic If the problem still persists, does you iSCSI target have monitoring statistics which you could look into? If the network link is becoming saturated this could be the issue (especially if heartbeat is running over the same interface). Could you also let us know which node is fencing and if all subsequent nodes receive this stack trace? Does the IO lock up? ( $ ls /vms/vStore ) Marty On Fri, Feb 21, 2014 at 3:28 AM, Guozhonghua guozhong...@h3c.com wrote: Hi everyone, as we test the performance of the ocfs2 with fio. As the test case running, one of host of ocfs2 cluster will be blocked a small time and restart sooner. The test environment is that there are six host sharing one iSCSI LUN which capacity is about 1T and it is formatted with ocfs2, and mount point on every host is /vms/vStore. All of the host’s OS is ubuntu 12.04, and we upgrade the kernel with 3.2.50, and ocfs2 as compiled according with kernel 3.2.50. We test the performance of the ocfs2 with fio on one every host. The fio test configure is as below, and the filename is different on every host. Such as file1…file5 is on host1, file6….file10 are on host2, and so on. One example fio file is as below: root@cvknode4:~/fios_test4# cat 1024k_10r [global] ioengine=libaio rw=read bs=1024K time_based runtime=180 size=9g direct=1 iodepth=1 [file1] filename=/vms/vStor/file41 [file2] filename=/vms/vStor/file42 [file3] filename=/vms/vStor/file43 [file4] filename=/vms/vStor/file44 [file5] filename=/vms/vStor/file45 As we start fio tools on the hosts sequent, several minute later, one host will blocked and restart(fenced). Is it one issue of ocfs2? Or is there any fixed patch for it? The syslog is as below: Feb 19 17:50:01 cvknode9 CRON[16143]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:01 cvknode9 CRON[16147]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:01 cvknode9 CRON[16146]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:01 cvknode9 CRON[16144]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:01 cvknode9 CRON[16141]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:02 cvknode9 CRON[16134]: (CRON) info (No MTA installed, discarding output) Feb 19 17:50:03 cvknode9 crmadmin: [16194]: ERROR: admin_message_timeout: No messages received in 2 seconds Feb 19 17:50:03 cvknode9 CRON[16140]: (CRON) info (No MTA installed, discarding output) Feb 19 17:51:00 cvknode9 kernel: [ 803.464977] [ cut here ] Feb 19 17:51:00 cvknode9 kernel: [ 803.464991] WARNING: at kernel/watchdog.c:241 watchdog_overflow_callback+0x9a/0xc0() Feb 19 17:51:00 cvknode9 kernel: [ 803.464993] Hardware name: FlexServer B590 Feb 19 17:51:00 cvknode9 kernel: [ 803.464995] Watchdog detected hard LOCKUP on cpu 0 Feb 19 17:51:00 cvknode9 kernel: [ 803.464997] Modules linked in: ip6table_filter ip6_tables iptable_filter ip_tables ebtable_nat ebtables x_tables ocfs2(O) quota_tree drbd lru_cache 8021q garp stp vhost_net macvtap macvlan kvm_intel kvm ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr iscsi_tcp libiscsi_tcp ocfs2_dlmfs(O) ocfs2_stack_o2cb(O) ocfs2_dlm(O) ocfs2_nodemanager(O) ocfs2_stackglue(O) configfs openvswitch_mod(O) nfsd nfs lockd fscache auth_rpcgss nfs_acl sunrpc psmouse ioatdma dm_multipath serio_raw sb_edac hpilo edac_core dca acpi_power_meter mac_hid lp parport hpsa be2iscsi iscsi_boot_sysfs libiscsi be2net scsi_transport_iscsi Feb 19 17:51:00 cvknode9 kernel: [ 803.465065] Pid: 6029, comm: ocfs2dc Tainted: G O 3.2.50 #1 Feb 19 17:51:00 cvknode9 kernel: [ 803.465067] Call Trace: Feb 19 17:51:00 cvknode9 kernel: [ 803.465069] NMI [81066daf] warn_slowpath_common+0x7f/0xc0 Feb 19 17:51:00 cvknode9 kernel: [ 803.465084] [81066ea6] warn_slowpath_fmt+0x46/0x50 Feb 19 17:51:00 cvknode9 kernel: [ 803.465089] [8101b833] ? native_sched_clock+0x13/0x80 Feb 19 17:51:00 cvknode9 kernel: [ 803.465093] [810d6b1a] watchdog_overflow_callback+0x9a/0xc0 Feb 19 17:51:00 cvknode9 kernel: [ 803.465099] [8110eb76] __perf_event_overflow+0x96/0x1f0 Feb 19 17:51:00 cvknode9 kernel: [ 803.465103] [8110c491] ? perf_event_update_userpage+0x11/0xc0 Feb 19 17:51:00 cvknode9 kernel: [ 803.465109] [8102468a] ?
[Ocfs2-devel] 答复: [Ocfs2-users] Hi everyone, is it an issue? The host is blocked as the issue created. Thanks a lot.
Hi Marty, It is that the latest one host which run fio tools, and the host is frequently receive the stack trace and fenced, other hosts' IO lock up which is in the OCFS2 cluster at the same time. We will try the new kernel. Thanks a lot. Guozhonghua -邮件原件- 发件人: Marty Sweet [mailto:msweet@gmail.com] 发送时间: 2014年2月21日 11:47 收件人: guozhonghua 02084 抄送: ocfs2-devel@oss.oracle.com; ocfs2-us...@oss.oracle.com 主题: Re: [Ocfs2-users] Hi everyone, is it an issue? The host is blocked as the issue created. Thanks a lot. Hi, By the looks of the call-trace this looks like an issue with communicating with the iSCSI target. I run 6 nodes with OCFS2 over Fibre Channel on Ubuntu 12.04 Linux 3.5 and Linux 3.13 (VM Cluster and Samba Cluster). Before investigating this any further I would advise upgrading your kernel to at least 3.8 (which is officially support by Ubuntu). There have been many improvements to OCFS2 in recent kernels which have increased the stability in (at least our environment) substantially. $ apt-get install linux-image-3.8.0-35-generic If the problem still persists, does you iSCSI target have monitoring statistics which you could look into? If the network link is becoming saturated this could be the issue (especially if heartbeat is running over the same interface). Could you also let us know which node is fencing and if all subsequent nodes receive this stack trace? Does the IO lock up? ( $ ls /vms/vStore ) Marty - 本邮件及其附件含有杭州华三通信技术有限公司的保密信息,仅限于发送给上面地址中列出 的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部分地泄露、复制、 或散发)本邮件中的信息。如果您错收了本邮件,请您立即电话或邮件通知发件人并删除本 邮件! This e-mail and its attachments contain confidential information from H3C, which is intended only for the person or entity whose address is listed above. Any use of the information contained herein in any way (including, but not limited to, total or partial disclosure, reproduction, or dissemination) by persons other than the intended recipient(s) is prohibited. If you receive this e-mail in error, please notify the sender by phone or email immediately and delete it! ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Re: [Ocfs2-devel] [PATCH 0/6 v2] ocfs2: Avoid pending orphaned inodes
Hi Jan, thanks a lot for these patches. They all look good to me ... I just have one question on patch 5 Thanks, --Srini On 02/20/2014 07:18 AM, Jan Kara wrote: Hello, here is a second version of my patchset to solve a deadlocks when we do not defer dropping of inode reference from downconvert worker. I have tested the patches (also with lockdep enabled) and they seem to work fine. Comments are welcome! Honza ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Re: [Ocfs2-devel] [PATCH 3/6] quota: Provide function to grab quota structure reference
looks good to me Reviewed-by: Srinivas Eeda srinivas.e...@oracle.com On 02/20/2014 07:18 AM, Jan Kara wrote: Provide dqgrab() function to get quota structure reference when we are sure it already has at least one active reference. Make use of this function inside quota code. Signed-off-by: Jan Kara j...@suse.cz --- fs/quota/dquot.c | 4 ++-- include/linux/quotaops.h | 8 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 831d49a4111f..e3f09e34d0b2 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -528,7 +528,7 @@ restart: if (atomic_read(dquot-dq_count)) { DEFINE_WAIT(wait); - atomic_inc(dquot-dq_count); + dqgrab(dquot); prepare_to_wait(dquot-dq_wait_unused, wait, TASK_UNINTERRUPTIBLE); spin_unlock(dq_list_lock); @@ -624,7 +624,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type) /* Now we have active dquot from which someone is * holding reference so we can safely just increase * use count */ - atomic_inc(dquot-dq_count); + dqgrab(dquot); spin_unlock(dq_list_lock); dqstats_inc(DQST_LOOKUPS); err = sb-dq_op-write_dquot(dquot); diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 6965fe394c3b..1d3eee594cd6 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -46,6 +46,14 @@ void inode_reclaim_rsv_space(struct inode *inode, qsize_t number); void dquot_initialize(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); +static inline struct dquot *dqgrab(struct dquot *dquot) +{ + /* Make sure someone else has active reference to dquot */ + WARN_ON_ONCE(!atomic_read(dquot-dq_count)); + WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, dquot-dq_flags)); + atomic_inc(dquot-dq_count); + return dquot; +} void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Re: [Ocfs2-devel] [PATCH 4/6] ocfs2: Implement delayed dropping of last dquot reference
looks good to me Reviewed-by: Srinivas Eeda srinivas.e...@oracle.com On 02/20/2014 07:18 AM, Jan Kara wrote: We cannot drop last dquot reference from downconvert thread as that creates the following deadlock: NODE 1 NODE2 holds dentry lock for 'foo' holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE dquot_initialize(bar) ocfs2_dquot_acquire() ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) ... downconvert thread (triggered from another node or a different process from NODE2) ocfs2_dentry_post_unlock() ... iput(foo) ocfs2_evict_inode(foo) ocfs2_clear_inode(foo) dquot_drop(inode) ... ocfs2_dquot_release() ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE) - blocks finds we need more space in quota file ... ocfs2_extend_no_holes() ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE) - deadlocks waiting for downconvert thread We solve the problem by postponing dropping of the last dquot reference to a workqueue if it happens from the downconvert thread. Signed-off-by: Jan Kara j...@suse.cz --- fs/ocfs2/ocfs2.h| 5 + fs/ocfs2/quota.h| 2 ++ fs/ocfs2/quota_global.c | 35 +++ fs/ocfs2/super.c| 8 4 files changed, 50 insertions(+) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 553f53cc73ae..64c02239ba46 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -30,6 +30,7 @@ #include linux/sched.h #include linux/wait.h #include linux/list.h +#include linux/llist.h #include linux/rbtree.h #include linux/workqueue.h #include linux/kref.h @@ -419,6 +420,10 @@ struct ocfs2_super struct ocfs2_dentry_lock *dentry_lock_list; struct work_struct dentry_lock_work; + /* List of dquot structures to drop last reference to */ + struct llist_head dquot_drop_list; + struct work_struct dquot_drop_work; + wait_queue_head_t osb_mount_event; /* Truncate log info */ diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index d5ab56cbe5c5..f266d67df3c6 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -28,6 +28,7 @@ struct ocfs2_dquot { unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ s64 dq_origspace; /* Last globally synced space usage */ s64 dq_originodes; /* Last globally synced inode usage */ + struct llist_node list; /* Member of list of dquots to drop */ }; /* Description of one chunk to recover in memory */ @@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 p_block, int ocfs2_create_local_dquot(struct dquot *dquot); int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot); int ocfs2_local_write_dquot(struct dquot *dquot); +void ocfs2_drop_dquot_refs(struct work_struct *work); extern const struct dquot_operations ocfs2_quota_operations; extern struct quota_format_type ocfs2_quota_format; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index aaa50611ec66..7921e209c64b 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -10,6 +10,7 @@ #include linux/jiffies.h #include linux/writeback.h #include linux/workqueue.h +#include linux/llist.h #include cluster/masklog.h @@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) OCFS2_INODE_UPDATE_CREDITS; } +void ocfs2_drop_dquot_refs(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, +dquot_drop_work); + struct llist_node *list; + struct ocfs2_dquot *odquot, *next_odquot; + + list = llist_del_all(osb-dquot_drop_list); + llist_for_each_entry_safe(odquot, next_odquot, list, list) { + /* Drop the reference we acquired in ocfs2_dquot_release() */ + dqput(odquot-dq_dquot); + } +} + +/* + * Called when the last reference to dquot is dropped. If we are called from + * downconvert thread, we cannot do all the handling here because grabbing + * quota lock could deadlock (the node holding the quota lock could need some + * other cluster lock to proceed but with blocked downconvert thread we cannot + * release any lock). + */ static int
Re: [Ocfs2-devel] [PATCH 1/6] ocfs2: Remove OCFS2_INODE_SKIP_DELETE flag
Reviewed-by: Srinivas Eeda srinivas.e...@oracle.com On 02/20/2014 07:18 AM, Jan Kara wrote: The flag was never set, delete it. Signed-off-by: Jan Kara j...@suse.cz --- fs/ocfs2/inode.c | 6 -- fs/ocfs2/inode.h | 8 +++- fs/ocfs2/journal.c | 6 -- 3 files changed, 3 insertions(+), 17 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index f29a90fde619..b4baaefe4dd4 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -822,12 +822,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail_unlock; } - /* If we have allowd wipe of this inode for another node, it - * will be marked here so we can safely skip it. Recovery will - * cleanup any inodes we might inadvertently skip here. */ - if (oi-ip_flags OCFS2_INODE_SKIP_DELETE) - goto bail_unlock; - ret = 1; bail_unlock: spin_unlock(oi-ip_lock); diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 621fc73bf23d..f60bc314ee0a 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -84,8 +84,6 @@ struct ocfs2_inode_info #define OCFS2_INODE_BITMAP 0x0004 /* This inode has been wiped from disk */ #define OCFS2_INODE_DELETED 0x0008 -/* Another node is deleting, so our delete is a nop */ -#define OCFS2_INODE_SKIP_DELETE 0x0010 /* Has the inode been orphaned on another node? * * This hints to ocfs2_drop_inode that it should clear i_nlink before @@ -100,11 +98,11 @@ struct ocfs2_inode_info * rely on ocfs2_delete_inode to sort things out under the proper * cluster locks. */ -#define OCFS2_INODE_MAYBE_ORPHANED 0x0020 +#define OCFS2_INODE_MAYBE_ORPHANED 0x0010 /* Does someone have the file open O_DIRECT */ -#define OCFS2_INODE_OPEN_DIRECT 0x0040 +#define OCFS2_INODE_OPEN_DIRECT 0x0020 /* Tell the inode wipe code it's not in orphan dir */ -#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x0080 +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x0040 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 44fc3e530c3d..03ea9314fecd 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2132,12 +2132,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi-ip_next_orphan; spin_lock(oi-ip_lock); - /* The remote delete code may have set these on the - * assumption that the other node would wipe them - * successfully. If they are still in the node's - * orphan dir, we need to reset that state. */ - oi-ip_flags = ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE); - /* Set the proper information to get us going into * ocfs2_delete_inode. */ oi-ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Re: [Ocfs2-devel] [PATCH 2/6] ocfs2: Move dquot_initialize() in ocfs2_delete_inode() somewhat later
looks good to me Reviewed-by: Srinivas Eeda srinivas.e...@oracle.com On 02/20/2014 07:18 AM, Jan Kara wrote: Move dquot_initalize() call in ocfs2_delete_inode() after the moment we verify inode is actually a sane one to delete. We certainly don't want to initialize quota for system inodes etc. This also avoids calling into quota code from downconvert thread. Add more details into the comment why bailing out from ocfs2_delete_inode() when we are in downconvert thread is OK. Signed-off-by: Jan Kara j...@suse.cz --- fs/ocfs2/inode.c | 16 +--- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index b4baaefe4dd4..3b0d722de35e 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -804,11 +804,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode *inode) goto bail; } - /* If we're coming from downconvert_thread we can't go into our own - * voting [hello, deadlock city!], so unforuntately we just - * have to skip deleting this guy. That's OK though because - * the node who's doing the actual deleting should handle it - * anyway. */ + /* + * If we're coming from downconvert_thread we can't go into our own + * voting [hello, deadlock city!] so we cannot delete the inode. But + * since we dropped last inode ref when downconverting dentry lock, + * we cannot have the file open and thus the node doing unlink will + * take care of deleting the inode. + */ if (current == osb-dc_task) goto bail; @@ -954,8 +956,6 @@ static void ocfs2_delete_inode(struct inode *inode) if (is_bad_inode(inode) || !OCFS2_I(inode)-ip_blkno) goto bail; - dquot_initialize(inode); - if (!ocfs2_inode_is_valid_to_delete(inode)) { /* It's probably not necessary to truncate_inode_pages * here but we do it for safety anyway (it will most @@ -964,6 +964,8 @@ static void ocfs2_delete_inode(struct inode *inode) goto bail; } + dquot_initialize(inode); + /* We want to block signals in delete_inode as the lock and * messaging paths may return us -ERESTARTSYS. Which would * cause us to exit early, resulting in inodes being orphaned ___ Ocfs2-devel mailing list Ocfs2-devel@oss.oracle.com https://oss.oracle.com/mailman/listinfo/ocfs2-devel
Re: [Ocfs2-devel] [PATCH 5/6] ocfs2: Avoid blocking in ocfs2_mark_lockres_freeing() in downconvert thread
I like the idea of dc_task handling queued basts in ocfs2_mark_lockres_freeing. I am wondering if we should call lockres-l_ops-post_unlock(osb, lockres) ? Would there be another node waiting for a bast response ? On 02/20/2014 07:18 AM, Jan Kara wrote: If we are dropping last inode reference from downconvert thread, we will end up calling ocfs2_mark_lockres_freeing() which can block if the lock we are freeing is queued thus creating an A-A deadlock. Luckily, since we are the downconvert thread, we can immediately dequeue the lock and thus avoid waiting in this case. Signed-off-by: Jan Kara j...@suse.cz --- fs/ocfs2/dlmglue.c | 33 +++-- fs/ocfs2/dlmglue.h | 3 ++- fs/ocfs2/inode.c | 7 --- 3 files changed, 37 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 19986959d149..b7580157ef01 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -3150,7 +3150,8 @@ out: * it safe to drop. * * You can *not* attempt to call cluster_lock on this lockres anymore. */ -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres) { int status; struct ocfs2_mask_waiter mw; @@ -3160,6 +3161,33 @@ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) spin_lock_irqsave(lockres-l_lock, flags); lockres-l_flags |= OCFS2_LOCK_FREEING; + if (lockres-l_flags OCFS2_LOCK_QUEUED current == osb-dc_task) { + unsigned long flags; + + /* + * We know the downconvert is queued but not in progress + * because we are the downconvert thread and processing + * different lock. So we can just remove the lock from the + * queue. This is not only an optimization but also a way + * to avoid the following deadlock: + * ocfs2_dentry_post_unlock() + * ocfs2_dentry_lock_put() + * ocfs2_drop_dentry_lock() + * iput() + * ocfs2_evict_inode() + * ocfs2_clear_inode() + * ocfs2_mark_lockres_freeing() + * ... blocks waiting for OCFS2_LOCK_QUEUED + * since we are the downconvert thread which + * should clear the flag. + */ + spin_lock_irqsave(osb-dc_task_lock, flags); + list_del_init(lockres-l_blocked_list); + osb-blocked_lock_count--; + spin_unlock_irqrestore(osb-dc_task_lock, flags); + lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED); + goto out_unlock; + } while (lockres-l_flags OCFS2_LOCK_QUEUED) { lockres_add_mask_waiter(lockres, mw, OCFS2_LOCK_QUEUED, 0); spin_unlock_irqrestore(lockres-l_lock, flags); @@ -3172,6 +3200,7 @@ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres) spin_lock_irqsave(lockres-l_lock, flags); } +out_unlock: spin_unlock_irqrestore(lockres-l_lock, flags); } @@ -3180,7 +3209,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, { int ret; - ocfs2_mark_lockres_freeing(lockres); + ocfs2_mark_lockres_freeing(osb, lockres); ret = ocfs2_drop_lock(osb, lockres); if (ret) mlog_errno(ret); diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 1d596d8c4a4a..d293a22c32c5 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex); void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex); -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb, + struct ocfs2_lock_res *lockres); void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 3b0d722de35e..9661f8db21dc 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -1053,6 +1053,7 @@ static void ocfs2_clear_inode(struct inode *inode) { int status; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_super *osb = OCFS2_SB(inode-i_sb); clear_inode(inode); trace_ocfs2_clear_inode((unsigned long long)oi-ip_blkno, @@ -1069,9 +1070,9 @@ static void ocfs2_clear_inode(struct inode *inode) /* Do these before all the other work so that we don't bounce * the downconvert thread while waiting to destroy the locks. */ - ocfs2_mark_lockres_freeing(oi-ip_rw_lockres); -