[Ocfs2-devel] [PATCH] ocfs2: Fix quota file corruption

2014-02-20 Thread Jan Kara
Global quota files are accessed from different nodes. Thus we cannot
cache offset of quota structure in the quota file after we drop our
node reference count to it because after that moment quota structure may
be freed and reallocated elsewhere by a different node resulting in
corruption of quota file.

Fix the problem by clearing dq_off when we are releasing dquot
structure. We also remove the DB_READ_B handling because it is useless -
DQ_ACTIVE_B is set iff DQ_READ_B is set.

CC: sta...@vger.kernel.org
CC: Goldwyn Rodrigues rgold...@suse.de
CC: Mark Fasheh mfas...@suse.de
Signed-off-by: Jan Kara j...@suse.cz
---
 fs/ocfs2/quota_global.c | 27 +--
 fs/ocfs2/quota_local.c  |  4 
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index aaa50611ec66..d7b5108789e2 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -717,6 +717,12 @@ static int ocfs2_release_dquot(struct dquot *dquot)
 */
if (status  0)
mlog_errno(status);
+   /*
+* Clear dq_off so that we search for the structure in quota file next
+* time we acquire it. The structure might be deleted and reallocated
+* elsewhere by another node while our dquot structure is on freelist.
+*/
+   dquot-dq_off = 0;
clear_bit(DQ_ACTIVE_B, dquot-dq_flags);
 out_trans:
ocfs2_commit_trans(osb, handle);
@@ -756,16 +762,17 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
status = ocfs2_lock_global_qf(info, 1);
if (status  0)
goto out;
-   if (!test_bit(DQ_READ_B, dquot-dq_flags)) {
-   status = ocfs2_qinfo_lock(info, 0);
-   if (status  0)
-   goto out_dq;
-   status = qtree_read_dquot(info-dqi_gi, dquot);
-   ocfs2_qinfo_unlock(info, 0);
-   if (status  0)
-   goto out_dq;
-   }
-   set_bit(DQ_READ_B, dquot-dq_flags);
+   status = ocfs2_qinfo_lock(info, 0);
+   if (status  0)
+   goto out_dq;
+   /*
+* We always want to read dquot structure from disk because we don't
+* know what happened with it while it was on freelist.
+*/
+   status = qtree_read_dquot(info-dqi_gi, dquot);
+   ocfs2_qinfo_unlock(info, 0);
+   if (status  0)
+   goto out_dq;
 
OCFS2_DQUOT(dquot)-dq_use_count++;
OCFS2_DQUOT(dquot)-dq_origspace = dquot-dq_dqb.dqb_curspace;
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 2e4344be3b96..2001862bf2b1 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -1303,10 +1303,6 @@ int ocfs2_local_release_dquot(handle_t *handle, struct 
dquot *dquot)
ocfs2_journal_dirty(handle, od-dq_chunk-qc_headerbh);
 
 out:
-   /* Clear the read bit so that next time someone uses this
-* dquot he reads fresh info from disk and allocates local
-* dquot structure */
-   clear_bit(DQ_READ_B, dquot-dq_flags);
return status;
 }
 
-- 
1.8.1.4


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


[Ocfs2-devel] [PATCH V2] ocfs2: dlm: fix recovery hung

2014-02-20 Thread Junxiao Bi
There is a race window in dlm_do_recovery() between dlm_remaster_locks()
and dlm_reset_recovery() when the recovery master nearly finish the recovery
process for a dead node. After the master sends FINALIZE_RECO message in
dlm_remaster_locks(), another node may become the recovery master for another
dead node, and then send the BEGIN_RECO message to all the nodes included the
old master, in the handler of this message dlm_begin_reco_handler() of old 
master,
dlm-reco.dead_node and dlm-reco.new_master will be set to the second dead
node and the new master, then in dlm_reset_recovery(), these two variables
will be reset to default value. This will cause new recovery master can not 
finish
the recovery process and hung, at last the whole cluster will hung for recovery.

old recovery master: new recovery master:
dlm_remaster_locks()
  become recovery master for
  another dead node.
  dlm_send_begin_reco_message()
dlm_begin_reco_handler()
{
 if (dlm-reco.state  DLM_RECO_STATE_FINALIZE) {
  return -EAGAIN;
 }
 dlm_set_reco_master(dlm, br-node_idx);
 dlm_set_reco_dead_node(dlm, br-dead_node);
}
dlm_reset_recovery()
{
 dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
 dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
}
  will hung in 
dlm_remaster_locks() for
  request dlm locks info

Before send FINALIZE_RECO message, recovery master should set 
DLM_RECO_STATE_FINALIZE
for itself and clear it after the recovery done, this can break the race 
windows as
the BEGIN_RECO messages will not be handled before DLM_RECO_STATE_FINALIZE flag 
is
cleared.

A similar race may happen between new recovery master and normal node which is 
in
dlm_finalize_reco_handler(), also fix it.

Reviewed-by: Srinivas Eeda srinivas.e...@oracle.com
Reviewed-by: Wengang Wang wen.gang.w...@oracle.com
Cc: sta...@vger.kernel.org
Signed-off-by: Junxiao Bi junxiao...@oracle.com
---
 fs/ocfs2/dlm/dlmrecovery.c |   15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7035af0..8179bd9 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -537,7 +537,10 @@ master_here:
/* success!  see if any other nodes need recovery */
mlog(0, DONE mastering recovery of %s:%u here(this=%u)!\n,
 dlm-name, dlm-reco.dead_node, dlm-node_num);
-   dlm_reset_recovery(dlm);
+   spin_lock(dlm-spinlock);
+   __dlm_reset_recovery(dlm);
+   dlm-reco.state = ~DLM_RECO_STATE_FINALIZE;
+   spin_unlock(dlm-spinlock);
}
dlm_end_recovery(dlm);
 
@@ -695,6 +698,14 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 
dead_node)
if (all_nodes_done) {
int ret;
 
+   /* Set this flag on recovery master to avoid
+* a new recovery for another dead node start
+* before the recovery is not done. That may
+* cause recovery hung.*/
+   spin_lock(dlm-spinlock);
+   dlm-reco.state |= DLM_RECO_STATE_FINALIZE;
+   spin_unlock(dlm-spinlock);
+
/* all nodes are now in DLM_RECO_NODE_DATA_DONE state
 * just send a finalize message to everyone and
 * clean up */
@@ -2882,8 +2893,8 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 
len, void *data,
BUG();
}
dlm-reco.state = ~DLM_RECO_STATE_FINALIZE;
+   __dlm_reset_recovery(dlm);
spin_unlock(dlm-spinlock);
-   dlm_reset_recovery(dlm);
dlm_kick_recovery_thread(dlm);
break;
default:
-- 
1.7.9.5


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


[Ocfs2-devel] [PATCH] fsck.ocfs2: Do not complain about oversized quota files

2014-02-20 Thread Jan Kara
Quota files can have blocks beyond i_size. This is correct because quota
code reserves these blocks to allow for extension of quota files without
the need for allocation. So make fsck.ocfs2 not complain about them.

Signed-off-by: Jan Kara j...@suse.cz
---
 fsck.ocfs2/pass1.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/fsck.ocfs2/pass1.c b/fsck.ocfs2/pass1.c
index 22dafdf50647..5118e6c630c4 100644
--- a/fsck.ocfs2/pass1.c
+++ b/fsck.ocfs2/pass1.c
@@ -832,6 +832,12 @@ out:
return ret;
 }
 
+static int is_quota_file(uint64_t ino)
+{
+   return ino == USER_QUOTA_SYSTEM_INODE ||
+  ino == GROUP_QUOTA_SYSTEM_INODE;
+}
+
 /*
  * this verifies i_size and i_clusters for inodes that use i_list to
  * reference extents of data.
@@ -1002,6 +1008,7 @@ size_cluster_check:
 *   i_clusters, even on a sparsed filesystem
 */
if (!S_ISLNK(di-i_mode)  !S_ISDIR(di-i_mode) 
+   !is_quota_file(di-i_blkno) 
di-i_size = unexpected 
prompt(ost, PY, PR_INODE_SPARSE_SIZE, Inode 
%PRIu64
has a size of %PRIu64 but has %PRIu64
-- 
1.8.1.4


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


[Ocfs2-devel] [PATCH 6/6] ocfs2: Revert iput deferring code in ocfs2_drop_dentry_lock

2014-02-20 Thread Jan Kara
From: Goldwyn Rodrigues rgold...@suse.de

The following patches are reverted in this patch because these
patches caused performance regression in the remote unlink() calls.

ea455f8ab68338ba69f5d3362b342c115bea8e13 - ocfs2: Push out dropping
of dentry lock to ocfs2_wq
f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a - ocfs2: Fix deadlock on umount
5fd131893793567c361ae64cbeb28a2a753bbe35 - ocfs2: Don't oops in
ocfs2_kill_sb on a failed mount

Previous patches in this series removed the possible deadlocks from
downconvert thread so the above patches shouldn't be needed anymore.

The regression is caused because these patches delay the iput() in case
of dentry unlocks. This also delays the unlocking of the open lockres.
The open lockresource is required to test if the inode can be wiped from
disk or not. When the deleting node does not get the open lock, it marks
it as orphan (even though it is not in use by another node/process)
and causes a journal checkpoint. This delays operations following the
inode eviction. This also moves the inode to the orphaned inode
which further causes more I/O and a lot of unneccessary orphans.

The following script can be used to generate the load causing issues:
declare -a create
declare -a remove
declare -a iterations=(1 2 4 8 16 32 64 128 256 512 1024 2048 4096 8192 16384)
unique=`mktemp -u X`
script=/tmp/idontknow-${unique}.sh
cat EOF  ${script}
for n in {1..8}; do mkdir -p test/dir\${n}
  eval touch test/dir\${n}/foo{1..\$1}
done
EOF
chmod 700 ${script}

function fcreate ()
{
  exec 21 /usr/bin/time --format=%E ${script} $1
}

function fremove ()
{
  exec 21 /usr/bin/time --format=%E ssh node2 cd `pwd`; rm -Rf test*
}

function fcp ()
{
  exec 21 /usr/bin/time --format=%E ssh node3 cd `pwd`; cp -R test test.new
}

echo -
echo | # files | create #s | copy #s | remove #s |
echo -
for ((x=0; x  ${#iterations[*]} ; x++)) do
  create[$x]=`fcreate ${iterations[$x]}`
  copy[$x]=`fcp ${iterations[$x]}`
  remove[$x]=`fremove`
  printf | %8d | %9s | %9s | %9s |\n ${iterations[$x]} ${create[$x]} 
${copy[$x]} ${remove[$x]}
done
rm ${script}
echo 

Signed-off-by: Srinivas Eeda srinivas.e...@oracle.com
Signed-off-by: Goldwyn Rodrigues rgold...@suse.com
Signed-off-by: Jan Kara j...@suse.cz
---
 fs/ocfs2/dcache.c | 61 +++
 fs/ocfs2/dcache.h | 12 +--
 fs/ocfs2/ocfs2.h  | 28 -
 fs/ocfs2/super.c  | 30 +--
 4 files changed, 9 insertions(+), 122 deletions(-)

diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index 0d3a97d2d5f6..e2e05a106beb 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -37,7 +37,6 @@
 #include dlmglue.h
 #include file.h
 #include inode.h
-#include super.h
 #include ocfs2_trace.h
 
 void ocfs2_dentry_attach_gen(struct dentry *dentry)
@@ -346,52 +345,6 @@ out_attach:
return ret;
 }
 
-DEFINE_SPINLOCK(dentry_list_lock);
-
-/* We limit the number of dentry locks to drop in one go. We have
- * this limit so that we don't starve other users of ocfs2_wq. */
-#define DL_INODE_DROP_COUNT 64
-
-/* Drop inode references from dentry locks */
-static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
-{
-   struct ocfs2_dentry_lock *dl;
-
-   spin_lock(dentry_list_lock);
-   while (osb-dentry_lock_list  (drop_count  0 || drop_count--)) {
-   dl = osb-dentry_lock_list;
-   osb-dentry_lock_list = dl-dl_next;
-   spin_unlock(dentry_list_lock);
-   iput(dl-dl_inode);
-   kfree(dl);
-   spin_lock(dentry_list_lock);
-   }
-   spin_unlock(dentry_list_lock);
-}
-
-void ocfs2_drop_dl_inodes(struct work_struct *work)
-{
-   struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
-  dentry_lock_work);
-
-   __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
-   /*
-* Don't queue dropping if umount is in progress. We flush the
-* list in ocfs2_dismount_volume
-*/
-   spin_lock(dentry_list_lock);
-   if (osb-dentry_lock_list 
-   !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
-   queue_work(ocfs2_wq, osb-dentry_lock_work);
-   spin_unlock(dentry_list_lock);
-}
-
-/* Flush the whole work queue */
-void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
-{
-   __ocfs2_drop_dl_inodes(osb, -1);
-}
-
 /*
  * ocfs2_dentry_iput() and friends.
  *
@@ -416,24 +369,16 @@ void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
 static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
   struct ocfs2_dentry_lock *dl)
 {
+   iput(dl-dl_inode);
ocfs2_simple_drop_lockres(osb, dl-dl_lockres);
ocfs2_lock_res_free(dl-dl_lockres);
-
-   /* We leave dropping of inode reference to 

[Ocfs2-devel] [PATCH 0/6 v2] ocfs2: Avoid pending orphaned inodes

2014-02-20 Thread Jan Kara
  Hello,

  here is a second version of my patchset to solve a deadlocks when we do not
defer dropping of inode reference from downconvert worker. I have tested the
patches (also with lockdep enabled) and they seem to work fine. Comments are
welcome!

Honza

___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


[Ocfs2-devel] [PATCH 4/6] ocfs2: Implement delayed dropping of last dquot reference

2014-02-20 Thread Jan Kara
We cannot drop last dquot reference from downconvert thread as that
creates the following deadlock:

NODE 1  NODE2
holds dentry lock for 'foo'
holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE
dquot_initialize(bar)
  ocfs2_dquot_acquire()

ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE)
...
downconvert thread (triggered from another
node or a different process from NODE2)
  ocfs2_dentry_post_unlock()
...
iput(foo)
  ocfs2_evict_inode(foo)
ocfs2_clear_inode(foo)
  dquot_drop(inode)
...
ocfs2_dquot_release()
  ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE)
   - blocks
finds we need more space in
quota file
...
ocfs2_extend_no_holes()
  
ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE)
- deadlocks waiting for
  downconvert thread

We solve the problem by postponing dropping of the last dquot reference
to a workqueue if it happens from the downconvert thread.

Signed-off-by: Jan Kara j...@suse.cz
---
 fs/ocfs2/ocfs2.h|  5 +
 fs/ocfs2/quota.h|  2 ++
 fs/ocfs2/quota_global.c | 35 +++
 fs/ocfs2/super.c|  8 
 4 files changed, 50 insertions(+)

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 553f53cc73ae..64c02239ba46 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -30,6 +30,7 @@
 #include linux/sched.h
 #include linux/wait.h
 #include linux/list.h
+#include linux/llist.h
 #include linux/rbtree.h
 #include linux/workqueue.h
 #include linux/kref.h
@@ -419,6 +420,10 @@ struct ocfs2_super
struct ocfs2_dentry_lock *dentry_lock_list;
struct work_struct dentry_lock_work;
 
+   /* List of dquot structures to drop last reference to */
+   struct llist_head dquot_drop_list;
+   struct work_struct dquot_drop_work;
+
wait_queue_head_t   osb_mount_event;
 
/* Truncate log info */
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index d5ab56cbe5c5..f266d67df3c6 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -28,6 +28,7 @@ struct ocfs2_dquot {
unsigned int dq_use_count;  /* Number of nodes having reference to 
this entry in global quota file */
s64 dq_origspace;   /* Last globally synced space usage */
s64 dq_originodes;  /* Last globally synced inode usage */
+   struct llist_node list; /* Member of list of dquots to drop */
 };
 
 /* Description of one chunk to recover in memory */
@@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 
p_block,
 int ocfs2_create_local_dquot(struct dquot *dquot);
 int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
 int ocfs2_local_write_dquot(struct dquot *dquot);
+void ocfs2_drop_dquot_refs(struct work_struct *work);
 
 extern const struct dquot_operations ocfs2_quota_operations;
 extern struct quota_format_type ocfs2_quota_format;
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index aaa50611ec66..7921e209c64b 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -10,6 +10,7 @@
 #include linux/jiffies.h
 #include linux/writeback.h
 #include linux/workqueue.h
+#include linux/llist.h
 
 #include cluster/masklog.h
 
@@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block *sb, 
int type)
   OCFS2_INODE_UPDATE_CREDITS;
 }
 
+void ocfs2_drop_dquot_refs(struct work_struct *work)
+{
+   struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+  dquot_drop_work);
+   struct llist_node *list;
+   struct ocfs2_dquot *odquot, *next_odquot;
+
+   list = llist_del_all(osb-dquot_drop_list);
+   llist_for_each_entry_safe(odquot, next_odquot, list, list) {
+   /* Drop the reference we acquired in ocfs2_dquot_release() */
+   dqput(odquot-dq_dquot);
+   }
+}
+
+/*
+ * Called when the last reference to dquot is dropped. If we are called from
+ * downconvert thread, we cannot do all the handling here because grabbing
+ * quota lock could deadlock (the node holding the quota lock could need some
+ * other cluster lock to proceed but with blocked downconvert thread we cannot
+ * release any lock).
+ */
 static int ocfs2_release_dquot(struct dquot *dquot)
 {
handle_t *handle;
@@ -694,6 +716,19 @@ static int ocfs2_release_dquot(struct dquot *dquot)
/* Check whether we are not racing with some other dqget() */
if (atomic_read(dquot-dq_count)  

[Ocfs2-devel] [PATCH 3/6] quota: Provide function to grab quota structure reference

2014-02-20 Thread Jan Kara
Provide dqgrab() function to get quota structure reference when we are
sure it already has at least one active reference. Make use of this
function inside quota code.

Signed-off-by: Jan Kara j...@suse.cz
---
 fs/quota/dquot.c | 4 ++--
 include/linux/quotaops.h | 8 
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 831d49a4111f..e3f09e34d0b2 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -528,7 +528,7 @@ restart:
if (atomic_read(dquot-dq_count)) {
DEFINE_WAIT(wait);
 
-   atomic_inc(dquot-dq_count);
+   dqgrab(dquot);
prepare_to_wait(dquot-dq_wait_unused, wait,
TASK_UNINTERRUPTIBLE);
spin_unlock(dq_list_lock);
@@ -624,7 +624,7 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
/* Now we have active dquot from which someone is
 * holding reference so we can safely just increase
 * use count */
-   atomic_inc(dquot-dq_count);
+   dqgrab(dquot);
spin_unlock(dq_list_lock);
dqstats_inc(DQST_LOOKUPS);
err = sb-dq_op-write_dquot(dquot);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 6965fe394c3b..1d3eee594cd6 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -46,6 +46,14 @@ void inode_reclaim_rsv_space(struct inode *inode, qsize_t 
number);
 void dquot_initialize(struct inode *inode);
 void dquot_drop(struct inode *inode);
 struct dquot *dqget(struct super_block *sb, struct kqid qid);
+static inline struct dquot *dqgrab(struct dquot *dquot)
+{
+   /* Make sure someone else has active reference to dquot */
+   WARN_ON_ONCE(!atomic_read(dquot-dq_count));
+   WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, dquot-dq_flags));
+   atomic_inc(dquot-dq_count);
+   return dquot;
+}
 void dqput(struct dquot *dquot);
 int dquot_scan_active(struct super_block *sb,
  int (*fn)(struct dquot *dquot, unsigned long priv),
-- 
1.8.1.4


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


[Ocfs2-devel] [PATCH 1/6] ocfs2: Remove OCFS2_INODE_SKIP_DELETE flag

2014-02-20 Thread Jan Kara
The flag was never set, delete it.

Signed-off-by: Jan Kara j...@suse.cz
---
 fs/ocfs2/inode.c   | 6 --
 fs/ocfs2/inode.h   | 8 +++-
 fs/ocfs2/journal.c | 6 --
 3 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f29a90fde619..b4baaefe4dd4 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -822,12 +822,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode 
*inode)
goto bail_unlock;
}
 
-   /* If we have allowd wipe of this inode for another node, it
-* will be marked here so we can safely skip it. Recovery will
-* cleanup any inodes we might inadvertently skip here. */
-   if (oi-ip_flags  OCFS2_INODE_SKIP_DELETE)
-   goto bail_unlock;
-
ret = 1;
 bail_unlock:
spin_unlock(oi-ip_lock);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 621fc73bf23d..f60bc314ee0a 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -84,8 +84,6 @@ struct ocfs2_inode_info
 #define OCFS2_INODE_BITMAP 0x0004
 /* This inode has been wiped from disk */
 #define OCFS2_INODE_DELETED0x0008
-/* Another node is deleting, so our delete is a nop */
-#define OCFS2_INODE_SKIP_DELETE0x0010
 /* Has the inode been orphaned on another node?
  *
  * This hints to ocfs2_drop_inode that it should clear i_nlink before
@@ -100,11 +98,11 @@ struct ocfs2_inode_info
  * rely on ocfs2_delete_inode to sort things out under the proper
  * cluster locks.
  */
-#define OCFS2_INODE_MAYBE_ORPHANED 0x0020
+#define OCFS2_INODE_MAYBE_ORPHANED 0x0010
 /* Does someone have the file open O_DIRECT */
-#define OCFS2_INODE_OPEN_DIRECT0x0040
+#define OCFS2_INODE_OPEN_DIRECT0x0020
 /* Tell the inode wipe code it's not in orphan dir */
-#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x0080
+#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x0040
 
 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
 {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 44fc3e530c3d..03ea9314fecd 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -2132,12 +2132,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
iter = oi-ip_next_orphan;
 
spin_lock(oi-ip_lock);
-   /* The remote delete code may have set these on the
-* assumption that the other node would wipe them
-* successfully.  If they are still in the node's
-* orphan dir, we need to reset that state. */
-   oi-ip_flags = ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
-
/* Set the proper information to get us going into
 * ocfs2_delete_inode. */
oi-ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-- 
1.8.1.4


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


[Ocfs2-devel] [PATCH 5/6] ocfs2: Avoid blocking in ocfs2_mark_lockres_freeing() in downconvert thread

2014-02-20 Thread Jan Kara
If we are dropping last inode reference from downconvert thread, we will
end up calling ocfs2_mark_lockres_freeing() which can block if the lock
we are freeing is queued thus creating an A-A deadlock. Luckily, since
we are the downconvert thread, we can immediately dequeue the lock and
thus avoid waiting in this case.

Signed-off-by: Jan Kara j...@suse.cz
---
 fs/ocfs2/dlmglue.c | 33 +++--
 fs/ocfs2/dlmglue.h |  3 ++-
 fs/ocfs2/inode.c   |  7 ---
 3 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 19986959d149..b7580157ef01 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3150,7 +3150,8 @@ out:
  * it safe to drop.
  *
  * You can *not* attempt to call cluster_lock on this lockres anymore. */
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+   struct ocfs2_lock_res *lockres)
 {
int status;
struct ocfs2_mask_waiter mw;
@@ -3160,6 +3161,33 @@ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res 
*lockres)
 
spin_lock_irqsave(lockres-l_lock, flags);
lockres-l_flags |= OCFS2_LOCK_FREEING;
+   if (lockres-l_flags  OCFS2_LOCK_QUEUED  current == osb-dc_task) {
+   unsigned long flags;
+
+   /*
+* We know the downconvert is queued but not in progress
+* because we are the downconvert thread and processing
+* different lock. So we can just remove the lock from the
+* queue. This is not only an optimization but also a way
+* to avoid the following deadlock:
+*   ocfs2_dentry_post_unlock()
+* ocfs2_dentry_lock_put()
+*   ocfs2_drop_dentry_lock()
+* iput()
+*   ocfs2_evict_inode()
+* ocfs2_clear_inode()
+*   ocfs2_mark_lockres_freeing()
+* ... blocks waiting for OCFS2_LOCK_QUEUED
+* since we are the downconvert thread which
+* should clear the flag.
+*/
+   spin_lock_irqsave(osb-dc_task_lock, flags);
+   list_del_init(lockres-l_blocked_list);
+   osb-blocked_lock_count--;
+   spin_unlock_irqrestore(osb-dc_task_lock, flags);
+   lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
+   goto out_unlock;
+   }
while (lockres-l_flags  OCFS2_LOCK_QUEUED) {
lockres_add_mask_waiter(lockres, mw, OCFS2_LOCK_QUEUED, 0);
spin_unlock_irqrestore(lockres-l_lock, flags);
@@ -3172,6 +3200,7 @@ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res 
*lockres)
 
spin_lock_irqsave(lockres-l_lock, flags);
}
+out_unlock:
spin_unlock_irqrestore(lockres-l_lock, flags);
 }
 
@@ -3180,7 +3209,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
 {
int ret;
 
-   ocfs2_mark_lockres_freeing(lockres);
+   ocfs2_mark_lockres_freeing(osb, lockres);
ret = ocfs2_drop_lock(osb, lockres);
if (ret)
mlog_errno(ret);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 1d596d8c4a4a..d293a22c32c5 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree 
*ref_tree, int ex);
 void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
 
 
-void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
+void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
+   struct ocfs2_lock_res *lockres);
 void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
   struct ocfs2_lock_res *lockres);
 
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 3b0d722de35e..9661f8db21dc 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -1053,6 +1053,7 @@ static void ocfs2_clear_inode(struct inode *inode)
 {
int status;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
+   struct ocfs2_super *osb = OCFS2_SB(inode-i_sb);
 
clear_inode(inode);
trace_ocfs2_clear_inode((unsigned long long)oi-ip_blkno,
@@ -1069,9 +1070,9 @@ static void ocfs2_clear_inode(struct inode *inode)
 
/* Do these before all the other work so that we don't bounce
 * the downconvert thread while waiting to destroy the locks. */
-   ocfs2_mark_lockres_freeing(oi-ip_rw_lockres);
-   ocfs2_mark_lockres_freeing(oi-ip_inode_lockres);
-   ocfs2_mark_lockres_freeing(oi-ip_open_lockres);
+   ocfs2_mark_lockres_freeing(osb, oi-ip_rw_lockres);
+   ocfs2_mark_lockres_freeing(osb, oi-ip_inode_lockres);
+   ocfs2_mark_lockres_freeing(osb, oi-ip_open_lockres);
 

[Ocfs2-devel] [PATCH] quota: Fix race between dqput() and dquot_scan_active()

2014-02-20 Thread Jan Kara
Currently last dqput() can race with dquot_scan_active() causing it to
call callback for an already deactivated dquot. The race is as follows:

CPU1CPU2
  dqput()
spin_lock(dq_list_lock);
if (atomic_read(dquot-dq_count)  1) {
 - not taken
if (test_bit(DQ_ACTIVE_B, dquot-dq_flags)) {
  spin_unlock(dq_list_lock);
  -release_dquot(dquot);
if (atomic_read(dquot-dq_count)  1)
 - not taken
  dquot_scan_active()
spin_lock(dq_list_lock);
if (!test_bit(DQ_ACTIVE_B, 
dquot-dq_flags))
 - not taken
atomic_inc(dquot-dq_count);
spin_unlock(dq_list_lock);
- proceeds to release dquot
ret = fn(dquot, priv);
 - called for inactive dquot

Fix the problem by making sure possible -release_dquot() is finished by
the time we call the callback and new calls to it will notice reference
dquot_scan_active() has taken and bail out.

Signed-off-by: Jan Kara j...@suse.cz
---
 fs/quota/dquot.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

  This is the last patch needed to make ocfs2 quotas rock solid in my testing.
I will carry it in my tree and push it to Linus soon.

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 831d49a4111f..cfc8dcc16043 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -581,9 +581,17 @@ int dquot_scan_active(struct super_block *sb,
dqstats_inc(DQST_LOOKUPS);
dqput(old_dquot);
old_dquot = dquot;
-   ret = fn(dquot, priv);
-   if (ret  0)
-   goto out;
+   /*
+* -release_dquot() can be racing with us. Our reference
+* protects us from new calls to it so just wait for any
+* outstanding call and recheck the DQ_ACTIVE_B after that.
+*/
+   wait_on_dquot(dquot);
+   if (test_bit(DQ_ACTIVE_B, dquot-dq_flags)) {
+   ret = fn(dquot, priv);
+   if (ret  0)
+   goto out;
+   }
spin_lock(dq_list_lock);
/* We are safe to continue now because our dquot could not
 * be moved out of the inuse list while we hold the reference */
-- 
1.8.1.4


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


[Ocfs2-devel] Hi everyone, is it an issue? The host is blocked as the issue created. Thanks a lot.

2014-02-20 Thread Guozhonghua
Hi everyone, as we test the performance of the ocfs2 with fio. As the test case 
running, one of host of ocfs2 cluster will be blocked a small time and restart 
sooner.
The test environment is that there are six host sharing one iSCSI LUN which 
capacity is about 1T and it is formatted with ocfs2, and mount point on every 
host is /vms/vStore.
All of the host's OS is ubuntu 12.04, and we upgrade the kernel with 3.2.50, 
and ocfs2 as compiled according with kernel 3.2.50.
We test the performance of the ocfs2 with fio on one every host.

The fio test configure is as below, and the filename is different on every host.
Such as file1...file5 is on host1, file6file10 are on host2, and so on.

One example fio file is as below:
root@cvknode4:~/fios_test4# cat 1024k_10r
[global]
ioengine=libaio
rw=read
bs=1024K
time_based
runtime=180
size=9g
direct=1
iodepth=1

[file1]
filename=/vms/vStor/file41

[file2]
filename=/vms/vStor/file42

[file3]
filename=/vms/vStor/file43

[file4]
filename=/vms/vStor/file44

[file5]
filename=/vms/vStor/file45

As we start fio tools on the hosts sequent, several minute later, one host will 
blocked and restart(fenced).
Is it one issue of ocfs2? Or is there any fixed patch for it?

The syslog is as below:
Feb 19 17:50:01 cvknode9 CRON[16143]: (CRON) info (No MTA installed, discarding 
output)
Feb 19 17:50:01 cvknode9 CRON[16147]: (CRON) info (No MTA installed, discarding 
output)
Feb 19 17:50:01 cvknode9 CRON[16146]: (CRON) info (No MTA installed, discarding 
output)
Feb 19 17:50:01 cvknode9 CRON[16144]: (CRON) info (No MTA installed, discarding 
output)
Feb 19 17:50:01 cvknode9 CRON[16141]: (CRON) info (No MTA installed, discarding 
output)
Feb 19 17:50:02 cvknode9 CRON[16134]: (CRON) info (No MTA installed, discarding 
output)
Feb 19 17:50:03 cvknode9 crmadmin: [16194]: ERROR: admin_message_timeout: No 
messages received in 2 seconds
Feb 19 17:50:03 cvknode9 CRON[16140]: (CRON) info (No MTA installed, discarding 
output)
Feb 19 17:51:00 cvknode9 kernel: [  803.464977] [ cut here 
]
Feb 19 17:51:00 cvknode9 kernel: [  803.464991] WARNING: at 
kernel/watchdog.c:241 watchdog_overflow_callback+0x9a/0xc0()
Feb 19 17:51:00 cvknode9 kernel: [  803.464993] Hardware name: FlexServer B590
Feb 19 17:51:00 cvknode9 kernel: [  803.464995] Watchdog detected hard LOCKUP 
on cpu 0
Feb 19 17:51:00 cvknode9 kernel: [  803.464997] Modules linked in: 
ip6table_filter ip6_tables iptable_filter ip_tables ebtable_nat ebtables 
x_tables ocfs2(O) quota_tree drbd lru_cache 8021q garp stp vhost_net macvtap 
macvlan kvm_intel kvm ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core ib_addr 
iscsi_tcp libiscsi_tcp ocfs2_dlmfs(O) ocfs2_stack_o2cb(O) ocfs2_dlm(O) 
ocfs2_nodemanager(O) ocfs2_stackglue(O) configfs openvswitch_mod(O) nfsd nfs 
lockd fscache auth_rpcgss nfs_acl sunrpc psmouse ioatdma dm_multipath serio_raw 
sb_edac hpilo edac_core dca acpi_power_meter mac_hid lp parport hpsa be2iscsi 
iscsi_boot_sysfs libiscsi be2net scsi_transport_iscsi
Feb 19 17:51:00 cvknode9 kernel: [  803.465065] Pid: 6029, comm: ocfs2dc 
Tainted: G   O 3.2.50 #1
Feb 19 17:51:00 cvknode9 kernel: [  803.465067] Call Trace:
Feb 19 17:51:00 cvknode9 kernel: [  803.465069]  NMI  [81066daf] 
warn_slowpath_common+0x7f/0xc0
Feb 19 17:51:00 cvknode9 kernel: [  803.465084]  [81066ea6] 
warn_slowpath_fmt+0x46/0x50
Feb 19 17:51:00 cvknode9 kernel: [  803.465089]  [8101b833] ? 
native_sched_clock+0x13/0x80
Feb 19 17:51:00 cvknode9 kernel: [  803.465093]  [810d6b1a] 
watchdog_overflow_callback+0x9a/0xc0
Feb 19 17:51:00 cvknode9 kernel: [  803.465099]  [8110eb76] 
__perf_event_overflow+0x96/0x1f0
Feb 19 17:51:00 cvknode9 kernel: [  803.465103]  [8110c491] ? 
perf_event_update_userpage+0x11/0xc0
Feb 19 17:51:00 cvknode9 kernel: [  803.465109]  [8102468a] ? 
x86_perf_event_set_period+0xda/0x150
Feb 19 17:51:00 cvknode9 kernel: [  803.465113]  [8110f534] 
perf_event_overflow+0x14/0x20
Feb 19 17:51:00 cvknode9 kernel: [  803.465118]  [81028c93] 
intel_pmu_handle_irq+0x163/0x2e0
Feb 19 17:51:00 cvknode9 kernel: [  803.465130]  [81644b01] 
perf_event_nmi_handler+0x21/0x30
Feb 19 17:51:00 cvknode9 kernel: [  803.465134]  [816443d1] 
do_nmi+0x101/0x350
Feb 19 17:51:00 cvknode9 kernel: [  803.465138]  [81643a30] 
nmi+0x20/0x30
Feb 19 17:51:00 cvknode9 kernel: [  803.465147]  [8103db15] ? 
__ticket_spin_lock+0x25/0x30
Feb 19 17:51:00 cvknode9 kernel: [  803.465149]  EOE  IRQ  
[81642fee] _raw_spin_lock+0xe/0x20
Feb 19 17:51:00 cvknode9 kernel: [  803.465216]  [a03e5487] 
ocfs2_wake_downconvert_thread+0x27/0x60 [ocfs2]
Feb 19 17:51:00 cvknode9 kernel: [  803.465231]  [a03e5554] 
__ocfs2_cluster_unlock.isra.32+0x94/0xf0 [ocfs2]
Feb 19 17:51:00 cvknode9 kernel: [  803.465245]  [a03e5b2b] 
ocfs2_rw_unlock+0x6b/0xe0 [ocfs2]
Feb 19 17:51:00 cvknode9 kernel: [  803.465252]  [811aa22f] ? 

Re: [Ocfs2-devel] [Ocfs2-users] Hi everyone, is it an issue? The host is blocked as the issue created. Thanks a lot.

2014-02-20 Thread Marty Sweet
Hi,

By the looks of the call-trace this looks like an issue with
communicating with the iSCSI target.
I run 6 nodes with OCFS2 over Fibre Channel on Ubuntu 12.04 Linux 3.5
and Linux 3.13 (VM Cluster and Samba Cluster).

Before investigating this any further I would advise upgrading your
kernel to at least 3.8 (which is officially support by Ubuntu). There
have been many improvements to OCFS2 in recent kernels which have
increased the stability in (at least our environment) substantially.

$ apt-get install linux-image-3.8.0-35-generic

If the problem still persists, does you iSCSI target have monitoring
statistics which you could look into? If the network link is becoming
saturated this could be the issue (especially if heartbeat is running
over the same interface).
Could you also let us know which node is fencing and if all subsequent
nodes receive this stack trace? Does the IO lock up? ( $ ls
/vms/vStore )

Marty

On Fri, Feb 21, 2014 at 3:28 AM, Guozhonghua guozhong...@h3c.com wrote:
 Hi everyone, as we test the performance of the ocfs2 with fio. As the test
 case running, one of host of ocfs2 cluster will be blocked a small time and
 restart sooner.

 The test environment is that there are six host sharing one iSCSI LUN which
 capacity is about 1T and it is formatted with ocfs2, and mount point on
 every host is /vms/vStore.

 All of the host’s OS is ubuntu 12.04, and we upgrade the kernel with 3.2.50,
 and ocfs2 as compiled according with kernel 3.2.50.

 We test the performance of the ocfs2 with fio on one every host.



 The fio test configure is as below, and the filename is different on every
 host.

 Such as file1…file5 is on host1, file6….file10 are on host2, and so on.



 One example fio file is as below:

 root@cvknode4:~/fios_test4# cat 1024k_10r

 [global]

 ioengine=libaio

 rw=read

 bs=1024K

 time_based

 runtime=180

 size=9g

 direct=1

 iodepth=1



 [file1]

 filename=/vms/vStor/file41



 [file2]

 filename=/vms/vStor/file42



 [file3]

 filename=/vms/vStor/file43



 [file4]

 filename=/vms/vStor/file44



 [file5]

 filename=/vms/vStor/file45



 As we start fio tools on the hosts sequent, several minute later, one host
 will blocked and restart(fenced).

 Is it one issue of ocfs2? Or is there any fixed patch for it?



 The syslog is as below:

 Feb 19 17:50:01 cvknode9 CRON[16143]: (CRON) info (No MTA installed,
 discarding output)

 Feb 19 17:50:01 cvknode9 CRON[16147]: (CRON) info (No MTA installed,
 discarding output)

 Feb 19 17:50:01 cvknode9 CRON[16146]: (CRON) info (No MTA installed,
 discarding output)

 Feb 19 17:50:01 cvknode9 CRON[16144]: (CRON) info (No MTA installed,
 discarding output)

 Feb 19 17:50:01 cvknode9 CRON[16141]: (CRON) info (No MTA installed,
 discarding output)

 Feb 19 17:50:02 cvknode9 CRON[16134]: (CRON) info (No MTA installed,
 discarding output)

 Feb 19 17:50:03 cvknode9 crmadmin: [16194]: ERROR: admin_message_timeout: No
 messages received in 2 seconds

 Feb 19 17:50:03 cvknode9 CRON[16140]: (CRON) info (No MTA installed,
 discarding output)

 Feb 19 17:51:00 cvknode9 kernel: [  803.464977] [ cut here
 ]

 Feb 19 17:51:00 cvknode9 kernel: [  803.464991] WARNING: at
 kernel/watchdog.c:241 watchdog_overflow_callback+0x9a/0xc0()

 Feb 19 17:51:00 cvknode9 kernel: [  803.464993] Hardware name: FlexServer
 B590

 Feb 19 17:51:00 cvknode9 kernel: [  803.464995] Watchdog detected hard
 LOCKUP on cpu 0

 Feb 19 17:51:00 cvknode9 kernel: [  803.464997] Modules linked in:
 ip6table_filter ip6_tables iptable_filter ip_tables ebtable_nat ebtables
 x_tables ocfs2(O) quota_tree drbd lru_cache 8021q garp stp vhost_net macvtap
 macvlan kvm_intel kvm ib_iser rdma_cm ib_cm iw_cm ib_sa ib_mad ib_core
 ib_addr iscsi_tcp libiscsi_tcp ocfs2_dlmfs(O) ocfs2_stack_o2cb(O)
 ocfs2_dlm(O) ocfs2_nodemanager(O) ocfs2_stackglue(O) configfs
 openvswitch_mod(O) nfsd nfs lockd fscache auth_rpcgss nfs_acl sunrpc psmouse
 ioatdma dm_multipath serio_raw sb_edac hpilo edac_core dca acpi_power_meter
 mac_hid lp parport hpsa be2iscsi iscsi_boot_sysfs libiscsi be2net
 scsi_transport_iscsi

 Feb 19 17:51:00 cvknode9 kernel: [  803.465065] Pid: 6029, comm: ocfs2dc
 Tainted: G   O 3.2.50 #1

 Feb 19 17:51:00 cvknode9 kernel: [  803.465067] Call Trace:

 Feb 19 17:51:00 cvknode9 kernel: [  803.465069]  NMI  [81066daf]
 warn_slowpath_common+0x7f/0xc0

 Feb 19 17:51:00 cvknode9 kernel: [  803.465084]  [81066ea6]
 warn_slowpath_fmt+0x46/0x50

 Feb 19 17:51:00 cvknode9 kernel: [  803.465089]  [8101b833] ?
 native_sched_clock+0x13/0x80

 Feb 19 17:51:00 cvknode9 kernel: [  803.465093]  [810d6b1a]
 watchdog_overflow_callback+0x9a/0xc0

 Feb 19 17:51:00 cvknode9 kernel: [  803.465099]  [8110eb76]
 __perf_event_overflow+0x96/0x1f0

 Feb 19 17:51:00 cvknode9 kernel: [  803.465103]  [8110c491] ?
 perf_event_update_userpage+0x11/0xc0

 Feb 19 17:51:00 cvknode9 kernel: [  803.465109]  [8102468a] ?
 

[Ocfs2-devel] 答复: [Ocfs2-users] Hi everyone, is it an issue? The host is blocked as the issue created. Thanks a lot.

2014-02-20 Thread Guozhonghua
Hi Marty,

It is that the latest one host which run fio tools, and the host is frequently 
receive the stack trace and fenced, other hosts' IO lock up which is in the 
OCFS2 cluster at the same time.

We will try the new kernel.

Thanks a lot.

Guozhonghua

-邮件原件-
发件人: Marty Sweet [mailto:msweet@gmail.com]
发送时间: 2014年2月21日 11:47
收件人: guozhonghua 02084
抄送: ocfs2-devel@oss.oracle.com; ocfs2-us...@oss.oracle.com
主题: Re: [Ocfs2-users] Hi everyone, is it an issue? The host is blocked as the 
issue created. Thanks a lot.

Hi,

By the looks of the call-trace this looks like an issue with communicating with 
the iSCSI target.
I run 6 nodes with OCFS2 over Fibre Channel on Ubuntu 12.04 Linux 3.5 and Linux 
3.13 (VM Cluster and Samba Cluster).

Before investigating this any further I would advise upgrading your kernel to 
at least 3.8 (which is officially support by Ubuntu). There have been many 
improvements to OCFS2 in recent kernels which have increased the stability in 
(at least our environment) substantially.

$ apt-get install linux-image-3.8.0-35-generic

If the problem still persists, does you iSCSI target have monitoring statistics 
which you could look into? If the network link is becoming saturated this could 
be the issue (especially if heartbeat is running over the same interface).
Could you also let us know which node is fencing and if all subsequent nodes 
receive this stack trace? Does the IO lock up? ( $ ls /vms/vStore )

Marty

-
本邮件及其附件含有杭州华三通信技术有限公司的保密信息,仅限于发送给上面地址中列出
的个人或群组。禁止任何其他人以任何形式使用(包括但不限于全部或部分地泄露、复制、
或散发)本邮件中的信息。如果您错收了本邮件,请您立即电话或邮件通知发件人并删除本
邮件!
This e-mail and its attachments contain confidential information from H3C, 
which is
intended only for the person or entity whose address is listed above. Any use 
of the
information contained herein in any way (including, but not limited to, total 
or partial
disclosure, reproduction, or dissemination) by persons other than the intended
recipient(s) is prohibited. If you receive this e-mail in error, please notify 
the sender
by phone or email immediately and delete it!
___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel

Re: [Ocfs2-devel] [PATCH 0/6 v2] ocfs2: Avoid pending orphaned inodes

2014-02-20 Thread Srinivas Eeda
Hi Jan,

thanks a lot for these patches. They all look good to me ... I just have 
one question on patch 5

Thanks,
--Srini

On 02/20/2014 07:18 AM, Jan Kara wrote:
Hello,

here is a second version of my patchset to solve a deadlocks when we do not
 defer dropping of inode reference from downconvert worker. I have tested the
 patches (also with lockdep enabled) and they seem to work fine. Comments are
 welcome!

   Honza


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


Re: [Ocfs2-devel] [PATCH 3/6] quota: Provide function to grab quota structure reference

2014-02-20 Thread Srinivas Eeda
looks good to me
Reviewed-by: Srinivas Eeda srinivas.e...@oracle.com

On 02/20/2014 07:18 AM, Jan Kara wrote:
 Provide dqgrab() function to get quota structure reference when we are
 sure it already has at least one active reference. Make use of this
 function inside quota code.

 Signed-off-by: Jan Kara j...@suse.cz
 ---
   fs/quota/dquot.c | 4 ++--
   include/linux/quotaops.h | 8 
   2 files changed, 10 insertions(+), 2 deletions(-)

 diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
 index 831d49a4111f..e3f09e34d0b2 100644
 --- a/fs/quota/dquot.c
 +++ b/fs/quota/dquot.c
 @@ -528,7 +528,7 @@ restart:
   if (atomic_read(dquot-dq_count)) {
   DEFINE_WAIT(wait);
   
 - atomic_inc(dquot-dq_count);
 + dqgrab(dquot);
   prepare_to_wait(dquot-dq_wait_unused, wait,
   TASK_UNINTERRUPTIBLE);
   spin_unlock(dq_list_lock);
 @@ -624,7 +624,7 @@ int dquot_writeback_dquots(struct super_block *sb, int 
 type)
   /* Now we have active dquot from which someone is
* holding reference so we can safely just increase
* use count */
 - atomic_inc(dquot-dq_count);
 + dqgrab(dquot);
   spin_unlock(dq_list_lock);
   dqstats_inc(DQST_LOOKUPS);
   err = sb-dq_op-write_dquot(dquot);
 diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
 index 6965fe394c3b..1d3eee594cd6 100644
 --- a/include/linux/quotaops.h
 +++ b/include/linux/quotaops.h
 @@ -46,6 +46,14 @@ void inode_reclaim_rsv_space(struct inode *inode, qsize_t 
 number);
   void dquot_initialize(struct inode *inode);
   void dquot_drop(struct inode *inode);
   struct dquot *dqget(struct super_block *sb, struct kqid qid);
 +static inline struct dquot *dqgrab(struct dquot *dquot)
 +{
 + /* Make sure someone else has active reference to dquot */
 + WARN_ON_ONCE(!atomic_read(dquot-dq_count));
 + WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, dquot-dq_flags));
 + atomic_inc(dquot-dq_count);
 + return dquot;
 +}
   void dqput(struct dquot *dquot);
   int dquot_scan_active(struct super_block *sb,
 int (*fn)(struct dquot *dquot, unsigned long priv),


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


Re: [Ocfs2-devel] [PATCH 4/6] ocfs2: Implement delayed dropping of last dquot reference

2014-02-20 Thread Srinivas Eeda
looks good to me
Reviewed-by: Srinivas Eeda srinivas.e...@oracle.com

On 02/20/2014 07:18 AM, Jan Kara wrote:
 We cannot drop last dquot reference from downconvert thread as that
 creates the following deadlock:

 NODE 1  NODE2
 holds dentry lock for 'foo'
 holds inode lock for GLOBAL_BITMAP_SYSTEM_INODE
  dquot_initialize(bar)
ocfs2_dquot_acquire()
  
 ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE)
  ...
 downconvert thread (triggered from another
 node or a different process from NODE2)
ocfs2_dentry_post_unlock()
  ...
  iput(foo)
ocfs2_evict_inode(foo)
  ocfs2_clear_inode(foo)
dquot_drop(inode)
  ...
   ocfs2_dquot_release()
ocfs2_inode_lock(USER_QUOTA_SYSTEM_INODE)
 - blocks
  finds we need more space in
  quota file
  ...
  ocfs2_extend_no_holes()

 ocfs2_inode_lock(GLOBAL_BITMAP_SYSTEM_INODE)
  - deadlocks waiting for
downconvert thread

 We solve the problem by postponing dropping of the last dquot reference
 to a workqueue if it happens from the downconvert thread.

 Signed-off-by: Jan Kara j...@suse.cz
 ---
   fs/ocfs2/ocfs2.h|  5 +
   fs/ocfs2/quota.h|  2 ++
   fs/ocfs2/quota_global.c | 35 +++
   fs/ocfs2/super.c|  8 
   4 files changed, 50 insertions(+)

 diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
 index 553f53cc73ae..64c02239ba46 100644
 --- a/fs/ocfs2/ocfs2.h
 +++ b/fs/ocfs2/ocfs2.h
 @@ -30,6 +30,7 @@
   #include linux/sched.h
   #include linux/wait.h
   #include linux/list.h
 +#include linux/llist.h
   #include linux/rbtree.h
   #include linux/workqueue.h
   #include linux/kref.h
 @@ -419,6 +420,10 @@ struct ocfs2_super
   struct ocfs2_dentry_lock *dentry_lock_list;
   struct work_struct dentry_lock_work;
   
 + /* List of dquot structures to drop last reference to */
 + struct llist_head dquot_drop_list;
 + struct work_struct dquot_drop_work;
 +
   wait_queue_head_t   osb_mount_event;
   
   /* Truncate log info */
 diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
 index d5ab56cbe5c5..f266d67df3c6 100644
 --- a/fs/ocfs2/quota.h
 +++ b/fs/ocfs2/quota.h
 @@ -28,6 +28,7 @@ struct ocfs2_dquot {
   unsigned int dq_use_count;  /* Number of nodes having reference to 
 this entry in global quota file */
   s64 dq_origspace;   /* Last globally synced space usage */
   s64 dq_originodes;  /* Last globally synced inode usage */
 + struct llist_node list; /* Member of list of dquots to drop */
   };
   
   /* Description of one chunk to recover in memory */
 @@ -110,6 +111,7 @@ int ocfs2_read_quota_phys_block(struct inode *inode, u64 
 p_block,
   int ocfs2_create_local_dquot(struct dquot *dquot);
   int ocfs2_local_release_dquot(handle_t *handle, struct dquot *dquot);
   int ocfs2_local_write_dquot(struct dquot *dquot);
 +void ocfs2_drop_dquot_refs(struct work_struct *work);
   
   extern const struct dquot_operations ocfs2_quota_operations;
   extern struct quota_format_type ocfs2_quota_format;
 diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
 index aaa50611ec66..7921e209c64b 100644
 --- a/fs/ocfs2/quota_global.c
 +++ b/fs/ocfs2/quota_global.c
 @@ -10,6 +10,7 @@
   #include linux/jiffies.h
   #include linux/writeback.h
   #include linux/workqueue.h
 +#include linux/llist.h
   
   #include cluster/masklog.h
   
 @@ -679,6 +680,27 @@ static int ocfs2_calc_qdel_credits(struct super_block 
 *sb, int type)
  OCFS2_INODE_UPDATE_CREDITS;
   }
   
 +void ocfs2_drop_dquot_refs(struct work_struct *work)
 +{
 + struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
 +dquot_drop_work);
 + struct llist_node *list;
 + struct ocfs2_dquot *odquot, *next_odquot;
 +
 + list = llist_del_all(osb-dquot_drop_list);
 + llist_for_each_entry_safe(odquot, next_odquot, list, list) {
 + /* Drop the reference we acquired in ocfs2_dquot_release() */
 + dqput(odquot-dq_dquot);
 + }
 +}
 +
 +/*
 + * Called when the last reference to dquot is dropped. If we are called from
 + * downconvert thread, we cannot do all the handling here because grabbing
 + * quota lock could deadlock (the node holding the quota lock could need some
 + * other cluster lock to proceed but with blocked downconvert thread we 
 cannot
 + * release any lock).
 + */
   static int 

Re: [Ocfs2-devel] [PATCH 1/6] ocfs2: Remove OCFS2_INODE_SKIP_DELETE flag

2014-02-20 Thread Srinivas Eeda
Reviewed-by: Srinivas Eeda srinivas.e...@oracle.com

On 02/20/2014 07:18 AM, Jan Kara wrote:
 The flag was never set, delete it.

 Signed-off-by: Jan Kara j...@suse.cz
 ---
   fs/ocfs2/inode.c   | 6 --
   fs/ocfs2/inode.h   | 8 +++-
   fs/ocfs2/journal.c | 6 --
   3 files changed, 3 insertions(+), 17 deletions(-)

 diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
 index f29a90fde619..b4baaefe4dd4 100644
 --- a/fs/ocfs2/inode.c
 +++ b/fs/ocfs2/inode.c
 @@ -822,12 +822,6 @@ static int ocfs2_inode_is_valid_to_delete(struct inode 
 *inode)
   goto bail_unlock;
   }
   
 - /* If we have allowd wipe of this inode for another node, it
 -  * will be marked here so we can safely skip it. Recovery will
 -  * cleanup any inodes we might inadvertently skip here. */
 - if (oi-ip_flags  OCFS2_INODE_SKIP_DELETE)
 - goto bail_unlock;
 -
   ret = 1;
   bail_unlock:
   spin_unlock(oi-ip_lock);
 diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
 index 621fc73bf23d..f60bc314ee0a 100644
 --- a/fs/ocfs2/inode.h
 +++ b/fs/ocfs2/inode.h
 @@ -84,8 +84,6 @@ struct ocfs2_inode_info
   #define OCFS2_INODE_BITMAP  0x0004
   /* This inode has been wiped from disk */
   #define OCFS2_INODE_DELETED 0x0008
 -/* Another node is deleting, so our delete is a nop */
 -#define OCFS2_INODE_SKIP_DELETE  0x0010
   /* Has the inode been orphaned on another node?
*
* This hints to ocfs2_drop_inode that it should clear i_nlink before
 @@ -100,11 +98,11 @@ struct ocfs2_inode_info
* rely on ocfs2_delete_inode to sort things out under the proper
* cluster locks.
*/
 -#define OCFS2_INODE_MAYBE_ORPHANED   0x0020
 +#define OCFS2_INODE_MAYBE_ORPHANED   0x0010
   /* Does someone have the file open O_DIRECT */
 -#define OCFS2_INODE_OPEN_DIRECT  0x0040
 +#define OCFS2_INODE_OPEN_DIRECT  0x0020
   /* Tell the inode wipe code it's not in orphan dir */
 -#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x0080
 +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x0040
   
   static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode)
   {
 diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
 index 44fc3e530c3d..03ea9314fecd 100644
 --- a/fs/ocfs2/journal.c
 +++ b/fs/ocfs2/journal.c
 @@ -2132,12 +2132,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super 
 *osb,
   iter = oi-ip_next_orphan;
   
   spin_lock(oi-ip_lock);
 - /* The remote delete code may have set these on the
 -  * assumption that the other node would wipe them
 -  * successfully.  If they are still in the node's
 -  * orphan dir, we need to reset that state. */
 - oi-ip_flags = ~(OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE);
 -
   /* Set the proper information to get us going into
* ocfs2_delete_inode. */
   oi-ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


Re: [Ocfs2-devel] [PATCH 2/6] ocfs2: Move dquot_initialize() in ocfs2_delete_inode() somewhat later

2014-02-20 Thread Srinivas Eeda
looks good to me

Reviewed-by: Srinivas Eeda srinivas.e...@oracle.com

On 02/20/2014 07:18 AM, Jan Kara wrote:
 Move dquot_initalize() call in ocfs2_delete_inode() after the moment we
 verify inode is actually a sane one to delete. We certainly don't want
 to initialize quota for system inodes etc. This also avoids calling into
 quota code from downconvert thread.

 Add more details into the comment why bailing out from
 ocfs2_delete_inode() when we are in downconvert thread is OK.

 Signed-off-by: Jan Kara j...@suse.cz
 ---
   fs/ocfs2/inode.c | 16 +---
   1 file changed, 9 insertions(+), 7 deletions(-)

 diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
 index b4baaefe4dd4..3b0d722de35e 100644
 --- a/fs/ocfs2/inode.c
 +++ b/fs/ocfs2/inode.c
 @@ -804,11 +804,13 @@ static int ocfs2_inode_is_valid_to_delete(struct inode 
 *inode)
   goto bail;
   }
   
 - /* If we're coming from downconvert_thread we can't go into our own
 -  * voting [hello, deadlock city!], so unforuntately we just
 -  * have to skip deleting this guy. That's OK though because
 -  * the node who's doing the actual deleting should handle it
 -  * anyway. */
 + /*
 +  * If we're coming from downconvert_thread we can't go into our own
 +  * voting [hello, deadlock city!] so we cannot delete the inode. But
 +  * since we dropped last inode ref when downconverting dentry lock,
 +  * we cannot have the file open and thus the node doing unlink will
 +  * take care of deleting the inode.
 +  */
   if (current == osb-dc_task)
   goto bail;
   
 @@ -954,8 +956,6 @@ static void ocfs2_delete_inode(struct inode *inode)
   if (is_bad_inode(inode) || !OCFS2_I(inode)-ip_blkno)
   goto bail;
   
 - dquot_initialize(inode);
 -
   if (!ocfs2_inode_is_valid_to_delete(inode)) {
   /* It's probably not necessary to truncate_inode_pages
* here but we do it for safety anyway (it will most
 @@ -964,6 +964,8 @@ static void ocfs2_delete_inode(struct inode *inode)
   goto bail;
   }
   
 + dquot_initialize(inode);
 +
   /* We want to block signals in delete_inode as the lock and
* messaging paths may return us -ERESTARTSYS. Which would
* cause us to exit early, resulting in inodes being orphaned


___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


Re: [Ocfs2-devel] [PATCH 5/6] ocfs2: Avoid blocking in ocfs2_mark_lockres_freeing() in downconvert thread

2014-02-20 Thread Srinivas Eeda
I like the idea of dc_task handling queued basts in 
ocfs2_mark_lockres_freeing.

I am wondering if we should call lockres-l_ops-post_unlock(osb, 
lockres) ? Would there be another node waiting for a bast response ?

On 02/20/2014 07:18 AM, Jan Kara wrote:
 If we are dropping last inode reference from downconvert thread, we will
 end up calling ocfs2_mark_lockres_freeing() which can block if the lock
 we are freeing is queued thus creating an A-A deadlock. Luckily, since
 we are the downconvert thread, we can immediately dequeue the lock and
 thus avoid waiting in this case.

 Signed-off-by: Jan Kara j...@suse.cz
 ---
   fs/ocfs2/dlmglue.c | 33 +++--
   fs/ocfs2/dlmglue.h |  3 ++-
   fs/ocfs2/inode.c   |  7 ---
   3 files changed, 37 insertions(+), 6 deletions(-)

 diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
 index 19986959d149..b7580157ef01 100644
 --- a/fs/ocfs2/dlmglue.c
 +++ b/fs/ocfs2/dlmglue.c
 @@ -3150,7 +3150,8 @@ out:
* it safe to drop.
*
* You can *not* attempt to call cluster_lock on this lockres anymore. */
 -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres)
 +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
 + struct ocfs2_lock_res *lockres)
   {
   int status;
   struct ocfs2_mask_waiter mw;
 @@ -3160,6 +3161,33 @@ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res 
 *lockres)
   
   spin_lock_irqsave(lockres-l_lock, flags);
   lockres-l_flags |= OCFS2_LOCK_FREEING;
 + if (lockres-l_flags  OCFS2_LOCK_QUEUED  current == osb-dc_task) {
 + unsigned long flags;
 +
 + /*
 +  * We know the downconvert is queued but not in progress
 +  * because we are the downconvert thread and processing
 +  * different lock. So we can just remove the lock from the
 +  * queue. This is not only an optimization but also a way
 +  * to avoid the following deadlock:
 +  *   ocfs2_dentry_post_unlock()
 +  * ocfs2_dentry_lock_put()
 +  *   ocfs2_drop_dentry_lock()
 +  * iput()
 +  *   ocfs2_evict_inode()
 +  * ocfs2_clear_inode()
 +  *   ocfs2_mark_lockres_freeing()
 +  * ... blocks waiting for OCFS2_LOCK_QUEUED
 +  * since we are the downconvert thread which
 +  * should clear the flag.
 +  */
 + spin_lock_irqsave(osb-dc_task_lock, flags);
 + list_del_init(lockres-l_blocked_list);
 + osb-blocked_lock_count--;
 + spin_unlock_irqrestore(osb-dc_task_lock, flags);
 + lockres_clear_flags(lockres, OCFS2_LOCK_QUEUED);
 + goto out_unlock;
 + }
   while (lockres-l_flags  OCFS2_LOCK_QUEUED) {
   lockres_add_mask_waiter(lockres, mw, OCFS2_LOCK_QUEUED, 0);
   spin_unlock_irqrestore(lockres-l_lock, flags);
 @@ -3172,6 +3200,7 @@ void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res 
 *lockres)
   
   spin_lock_irqsave(lockres-l_lock, flags);
   }
 +out_unlock:
   spin_unlock_irqrestore(lockres-l_lock, flags);
   }
   
 @@ -3180,7 +3209,7 @@ void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
   {
   int ret;
   
 - ocfs2_mark_lockres_freeing(lockres);
 + ocfs2_mark_lockres_freeing(osb, lockres);
   ret = ocfs2_drop_lock(osb, lockres);
   if (ret)
   mlog_errno(ret);
 diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
 index 1d596d8c4a4a..d293a22c32c5 100644
 --- a/fs/ocfs2/dlmglue.h
 +++ b/fs/ocfs2/dlmglue.h
 @@ -157,7 +157,8 @@ int ocfs2_refcount_lock(struct ocfs2_refcount_tree 
 *ref_tree, int ex);
   void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
   
   
 -void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
 +void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
 + struct ocfs2_lock_res *lockres);
   void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
  struct ocfs2_lock_res *lockres);
   
 diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
 index 3b0d722de35e..9661f8db21dc 100644
 --- a/fs/ocfs2/inode.c
 +++ b/fs/ocfs2/inode.c
 @@ -1053,6 +1053,7 @@ static void ocfs2_clear_inode(struct inode *inode)
   {
   int status;
   struct ocfs2_inode_info *oi = OCFS2_I(inode);
 + struct ocfs2_super *osb = OCFS2_SB(inode-i_sb);
   
   clear_inode(inode);
   trace_ocfs2_clear_inode((unsigned long long)oi-ip_blkno,
 @@ -1069,9 +1070,9 @@ static void ocfs2_clear_inode(struct inode *inode)
   
   /* Do these before all the other work so that we don't bounce
* the downconvert thread while waiting to destroy the locks. */
 - ocfs2_mark_lockres_freeing(oi-ip_rw_lockres);
 -