Re: [Ocfs2-devel] [PATCH] ocfs2/dlm: remove lockres from purge when a lock is added

2010-06-10 Thread Srinivas Eeda
Wengang, thanks for the patch. My comments are inline :)

On 6/8/2010 7:38 AM, Wengang Wang wrote:
 dlm_thread(when purges a lockres) races with another thread that is running on
 dlmlock_remote(). dlmlock_remote() can add a lock to the blocked list of the
 lockres without taking dlm-spinlock. This can lead a BUG() in dlm_thread 
 because
 of the added lock.

 Fix
 We take dlm-spinlock when adding a lock to the blocked list and make sure the
 lockres is not in purge list. If we removed the lockres from purge list, try 
 to
 add it back.

 Signed-off-by: Wengang Wang wen.gang.w...@oracle.com
 ---
  fs/ocfs2/dlm/dlmcommon.h |   15 +++
  fs/ocfs2/dlm/dlmlock.c   |   18 +++---
  fs/ocfs2/dlm/dlmthread.c |   28 
  3 files changed, 58 insertions(+), 3 deletions(-)

 diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
 index 4b6ae2c..134973a 100644
 --- a/fs/ocfs2/dlm/dlmcommon.h
 +++ b/fs/ocfs2/dlm/dlmcommon.h
 @@ -1002,6 +1002,9 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt 
 *dlm,
  
  /* will exit holding res-spinlock, but may drop in function */
  void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags);
 +void __dlm_wait_on_lockres_flags_lock_dlm(struct dlm_ctxt *dlm,
 +   struct dlm_lock_resource *res,
 +   int flags);
  void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int 
 flags);
  
  /* will exit holding res-spinlock, but may drop in function */
 @@ -1012,6 +1015,18 @@ static inline void __dlm_wait_on_lockres(struct 
 dlm_lock_resource *res)
 DLM_LOCK_RES_MIGRATING));
  }
  
 +/*
 + * will exit holding dlm-spinlock and res-spinlock, but may drop in
 + * function
 + */
 +static inline void __dlm_wait_on_lockres_lock_dlm(struct dlm_ctxt *dlm,
 +   struct dlm_lock_resource *res)
 +{
 + __dlm_wait_on_lockres_flags_lock_dlm(dlm, res,
 +  (DLM_LOCK_RES_IN_PROGRESS|
 +   DLM_LOCK_RES_RECOVERING|
 +   DLM_LOCK_RES_MIGRATING));
 +}
  void __dlm_unlink_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry 
 *mle);
  void __dlm_insert_mle(struct dlm_ctxt *dlm, struct dlm_master_list_entry 
 *mle);
  
 diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
 index 69cf369..fb3c178 100644
 --- a/fs/ocfs2/dlm/dlmlock.c
 +++ b/fs/ocfs2/dlm/dlmlock.c
 @@ -222,23 +222,35 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt 
 *dlm,
 struct dlm_lock *lock, int flags)
  {
   enum dlm_status status = DLM_DENIED;
 - int lockres_changed = 1;
 + int lockres_changed = 1, recalc;
  
   mlog_entry(type=%d\n, lock-ml.type);
   mlog(0, lockres %.*s, flags = 0x%x\n, res-lockname.len,
res-lockname.name, flags);
  
 + spin_lock(dlm-spinlock);
   spin_lock(res-spinlock);
  
   /* will exit this call with spinlock held */
 - __dlm_wait_on_lockres(res);
 + __dlm_wait_on_lockres_lock_dlm(dlm, res);
   res-state |= DLM_LOCK_RES_IN_PROGRESS;
  
   /* add lock to local (secondary) queue */
   dlm_lock_get(lock);
   list_add_tail(lock-list, res-blocked);
   lock-lock_pending = 1;
 + /*
 +  * make sure this lockres is not in purge list if we remove the
 +  * lockres from purge list, we try to add it back if locking failed.
 +  *
 +  * there is a race with dlm_thread purging this lockres.
 +  * in this case, it can trigger a BUG() because we added a lock to the
 +  * blocked list.
 +  */
 + recalc = !list_empty(res-purge);
 + __dlm_lockres_calc_usage(dlm, res);
   
Thanks for this patch, looks good. A few comments on another approach, 
let me know your thoughts.

The patch I sent for review marks the lockres to be in use (before 
dlmlock_remote is called) in dlm_get_lockresource. I have to do that to 
protect the lockres from unhashing once it's being used. But since 
lockres is still in purgelist the BUG you mentioned can still trigger in 
dlm_thread.

once a lockres is on purge list there are quite a few cases(case this 
patch is addressing, case the patch I just sent for review is addresing, 
and the case of the recovery) that can trigger the lockres to be reused, 
before or while it is getting purged. dlm_thread(dlm_run_purge_list) 
already expects that a lockres on purge list could just getting started 
to be reused. However it doesn't handle the above cases. I think it's 
better to enhance dlm_run_purge_list/dlm_purge_lockres code to handle 
these cases. I'am working on this change.

There also seems to be a case thats not handled. Assume dlm_thread sent 
deref message and waiting for response, now dlmlock_remote removed it 
from purge list. dlm_thread got response from master and continues and 

[Ocfs2-devel] [PATCH] ocfs2/dlm: correct the refmap on recovery master

2010-06-10 Thread Wengang Wang
If the dlm recovery goes on the non-master node where purging work is going on, 
There could be unexpected reference left on some lockres' on recovery master.
That is because we migrated the lockres' to recovery master but didn't send
deref requests to it accordingly(was sending to the dead original master or to
the UNKNOWN).

Fix:
For the lockres which is in progress of dropping reference, we don't migrate it
to recovery master and unhash the lockres in the purge work.
For those not in progress of the dropping, delay the purge work until recovery
finished so that it can send deref request to the correct master(recovery
master) later.

Signed-off-by: Wengang Wang wen.gang.w...@oracle.com
---
 fs/ocfs2/dlm/dlmrecovery.c |   17 +++--
 fs/ocfs2/dlm/dlmthread.c   |   36 ++--
 2 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index f8b75ce..43530ce 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1997,6 +1997,8 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt 
*dlm,
struct list_head *queue;
struct dlm_lock *lock, *next;
 
+   assert_spin_locked(dlm-spinlock);
+   assert_spin_locked(res-spinlock);
res-state |= DLM_LOCK_RES_RECOVERING;
if (!list_empty(res-recovering)) {
mlog(0,
@@ -2336,9 +2338,20 @@ static void dlm_do_local_recovery_cleanup(struct 
dlm_ctxt *dlm, u8 dead_node)
 
/* the wake_up for this will happen when the
 * RECOVERING flag is dropped later */
-   res-state = ~DLM_LOCK_RES_DROPPING_REF;
+   if (res-state  DLM_LOCK_RES_DROPPING_REF) {
+   /*
+* don't migrate a lockres which is in
+* progress of dropping ref
+*/
+   mlog(ML_NOTICE, %.*s ignored for 
+migration\n, res-lockname.len,
+res-lockname.name);
+   res-state =
+   ~DLM_LOCK_RES_DROPPING_REF;
+   } else
+   dlm_move_lockres_to_recovery_list(dlm,
+ res);
 
-   dlm_move_lockres_to_recovery_list(dlm, res);
} else if (res-owner == dlm-node_num) {
dlm_free_dead_locks(dlm, res, dead_node);
__dlm_lockres_calc_usage(dlm, res);
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d4f73ca..0771420 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -92,17 +92,23 @@ int __dlm_lockres_has_locks(struct dlm_lock_resource *res)
  * truly ready to be freed. */
 int __dlm_lockres_unused(struct dlm_lock_resource *res)
 {
-   if (!__dlm_lockres_has_locks(res) 
-   (list_empty(res-dirty)  !(res-state  DLM_LOCK_RES_DIRTY))) {
-   /* try not to scan the bitmap unless the first two
-* conditions are already true */
-   int bit = find_next_bit(res-refmap, O2NM_MAX_NODES, 0);
-   if (bit = O2NM_MAX_NODES) {
-   /* since the bit for dlm-node_num is not
-* set, inflight_locks better be zero */
-   BUG_ON(res-inflight_locks != 0);
-   return 1;
-   }
+   int bit;
+
+   if (__dlm_lockres_has_locks(res))
+   return 0;
+
+   if (!list_empty(res-dirty) || res-state  DLM_LOCK_RES_DIRTY)
+   return 0;
+
+   if (res-state  DLM_LOCK_RES_RECOVERING)
+   return 0;
+
+   bit = find_next_bit(res-refmap, O2NM_MAX_NODES, 0);
+   if (bit = O2NM_MAX_NODES) {
+   /* since the bit for dlm-node_num is not
+* set, inflight_locks better be zero */
+   BUG_ON(res-inflight_locks != 0);
+   return 1;
}
return 0;
 }
@@ -158,6 +164,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
int master;
int ret = 0;
 
+   assert_spin_locked(dlm-spinlock);
+
spin_lock(res-spinlock);
if (!__dlm_lockres_unused(res)) {
mlog(0, %s:%.*s: tried to purge but not unused\n,
@@ -216,13 +224,13 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 master = %d\n, res-lockname.len, res-lockname.name,
 res, master);
list_del_init(res-purge);
-   spin_unlock(res-spinlock);
+   /* not the last ref */
dlm_lockres_put(res);
 

[Ocfs2-devel] Reorganize data elements to reduce struct sizes

2010-06-10 Thread Goldwyn Rodrigues
Thanks for the comments. I have incorportated them all.

CONFIG_OCFS2_FS_STATS is enabled and CONFIG_DEBUG_LOCK_ALLOC is disabled.
Statistics now look like -
ocfs2_write_ctxt: 2144 - 2136 = 8
ocfs2_inode_info: 1960 - 1848 = 112
ocfs2_journal: 168 - 160 = 8
ocfs2_lock_res: 336 - 304 = 32
ocfs2_refcount_tree: 512 - 472 = 40

Signed-off-by: Goldwyn Rodrigues rgold...@suse.de

---
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 3623ca2..1b5e284 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -910,8 +910,8 @@ struct ocfs2_write_ctxt {
 * out in so that future reads from that region will get
 * zero's.
 */
-   struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
unsigned intw_num_pages;
+   struct page *w_pages[OCFS2_MAX_CTXT_PAGES];
struct page *w_target_page;

/*
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 9f5f5fc..a1d70bf 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,27 +46,24 @@ struct ocfs2_inode_info
/* These fields are protected by ip_lock */
spinlock_t  ip_lock;
u32 ip_open_count;
-   u32 ip_clusters;
struct list_headip_io_markers;
+   u32 ip_clusters;

+   u16 ip_dyn_features;
struct mutexip_io_mutex;
-
u32 ip_flags; /* see below */
u32 ip_attr; /* inode attributes */
-   u16 ip_dyn_features;

/* protected by recovery_lock. */
struct inode*ip_next_orphan;

-   u32 ip_dir_start_lookup;
-
struct ocfs2_caching_info   ip_metadata_cache;
-
struct ocfs2_extent_map ip_extent_map;
-
struct inodevfs_inode;
struct jbd2_inode   ip_jinode;

+   u32 ip_dir_start_lookup;
+
/* Only valid if the inode is the dir. */
u32 ip_last_used_slot;
u64 ip_last_used_group;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index b5baaa8..43e56b9 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -67,11 +67,12 @@ struct ocfs2_journal {
struct buffer_head*j_bh;  /* Journal disk inode block */
atomic_t  j_num_trans; /* Number of transactions
* currently in the system. */
+   spinlock_tj_lock;
unsigned long j_trans_id;
struct rw_semaphore   j_trans_barrier;
wait_queue_head_t j_checkpointed;

-   spinlock_tj_lock;
+   /* both fields protected by j_lock*/
struct list_head  j_la_cleanups;
struct work_structj_recovery_work;
 };
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c67003b..65739b3 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -150,26 +150,33 @@ typedef void (*ocfs2_lock_callback)(int status,
unsigned long data);
 struct ocfs2_lock_res {
void*l_priv;
struct ocfs2_lock_res_ops *l_ops;
-   spinlock_t   l_lock;
+

struct list_head l_blocked_list;
struct list_head l_mask_waiters;

-   enum ocfs2_lock_type l_type;
unsigned longl_flags;
char l_name[OCFS2_LOCK_ID_MAX_LEN];
-   int  l_level;
unsigned int l_ro_holders;
unsigned int l_ex_holders;
-   struct ocfs2_dlm_lksbl_lksb;
+   unsigned charl_level;
+
+   /* Data packed - type enum ocfs2_lock_type */
+   unsigned charl_type;

/* used from AST/BAST funcs. */
-   enum ocfs2_ast_actionl_action;
-   enum ocfs2_unlock_action l_unlock_action;
-   int  l_requested;
-   int  l_blocking;
+   /* Data packed - enum type ocfs2_ast_action */
+   unsigned charl_action;
+   /* Data packed - enum type ocfs2_unlock_action */
+   unsigned charl_unlock_action;
+   unsigned charl_requested;
+   unsigned charl_blocking;
unsigned int l_pending_gen;

+   spinlock_t   l_lock;
+
+   struct ocfs2_dlm_lksbl_lksb;
+
wait_queue_head_tl_event;

struct list_head l_debug_list;
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 9983ba1..f04892d 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -21,14 +21,14 @@ struct ocfs2_refcount_tree {
struct