Re: [Ocfs2-devel] [PATCH] ocfs2: Add missing dlm_put() in dlm_begin_reco_handler

2013-05-31 Thread Jeff Liu
Hi Jiufei,

On 05/31/2013 02:44 PM, Xue jiufei wrote:

 Function dlm_begin_reco_handler returns without putting dlm 
 when dlm recovery state is DLM_RECO_STATE_FINALIZE.
 
 Signed-off-by: joyce xuejiu...@huawei.com

The patch looks good to me, you can consider adding:
Reviewed-by: Jie Liu jeff@oracle.com

Thanks,
-Jeff

 ---
  fs/ocfs2/dlm/dlmrecovery.c |1 +
  1 file changed, 1 insertion(+)
 
 diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
 index eeac97b..a145cf8 100644
 --- a/fs/ocfs2/dlm/dlmrecovery.c
 +++ b/fs/ocfs2/dlm/dlmrecovery.c
 @@ -2697,6 +2697,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 
 len, void *data,
dlm-name, br-node_idx, br-dead_node,
dlm-reco.dead_node, dlm-reco.new_master);
   spin_unlock(dlm-spinlock);
 + dlm_put(dlm);
   return -EAGAIN;
   }
   spin_unlock(dlm-spinlock);
 -- 1.7.9.7
 
 
 ___
 Ocfs2-devel mailing list
 Ocfs2-devel@oss.oracle.com
 https://oss.oracle.com/mailman/listinfo/ocfs2-devel



___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel


Re: [Ocfs2-devel] [PATCH] ocfs2: resend master request when lost connection with someone

2013-05-31 Thread Xue jiufei
Hi, Xiaowei
It's OK to simlify the patch just as you did. But we don't want to resend
master request to all others nodes in consideration of network traffic.
So we record those maybe down nodes in down_nodemap.

于 2013/5/28 14:12, xiaowei.hu 写道:
 Hi,
 
 I reviewed this patch , it did could fix a temp lost connection problem, but 
 a few questions:
 
 1. since we don't need to know the node numbers of down nodes, if simply 
 replace the down_nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)], with a int named for 
 example mreq_msg_send_fail ?
 
 2.since the final work is to return -EAGAIN, the resend all master requests. 
 How about we simply do this?:
 
  while ((nodenum = dlm_node_iter_next(iter)) = 0) {
  ret = dlm_do_master_request(res, mle, nodenum);
 -if (ret  0)
 +if (ret  0) {
  mlog_errno(ret);
 +wait_on_recovery = 1;
 +msleep(DLM_NODE_DEATH_WAIT_MAX);
 +goto redo_request;
 +}
 
 Am I missing something?
 
 Thanks,
 Xiaowei
 
 On 12/22/2012 03:00 PM, Xue jiufei wrote:
Function dlm_get_lock_resource() sends master request to all nodes in
 domain_map and waits for their responses when the node(say nodeA) doesn't
 known who the master is.
When nodeA sends the master request, it happened that network of
 nodeB down for a while, and then restore. The master request
 from nodeA does not reach nodeB. NodeA may wait again and again in
 dlm_wait_for_lock_mastery() and never returns.
This patch resend the mater request when a node lost connection with
 some other nodes.

 Signed-off-by: xuejiufei xuejiu...@huawei.com
 ---
   fs/ocfs2/dlm/dlmmaster.c |   41 +++--
   1 files changed, 35 insertions(+), 6 deletions(-)

 diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
 index c491f97..2a99a95 100644
 --- a/fs/ocfs2/dlm/dlmmaster.c
 +++ b/fs/ocfs2/dlm/dlmmaster.c
 @@ -106,7 +106,7 @@ static int dlm_do_master_request(struct 
 dlm_lock_resource *res,
   static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_master_list_entry *mle,
 - int *blocked);
 + int *blocked, int *retry, int host_down);
   static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
   struct dlm_lock_resource *res,
   struct dlm_master_list_entry *mle,
 @@ -712,6 +712,8 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct 
 dlm_ctxt *dlm,
   unsigned int hash;
   int tries = 0;
   int bit, wait_on_recovery = 0;
 +int retry = 0;
 +unsigned long down_nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
 BUG_ON(!lockid);
   @@ -910,11 +912,25 @@ redo_request:
   goto wait;
 ret = -EINVAL;
 -dlm_node_iter_init(mle-vote_map, iter);
 +if (!retry)
 +dlm_node_iter_init(mle-vote_map, iter);
 +else {
 +mlog(0, %s:%.*s: retrying, send master request to maybe down 
 node\n,
 +dlm-name, res-lockname.len, res-lockname.name);
 +dlm_node_iter_init(down_nodemap, iter);
 +}
 +memset(down_nodemap, 0, sizeof(down_nodemap));
 +
   while ((nodenum = dlm_node_iter_next(iter)) = 0) {
   ret = dlm_do_master_request(res, mle, nodenum);
 -if (ret  0)
 +if (ret  0) {
   mlog_errno(ret);
 +if (dlm_is_host_down(ret)) {
 +mlog(0, %s:%.*s: node %u maybe dead, set down_nodemap\n,
 +dlm-name, res-lockname.len, res-lockname.name, 
 nodenum);
 +set_bit(nodenum, down_nodemap);
 +}
 +}
   if (mle-master != O2NM_MAX_NODES) {
   /* found a master ! */
   if (mle-master = nodenum)
 @@ -931,9 +947,11 @@ redo_request:
 wait:
   /* keep going until the response map includes all nodes */
 -ret = dlm_wait_for_lock_mastery(dlm, res, mle, blocked);
 +ret = dlm_wait_for_lock_mastery(dlm, res, mle, blocked, retry,
 +find_next_bit(down_nodemap, O2NM_MAX_NODES, 0)  
 O2NM_MAX_NODES);
   if (ret  0) {
 -wait_on_recovery = 1;
 +if (!retry)
 +wait_on_recovery = 1;
   mlog(0, %s: res %.*s, Node map changed, redo the master 
request now, blocked=%d\n, dlm-name, res-lockname.len,
res-lockname.name, blocked);
 @@ -980,7 +998,7 @@ leave:
   static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_master_list_entry *mle,
 - int *blocked)
 + int *blocked, int *retry, int host_down)
   {
   u8 m;
   int ret, bit;
 @@ -990,6 +1008,7 @@ static int dlm_wait_for_lock_mastery(struct dlm_ctxt 
 *dlm,
   recheck:
   ret = 0;
   assert = 0;
 +*retry = 0;
 /* check if another node has already become the 

Re: [Ocfs2-devel] [PATCH] ocfs2: resend master request when lost connection with someone

2013-05-31 Thread Srinivas Eeda
On 05/31/2013 03:38 AM, Xue jiufei wrote:
 Hi, Xiaowei
 It's OK to simlify the patch just as you did. But we don't want to resend
 master request to all others nodes in consideration of network traffic.
 So we record those maybe down nodes in down_nodemap.

 于 2013/5/28 14:12, xiaowei.hu 写道:
 Hi,

 I reviewed this patch , it did could fix a temp lost connection problem, but 
 a few questions:

 1. since we don't need to know the node numbers of down nodes, if simply 
 replace the down_nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)], with a int named 
 for example mreq_msg_send_fail ?

 2.since the final work is to return -EAGAIN, the resend all master requests. 
 How about we simply do this?:

   while ((nodenum = dlm_node_iter_next(iter)) = 0) {
   ret = dlm_do_master_request(res, mle, nodenum);
 -if (ret  0)
 +if (ret  0) {
   mlog_errno(ret);
 +wait_on_recovery = 1;
 +msleep(DLM_NODE_DEATH_WAIT_MAX);
 +goto redo_request;
 +}

 Am I missing something?

 Thanks,
 Xiaowei

 On 12/22/2012 03:00 PM, Xue jiufei wrote:
 Function dlm_get_lock_resource() sends master request to all nodes in
 domain_map and waits for their responses when the node(say nodeA) doesn't
 known who the master is.
 When nodeA sends the master request, it happened that network of
 nodeB down for a while, and then restore. The master request
 from nodeA does not reach nodeB. NodeA may wait again and again in
 dlm_wait_for_lock_mastery() and never returns.
 This patch resend the mater request when a node lost connection with
 some other nodes.
Yes, with the current reconnect code there is a possibility of message 
loss and can happen to all kinds of messages and responses. You are 
assuming the message never received by node B. Message might have been 
received by node B but the response might have lost too. Currently there 
is no way of telling the difference. DLM layer shouldn't worry about 
nodes loosing connection temporarily. Right way to fix this is to fix 
o2net reconnect code.

 Signed-off-by: xuejiufei xuejiu...@huawei.com
 ---
fs/ocfs2/dlm/dlmmaster.c |   41 +++--
1 files changed, 35 insertions(+), 6 deletions(-)

 diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
 index c491f97..2a99a95 100644
 --- a/fs/ocfs2/dlm/dlmmaster.c
 +++ b/fs/ocfs2/dlm/dlmmaster.c
 @@ -106,7 +106,7 @@ static int dlm_do_master_request(struct 
 dlm_lock_resource *res,
static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm,
 struct dlm_lock_resource *res,
 struct dlm_master_list_entry *mle,
 - int *blocked);
 + int *blocked, int *retry, int host_down);
static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res,
struct dlm_master_list_entry *mle,
 @@ -712,6 +712,8 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct 
 dlm_ctxt *dlm,
unsigned int hash;
int tries = 0;
int bit, wait_on_recovery = 0;
 +int retry = 0;
 +unsigned long down_nodemap[BITS_TO_LONGS(O2NM_MAX_NODES)];
  BUG_ON(!lockid);
@@ -910,11 +912,25 @@ redo_request:
goto wait;
  ret = -EINVAL;
 -dlm_node_iter_init(mle-vote_map, iter);
 +if (!retry)
 +dlm_node_iter_init(mle-vote_map, iter);
 +else {
 +mlog(0, %s:%.*s: retrying, send master request to maybe down 
 node\n,
 +dlm-name, res-lockname.len, res-lockname.name);
 +dlm_node_iter_init(down_nodemap, iter);
 +}
 +memset(down_nodemap, 0, sizeof(down_nodemap));
 +
while ((nodenum = dlm_node_iter_next(iter)) = 0) {
ret = dlm_do_master_request(res, mle, nodenum);
 -if (ret  0)
 +if (ret  0) {
mlog_errno(ret);
 +if (dlm_is_host_down(ret)) {
 +mlog(0, %s:%.*s: node %u maybe dead, set down_nodemap\n,
 +dlm-name, res-lockname.len, res-lockname.name, 
 nodenum);
 +set_bit(nodenum, down_nodemap);
 +}
 +}
if (mle-master != O2NM_MAX_NODES) {
/* found a master ! */
if (mle-master = nodenum)
 @@ -931,9 +947,11 @@ redo_request:
  wait:
/* keep going until the response map includes all nodes */
 -ret = dlm_wait_for_lock_mastery(dlm, res, mle, blocked);
 +ret = dlm_wait_for_lock_mastery(dlm, res, mle, blocked, retry,
 +find_next_bit(down_nodemap, O2NM_MAX_NODES, 0)  
 O2NM_MAX_NODES);
if (ret  0) {
 -wait_on_recovery = 1;
 +if (!retry)
 +wait_on_recovery = 1;
mlog(0, %s: res %.*s, Node map changed, redo the master 
 request now, blocked=%d\n, dlm-name, res-lockname.len,
 res-lockname.name, blocked);
 @@ -980,7 +998,7 @@ 

Re: [Ocfs2-devel] [PATCH v2] ocfs2: should not use le32_add_cpu to set ocfs2_dinode i_flags

2013-05-31 Thread Andrew Morton
On Thu, 30 May 2013 12:49:46 +0800 Joseph Qi joseph...@huawei.com wrote:

 If we use le32_add_cpu to set ocfs2_dinode i_flags, it may lead to the
 corresponding flag corrupted. So we should change it to bitwise and/or
 operation.
 

The usual question: what is the end-user impact of the bug which was
just fixed?

For the usual reason: so I and others can decide which kernel versions
need the fix.

Thanks.

___
Ocfs2-devel mailing list
Ocfs2-devel@oss.oracle.com
https://oss.oracle.com/mailman/listinfo/ocfs2-devel