[Cluster-devel] [PATCH] qdiskd: (RHEL56) Don't write evictions if allow_kill is off

2010-11-10 Thread Lon Hohberger
Previously, qdisk master would write an eviction notice to disk
for a hung qdisk node even if allow_kill was off, causing the
other node to reboot.

This patch causes the qdisk master to write S_NONE as the state
of hung nodes on-disk when allow_kill is off instead of S_EVICT.

So, when the node wakes up, it will read the S_NONE state and
take action based on that state instead of reading S_EVICT and
rebooting.

Because there is so much internal qdiskd state which would need
to be fixed on a node which is in this state (including rejoining
the qdisk membership), the only clean method to continue
operations is to restart qdiskd.

Resolves: rhbz#602731

Signed-off-by: Lon Hohberger l...@redhat.com
---
 cman/qdisk/main.c |   80 +
 1 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 153b190..1eb10a6 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -109,17 +109,36 @@ node_info_init(node_info_t *ni, int max)
 }
 
 
+static void
+reincarnate(void)
+{
+   char buf[PATH_MAX];
+   char cmd[PATH_MAX];
+
+   clulog(LOG_CRIT, Attempting to restart\n);
+
+   snprintf(buf, sizeof(buf), /proc/%d/exe, getpid());
+   if (readlink(buf, cmd, sizeof(cmd))  0)
+   goto out_die;
+
+   execlp(cmd, cmd, NULL);
+out_die:
+   clulog(LOG_CRIT, Unable to restart; dying.\n);
+   exit(-1);
+}
+
+
 /**
   Check to see if someone tried to evict us but we were out to lunch.
   Rare case; usually other nodes would put up the 'Undead' message and
   re-evict us.
  */
-void
+static int
 check_self(qd_ctx *ctx, status_block_t *sb)
 {
if (!sb-ps_updatenode ||
(sb-ps_updatenode == ctx-qc_my_id)) {
-   return;
+   return 0;
}
 
/* I did not update this??! */
@@ -127,10 +146,16 @@ check_self(qd_ctx *ctx, status_block_t *sb)
case S_EVICT:
/* Someone told us to die. */
reboot(RB_AUTOBOOT);
+   case S_NONE:
+   return -1;
default:
-   clulog(LOG_EMERG, Unhandled state: %d\n, sb-ps_state);
-   raise(SIGSTOP);
+   break;
}
+
+   clulog(LOG_EMERG, Unhandled state: %d\n, sb-ps_state);
+   raise(SIGSTOP);
+
+   return -1;
 }
 
 
@@ -160,9 +185,11 @@ read_node_blocks(qd_ctx *ctx, node_info_t *ni, int max)
swab_status_block_t(sb);
 
if (sb-ps_nodeid == ctx-qc_my_id) {
-   check_self(ctx, sb);
+   if (check_self(ctx, sb)  0)
+   reincarnate();
continue;
} 
+
/* message. */
memcpy((ni[x].ni_last_msg), (ni[x].ni_msg),
   sizeof(ni[x].ni_last_msg));
@@ -278,17 +305,26 @@ check_transitions(qd_ctx *ctx, node_info_t *ni, int max, 
memb_mask_t mask)
   Write eviction notice if we're the master.
 */
if (ctx-qc_status == S_MASTER) {
-   clulog(LOG_NOTICE,
-  Writing eviction notice for node %d\n,
-  ni[x].ni_status.ps_nodeid);
-   qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
-   S_EVICT, NULL, NULL, NULL);
+
if (ctx-qc_flags  RF_ALLOW_KILL) {
+   clulog(LOG_NOTICE,
+  Writing eviction notice for 
node %d\n,
+  ni[x].ni_status.ps_nodeid);
+   qd_write_status(ctx, 
ni[x].ni_status.ps_nodeid,
+   S_EVICT, NULL, NULL, 
NULL);
clulog(LOG_DEBUG, Telling CMAN to 
kill the node\n);
cman_kill_node(ctx-qc_ch,
ni[x].ni_status.ps_nodeid);
+   } else {
+   clulog(LOG_NOTICE,
+  Node %d should be evicted, but 
+  allow_kill is off\n,
+  ni[x].ni_status.ps_nodeid);
+   qd_write_status(ctx, 
ni[x].ni_status.ps_nodeid,
+   S_NONE, NULL, NULL, 
NULL);
}
+
}
 
/* Clear our master mask for the node after eviction */
@@ -313,20 +349,28 @@ check_transitions(qd_ctx *ctx, node_info_t *ni, int max, 
memb_mask_t mask)

[Cluster-devel] [PATCH] qdiskd: (STABLE31) Don't write evictions if allow_kill is off

2010-11-10 Thread Lon Hohberger
Previously, qdisk master would write an eviction notice to disk
for a hung qdisk node even if allow_kill was off, causing the
other node to reboot.

This patch causes the qdisk master to write S_NONE as the state
of hung nodes on-disk when allow_kill is off instead of S_EVICT.

So, when the node wakes up, it will read the S_NONE state and
take action based on that state instead of reading S_EVICT and
rebooting.

Because there is so much internal qdiskd state which would need
to be fixed on a node which is in this state (including rejoining
the qdisk membership), the only clean method to continue
operations is to restart qdiskd.

Resolves: rhbz#602731

Signed-off-by: Lon Hohberger l...@redhat.com
---
 cman/qdisk/main.c |   80 +
 1 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 6a9b821..f0b7a5f 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -128,17 +128,36 @@ node_info_init(node_info_t *ni, int max)
 }
 
 
+static void
+reincarnate(void)
+{
+   char buf[PATH_MAX];
+   char cmd[PATH_MAX];
+
+   logt_print(LOG_CRIT, Attempting to restart\n);
+
+   snprintf(buf, sizeof(buf), /proc/%d/exe, getpid());
+   if (readlink(buf, cmd, sizeof(cmd))  0)
+   goto out_die;
+
+   execlp(cmd, cmd, NULL);
+out_die:
+   logt_print(LOG_CRIT, Unable to restart; dying.\n);
+   exit(-1);
+}
+
+
 /**
   Check to see if someone tried to evict us but we were out to lunch.
   Rare case; usually other nodes would put up the 'Undead' message and
   re-evict us.
  */
-static void
+static int
 check_self(qd_ctx *ctx, status_block_t *sb)
 {
if (!sb-ps_updatenode ||
(sb-ps_updatenode == ctx-qc_my_id)) {
-   return;
+   return 0;
}
 
/* I did not update this??! */
@@ -146,10 +165,16 @@ check_self(qd_ctx *ctx, status_block_t *sb)
case S_EVICT:
/* Someone told us to die. */
reboot(RB_AUTOBOOT);
+   case S_NONE:
+   return -1;
default:
-   logt_print(LOG_EMERG, Unhandled state: %d\n, sb-ps_state);
-   raise(SIGSTOP);
+   break;
}
+
+   logt_print(LOG_EMERG, Unhandled state: %d\n, sb-ps_state);
+   raise(SIGSTOP);
+
+   return -1;
 }
 
 
@@ -179,9 +204,11 @@ read_node_blocks(qd_ctx *ctx, node_info_t *ni, int max)
swab_status_block_t(sb);
 
if (sb-ps_nodeid == ctx-qc_my_id) {
-   check_self(ctx, sb);
+   if (check_self(ctx, sb)  0)
+   reincarnate();
continue;
} 
+
/* message. */
memcpy((ni[x].ni_last_msg), (ni[x].ni_msg),
   sizeof(ni[x].ni_last_msg));
@@ -297,17 +324,26 @@ check_transitions(qd_ctx *ctx, node_info_t *ni, int max, 
memb_mask_t mask)
   Write eviction notice if we're the master.
 */
if (ctx-qc_status == S_MASTER) {
-   logt_print(LOG_NOTICE,
-  Writing eviction notice for node %d\n,
-  ni[x].ni_status.ps_nodeid);
-   qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
-   S_EVICT, NULL, NULL, NULL);
+
if (ctx-qc_flags  RF_ALLOW_KILL) {
+   logt_print(LOG_NOTICE,
+  Writing eviction notice for 
node %d\n,
+  ni[x].ni_status.ps_nodeid);
+   qd_write_status(ctx, 
ni[x].ni_status.ps_nodeid,
+   S_EVICT, NULL, NULL, 
NULL);
logt_print(LOG_DEBUG, Telling CMAN to 
kill the node\n);
cman_kill_node(ctx-qc_cman_admin,
ni[x].ni_status.ps_nodeid);
+   } else {
+   logt_print(LOG_NOTICE,
+  Node %d should be evicted, but 
+  allow_kill is off\n,
+  ni[x].ni_status.ps_nodeid);
+   qd_write_status(ctx, 
ni[x].ni_status.ps_nodeid,
+   S_NONE, NULL, NULL, 
NULL);
}
+
}
 
/* Clear our master mask for the node after eviction */
@@ -332,20 +368,28 @@ check_transitions(qd_ctx *ctx, node_info_t *ni, int max, 

Re: [Cluster-devel] [PATCH 6/9] fs/gfs2/glock.c: Use printf extension %pV

2010-11-10 Thread Steven Whitehouse
Hi,

Now in my -nmw GFS2 git tree along with the previous patch. Thanks,

Steve.

On Tue, 2010-11-09 at 16:35 -0800, Joe Perches wrote:
 Using %pV reduces the number of printk calls and
 eliminates any possible message interleaving from
 other printk calls.
 
 Signed-off-by: Joe Perches j...@perches.com
 ---
  fs/gfs2/glock.c |9 +++--
  1 files changed, 7 insertions(+), 2 deletions(-)
 
 diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
 index 8777885..d30b39c 100644
 --- a/fs/gfs2/glock.c
 +++ b/fs/gfs2/glock.c
 @@ -952,17 +952,22 @@ int gfs2_glock_wait(struct gfs2_holder *gh)
  
  void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...)
  {
 + struct va_format vaf;
   va_list args;
  
   va_start(args, fmt);
 +
   if (seq) {
   struct gfs2_glock_iter *gi = seq-private;
   vsprintf(gi-string, fmt, args);
   seq_printf(seq, gi-string);
   } else {
 - printk(KERN_ERR  );
 - vprintk(fmt, args);
 + vaf.fmt = fmt;
 + vaf.va = args;
 +
 + printk(KERN_ERR  %pV, vaf);
   }
 +
   va_end(args);
  }
  




[Cluster-devel] [PATCH] fs/gfs2/glock.h: Add __attribute__((format(printf, 2, 3)) to gfs2_print_dbg

2010-11-10 Thread Joe Perches
Functions that use printf formatting, especially
those that use %pV, should have their uses of
printf format and arguments checked by the compiler.

Signed-off-by: Joe Perches j...@perches.com
---
No current uses report any error in an allyesconfig build.
 
 fs/gfs2/glock.h |2 ++
 1 files changed, 2 insertions(+), 0 deletions(-)

diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index db1c26d..a12d117 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -212,6 +212,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
 int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+
+__attribute__ ((format(printf, 2, 3)))
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 
 /**




Re: [Cluster-devel] [PATCH] fs/gfs2/glock.h: Add __attribute__((format(printf, 2, 3)) to gfs2_print_dbg

2010-11-10 Thread Steven Whitehouse
Hi,

Now in the GFS2 -nmw git tree. Thanks,

Steve.

On Wed, 2010-11-10 at 13:19 -0800, Joe Perches wrote:
 Functions that use printf formatting, especially
 those that use %pV, should have their uses of
 printf format and arguments checked by the compiler.
 
 Signed-off-by: Joe Perches j...@perches.com
 ---
 No current uses report any error in an allyesconfig build.
  
  fs/gfs2/glock.h |2 ++
  1 files changed, 2 insertions(+), 0 deletions(-)
 
 diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
 index db1c26d..a12d117 100644
 --- a/fs/gfs2/glock.h
 +++ b/fs/gfs2/glock.h
 @@ -212,6 +212,8 @@ int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
  int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
  void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
  void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
 +
 +__attribute__ ((format(printf, 2, 3)))
  void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
  
  /**
 
 




Re: [Cluster-devel] [PATCH 0/9] treewide: convert vprintk uses to %pV

2010-11-10 Thread Joe Perches
On Wed, 2010-11-10 at 14:48 -0800, Luis R. Rodriguez wrote:
 When was this added upstream BTW? I ask for backport considerations.

commit 7db6f5fb65a82af03229eef104dc9899c5eecf33
Author: Joe Perches j...@perches.com
Date:   Sun Jun 27 01:02:33 2010 +

vsprintf: Recursive vsnprintf: Add %pV, struct va_format

Add the ability to print a format and va_list from a structure pointer

Allows __dev_printk to be implemented as a single printk while
minimizing string space duplication.

%pV should not be used without some mechanism to verify the
format and argument use ala __attribute__(format (printf(...))).

Signed-off-by: Joe Perches j...@perches.com
Acked-by: Greg Kroah-Hartman gre...@suse.de
Signed-off-by: David S. Miller da...@davemloft.net




[Cluster-devel] [PATCH] dlm: Handle application limited situations properly.

2010-11-10 Thread David Miller

In the normal regime where an application uses non-blocking I/O
writes on a socket, they will handle -EAGAIN and use poll() to
wait for send space.

They don't actually sleep on the socket I/O write.

But kernel level RPC layers that do socket I/O operations directly
and key off of -EAGAIN on the write() to try again later don't
use poll(), they instead have their own sleeping mechanism and
rely upon -sk_write_space() to trigger the wakeup.

So they do effectively sleep on the write(), but this mechanism
alone does not let the socket layers know what's going on.

Therefore they must emulate what would have happened, otherwise
TCP cannot possibly see that the connection is application window
size limited.

Handle this, therefore, like SUNRPC by setting SOCK_NOSPACE and
bumping the -sk_write_count as needed when we hit the send buffer
limits.

This should make TCP send buffer size auto-tuning and the
-sk_write_space() callback invocations actually happen.

Signed-off-by: David S. Miller da...@davemloft.net

diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 37a34c2..77720f8 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -108,6 +108,7 @@ struct connection {
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
 #define CF_CLOSE 6
+#define CF_APP_LIMITED 7
struct list_head writequeue;  /* List of outgoing writequeue_entries */
spinlock_t writequeue_lock;
int (*rx_action) (struct connection *); /* What to do when active */
@@ -295,7 +296,17 @@ static void lowcomms_write_space(struct sock *sk)
 {
struct connection *con = sock2con(sk);
 
-   if (con  !test_and_set_bit(CF_WRITE_PENDING, con-flags))
+   if (!con)
+   return;
+
+   clear_bit(SOCK_NOSPACE, con-sock-flags);
+
+   if (test_and_clear_bit(CF_APP_LIMITED, con-flags)) {
+   con-sock-sk-sk_write_pending--;
+   clear_bit(SOCK_ASYNC_NOSPACE, con-sock-flags);
+   }
+
+   if (!test_and_set_bit(CF_WRITE_PENDING, con-flags))
queue_work(send_workqueue, con-swork);
 }
 
@@ -1319,6 +1330,15 @@ static void send_to_sock(struct connection *con)
ret = kernel_sendpage(con-sock, e-page, offset, len,
  msg_flags);
if (ret == -EAGAIN || ret == 0) {
+   if (ret == -EAGAIN 
+   test_bit(SOCK_ASYNC_NOSPACE, 
con-sock-flags) 
+   !test_and_set_bit(CF_APP_LIMITED, 
con-flags)) {
+   /* Notify TCP that we're limited by the
+* application window size.
+*/
+   set_bit(SOCK_NOSPACE, 
con-sock-flags);
+   con-sock-sk-sk_write_pending++;
+   }
cond_resched();
goto out;
}