This patch fixes the problem in my environment.
Would you test on top of whichever srcs your using?
thanks
-steve
On Mon, 2009-04-20 at 11:18 +0800, jay_chen wrote:
> I am not quiet understand the operations of throw_away.
> And I have a question. (maybe stupid)
>
> Since different nodes uses different assemblys.
> Why isn't a throw_away_mode for each node?
>
> Jay...
>
> -----Original Message-----
> From: Steven Dake [mailto:[email protected]]
> Sent: Tuesday, April 14, 2009 10:42 AM
> To: jay_chen
> Cc: [email protected]
> Subject: Re: RE:Re: [Openais] sync_in_process always return 1
>
> On Tue, 2009-04-14 at 09:18 +0800, jay_chen wrote:
> > (send this mail again to remove "Spam Mail" on title)
> >
> > Dear steve:
> > I add a printf in the following code.
> > And I got many printf when the problem happen.
> > I don't understand what is throw away.
> > Is there any doc or explanation I could reference?
> > Thanks.
> >
>
> The throw away logic was recently added to fix a defect, and apparently it
> is buggy. Its purpose is to throw away one message that are fragmented
> where some of the fragment contents have been lost. Right after a config
> change, it should throw away all message fragments if there is a gap in the
> message until it finds the start of a new message.
>
> Do you have logs of the configuration changes that happen via clm.c?
> I'd like to see the last two or three configuration changes.
>
>
> > Jay...
> >
> > ====== code
> > if (throw_away_mode == THROW_AWAY_ACTIVE) {
> >
> > printf("THROW_AWAY_ACTIVE mcast->fragmented %d\n",
> > mcast->fragmented);
> >
> > /* Throw away the first msg block */
> > if (mcast->fragmented == 0 || mcast->fragmented == 1) {
> > throw_away_mode = THROW_AWAY_INACTIVE;
> >
> > assembly->index += msg_lens[0];
> > iov_delv.iov_base =
> > &assembly->data[assembly->index];
> >
> > ====== debug output
> > sync_start_process
> > sync_start_process call sync_barrier_send with res 0 THROW_AWAY_ACTIVE
> > mcast->fragmented 150 THROW_AWAY_ACTIVE mcast->fragmented 151
> > THROW_AWAY_ACTIVE mcast->fragmented 152 THROW_AWAY_ACTIVE
> > mcast->fragmented 153 THROW_AWAY_ACTIVE mcast->fragmented 154
> > THROW_AWAY_ACTIVE mcast->fragmented 155 THROW_AWAY_ACTIVE
> > mcast->fragmented 156 THROW_AWAY_ACTIVE mcast->fragmented 157
> > THROW_AWAY_ACTIVE mcast->fragmented 158 THROW_AWAY_ACTIVE
> > mcast->fragmented 159 THROW_AWAY_ACTIVE mcast->fragmented 160
> > THROW_AWAY_ACTIVE mcast->fragmented 161 THROW_AWAY_ACTIVE
> > mcast->fragmented 162 THROW_AWAY_ACTIVE mcast->fragmented 163
> > THROW_AWAY_ACTIVE mcast->fragmented 164 THROW_AWAY_ACTIVE
> > mcast->fragmented 165 THROW_AWAY_ACTIVE mcast->fragmented 1
> > THROW_AWAY_ACTIVE mcast->fragmented 3 THROW_AWAY_ACTIVE
> > mcast->fragmented 4 THROW_AWAY_ACTIVE mcast->fragmented 5
> > THROW_AWAY_ACTIVE mcast->fragmented 6 THROW_AWAY_ACTIVE
> > mcast->fragmented 7 THROW_AWAY_ACTIVE mcast->fragmented 8
> > THROW_AWAY_ACTIVE mcast->fragmented 9 THROW_AWAY_ACTIVE
> > mcast->fragmented 10 THROW_AWAY_ACTIVE mcast->fragmented 11
> > THROW_AWAY_ACTIVE mcast->fragmented 12 THROW_AWAY_ACTIVE
> > mcast->fragmented 13 THROW_AWAY_ACTIVE mcast->fragmented 14
> > THROW_AWAY_ACTIVE mcast->fragmented 15 THROW_AWAY_ACTIVE
> > mcast->fragmented 16 THROW_AWAY_ACTIVE mcast->fragmented 17
> > THROW_AWAY_ACTIVE mcast->fragmented 166 THROW_AWAY_ACTIVE
> > mcast->fragmented 167 THROW_AWAY_ACTIVE mcast->fragmented 168
> > THROW_AWAY_ACTIVE mcast->fragmented 169 THROW_AWAY_ACTIVE
> > mcast->fragmented 170 THROW_AWAY_ACTIVE mcast->fragmented 171
> > THROW_AWAY_ACTIVE mcast->fragmented 172 THROW_AWAY_ACTIVE
> > mcast->fragmented 173 THROW_AWAY_ACTIVE mcast->fragmented 174
> > THROW_AWAY_ACTIVE mcast->fragmented 175 THROW_AWAY_ACTIVE
> > mcast->fragmented 176 THROW_AWAY_ACTIVE mcast->fragmented 177
> > THROW_AWAY_ACTIVE mcast->fragmented 178 THROW_AWAY_ACTIVE
> > mcast->fragmented 179 THROW_AWAY_ACTIVE mcast->fragmented 180
> > THROW_AWAY_ACTIVE mcast->fragmented 181 THROW_AWAY_ACTIVE
> > mcast->fragmented 18 THROW_AWAY_ACTIVE mcast->fragmented 19
> > THROW_AWAY_ACTIVE mcast->fragmented 20 THROW_AWAY_ACTIVE
> > mcast->fragmented 21 THROW_AWAY_ACTIVE mcast->fragmented 22
> > THROW_AWAY_ACTIVE mcast->fragmented 23 THROW_AWAY_ACTIVE
> > mcast->fragmented 24 THROW_AWAY_ACTIVE mcast->fragmented 25
> > THROW_AWAY_ACTIVE mcast->fragmented 26 THROW_AWAY_ACTIVE
> > mcast->fragmented 27 THROW_AWAY_ACTIVE mcast->fragmented 182
> > THROW_AWAY_ACTIVE mcast->fragmented 183 THROW_AWAY_ACTIVE
> > mcast->fragmented 184 THROW_AWAY_ACTIVE mcast->fragmented 185
> > THROW_AWAY_ACTIVE mcast->fragmented 186 THROW_AWAY_ACTIVE
> > mcast->fragmented 187 THROW_AWAY_ACTIVE mcast->fragmented 188
> > THROW_AWAY_ACTIVE mcast->fragmented 189 THROW_AWAY_ACTIVE
> > mcast->fragmented 190 THROW_AWAY_ACTIVE mcast->fragmented 191
> > THROW_AWAY_ACTIVE mcast->fragmented 192 THROW_AWAY_ACTIVE
> > mcast->fragmented 193 THROW_AWAY_ACTIVE mcast->fragmented 194
> > THROW_AWAY_ACTIVE mcast->fragmented 195 THROW_AWAY_ACTIVE
> > mcast->fragmented 196 THROW_AWAY_ACTIVE mcast->fragmented 197
> > THROW_AWAY_ACTIVE mcast->fragmented 198 THROW_AWAY_ACTIVE
> > mcast->fragmented 28 THROW_AWAY_ACTIVE mcast->fragmented 29
> > THROW_AWAY_ACTIVE mcast->fragmented 30 THROW_AWAY_ACTIVE
> > mcast->fragmented 31 THROW_AWAY_ACTIVE mcast->fragmented 32
> > THROW_AWAY_ACTIVE mcast->fragmented 33 THROW_AWAY_ACTIVE
> > mcast->fragmented 34 THROW_AWAY_ACTIVE mcast->fragmented 35
> > THROW_AWAY_ACTIVE mcast->fragmented 36 THROW_AWAY_ACTIVE
> > mcast->fragmented 199 THROW_AWAY_ACTIVE mcast->fragmented 37
> > THROW_AWAY_ACTIVE mcast->fragmented 38 THROW_AWAY_ACTIVE
> > mcast->fragmented 39 THROW_AWAY_ACTIVE mcast->fragmented 40
> > THROW_AWAY_ACTIVE mcast->fragmented 200 THROW_AWAY_ACTIVE
> > mcast->fragmented 201 THROW_AWAY_ACTIVE mcast->fragmented 202
> > THROW_AWAY_ACTIVE mcast->fragmented 203 THROW_AWAY_ACTIVE
> > mcast->fragmented 204 THROW_AWAY_ACTIVE mcast->fragmented 205
> > THROW_AWAY_ACTIVE mcast->fragmented 206 THROW_AWAY_ACTIVE
> > mcast->fragmented 207 THROW_AWAY_ACTIVE mcast->fragmented 208
> > THROW_AWAY_ACTIVE mcast->fragmented 209 THROW_AWAY_ACTIVE
> > mcast->fragmented 210 THROW_AWAY_ACTIVE mcast->fragmented 211
> > THROW_AWAY_ACTIVE mcast->fragmented 212 THROW_AWAY_ACTIVE
> > mcast->fragmented 213 THROW_AWAY_ACTIVE mcast->fragmented 214
> > THROW_AWAY_ACTIVE mcast->fragmented 215 THROW_AWAY_ACTIVE
> > mcast->fragmented 216 THROW_AWAY_ACTIVE mcast->fragmented 1
> > THROW_AWAY_ACTIVE mcast->fragmented 3 THROW_AWAY_ACTIVE
> > mcast->fragmented 4 THROW_AWAY_ACTIVE mcast->fragmented 5
> > THROW_AWAY_ACTIVE mcast->fragmented 6 THROW_AWAY_ACTIVE
> > mcast->fragmented 217 THROW_AWAY_ACTIVE mcast->fragmented 218
> > THROW_AWAY_ACTIVE mcast->fragmented 219 THROW_AWAY_ACTIVE
> > mcast->fragmented 220 THROW_AWAY_ACTIVE mcast->fragmented 221
> > THROW_AWAY_ACTIVE mcast->fragmented 222 THROW_AWAY_ACTIVE
> > mcast->fragmented 223 THROW_AWAY_ACTIVE mcast->fragmented 224
> > THROW_AWAY_ACTIVE mcast->fragmented 225 THROW_AWAY_ACTIVE
> > mcast->fragmented 226 THROW_AWAY_ACTIVE mcast->fragmented 227
> > THROW_AWAY_ACTIVE mcast->fragmented 228 THROW_AWAY_ACTIVE
> > mcast->fragmented 229 THROW_AWAY_ACTIVE mcast->fragmented 230
> > THROW_AWAY_ACTIVE mcast->fragmented 231 THROW_AWAY_ACTIVE
> > mcast->fragmented 232 THROW_AWAY_ACTIVE mcast->fragmented 233
> > THROW_AWAY_ACTIVE mcast->fragmented 7 THROW_AWAY_ACTIVE
> > mcast->fragmented 8 THROW_AWAY_ACTIVE mcast->fragmented 9
> > THROW_AWAY_ACTIVE mcast->fragmented 234 THROW_AWAY_ACTIVE
> > mcast->fragmented 235 THROW_AWAY_ACTIVE mcast->fragmented 236
> > THROW_AWAY_ACTIVE mcast->fragmented 237 THROW_AWAY_ACTIVE
> > mcast->fragmented 238 THROW_AWAY_ACTIVE mcast->fragmented 239
> > THROW_AWAY_ACTIVE mcast->fragmented 240 THROW_AWAY_ACTIVE
> > mcast->fragmented 241 THROW_AWAY_ACTIVE mcast->fragmented 242
> > THROW_AWAY_ACTIVE mcast->fragmented 243 THROW_AWAY_ACTIVE
> > mcast->fragmented 244 THROW_AWAY_ACTIVE mcast->fragmented 245
> > THROW_AWAY_ACTIVE mcast->fragmented 246 THROW_AWAY_ACTIVE
> > mcast->fragmented 247 THROW_AWAY_ACTIVE mcast->fragmented 248
> > THROW_AWAY_ACTIVE mcast->fragmented 249 THROW_AWAY_ACTIVE
> > mcast->fragmented 250 THROW_AWAY_ACTIVE mcast->fragmented 10
> > THROW_AWAY_ACTIVE mcast->fragmented 11 THROW_AWAY_ACTIVE
> > mcast->fragmented 12 THROW_AWAY_ACTIVE mcast->fragmented 13
> > THROW_AWAY_ACTIVE mcast->fragmented 251 THROW_AWAY_ACTIVE
> > mcast->fragmented 252 THROW_AWAY_ACTIVE mcast->fragmented 253
> > THROW_AWAY_ACTIVE mcast->fragmented 254 THROW_AWAY_ACTIVE
> > mcast->fragmented 255 THROW_AWAY_ACTIVE mcast->fragmented 1
> > THROW_AWAY_ACTIVE mcast->fragmented 3 THROW_AWAY_ACTIVE
> > mcast->fragmented 4 THROW_AWAY_ACTIVE mcast->fragmented 5
> > THROW_AWAY_ACTIVE mcast->fragmented 6 THROW_AWAY_ACTIVE
> > mcast->fragmented 7 THROW_AWAY_ACTIVE mcast->fragmented 8
> > THROW_AWAY_ACTIVE mcast->fragmented 9 THROW_AWAY_ACTIVE
> > mcast->fragmented 10 THROW_AWAY_ACTIVE mcast->fragmented 11
> > THROW_AWAY_ACTIVE mcast->fragmented 12 THROW_AWAY_ACTIVE
> > mcast->fragmented 14 THROW_AWAY_ACTIVE mcast->fragmented 13
> > THROW_AWAY_ACTIVE mcast->fragmented 14 THROW_AWAY_ACTIVE
> > mcast->fragmented 15 THROW_AWAY_ACTIVE mcast->fragmented 16
> > THROW_AWAY_ACTIVE mcast->fragmented 17 THROW_AWAY_ACTIVE
> > mcast->fragmented 18 THROW_AWAY_ACTIVE mcast->fragmented 19
> > THROW_AWAY_ACTIVE mcast->fragmented 20 THROW_AWAY_ACTIVE
> > mcast->fragmented 21 THROW_AWAY_ACTIVE mcast->fragmented 22
> > THROW_AWAY_ACTIVE mcast->fragmented 23 THROW_AWAY_ACTIVE
> > mcast->fragmented 24 THROW_AWAY_ACTIVE mcast->fragmented 25
> > THROW_AWAY_ACTIVE mcast->fragmented 26 THROW_AWAY_ACTIVE
> > mcast->fragmented 27 THROW_AWAY_ACTIVE mcast->fragmented 28
> > THROW_AWAY_ACTIVE mcast->fragmented 29 THROW_AWAY_ACTIVE
> > mcast->fragmented 15 THROW_AWAY_ACTIVE mcast->fragmented 16
> > THROW_AWAY_ACTIVE mcast->fragmented 17 THROW_AWAY_ACTIVE
> > mcast->fragmented 30 THROW_AWAY_ACTIVE mcast->fragmented 31
> > THROW_AWAY_ACTIVE mcast->fragmented 32 THROW_AWAY_ACTIVE
> > mcast->fragmented 33 THROW_AWAY_ACTIVE mcast->fragmented 34
> > THROW_AWAY_ACTIVE mcast->fragmented 35 THROW_AWAY_ACTIVE
> > mcast->fragmented 36 THROW_AWAY_ACTIVE mcast->fragmented 37
> > THROW_AWAY_ACTIVE mcast->fragmented 38 THROW_AWAY_ACTIVE
> > mcast->fragmented 39 THROW_AWAY_ACTIVE mcast->fragmented 40
> > THROW_AWAY_ACTIVE mcast->fragmented 41 THROW_AWAY_ACTIVE
> > mcast->fragmented 42 THROW_AWAY_ACTIVE mcast->fragmented 43
> > THROW_AWAY_ACTIVE mcast->fragmented 44 THROW_AWAY_ACTIVE
> > mcast->fragmented 45 THROW_AWAY_ACTIVE mcast->fragmented 46
> > THROW_AWAY_ACTIVE mcast->fragmented 18 THROW_AWAY_ACTIVE
> > mcast->fragmented 19 THROW_AWAY_ACTIVE mcast->fragmented 0
> > THROW_AWAY_ACTIVE mcast->fragmented 48 THROW_AWAY_ACTIVE
> > mcast->fragmented 49 THROW_AWAY_ACTIVE mcast->fragmented 50
> > THROW_AWAY_ACTIVE mcast->fragmented 51 THROW_AWAY_ACTIVE
> > mcast->fragmented 52 THROW_AWAY_ACTIVE mcast->fragmented 53
> > THROW_AWAY_ACTIVE mcast->fragmented 54 THROW_AWAY_ACTIVE
> > mcast->fragmented 55 THROW_AWAY_ACTIVE mcast->fragmented 56
> > THROW_AWAY_ACTIVE mcast->fragmented 57 THROW_AWAY_ACTIVE
> > mcast->fragmented 58 THROW_AWAY_ACTIVE mcast->fragmented 59
> > THROW_AWAY_ACTIVE mcast->fragmented 60 THROW_AWAY_ACTIVE
> > mcast->fragmented 61 THROW_AWAY_ACTIVE mcast->fragmented 62
> > THROW_AWAY_ACTIVE mcast->fragmented 63 THROW_AWAY_ACTIVE
> > mcast->fragmented 64 THROW_AWAY_ACTIVE mcast->fragmented 65
> > THROW_AWAY_ACTIVE mcast->fragmented 66 THROW_AWAY_ACTIVE
> > mcast->fragmented 67 THROW_AWAY_ACTIVE mcast->fragmented 68
> > THROW_AWAY_ACTIVE mcast->fragmented 69 THROW_AWAY_ACTIVE
> > mcast->fragmented 70 THROW_AWAY_ACTIVE mcast->fragmented 71
> > THROW_AWAY_ACTIVE mcast->fragmented 72 THROW_AWAY_ACTIVE
> > mcast->fragmented 73 THROW_AWAY_ACTIVE mcast->fragmented 74
> > THROW_AWAY_ACTIVE mcast->fragmented 75 THROW_AWAY_ACTIVE
> > mcast->fragmented 76 THROW_AWAY_ACTIVE mcast->fragmented 77
> > THROW_AWAY_ACTIVE mcast->fragmented 78 THROW_AWAY_ACTIVE
> > mcast->fragmented 79 THROW_AWAY_ACTIVE mcast->fragmented 80
> > THROW_AWAY_ACTIVE mcast->fragmented 0 THROW_AWAY_ACTIVE
> > mcast->fragmented 2 THROW_AWAY_ACTIVE mcast->fragmented 3
> > THROW_AWAY_ACTIVE mcast->fragmented 4 THROW_AWAY_ACTIVE
> > mcast->fragmented 5 THROW_AWAY_ACTIVE mcast->fragmented 6
> > THROW_AWAY_ACTIVE mcast->fragmented 7 THROW_AWAY_ACTIVE
> > mcast->fragmented 8 THROW_AWAY_ACTIVE mcast->fragmented 9
> > THROW_AWAY_ACTIVE mcast->fragmented 10 THROW_AWAY_ACTIVE
> > mcast->fragmented 11 THROW_AWAY_ACTIVE mcast->fragmented 12
> > THROW_AWAY_ACTIVE mcast->fragmented 13 THROW_AWAY_ACTIVE
> > mcast->fragmented 14 THROW_AWAY_ACTIVE mcast->fragmented 15
> > THROW_AWAY_ACTIVE mcast->fragmented 1 THROW_AWAY_ACTIVE
> > mcast->fragmented 3 THROW_AWAY_ACTIVE mcast->fragmented 4
> > THROW_AWAY_ACTIVE mcast->fragmented 5 THROW_AWAY_ACTIVE
> > mcast->fragmented 6 THROW_AWAY_ACTIVE mcast->fragmented 7
> > THROW_AWAY_ACTIVE mcast->fragmented 8 THROW_AWAY_ACTIVE
> > mcast->fragmented 9 THROW_AWAY_ACTIVE mcast->fragmented 10
> > THROW_AWAY_ACTIVE mcast->fragmented 11 THROW_AWAY_ACTIVE
> > mcast->fragmented 12 THROW_AWAY_ACTIVE mcast->fragmented 13
> > THROW_AWAY_ACTIVE mcast->fragmented 14 THROW_AWAY_ACTIVE
> > mcast->fragmented 15 THROW_AWAY_ACTIVE mcast->fragmented 16
> > THROW_AWAY_ACTIVE mcast->fragmented 17 THROW_AWAY_ACTIVE
> > mcast->fragmented 18 THROW_AWAY_ACTIVE mcast->fragmented 19
> > THROW_AWAY_ACTIVE mcast->fragmented 20 THROW_AWAY_ACTIVE
> > mcast->fragmented 0 -----Original Message-----
> > From: Steven Dake [mailto:[email protected]]
> > Sent: Monday, April 13, 2009 2:56 PM
> > To: jay_chen
> > Cc: [email protected]
> > Subject: [Spam Mail] Re: [Openais] sync_in_process always return 1
> >
> > On Mon, 2009-04-13 at 10:38 +0800, jay_chen wrote:
> > > Dear All:
> > >
> > > I am using 0.80.4 and I encounter a problem.
> > > I have two nodes on different devices and I checkpoint data from
> > > master to slave node.
> > > After they run for a long time, it always return
> > > SA_AIS_ERR_TRY_AGAIN on master. (which is doing checkpoint write
> > > operation periodically)
> > >
> > > Some observations:
> > > 1. sync_in_process( ) is always 1.
> > > 2. master & slave call sync_barrier_send( ) and return with res = 0
> > > (which means ok) 3. master call sync_deliver_fn( ) with slave nodeid
> > > only 4. slave call sync_deliver_fn with slave nodeid only 5. the
> > > state machine of sync remains in processing state forever
> > >
> > > Could anyody give me some hints to further investigate?
> > >
> > logs around the time this happens would be helpful.
> >
> > Is the THROW_AWAY code in totempg throwing away the first message in
> > the new configuration?
> >
> > what service is syncing when this barrier occurs?
> >
> > what io load is taking place (transactions per second).
> >
> > regards
> > -steve
> >
> > > Thanks.
> > >
> > > Jay Chen.
> > > _______________________________________________
> > > Openais mailing list
> > > [email protected]
> > > https://lists.linux-foundation.org/mailman/listinfo/openais
> >
>
Index: exec/totempg.c
===================================================================
--- exec/totempg.c (revision 1829)
+++ exec/totempg.c (working copy)
@@ -156,21 +156,20 @@
struct totem_config *totempg_totem_config;
+enum throw_away_mode_t {
+ THROW_AWAY_INACTIVE,
+ THROW_AWAY_ACTIVE
+};
+
struct assembly {
unsigned int nodeid;
unsigned char data[MESSAGE_SIZE_MAX];
int index;
unsigned char last_frag_num;
+ enum throw_away_mode_t throw_away_mode;
struct list_head list;
};
-enum throw_away_mode_t {
- THROW_AWAY_INACTIVE,
- THROW_AWAY_ACTIVE
-};
-
-static enum throw_away_mode_t throw_away_mode = THROW_AWAY_INACTIVE;
-
DECLARE_LIST_INIT(assembly_list_inuse);
DECLARE_LIST_INIT(assembly_list_free);
@@ -261,6 +260,10 @@
list_del (&assembly->list);
list_add (&assembly->list, &assembly_list_inuse);
assembly->nodeid = nodeid;
+ assembly->index = 0;
+ assembly->last_frag_num = 0;
+ assembly->throw_away_mode = THROW_AWAY_INACTIVE;
+
return (assembly);
}
@@ -565,10 +568,10 @@
*/
start = 0;
- if (throw_away_mode == THROW_AWAY_ACTIVE) {
+ if (assembly->throw_away_mode == THROW_AWAY_ACTIVE) {
/* Throw away the first msg block */
if (mcast->fragmented == 0 || mcast->fragmented == 1) {
- throw_away_mode = THROW_AWAY_INACTIVE;
+ assembly->throw_away_mode = THROW_AWAY_INACTIVE;
assembly->index += msg_lens[0];
iov_delv.iov_base = &assembly->data[assembly->index];
@@ -576,7 +579,7 @@
start = 1;
}
} else
- if (throw_away_mode == THROW_AWAY_INACTIVE) {
+ if (assembly->throw_away_mode == THROW_AWAY_INACTIVE) {
if (continuation == assembly->last_frag_num) {
assembly->last_frag_num = mcast->fragmented;
for (i = start; i < msg_count; i++) {
@@ -589,7 +592,7 @@
}
}
} else {
- throw_away_mode = THROW_AWAY_ACTIVE;
+ assembly->throw_away_mode = THROW_AWAY_ACTIVE;
}
}
_______________________________________________
Openais mailing list
[email protected]
https://lists.linux-foundation.org/mailman/listinfo/openais