Hi Vu
If possible remove version 3 checking entirely, see my previous mail
Thanks
Lennart
-----Original Message-----
From: Vu Minh Nguyen [mailto:[email protected]]
Sent: den 8 september 2015 09:48
To: [email protected]; Lennart Lund; Giang Do T
Cc: [email protected]
Subject: [PATCH 1 of 1] log: Log server crash if SC nodes is not of same chkpt
version [#1459]
osaf/services/saf/logsv/lgs/lgs_mbcsv.c | 20 ++++++++++----------
tests/logsv/README | 32 ++++++++++++++++++++++++++++++++
2 files changed, 42 insertions(+), 10 deletions(-)
Fix incorrect handling of version 3/4 checkpoint handling
diff --git a/osaf/services/saf/logsv/lgs/lgs_mbcsv.c
b/osaf/services/saf/logsv/lgs/lgs_mbcsv.c
--- a/osaf/services/saf/logsv/lgs/lgs_mbcsv.c
+++ b/osaf/services/saf/logsv/lgs/lgs_mbcsv.c
@@ -782,7 +782,7 @@ static uint32_t ckpt_encode_async_update
data_v3 = (lgsv_ckpt_msg_v3_t
*)(long)cbk_arg->info.encode.io_reo_hdl;
vdata = data_v3;
edp_function = edp_ed_ckpt_msg_v3;
- } else if (lgs_is_peer_v2()) {
+ } else if (lgs_is_peer_v2()) { /* Checkpoint version 2/3 is the same */
data_v2 = (lgsv_ckpt_msg_v2_t
*)(long)cbk_arg->info.encode.io_reo_hdl;
vdata = data_v2;
edp_function = edp_ed_ckpt_msg_v2;
@@ -1098,7 +1098,7 @@ static uint32_t ckpt_decode_log_cfg(lgs_
ckpt_msg_v3 = ckpt_msg;
lgs_cfg = &ckpt_msg_v3->ckpt_rec.lgs_cfg;
edp_function = edp_ed_lgs_cfg_rec_v3;
- } else if (lgs_is_peer_v2()) {
+ } else if (lgs_is_peer_v2()) { /* Checkpoint version 2/3 is the same */
ckpt_msg_v2 = ckpt_msg;
lgs_cfg = &ckpt_msg_v2->ckpt_rec.lgs_cfg;
edp_function = edp_ed_lgs_cfg_rec_v2; @@ -1151,10 +1151,10 @@
static uint32_t ckpt_decode_async_update
if (lgs_is_peer_v5()) {
ckpt_msg_v5->header = hdr;
ckpt_msg = ckpt_msg_v5;
- } else if (lgs_is_peer_v4() && (hdr_ptr->ckpt_rec_type ==
LGS_CKPT_LGS_CFG_V3)) {
+ } else if (lgs_is_peer_v4()) {
ckpt_msg_v3->header = hdr;
ckpt_msg = ckpt_msg_v3;
- } else if (lgs_is_peer_v2()) {
+ } else if (lgs_is_peer_v2()) { /* Checkpoint version 2/3 is the same
+*/
ckpt_msg_v2->header = hdr;
ckpt_msg = ckpt_msg_v2;
} else {
@@ -1168,9 +1168,9 @@ static uint32_t ckpt_decode_async_update
TRACE_2("\tINITIALIZE REC: UPDATE");
if (lgs_is_peer_v5()) {
reg_rec = &ckpt_msg_v5->ckpt_rec.initialize_client;
- } else if (lgs_is_peer_v3()) {
+ } else if (lgs_is_peer_v4()) {
reg_rec = &ckpt_msg_v3->ckpt_rec.initialize_client;
- } else if (lgs_is_peer_v2()) {
+ } else if (lgs_is_peer_v2()) { /* Checkpoint version 2/3 is the
same
+*/
reg_rec = &ckpt_msg_v2->ckpt_rec.initialize_client;
} else {
reg_rec = &ckpt_msg_v1->ckpt_rec.initialize_client;
@@ -1193,9 +1193,9 @@ static uint32_t ckpt_decode_async_update
TRACE_2("\tSTREAM OPEN: UPDATE");
if (lgs_is_peer_v5()) {
stream_open = &ckpt_msg_v5->ckpt_rec.stream_open;
- } else if (lgs_is_peer_v3()) {
+ } else if (lgs_is_peer_v4()) {
stream_open = &ckpt_msg_v3->ckpt_rec.stream_open;
- } else if (lgs_is_peer_v2()) {
+ } else if (lgs_is_peer_v2()) { /* Checkpoint version 2/3 is the
same
+*/
stream_open = &ckpt_msg_v2->ckpt_rec.stream_open;
} else {
stream_open = &ckpt_msg_v1->ckpt_rec.stream_open;
@@ -1449,7 +1449,7 @@ static uint32_t process_ckpt_data(lgs_cb
} else if (lgs_is_peer_v4()) {
data_v3 = data;
lgsv_ckpt_msg_type = data_v3->header.ckpt_rec_type;
- } else if (lgs_is_peer_v2()) {
+ } else if (lgs_is_peer_v2()) { /* Checkpoint version 2/3 is the same
+*/
data_v2 = data;
lgsv_ckpt_msg_type = data_v2->header.ckpt_rec_type;
} else {
@@ -2182,7 +2182,7 @@ uint32_t lgs_ckpt_send_async(lgs_cb_t *c
} else if (lgs_is_peer_v4()) {
lgsv_ckpt_msg_v3_t *ckpt_rec_v3 = ckpt_rec;
ckpt_rec_type = ckpt_rec_v3->header.ckpt_rec_type;
- } else if (lgs_is_peer_v2()) {
+ } else if (lgs_is_peer_v2()) { /* Checkpoint version 2/3 is the same
+*/
lgsv_ckpt_msg_v2_t *ckpt_rec_v2 = ckpt_rec;
ckpt_rec_type = ckpt_rec_v2->header.ckpt_rec_type;
} else {
diff --git a/tests/logsv/README b/tests/logsv/README
--- a/tests/logsv/README
+++ b/tests/logsv/README
@@ -10,3 +10,35 @@ The second TC tries to set the data grou In order to help
the TC passed, "log-data" group must be existing and be added to supplementary
group list of the user as which LOGSV is running.
Otherwise that TC will be skipped.
+
+
+SET UP A CLUSTER 02 NODES WITH DIFFERENT CKPT VERSIONS
+------------------------------------------------------
+There was issue when two nodes runs with different ckpt versions.
+Ticket #1459 is an example. How to setup this on UML?
+
+Here is guideline for environment preparing for the case active logsv
+with version #5, standby logsv with version #4.
+
+1. Create 02 separate folders, one refers to branch 5.0.x (folder A),
+ other one refers to 4.6.x branch (folder B).
+
+2. Build OpenSAF and UML for them (refer to Wiki)
+
+3. Open 2 terminals. On each one, change directory to
+../tools/cluster_sim_uml
+
+4. Start active node SC-1 with logsv version #5.
+ cluster_sim_uml> ./opensaf nodestart 1
+
+5. Start standby node SC-2 with logsv version #4
+ cluster_sim_uml> ./opensaf nodestart 2
+
+6. Wait for seconds to make sure all OpenSAF services come up.
+
+7. Run test by `logtest` app with no option.
+
+8. Observe both nodes if there is any issue (e.g: node is rebooted)
+
+If want to test the case active logsv version #4, standby logsv version #5,
reboot above SC-1 (swithover).
+
+
--- Begin Message ---
Hi Vu
Version 2 is the first version where checkpointing is used for the log server
configuration. This version is only checkpoint log root directory and close
timestamp.
Version 3 adds groupname to the configuration checkpoint structure.
So I don't understand what you mean about that " checkpoint version #2 is same
as version #3" it's not.
What is really confusing here is the lgs_is_peer_v3() and lgs_is_peer_v4()! I
will try to give an explanation but is not easy to explain or understand!
Prepare to think hard!
V3 originally:
lgs_is_peer_v3() was introduced not because there where new parameters added to
the checkpoint structure but to handle the rule that says that mailbox limits
were not allowed to be changed unless they were allowed to be changed on the
standby as well which was not the case in earlier versions (see ticket #921).
However the limits were not checkpointed so there is no change of the
checkpoint structure. This means that the version 2 structure and functions are
still used.
V3 after introducing group name parameter:
Now the log server configuration structure was changed, the group name
parameter was added and updated encode, decode and handle functions were added.
The new files for this changes are named lgs_mbcsv_v3.c and .h files but is
actually version 4 and lgs_is_peer_v4() was created!
V5:
Introduced to bring some order into this but unfortunately it does not help
with the backwards compatibility. Hopefully we never need to change checkpoint
version again at least not because of added or changed configuration parameters!
Now what to do about this...
During the work with #1387 "service configuration changes that are rejected on
one node systems" it was found that the rule that was originally the reason for
V3 is not needed. The fix for #1387 removes all checking/usage of this rule.
Fix for #1387 shall be pushed before #1459.
This means that V2 and V3 is the same if we talk about the lgs_is_peer_...()
functions so the lgs_is_peer_v3() could actually be removed. If standby says it
is version 2 or 3, version 2 structure and functions shall be used
(lgs_mbcsv_v2.c/.h). If standby says version 4, version 3 structure and
functions (lgs_mbcsv_v3.c/.h) shall be used.
However it is still confusing to use version 2 handling with chkpt version 2
and 3 and version 3 handling with chkpt version 4! I think some documentation
to explain this is needed. I suggest that this is done in the beginning of the
lgs_mbcsv.c file and maybe also in the README file.
Regards
Lennart
-----Original Message-----
From: Vu Minh Nguyen [mailto:[email protected]]
Sent: den 8 september 2015 07:45
To: Giang Do T; Lennart Lund
Cc: Beatriz Brandao
Subject: RE: [LOG] Standby node reboot frequently #1459
Good finding. I explored the 4.4 code and found that checkpoint version #2 is
same as version #3.
Therefore, I need to change back such following checking in some places.
>- } else if (lgs_is_peer_v2()) {
>+ } else if (lgs_is_peer_v3()) {
Please have a look.
Regards,
Vu
>-----Original Message-----
>From: giang do [mailto:[email protected]]
>Sent: Tuesday, September 08, 2015 9:39 AM
>To: Vu Minh Nguyen; 'Lennart Lund'
>Cc: 'Beatriz Brandao'
>Subject: Re: [LOG] Standby node reboot frequently #1459
>
>Hi Vu,
>
>Please see my comments in lines:
>
>diff --git a/osaf/services/saf/logsv/lgs/lgs_mbcsv.c
>b/osaf/services/saf/logsv/lgs/lgs_mbcsv.c
>--- a/osaf/services/saf/logsv/lgs/lgs_mbcsv.c
>+++ b/osaf/services/saf/logsv/lgs/lgs_mbcsv.c
>@@ -782,7 +782,7 @@ static uint32_t ckpt_encode_async_update
> data_v3 = (lgsv_ckpt_msg_v3_t
>*)(long)cbk_arg->info.encode.io_reo_hdl;
> vdata = data_v3;
> edp_function = edp_ed_ckpt_msg_v3;
>- } else if (lgs_is_peer_v2()) {
>+ } else if (lgs_is_peer_v3()) {
>[Giang] I scare about the way you ignore v2 here.
> In my opinion, Version 2 and version 3 use the same data structure:
>lgsv_ckpt_msg_v2_t.
>
>ku:~/o$ hg grep --all "LGS_MBCSV_VERSION "
>osaf/services/saf/logsv/lgs/lgs_mbcsv.h
>osaf/services/saf/logsv/lgs/lgs_mbcsv.h:6316:-:#define
>LGS_MBCSV_VERSION 3
>osaf/services/saf/logsv/lgs/lgs_mbcsv.h:6316:+:#define
>LGS_MBCSV_VERSION 4
>osaf/services/saf/logsv/lgs/lgs_mbcsv.h:5537:-:#define
>LGS_MBCSV_VERSION 2
>osaf/services/saf/logsv/lgs/lgs_mbcsv.h:5537:+:#define
>LGS_MBCSV_VERSION 3
>osaf/services/saf/logsv/lgs/lgs_mbcsv.h:4905:-:#define
>LGS_MBCSV_VERSION 1
>osaf/services/saf/logsv/lgs/lgs_mbcsv.h:4905:+:#define
>LGS_MBCSV_VERSION 2
>osaf/services/saf/logsv/lgs/lgs_mbcsv.h:619:+:#define LGS_MBCSV_VERSION
>1
>
>=> through hg grep command, we can see that LGS_MBCSV_VERSION_3 is
>added in changeset 5537.
> And in this version you can see that, there are no change in
>checkpoint message structures.
>
>You also can see in header of lgs_mbcsv.h file:
>"* Version 3: . For the moment check-pointing is not changed from
>version
2.
>Instead the configuration object is always read when changing from
>standby
to
>active."
>
>Best Regards,
>Giang Do
>
> data_v2 = (lgsv_ckpt_msg_v2_t
>*)(long)cbk_arg->info.encode.io_reo_hdl;
> vdata = data_v2;
> edp_function = edp_ed_ckpt_msg_v2; On 04/09/2015 13:25, Vu Minh
>Nguyen wrote:
>> Hi Lennart,
>>
>> Thanks for your explanation. The table shows clear picture. J
>>
>> After having lots of emails, Here I would summarize some notes.
>>
>> 1.Change #1459 ticket to enhancement, so that the fix is only applied
>> on
>> 4.7 branch
>>
>> 2.The 4.7 patch is attached (no recent change). If no more comment, I
>> will send it out for official review.
>>
>> Regards,
>>
>> Vu
>>
>> *From:*Lennart Lund [mailto:[email protected]]
>> *Sent:* Thursday, September 03, 2015 5:59 PM
>> *To:* Vu Nguyen M; Giang Do T
>> *Cc:* Beatriz Brandao
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi Vu
>>
>> "With upgrade case, the combination always is: active node is 4.7 (or
>> later), standby node is 4.6 (or earlier).
>>
>> Therefore, the changes (if any) should be in `encode` place."
>>
>> Assume an upgrade from 4.5 to 4.7. It will most likely follow the
>> sequence in the table:
>>
>> Active
>>
>>
>>
>> Standby
>>
>>
>>
>> Comment
>>
>> 4.5
>>
>>
>>
>> 4.5
>>
>>
>>
>> Before upgrade
>>
>> 4.5
>>
>>
>>
>> X
>>
>>
>>
>> Standby is being upgraded. Meantime we have no standby
>>
>> 4.5
>>
>>
>>
>> 4.7
>>
>>
>>
>> Standby is restarted after upgrade
>>
>> 4.7
>>
>>
>>
>> 4.5
>>
>>
>>
>> A switchover is done so that the previous active node can be upgraded.
>> The newly upgraded node is now active
>>
>> 4.7
>>
>>
>>
>> X
>>
>>
>>
>> Standby is being upgraded. Meantime we gave no standby
>>
>> 4.7
>>
>>
>>
>> 4.7
>>
>>
>>
>> Standby is restarted after upgrade.
>>
>> Regards
>>
>> Lennart
>>
>> *From:*Vu Minh Nguyen [mailto:[email protected]]
>> *Sent:* den 3 september 2015 12:00
>> *To:* Lennart Lund; Giang Do T
>> *Cc:* Beatriz Brandao
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi Lennart,
>>
>> I am wrong to state that during upgrade there is no case standby node
>> is
>> 4.7 (logsv with checkpoint version #5).
>>
>> > When the new version is installed the standby node is restarted.
>> Now we are running the old not fixed version on active node and new
>> version on standby
>>
>> As you said in previous email, after upgrading standby node to new
>> version, standby node is rebooted so that standby runs with new version.
>>
>> So, to avoid following failed cases, I think we should fix both in
>> `decode` and `encode` places as the patch I sent before
>> (lgs_1459_def_4.7c.patch).
>>
>> Standby
>>
>> Active
>>
>>
>>
>> *5*
>>
>> *3*
>>
>>
>>
>> Stb
>>
>> *4*
>>
>>
>>
>> Stb
>>
>> Regards,
>>
>> Vu
>>
>> *From:*Vu Minh Nguyen [mailto:[email protected]]
>> *Sent:* Thursday, September 03, 2015 11:18 AM
>> *To:* 'Lennart Lund'; 'Giang Do T'
>> *Cc:* 'Beatriz Brandao'
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi Lennart,
>>
>> > Are there any changes that could be made in the 4.7 fix to meet
>> this scenario better than the fix is already doing?
>>
>> Most of changed points(lgs_1459_def_4.7c.patch) are in `decode`
>> places,
>>
>> means that they are fixed for the case standby node is version #5.
>>
>> With upgrade case, the combination always is: active node is 4.7 (or
>> later), standby node is 4.6 (or earlier).
>>
>> Therefore, the changes (if any) should be in `encode` place.
>>
>> So, I think, there is only one point need to fix in
>> /ckpt_encode_async_update @ lgs_mbcsv.c/
>>
>> [Before]
>>
>> / } else if (//lgs_is_peer_v2//()) {/
>>
>> / data_v2 = (lgsv_ckpt_msg_v2_t
>> *)(long)cbk_arg->info.encode.io_reo_hdl;/
>>
>> / vdata = data_v2;/
>>
>> / edp_function = //edp_ed_ckpt_msg_v2//;/
>>
>> [After]
>>
>> / } else if (lgs_is_peer_v3()) {/
>>
>> / data_v2 = (lgsv_ckpt_msg_v2_t
>> *)(long)cbk_arg->info.encode.io_reo_hdl;/
>>
>> / vdata = data_v2;/
>>
>> / edp_function = edp_ed_ckpt_msg_v2;/
>>
>> It is for the case: active node is 4.7, standby node is 4.4 (which
>> has checkpoint version #2).
>>
>> As `/edp_ed_ckpt_msg_v2/` actually is implemented for encoding
>> checkpoint version #3.
>>
>> Since the change is minor and rarely happen (e.g: after completing
>> upgrade on standby, reboot standby node (now active node is 4.7 or
>> later),
>>
>> then the operator does change `logRootDirectory`, logsv on active
>> node will do encode for checkpoint message. Otherwise, above code
>> line never run)
>>
>> , could consider to set this ticket to `won't fix` ?
>>
>> Regards,
>>
>> Vu
>>
>> *From:*Lennart Lund [mailto:[email protected]]
>> *Sent:* Tuesday, September 01, 2015 5:27 PM
>> *To:* Vu Nguyen M; Giang Do T
>> *Cc:* Beatriz Brandao; Lennart Lund
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi Vu,
>>
>> This fix should maybe be changed from defect to enhancement which
>> means that the fix will not be implemented on 4.6. The reason is that
>> the most likely situation when this problem may occur is when
>> upgrading from an older version to the latest which is 4.7. It must
>> be very rare (most likely never) that someone wants to first update
>> their
>> 4.6 version so they get the 4.6 fix and later update to 4.7 or
>> upgrade to 4.6 after 4.7 is released . Fix an older version should be
>> avoided as much as possible because of the risk of adding new
>> problems to an
>already released version.
>>
>> This means that the most likely upgrade scenario is to upgrade from a
>> 4.6 (or earlier) without this fix (also if we push a fix for 4.6) to
>> 4.7 (or later) so this is what has to work as good as possible. Are
>> there any changes that could be made in the 4.7 fix to meet this
>> scenario better than the fix is already doing?
>>
>> I will look at this myself as part of the internal review.
>>
>> Regards
>>
>> Lennart
>>
>> *From:*Vu Minh Nguyen [mailto:[email protected]]
>> *Sent:* den 1 september 2015 11:10
>> *To:* Lennart Lund; Giang Do T
>> *Cc:* Beatriz Brandao
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi,
>>
>> Have you take a look at this patch?
>>
>> I saw #593 & #1288 pushed. If you have no more comment, I would like
>> to send it out for official review.
>>
>> Regards,
>>
>> Vu
>>
>> *From:*Vu Minh Nguyen [mailto:[email protected]]
>> *Sent:* Thursday, August 27, 2015 1:33 PM
>> *To:* 'Lennart Lund'
>> *Cc:* 'Beatriz Brandao'; 'Giang Do T'
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi Lennart,
>>
>> This morning, I tried to test full combinations among version #3, #4, #5.
>>
>> I found that there is one more code fault, needed to fix in both patches.
>>
>> Here are the updated ones. I also attached the excel file for test
>> cases/results.
>>
>> Please have a look and give your comments. Thanks.
>>
>> Regards,
>>
>> Vu
>>
>> *From:*Lennart Lund [mailto:[email protected]]
>> *Sent:* Wednesday, August 26, 2015 5:53 PM
>> *To:* Vu Nguyen M
>> *Cc:* Beatriz Brandao; Giang Do T; Lennart Lund
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Also, this fix shall be handled in OpenSAF after we have pushed #593
>> and
>> #1288
>>
>> /Lennart
>>
>> *From:*Lennart Lund
>> *Sent:* den 26 augusti 2015 12:51
>> *To:* Vu Nguyen M
>> *Cc:* Beatriz Brandao; Giang Do T; Lennart Lund
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi Vu
>>
>> You can update the patches. I think the patches should be made your
>> patches since you have done the most work around them the only thing
>> I have done is a quick hack based on your original findings. To
>> change the name in the patches use:
>>
>>> hg qrefresh -u <your user name>
>>
>> Is the ticket status "accepted" by you? If not please update the ticket.
>>
>> It is a bit problematic to test this fix at least it cannot be done
>> as a part of logtest. The best you can do is to write down a number
>> of test cases/test instructions that can be used for manual testing.
>> By doing that it is also possible to see to that no important variant
>> of versions, switch over, fail over etc. is forgotten; the
>> information can be reviewed and testing can be repeated.
>>
>> The test information could be included in the README file in the test
>> directory.
>>
>> Note:
>>
>> During an upgrade:
>>
>> 1.The standby node is "taken down" to be upgraded with the new version.
>> Meantime the active node is running the old version e.g. 4.6. The old
>> version does not have any fixes for this problem even if we have
>> released such fixes.
>>
>> 2.When the new version is installed the standby node is restarted.
>> Now we are running the old not fixed version on active node and new
>> version on standby
>>
>> 3.A switch over is done meaning that new version is active and old is
>> standby.
>>
>> 4.The standby node is "taken down" to be upgraded with the new version.
>> The new version is installed and the node is restarted. Now new
>> version is running on both nodes.
>>
>> 5.Normally it's now that model updates, configuration changes etc. is
done.
>>
>> During this upgrade sequence it is possible that requests for
>> initializing clients, opening streams, writing log records etc. are
>> received.
>>
>> It is also possible that the sequence will vary for different reasons
>> e.g. a failover may happen after 3. And it will take a while before
>> the situation is fixed
>>
>> Regards
>>
>> Lennart
>>
>> *From:*Vu Minh Nguyen [mailto:[email protected]]
>> *Sent:* den 26 augusti 2015 12:06
>> *To:* Lennart Lund
>> *Cc:* Beatriz Brandao; Giang Do T
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi Lennart,
>>
>> I would like to update your patches for fixing remaining issues, my
opinion.
>>
>> Regards,
>>
>> Vu
>>
>> *From:*Vu Minh Nguyen [mailto:[email protected]]
>> *Sent:* Wednesday, August 26, 2015 4:41 PM
>> *To:* 'Lennart Lund'
>> *Cc:* 'Beatriz Brandao'; 'Giang Do T'
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi Lennart,
>>
>> There is one more issue with following case.
>>
>> Scenario (no switch side):
>>
>> -SC-1: version #5
>>
>> -SC-2: version #4
>>
>> -Send any log record
>>
>> -SC-2 is rebooted
>>
>> I guess it comes due to lack of following code lines on 4.6.x branch:
>>
>> I tested it, it works.
>>
>> --- a/osaf/services/saf/logsv/lgs/lgs_mbcsv.c
>>
>> +++ b/osaf/services/saf/logsv/lgs/lgs_mbcsv.c
>>
>> @@ -1116,7 +1116,9 @@ static uint32_t ckpt_decode_async_update
>>
>> switch (hdr_ptr->ckpt_rec_type) {
>>
>> case LGS_CKPT_CLIENT_INITIALIZE:
>>
>> TRACE_2("\tINITIALIZE REC: UPDATE");
>>
>> - if (lgs_is_peer_v2()) {
>>
>> + if (lgs_is_peer_v3()) {
>>
>> + reg_rec = &ckpt_msg_v3->ckpt_rec.initialize_client;
>>
>> + }else if (lgs_is_peer_v2()) {
>>
>> reg_rec = &ckpt_msg_v2->ckpt_rec.initialize_client;
>>
>> } else {
>>
>> reg_rec = &ckpt_msg_v1->ckpt_rec.initialize_client;
>>
>> @@ -1136,7 +1138,9 @@ static uint32_t ckpt_decode_async_update
>>
>> case LGS_CKPT_OPEN_STREAM: /* 4 */
>>
>> TRACE_2("\tSTREAM OPEN: UPDATE");
>>
>> - if (lgs_is_peer_v2()) {
>>
>> + if (lgs_is_peer_v3()) {
>>
>> + stream_open = &ckpt_msg_v3->ckpt_rec.stream_open;
>>
>> + }else if (lgs_is_peer_v2()) {
>>
>> stream_open = &ckpt_msg_v2->ckpt_rec.stream_open;
>>
>> } else {
>>
>> stream_open = &ckpt_msg_v1->ckpt_rec.stream_open;
>>
>> Regards,
>>
>> Vu
>>
>> *From:*Vu Minh Nguyen [mailto:[email protected]]
>> *Sent:* Wednesday, August 26, 2015 3:13 PM
>> *To:* 'Lennart Lund'
>> *Cc:* 'Beatriz Brandao'; 'Giang Do T'
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi Lennart,
>>
>> The trouble comes due to the confused numbering checkpoint version.
>>
>> (ckpt_decode_log_cfg @ lgs_mbcsv.c)
>>
>> if (lgs_is_peer_v5()) {
>>
>> ckpt_msg_v5 = ckpt_msg;
>>
>> lgs_cfg = &ckpt_msg_v5->ckpt_rec.lgs_cfg;
>>
>> edp_function = edp_ed_lgs_cfg_rec_v5;
>>
>> } else if (lgs_is_peer_v3()) {
>>
>> ckpt_msg_v3 = ckpt_msg;
>>
>> lgs_cfg = &ckpt_msg_v3->ckpt_rec.lgs_cfg;
>>
>> edp_function = edp_ed_lgs_cfg_rec_v3;
>>
>> } else if (lgs_is_peer_v2()) {
>>
>> ckpt_msg_v2 = ckpt_msg;
>>
>> lgs_cfg = &ckpt_msg_v2->ckpt_rec.lgs_cfg;
>>
>> edp_function = edp_ed_lgs_cfg_rec_v2;
>>
>> } else {
>>
>> The correct map should be:
>>
>> 1. edp_ed_lgs_cfg_rec_v2
>>
>> àfor version #3
>>
>> 2. edp_ed_lgs_cfg_rec_v3
>>
>> àfor version #4
>>
>> 3. edp_ed_lgs_cfg_rec_v5
>>
>> àfor version #5
>>
>> The problem comes due to this wrong map. In version #3, the
>> checkpoint does not contain `logDataGroupname`.
>>
>> Therefore, it causes trouble when standby referring to
>> `logDataGroupname` in `edp_ed_lgs_cfg_rec_v3`.
>>
>> Regards,
>>
>> Vu
>>
>> *From:*Vu Minh Nguyen [mailto:[email protected]]
>> *Sent:* Wednesday, August 26, 2015 2:43 PM
>> *To:* 'Lennart Lund'
>> *Cc:* 'Beatriz Brandao'; 'Giang Do T'
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi Lennart,
>>
>> Still remains issue.
>>
>> Scenario:
>>
>> -SC-1: version #5
>>
>> -SC-2: version #3
>>
>> -Reboot SC-1
>>
>> -change saLogRootDirectory. (immcfg -a
>> logRootDirectory="/repl_opensaf/saflog"
>> logConfig=1,safApp=safLogService)
>>
>> -SC-1 reboot
>>
>> I am investigating the root cause, just send it for your info.
>>
>> Relate to MBCSV version, I am confused its numbering (1-5).
>>
>> For example, on branch 4.5.x, it is version #3. But 'lgs_mbcsv_v3`
>> does not exist.
>>
>> Does `version #1` refer to `lgs_mbcsv.c`, version #1 refer to
>> `lgs_mbcsv_1`, and so on?
>>
>> Regards,
>>
>> Vu
>>
>> *From:*Lennart Lund [mailto:[email protected]]
>> *Sent:* Tuesday, August 25, 2015 7:57 PM
>> *To:* Vu Nguyen M
>> *Cc:* Beatriz Brandao; Giang Do T; Lennart Lund
>> *Subject:* RE: [LOG] Standby node reboot frequently #1459
>>
>> Hi Vu,
>>
>> This is a serious problem that should be fixed before releasing 4.7.
>> I have written a ticket #1459. For the moment owner is set to me and
>> status is accepted.
>>
>> I have looked at the problem and found that your analyze is correct.
>> However the same incorrect handling exist in several more places and
>> in both OpenSAF 4.6 and 4.7. I have created two patches one for
>> branch
>> 4.6 and one for 4.7 (attached).
>>
>> I have not done much testing. Can you test this with different
>> combinations of branches on the nodes and with different combinations
>> of standby/active etc. If you find that my patches does not solve all
>> combinations please analyze and fix.
>>
>> I will continue with #1288 and send it out for official OpenSAF review.
>> I will also continue with the PR document.
>>
>> Thanks
>>
>> Lennart
>>
>> *From:*Vu Minh Nguyen [mailto:[email protected]]
>> *Sent:* den 25 augusti 2015 11:42
>> *To:* Lennart Lund
>> *Cc:* Beatriz Brandao; Giang Do T
>> *Subject:* [LOG] Standby node reboot frequently
>>
>> Hi Lennart,
>>
>> I set up the cluster 02 SCs with different checkpoint versions.
>>
>> -SC-1 is version #5 (active)
>>
>> -SC-2 is version #3 (standby)
>>
>> Then, I reboot SC-1 - active node and problem comes.
>>
>> -SC-1 is rebooted once
>>
>> -If sending any log record (e.g: saflogger -l), SC-1 is rebooted
>>
>> Following message appears before rebooting:
>>
>> Aug 25 9:41:36.986036 osaflogd 40028d80 [477:lgs_mbcsv.c:1139] >>
>> ckpt_decode_async_update
>>
>> Aug 25 9:41:36.986151 osaflogd 40028d80 [477:lgs_mbcsv.c:1149] T2
>> ckpt_rec_type: 4
>>
>> Aug 25 9:41:36.986789 osaflogd 40028d80 [477:lgs_mbcsv.c:1193] T2
>> STREAM OPEN: UPDATE
>>
>> Aug 25 9:41:36.986977 osaflogd 40028d80 [477:lgs_mbcsv.c:1855] >>
>> ckpt_proc_open_stream
>>
>> Aug 25 9:41:36.987095 osaflogd 40028d80 [477:lgs_evt.c:0082] TR
>> client_id: 0 lookup failed
>>
>> Aug 25 9:41:36.987626 osaflogd 40028d80 [477:lgs_mbcsv.c:1867] WA
>> Client 0 does not exist, failed to create stream ''
>>
>> Here is my analysis:
>>
>> -------------------------
>>
>> Look at top of function ckpt_decode_async_update() @ lgs_mbcsv.c. I
>> highlight the suspected code line.
>>
>> if (lgs_is_peer_v5()) {
>>
>> ckpt_msg_v5->header = hdr;
>>
>> ckpt_msg = ckpt_msg_v5;
>>
>> } else if (lgs_is_peer_v4() && (hdr_ptr->ckpt_rec_type ==
>> LGS_CKPT_LGS_CFG_V3)) {
>>
>> ckpt_msg_v3->header = hdr;
>>
>> ckpt_msg = ckpt_msg_v3;
>>
>> } else if (lgs_is_peer_v2()) {
>>
>> ckpt_msg_v2->header = hdr;
>>
>> ckpt_msg = ckpt_msg_v2;
>>
>> } else {
>>
>> ckpt_msg_v1->header = hdr;
>>
>> ckpt_msg = ckpt_msg_v1;
>>
>> }
>>
>> When getting checkpoint from active node, our case is version #3.
>> There are incorrect points here, I think.
>>
>> 1.With above code, it initializes chkp message header which points to
>> version #2. Checkpoint version #3 header is **not** initialized.
>>
>> 2.ckpt_rec_type == LGS_CKPT_LGS_CFG_V3should only valid for log
>> config checkpoint which is done by ckpt_decode_log_cfg() & version #4.
>>
>> The function ckpt_decode_async_update() is common process for many
>> kinds of checkpoint. Wonder putting at that place is correct (?).
>>
>> 3.For sure, I think we should add `lgs_is_peer_v3` as
>> `initialize`/`open` checkpoint do check for this version.
>>
>> It is just my assumption.
>>
>> I try to modify code as attached patch and the problem is gone.
>>
>> Could you reproduce it from your side and give your opinion? Thanks.
>>
>> Regards,
>>
>> Vu
>>
--- End Message ---
------------------------------------------------------------------------------
_______________________________________________
Opensaf-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/opensaf-devel