Hi a.Vu, Looking at patch in ticket #2670, we can use ref_counter to decide to remove client in case recovery fail. That way is simple.
Regards Canh -----Original Message----- From: Vu Minh Nguyen <vu.m.ngu...@dektech.com.au> Sent: Tuesday, June 5, 2018 5:54 PM To: lennart.l...@ericsson.com; canh.v.tru...@dektech.com.au Cc: opensaf-devel@lists.sourceforge.net; Vu Minh Nguyen <vu.m.ngu...@dektech.com.au> Subject: [PATCH 1/1] log: fix restore ref counter for deleted stream [#2870] In the methods LogAgent::saLogStreamClose() and LogAgent::saLogWriteLogAsync(), the client is deleted if failed to recover; however, the pointer to the log stream of this client has not been reset. Therefore, when the destrustor of ScopeData runs, the reference counter could be restored on deleted log stream. Besides, there are possibilities of race condition b/w deleting the failed-to-recovery client and using that deleted client in other thread. This patch introduces a new attribute to LogClient. When the client fails to recover, that client is not removed at that time but is done at next call of log handle initialize or finalize if the attribute is true. --- src/log/agent/lga_agent.cc | 87 ++++++++++++++++++++++++++++++++++++++------- src/log/agent/lga_agent.h | 5 +++ src/log/agent/lga_client.cc | 2 +- src/log/agent/lga_client.h | 7 ++++ 4 files changed, 87 insertions(+), 14 deletions(-) diff --git a/src/log/agent/lga_agent.cc b/src/log/agent/lga_agent.cc index f33b5dc..831edfa 100644 --- a/src/log/agent/lga_agent.cc +++ b/src/log/agent/lga_agent.cc @@ -255,6 +255,21 @@ void LogAgent::RemoveAllLogClients() { } } +void LogAgent::RemoveAllDeadLogClients() { + TRACE_ENTER(); + ScopeLock scopeLock(mutex_); + auto it = client_list_.begin(); + while (it != client_list_.end()) { + if ((*it)->is_died() == true) { + client_list_.erase(it); + delete *it; + it = client_list_.begin(); + continue; + } + it++; + } +} + // Add one client @client to the list @client_list_ // This method should be called in @saLogInitialize() context void LogAgent::AddLogClient(LogClient* client) { @@ -314,6 +329,14 @@ SaAisErrorT LogAgent::saLogInitialize(SaLogHandleT* logHandle, TRACE_ENTER(); + if (true) { + ScopeLock critical_section(get_delete_obj_sync_mutex_); + if (has_dead_clients == true) { + RemoveAllDeadLogClients(); + has_dead_clients = false; + } + } + // Verify parameters (log handle and that version is given) if ((logHandle == nullptr) || (version == nullptr)) { TRACE("version or handle FAILED"); @@ -453,6 +476,11 @@ SaAisErrorT LogAgent::saLogSelectionObjectGet( return ais_rc; } + if (client->is_died() == true) { + ais_rc = SA_AIS_ERR_BAD_HANDLE; + return ais_rc; + } + if (client->FetchAndIncreaseRefCounter(__func__, &updated) == -1) { // @client is being deleted. Don't touch @this client ais_rc = SA_AIS_ERR_TRY_AGAIN; @@ -504,6 +532,11 @@ SaAisErrorT LogAgent::saLogDispatch(SaLogHandleT logHandle, return ais_rc; } + if (client->is_died() == true) { + ais_rc = SA_AIS_ERR_BAD_HANDLE; + return ais_rc; + } + if (client->FetchAndIncreaseRefCounter(__func__, &updated) == -1) { // @client is being deleted. DO NOT touch this @client ais_rc = SA_AIS_ERR_TRY_AGAIN; @@ -584,6 +617,11 @@ SaAisErrorT LogAgent::saLogFinalize(SaLogHandleT logHandle) { if (true) { ScopeLock critical_section(get_delete_obj_sync_mutex_); + if (has_dead_clients == true) { + RemoveAllDeadLogClients(); + has_dead_clients = false; + } + // Get log client from the global list client = SearchClientByHandle(logHandle); if (client == nullptr) { @@ -852,6 +890,11 @@ SaAisErrorT LogAgent::saLogStreamOpen_2( return ais_rc; } + if (client->is_died() == true) { + ais_rc = SA_AIS_ERR_BAD_HANDLE; + return ais_rc; + } + if (client->FetchAndIncreaseRefCounter(__func__, &updated) == -1) { // @client is being deleted. DO NOT touch this @client ais_rc = SA_AIS_ERR_TRY_AGAIN; @@ -892,10 +935,12 @@ SaAisErrorT LogAgent::saLogStreamOpen_2( } // This client is failed to do recover in Recovery thread. - // Remove this client from the database. if (client->is_recovery_failed() == true) { ScopeLock critical_section(get_delete_obj_sync_mutex_); - RemoveLogClient(&client); + // Mark this client died, and it will be removed from client db later. + client->died(); + client = nullptr; + has_dead_clients = true; ais_rc = SA_AIS_ERR_BAD_HANDLE; return ais_rc; } @@ -1160,6 +1205,11 @@ SaAisErrorT LogAgent::saLogWriteLogAsync(SaLogStreamHandleT logStreamHandle, return ais_rc; } + if (client->is_died() == true) { + ais_rc = SA_AIS_ERR_BAD_HANDLE; + return ais_rc; + } + if (client->FetchAndIncreaseRefCounter(__func__, &cUpdated) == -1) { // @client is being deleted. DO NOT touch this @client ais_rc = SA_AIS_ERR_TRY_AGAIN; @@ -1263,10 +1313,13 @@ SaAisErrorT LogAgent::saLogWriteLogAsync(SaLogStreamHandleT logStreamHandle, } // This client is failed to do recover in Recovery thread. - // Remove this client from the database. if (client->is_recovery_failed() == true) { ScopeLock critical_section(get_delete_obj_sync_mutex_); - RemoveLogClient(&client); + // Mark this client died, and it will be removed from client db later. + client->died(); + client = nullptr; + stream = nullptr; + has_dead_clients = true; ais_rc = SA_AIS_ERR_BAD_HANDLE; return ais_rc; } @@ -1321,13 +1374,6 @@ SaAisErrorT LogAgent::saLogStreamClose(SaLogStreamHandleT logStreamHandle) { return ais_rc; } - if (stream->FetchAndDecreaseRefCounter(__func__, &sUpdated) != 0) { - // @stream is being used somewhere (>0), DO NOT delete this @stream. - // or @stream is being deleted on other thread (=-1) - ais_rc = SA_AIS_ERR_TRY_AGAIN; - return ais_rc; - } - // Retrieve @LogClient obj based on @client_handle client = SearchClientByHandle(stream->GetMyClientHandle()); if (client == nullptr) { @@ -1336,6 +1382,18 @@ SaAisErrorT LogAgent::saLogStreamClose(SaLogStreamHandleT logStreamHandle) { return ais_rc; } + if (client->is_died() == true) { + ais_rc = SA_AIS_ERR_BAD_HANDLE; + return ais_rc; + } + + if (stream->FetchAndDecreaseRefCounter(__func__, &sUpdated) != 0) { + // @stream is being used somewhere (>0), DO NOT delete this @stream. + // or @stream is being deleted on other thread (=-1) + ais_rc = SA_AIS_ERR_TRY_AGAIN; + return ais_rc; + } + if (client->FetchAndIncreaseRefCounter(__func__, &cUpdated) == -1) { ais_rc = SA_AIS_ERR_TRY_AGAIN; return ais_rc; @@ -1384,10 +1442,13 @@ SaAisErrorT LogAgent::saLogStreamClose(SaLogStreamHandleT logStreamHandle) { } // This client is failed to do recover in Recovery thread. - // Remove this client from the database. if (client->is_recovery_failed() == true) { ScopeLock critical_section(get_delete_obj_sync_mutex_); - RemoveLogClient(&client); + // Mark this client died, and it will be removed from client db later. + client->died(); + client = nullptr; + stream = nullptr; + has_dead_clients = true; ais_rc = SA_AIS_ERR_BAD_HANDLE; return ais_rc; } diff --git a/src/log/agent/lga_agent.h b/src/log/agent/lga_agent.h index 9f1c370..817a357 100644 --- a/src/log/agent/lga_agent.h +++ b/src/log/agent/lga_agent.h @@ -159,6 +159,8 @@ class LogAgent { void EnterCriticalSection(); void LeaveCriticalSection(); + void RemoveAllDeadLogClients(); + private: // Not allow to create @LogAgent object, except the singleton object @me_. LogAgent(); @@ -272,6 +274,9 @@ class LogAgent { // Hold list of current log clients std::vector<LogClient*> client_list_; + // true if there is any failed-to-recover client + bool has_dead_clients = false; + // LGS LGA sync params NCS_SEL_OBJ lgs_sync_sel_; diff --git a/src/log/agent/lga_client.cc b/src/log/agent/lga_client.cc index 386c849..3c3f893 100644 --- a/src/log/agent/lga_client.cc +++ b/src/log/agent/lga_client.cc @@ -32,7 +32,7 @@ // LogClient //-------------------------------------------------------------------------- ---- LogClient::LogClient(const SaLogCallbacksT* cb, uint32_t id, SaVersionT ver) - : client_id_{id}, ref_counter_object_{} { + : client_id_{id}, ref_counter_object_{}, died_{false} { TRACE_ENTER(); // Reset registered callback info memset(&callbacks_, 0, sizeof(callbacks_)); diff --git a/src/log/agent/lga_client.h b/src/log/agent/lga_client.h index 05349be..08137b7 100644 --- a/src/log/agent/lga_client.h +++ b/src/log/agent/lga_client.h @@ -173,6 +173,10 @@ class LogClient { // while the recovery thread is going at RecoveryState::kRecovery2. bool is_recovery_done() const; + // Invoke when the client is not recovered successfully. + void died() { died_ = true; } + bool is_died() const { return died_; } + private: // Search for @LogStreamInfo object from @stream_list_ based on @stream_id LogStreamInfo* SearchLogStreamInfoById(uint32_t); @@ -257,6 +261,9 @@ class LogClient { // LOG handle (derived from hdl-mngr) SaLogHandleT handle_; + // true if this client is not successfully recovered. + bool died_; + DELETE_COPY_AND_MOVE_OPERATORS(LogClient); }; -- 1.9.1
--- Begin Message ---Hi Canh + // [Canh1] Following the ticket #1396, As I remember that you, Ander, Vu + // and me had discussed a lot of about ticket 1396 and we had final agreement + // for the solution of ticket 1396: the mds connection never shutdown after first client + // is initialized although this client(last client) is finalized after that. + // This mean that when mds connection is initialized successfully, + // it will never be shutdown. So ncs_agents_shutdown() should not be called here + // if we finalize the last client, to keep the mds connection is alive Yes, you are right. So Ack. But please look at the minor comments I had as well Thanks Lennart > -----Original Message----- > From: Canh Van Truong [mailto:canh.v.tru...@dektech.com.au] > Sent: den 6 mars 2018 05:19 > To: Lennart Lund <lennart.l...@ericsson.com>; Vu Minh Nguyen > <vu.m.ngu...@dektech.com.au> > Cc: opensaf-devel@lists.sourceforge.net > Subject: RE: [PATCH 0/1] Review Request for log: fix log agent may crash > after recovery fails [#2670] > > Hi Lennart, > > Please see my reply comment [Canh1] in the attachment. > > Thanks > Canh > > -----Original Message----- > From: Lennart Lund [mailto:lennart.l...@ericsson.com] > Sent: Monday, March 5, 2018 7:40 PM > To: Canh Van Truong <canh.v.tru...@dektech.com.au>; Vu Minh Nguyen > <vu.m.ngu...@dektech.com.au>; Lennart Lund > <lennart.l...@ericsson.com> > Cc: opensaf-devel@lists.sourceforge.net > Subject: RE: [PATCH 0/1] Review Request for log: fix log agent may crash > after recovery fails [#2670] > > Hi Canh, > > I have added some new comments. See [Lennart1] in the attached .diff > Note that this patch contains all comments so I shall be applied alone, not > on top of the other comment patches > > Thanks > Lennart > > > -----Original Message----- > > From: Canh Van Truong [mailto:canh.v.tru...@dektech.com.au] > > Sent: den 2 mars 2018 07:29 > > To: Lennart Lund <lennart.l...@ericsson.com>; Vu Minh Nguyen > > <vu.m.ngu...@dektech.com.au> > > Cc: opensaf-devel@lists.sourceforge.net > > Subject: RE: [PATCH 0/1] Review Request for log: fix log agent may crash > > after recovery fails [#2670] > > > > Hi Lennart, > > > > Please see my replied comment in attachment. > > > > Thanks > > Canh > > > > -----Original Message----- > > From: Lennart Lund [mailto:lennart.l...@ericsson.com] > > Sent: Thursday, February 22, 2018 5:00 PM > > To: Canh Van Truong <canh.v.tru...@dektech.com.au>; Vu Minh Nguyen > > <vu.m.ngu...@dektech.com.au> > > Cc: opensaf-devel@lists.sourceforge.net; Canh Van Truong > > <canh.v.tru...@dektech.com.au>; Lennart Lund > > <lennart.l...@ericsson.com> > > Subject: RE: [PATCH 0/1] Review Request for log: fix log agent may crash > > after recovery fails [#2670] > > > > Hi Canh > > > > I have done a review and have some comments. See attached diff > > > > Thanks > > Lennart > > > > > -----Original Message----- > > > From: Canh Van Truong [mailto:canh.v.tru...@dektech.com.au] > > > Sent: den 9 november 2017 04:25 > > > To: Lennart Lund <lennart.l...@ericsson.com>; Vu Minh Nguyen > > > <vu.m.ngu...@dektech.com.au> > > > Cc: opensaf-devel@lists.sourceforge.net; Canh Van Truong > > > <canh.v.tru...@dektech.com.au> > > > Subject: [PATCH 0/1] Review Request for log: fix log agent may crash > after > > > recovery fails [#2670] > > > > > > Summary: log: fix log agent may crash after recovery fails [#2670] > > > Review request for Ticket(s): 2670 > > > Peer Reviewer(s): Lennart, Vu > > > Pull request to: Vu > > > Affected branch(es): develop, release > > > Development branch: ticket-2670 > > > Base revision: ce78275348c06f5d69577744f0dab525e79443e7 > > > Personal repository: git://git.code.sf.net/u/canht32/review > > > > > > -------------------------------- > > > Impacted area Impact y/n > > > -------------------------------- > > > Docs n > > > Build system n > > > RPM/packaging n > > > Configuration files n > > > Startup scripts n > > > SAF services y > > > OpenSAF services n > > > Core libraries n > > > Samples n > > > Tests n > > > Other n > > > > > > > > > Comments (indicate scope for each "y" above): > > > --------------------------------------------- > > > *** EXPLAIN/COMMENT THE PATCH SERIES HERE *** > > > > > > revision 980226974b1ab266b068eded487e6ce26b9645c0 > > > Author: Canh Van Truong <canh.v.tru...@dektech.com.au> > > > Date: Thu, 9 Nov 2017 10:14:53 +0700 > > > > > > log: fix log agent may crash after recovery fails [#2670] > > > > > > In log api, the client is deleted from the list in the agent after > > recovery fails. > > > there is no check if this client is used by other user. > > > > > > The patch fix to make sure that the deletion is just processed when it > is > > not > > > being used. > > > > > > > > > > > > Complete diffstat: > > > ------------------ > > > src/log/agent/lga_agent.cc | 62 ++++++++++++++++++++++++++++----- > -- > > -- > > > --------- > > > src/log/agent/lga_agent.h | 2 +- > > > src/log/agent/lga_client.h | 19 +++++--------- > > > src/log/agent/lga_util.cc | 25 +++---------------- > > > src/log/agent/lga_util.h | 3 +-- > > > 5 files changed, 50 insertions(+), 61 deletions(-) > > > > > > > > > Testing Commands: > > > ----------------- > > > *** LIST THE COMMAND LINE TOOLS/STEPS TO TEST YOUR CHANGES *** > > > > > > > > > Testing, Expected Results: > > > -------------------------- > > > *** PASTE COMMAND OUTPUTS / TEST RESULTS *** > > > > > > > > > Conditions of Submission: > > > ------------------------- > > > Ack from reviewers > > > > > > > > > Arch Built Started Linux distro > > > ------------------------------------------- > > > mips n n > > > mips64 n n > > > x86 n n > > > x86_64 n n > > > powerpc n n > > > powerpc64 n n > > > > > > > > > Reviewer Checklist: > > > ------------------- > > > [Submitters: make sure that your review doesn't trigger any > checkmarks!] > > > > > > > > > Your checkin has not passed review because (see checked entries): > > > > > > ___ Your RR template is generally incomplete; it has too many blank > > entries > > > that need proper data filled in. > > > > > > ___ You have failed to nominate the proper persons for review and push. > > > > > > ___ Your patches do not have proper short+long header > > > > > > ___ You have grammar/spelling in your header that is unacceptable. > > > > > > ___ You have exceeded a sensible line length in your > > > headers/comments/text. > > > > > > ___ You have failed to put in a proper Trac Ticket # into your commits. > > > > > > ___ You have incorrectly put/left internal data in your comments/files > > > (i.e. internal bug tracking tool IDs, product names etc) > > > > > > ___ You have not given any evidence of testing beyond basic build tests. > > > Demonstrate some level of runtime or other sanity testing. > > > > > > ___ You have ^M present in some of your files. These have to be > removed. > > > > > > ___ You have needlessly changed whitespace or added whitespace > crimes > > > like trailing spaces, or spaces before tabs. > > > > > > ___ You have mixed real technical changes with whitespace and other > > > cosmetic code cleanup changes. These have to be separate commits. > > > > > > ___ You need to refactor your submission into logical chunks; there is > > > too much content into a single commit. > > > > > > ___ You have extraneous garbage in your review (merge commits etc) > > > > > > ___ You have giant attachments which should never have been sent; > > > Instead you should place your content in a public tree to be pulled. > > > > > > ___ You have too many commits attached to an e-mail; resend as > threaded > > > commits, or place in a public tree for a pull. > > > > > > ___ You have resent this content multiple times without a clear > indication > > > of what has changed between each re-send. > > > > > > ___ You have failed to adequately and individually address all of the > > > comments and change requests that were proposed in the initial > review. > > > > > > ___ You have a misconfigured ~/.gitconfig file (i.e. user.name, > user.email > > > etc) > > > > > > ___ Your computer have a badly configured date and time; confusing the > > > the threaded patch review. > > > > > > ___ Your changes affect IPC mechanism, and you don't present any > results > > > for in-service upgradability test. > > > > > > ___ Your changes affect user manual and documentation, your patch > series > > > do not contain the patch that updates the Doxygen manual.
--- End Message ---
------------------------------------------------------------------------------ Check out the vibrant tech community on one of the world's most engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________ Opensaf-devel mailing list Opensaf-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/opensaf-devel