Hi a.Vu,

Looking at patch in ticket #2670, we can use ref_counter to decide to remove
client in case recovery fail. That way is simple.

Regards
Canh

-----Original Message-----
From: Vu Minh Nguyen <vu.m.ngu...@dektech.com.au> 
Sent: Tuesday, June 5, 2018 5:54 PM
To: lennart.l...@ericsson.com; canh.v.tru...@dektech.com.au
Cc: opensaf-devel@lists.sourceforge.net; Vu Minh Nguyen
<vu.m.ngu...@dektech.com.au>
Subject: [PATCH 1/1] log: fix restore ref counter for deleted stream [#2870]

In the methods LogAgent::saLogStreamClose() and
LogAgent::saLogWriteLogAsync(),
the client is deleted if failed to recover; however, the pointer to the log
stream of this client has not been reset. Therefore, when the destrustor of
ScopeData runs, the reference counter could be restored on deleted log
stream.

Besides, there are possibilities of race condition b/w deleting the
failed-to-recovery client and using that deleted client in other thread.

This patch introduces a new attribute to LogClient. When the client fails
to recover, that client is not removed at that time but is done at next call
of log handle initialize or finalize if the attribute is true.
---
 src/log/agent/lga_agent.cc  | 87
++++++++++++++++++++++++++++++++++++++-------
 src/log/agent/lga_agent.h   |  5 +++
 src/log/agent/lga_client.cc |  2 +-
 src/log/agent/lga_client.h  |  7 ++++
 4 files changed, 87 insertions(+), 14 deletions(-)

diff --git a/src/log/agent/lga_agent.cc b/src/log/agent/lga_agent.cc
index f33b5dc..831edfa 100644
--- a/src/log/agent/lga_agent.cc
+++ b/src/log/agent/lga_agent.cc
@@ -255,6 +255,21 @@ void LogAgent::RemoveAllLogClients() {
   }
 }
 
+void LogAgent::RemoveAllDeadLogClients() {
+  TRACE_ENTER();
+  ScopeLock scopeLock(mutex_);
+  auto it = client_list_.begin();
+  while (it != client_list_.end()) {
+    if ((*it)->is_died() == true) {
+      client_list_.erase(it);
+      delete *it;
+      it = client_list_.begin();
+      continue;
+    }
+    it++;
+  }
+}
+
 // Add one client @client to the list @client_list_
 // This method should be called in @saLogInitialize() context
 void LogAgent::AddLogClient(LogClient* client) {
@@ -314,6 +329,14 @@ SaAisErrorT LogAgent::saLogInitialize(SaLogHandleT*
logHandle,
 
   TRACE_ENTER();
 
+  if (true) {
+    ScopeLock critical_section(get_delete_obj_sync_mutex_);
+    if (has_dead_clients == true) {
+      RemoveAllDeadLogClients();
+      has_dead_clients = false;
+    }
+  }
+
   // Verify parameters (log handle and that version is given)
   if ((logHandle == nullptr) || (version == nullptr)) {
     TRACE("version or handle FAILED");
@@ -453,6 +476,11 @@ SaAisErrorT LogAgent::saLogSelectionObjectGet(
       return ais_rc;
     }
 
+    if (client->is_died() == true) {
+      ais_rc = SA_AIS_ERR_BAD_HANDLE;
+      return ais_rc;
+    }
+
     if (client->FetchAndIncreaseRefCounter(__func__, &updated) == -1) {
       // @client is being deleted. Don't touch @this client
       ais_rc = SA_AIS_ERR_TRY_AGAIN;
@@ -504,6 +532,11 @@ SaAisErrorT LogAgent::saLogDispatch(SaLogHandleT
logHandle,
       return ais_rc;
     }
 
+    if (client->is_died() == true) {
+      ais_rc = SA_AIS_ERR_BAD_HANDLE;
+      return ais_rc;
+    }
+
     if (client->FetchAndIncreaseRefCounter(__func__, &updated) == -1) {
       // @client is being deleted. DO NOT touch this @client
       ais_rc = SA_AIS_ERR_TRY_AGAIN;
@@ -584,6 +617,11 @@ SaAisErrorT LogAgent::saLogFinalize(SaLogHandleT
logHandle) {
   if (true) {
     ScopeLock critical_section(get_delete_obj_sync_mutex_);
 
+    if (has_dead_clients == true) {
+      RemoveAllDeadLogClients();
+      has_dead_clients = false;
+    }
+
     // Get log client from the global list
     client = SearchClientByHandle(logHandle);
     if (client == nullptr) {
@@ -852,6 +890,11 @@ SaAisErrorT LogAgent::saLogStreamOpen_2(
       return ais_rc;
     }
 
+    if (client->is_died() == true) {
+      ais_rc = SA_AIS_ERR_BAD_HANDLE;
+      return ais_rc;
+    }
+
     if (client->FetchAndIncreaseRefCounter(__func__, &updated) == -1) {
       // @client is being deleted. DO NOT touch this @client
       ais_rc = SA_AIS_ERR_TRY_AGAIN;
@@ -892,10 +935,12 @@ SaAisErrorT LogAgent::saLogStreamOpen_2(
     }
 
     // This client is failed to do recover in Recovery thread.
-    // Remove this client from the database.
     if (client->is_recovery_failed() == true) {
       ScopeLock critical_section(get_delete_obj_sync_mutex_);
-      RemoveLogClient(&client);
+      // Mark this client died, and it will be removed from client db
later.
+      client->died();
+      client = nullptr;
+      has_dead_clients = true;
       ais_rc = SA_AIS_ERR_BAD_HANDLE;
       return ais_rc;
     }
@@ -1160,6 +1205,11 @@ SaAisErrorT
LogAgent::saLogWriteLogAsync(SaLogStreamHandleT logStreamHandle,
       return ais_rc;
     }
 
+    if (client->is_died() == true) {
+      ais_rc = SA_AIS_ERR_BAD_HANDLE;
+      return ais_rc;
+    }
+
     if (client->FetchAndIncreaseRefCounter(__func__, &cUpdated) == -1) {
       // @client is being deleted. DO NOT touch this @client
       ais_rc = SA_AIS_ERR_TRY_AGAIN;
@@ -1263,10 +1313,13 @@ SaAisErrorT
LogAgent::saLogWriteLogAsync(SaLogStreamHandleT logStreamHandle,
     }
 
     // This client is failed to do recover in Recovery thread.
-    // Remove this client from the database.
     if (client->is_recovery_failed() == true) {
       ScopeLock critical_section(get_delete_obj_sync_mutex_);
-      RemoveLogClient(&client);
+      // Mark this client died, and it will be removed from client db
later.
+      client->died();
+      client = nullptr;
+      stream = nullptr;
+      has_dead_clients = true;
       ais_rc = SA_AIS_ERR_BAD_HANDLE;
       return ais_rc;
     }
@@ -1321,13 +1374,6 @@ SaAisErrorT
LogAgent::saLogStreamClose(SaLogStreamHandleT logStreamHandle) {
       return ais_rc;
     }
 
-    if (stream->FetchAndDecreaseRefCounter(__func__, &sUpdated) != 0) {
-      // @stream is being used somewhere (>0), DO NOT delete this @stream.
-      // or @stream is being deleted on other thread (=-1)
-      ais_rc = SA_AIS_ERR_TRY_AGAIN;
-      return ais_rc;
-    }
-
     // Retrieve @LogClient obj based on @client_handle
     client = SearchClientByHandle(stream->GetMyClientHandle());
     if (client == nullptr) {
@@ -1336,6 +1382,18 @@ SaAisErrorT
LogAgent::saLogStreamClose(SaLogStreamHandleT logStreamHandle) {
       return ais_rc;
     }
 
+    if (client->is_died() == true) {
+      ais_rc = SA_AIS_ERR_BAD_HANDLE;
+      return ais_rc;
+    }
+
+    if (stream->FetchAndDecreaseRefCounter(__func__, &sUpdated) != 0) {
+      // @stream is being used somewhere (>0), DO NOT delete this @stream.
+      // or @stream is being deleted on other thread (=-1)
+      ais_rc = SA_AIS_ERR_TRY_AGAIN;
+      return ais_rc;
+    }
+
     if (client->FetchAndIncreaseRefCounter(__func__, &cUpdated) == -1) {
       ais_rc = SA_AIS_ERR_TRY_AGAIN;
       return ais_rc;
@@ -1384,10 +1442,13 @@ SaAisErrorT
LogAgent::saLogStreamClose(SaLogStreamHandleT logStreamHandle) {
     }
 
     // This client is failed to do recover in Recovery thread.
-    // Remove this client from the database.
     if (client->is_recovery_failed() == true) {
       ScopeLock critical_section(get_delete_obj_sync_mutex_);
-      RemoveLogClient(&client);
+      // Mark this client died, and it will be removed from client db
later.
+      client->died();
+      client = nullptr;
+      stream = nullptr;
+      has_dead_clients = true;
       ais_rc = SA_AIS_ERR_BAD_HANDLE;
       return ais_rc;
     }
diff --git a/src/log/agent/lga_agent.h b/src/log/agent/lga_agent.h
index 9f1c370..817a357 100644
--- a/src/log/agent/lga_agent.h
+++ b/src/log/agent/lga_agent.h
@@ -159,6 +159,8 @@ class LogAgent {
   void EnterCriticalSection();
   void LeaveCriticalSection();
 
+  void RemoveAllDeadLogClients();
+
  private:
   // Not allow to create @LogAgent object, except the singleton object
@me_.
   LogAgent();
@@ -272,6 +274,9 @@ class LogAgent {
   // Hold list of current log clients
   std::vector<LogClient*> client_list_;
 
+  // true if there is any failed-to-recover client
+  bool has_dead_clients = false;
+
   // LGS LGA sync params
   NCS_SEL_OBJ lgs_sync_sel_;
 
diff --git a/src/log/agent/lga_client.cc b/src/log/agent/lga_client.cc
index 386c849..3c3f893 100644
--- a/src/log/agent/lga_client.cc
+++ b/src/log/agent/lga_client.cc
@@ -32,7 +32,7 @@
 // LogClient
 
//--------------------------------------------------------------------------
----
 LogClient::LogClient(const SaLogCallbacksT* cb, uint32_t id, SaVersionT
ver)
-    : client_id_{id}, ref_counter_object_{} {
+    : client_id_{id}, ref_counter_object_{}, died_{false} {
   TRACE_ENTER();
   // Reset registered callback info
   memset(&callbacks_, 0, sizeof(callbacks_));
diff --git a/src/log/agent/lga_client.h b/src/log/agent/lga_client.h
index 05349be..08137b7 100644
--- a/src/log/agent/lga_client.h
+++ b/src/log/agent/lga_client.h
@@ -173,6 +173,10 @@ class LogClient {
   // while the recovery thread is going at RecoveryState::kRecovery2.
   bool is_recovery_done() const;
 
+  // Invoke when the client is not recovered successfully.
+  void died() { died_ = true; }
+  bool is_died() const { return died_; }
+
  private:
   // Search for @LogStreamInfo object from @stream_list_ based on
@stream_id
   LogStreamInfo* SearchLogStreamInfoById(uint32_t);
@@ -257,6 +261,9 @@ class LogClient {
   // LOG handle (derived from hdl-mngr)
   SaLogHandleT handle_;
 
+  // true if this client is not successfully recovered.
+  bool died_;
+
   DELETE_COPY_AND_MOVE_OPERATORS(LogClient);
 };
 
-- 
1.9.1

--- Begin Message ---
Hi Canh

+      // [Canh1] Following the ticket #1396, As I remember that you, Ander,
Vu
+      // and me had discussed a lot of about ticket 1396 and we had final
agreement
+      // for the solution of ticket 1396: the mds connection never shutdown
after first client
+      // is initialized although this client(last client) is finalized
after that.
+      // This mean that  when mds connection is initialized successfully,
+      // it will never be shutdown. So ncs_agents_shutdown() should not be
called here
+      // if we finalize the last client, to keep the mds connection is
alive

Yes, you are right.  So Ack.
But please look at the minor comments I had as well

Thanks
Lennart

> -----Original Message-----
> From: Canh Van Truong [mailto:canh.v.tru...@dektech.com.au]
> Sent: den 6 mars 2018 05:19
> To: Lennart Lund <lennart.l...@ericsson.com>; Vu Minh Nguyen
> <vu.m.ngu...@dektech.com.au>
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: RE: [PATCH 0/1] Review Request for log: fix log agent may crash
> after recovery fails [#2670]
> 
> Hi Lennart,
> 
> Please see my reply comment [Canh1] in the attachment.
> 
> Thanks
> Canh
> 
> -----Original Message-----
> From: Lennart Lund [mailto:lennart.l...@ericsson.com]
> Sent: Monday, March 5, 2018 7:40 PM
> To: Canh Van Truong <canh.v.tru...@dektech.com.au>; Vu Minh Nguyen
> <vu.m.ngu...@dektech.com.au>; Lennart Lund
> <lennart.l...@ericsson.com>
> Cc: opensaf-devel@lists.sourceforge.net
> Subject: RE: [PATCH 0/1] Review Request for log: fix log agent may crash
> after recovery fails [#2670]
> 
> Hi Canh,
> 
> I have added some new comments. See [Lennart1] in the attached .diff
> Note that this patch contains all comments so I shall be applied alone,
not
> on top of the other comment patches
> 
> Thanks
> Lennart
> 
> > -----Original Message-----
> > From: Canh Van Truong [mailto:canh.v.tru...@dektech.com.au]
> > Sent: den 2 mars 2018 07:29
> > To: Lennart Lund <lennart.l...@ericsson.com>; Vu Minh Nguyen
> > <vu.m.ngu...@dektech.com.au>
> > Cc: opensaf-devel@lists.sourceforge.net
> > Subject: RE: [PATCH 0/1] Review Request for log: fix log agent may crash
> > after recovery fails [#2670]
> >
> > Hi Lennart,
> >
> > Please see my replied comment in attachment.
> >
> > Thanks
> > Canh
> >
> > -----Original Message-----
> > From: Lennart Lund [mailto:lennart.l...@ericsson.com]
> > Sent: Thursday, February 22, 2018 5:00 PM
> > To: Canh Van Truong <canh.v.tru...@dektech.com.au>; Vu Minh Nguyen
> > <vu.m.ngu...@dektech.com.au>
> > Cc: opensaf-devel@lists.sourceforge.net; Canh Van Truong
> > <canh.v.tru...@dektech.com.au>; Lennart Lund
> > <lennart.l...@ericsson.com>
> > Subject: RE: [PATCH 0/1] Review Request for log: fix log agent may crash
> > after recovery fails [#2670]
> >
> > Hi Canh
> >
> > I have done a review and have some comments. See attached diff
> >
> > Thanks
> > Lennart
> >
> > > -----Original Message-----
> > > From: Canh Van Truong [mailto:canh.v.tru...@dektech.com.au]
> > > Sent: den 9 november 2017 04:25
> > > To: Lennart Lund <lennart.l...@ericsson.com>; Vu Minh Nguyen
> > > <vu.m.ngu...@dektech.com.au>
> > > Cc: opensaf-devel@lists.sourceforge.net; Canh Van Truong
> > > <canh.v.tru...@dektech.com.au>
> > > Subject: [PATCH 0/1] Review Request for log: fix log agent may crash
> after
> > > recovery fails [#2670]
> > >
> > > Summary: log: fix log agent may crash after recovery fails [#2670]
> > > Review request for Ticket(s): 2670
> > > Peer Reviewer(s): Lennart, Vu
> > > Pull request to: Vu
> > > Affected branch(es): develop, release
> > > Development branch: ticket-2670
> > > Base revision: ce78275348c06f5d69577744f0dab525e79443e7
> > > Personal repository: git://git.code.sf.net/u/canht32/review
> > >
> > > --------------------------------
> > > Impacted area       Impact y/n
> > > --------------------------------
> > >  Docs                    n
> > >  Build system            n
> > >  RPM/packaging           n
> > >  Configuration files     n
> > >  Startup scripts         n
> > >  SAF services            y
> > >  OpenSAF services        n
> > >  Core libraries          n
> > >  Samples                 n
> > >  Tests                   n
> > >  Other                   n
> > >
> > >
> > > Comments (indicate scope for each "y" above):
> > > ---------------------------------------------
> > > *** EXPLAIN/COMMENT THE PATCH SERIES HERE ***
> > >
> > > revision 980226974b1ab266b068eded487e6ce26b9645c0
> > > Author:   Canh Van Truong <canh.v.tru...@dektech.com.au>
> > > Date:     Thu, 9 Nov 2017 10:14:53 +0700
> > >
> > > log: fix log agent may crash after recovery fails [#2670]
> > >
> > > In log api, the client is deleted from the list in the agent after
> > recovery fails.
> > > there is no check if this client is used by other user.
> > >
> > > The patch fix to make sure that the deletion is just processed when it
> is
> > not
> > > being used.
> > >
> > >
> > >
> > > Complete diffstat:
> > > ------------------
> > >  src/log/agent/lga_agent.cc | 62 ++++++++++++++++++++++++++++-----
> --
> > --
> > > ---------
> > >  src/log/agent/lga_agent.h  |  2 +-
> > >  src/log/agent/lga_client.h | 19 +++++---------
> > >  src/log/agent/lga_util.cc  | 25 +++----------------
> > >  src/log/agent/lga_util.h   |  3 +--
> > >  5 files changed, 50 insertions(+), 61 deletions(-)
> > >
> > >
> > > Testing Commands:
> > > -----------------
> > > *** LIST THE COMMAND LINE TOOLS/STEPS TO TEST YOUR CHANGES ***
> > >
> > >
> > > Testing, Expected Results:
> > > --------------------------
> > > *** PASTE COMMAND OUTPUTS / TEST RESULTS ***
> > >
> > >
> > > Conditions of Submission:
> > > -------------------------
> > > Ack from reviewers
> > >
> > >
> > > Arch      Built     Started    Linux distro
> > > -------------------------------------------
> > > mips        n          n
> > > mips64      n          n
> > > x86         n          n
> > > x86_64      n          n
> > > powerpc     n          n
> > > powerpc64   n          n
> > >
> > >
> > > Reviewer Checklist:
> > > -------------------
> > > [Submitters: make sure that your review doesn't trigger any
> checkmarks!]
> > >
> > >
> > > Your checkin has not passed review because (see checked entries):
> > >
> > > ___ Your RR template is generally incomplete; it has too many blank
> > entries
> > >     that need proper data filled in.
> > >
> > > ___ You have failed to nominate the proper persons for review and
push.
> > >
> > > ___ Your patches do not have proper short+long header
> > >
> > > ___ You have grammar/spelling in your header that is unacceptable.
> > >
> > > ___ You have exceeded a sensible line length in your
> > > headers/comments/text.
> > >
> > > ___ You have failed to put in a proper Trac Ticket # into your
commits.
> > >
> > > ___ You have incorrectly put/left internal data in your comments/files
> > >     (i.e. internal bug tracking tool IDs, product names etc)
> > >
> > > ___ You have not given any evidence of testing beyond basic build
tests.
> > >     Demonstrate some level of runtime or other sanity testing.
> > >
> > > ___ You have ^M present in some of your files. These have to be
> removed.
> > >
> > > ___ You have needlessly changed whitespace or added whitespace
> crimes
> > >     like trailing spaces, or spaces before tabs.
> > >
> > > ___ You have mixed real technical changes with whitespace and other
> > >     cosmetic code cleanup changes. These have to be separate commits.
> > >
> > > ___ You need to refactor your submission into logical chunks; there is
> > >     too much content into a single commit.
> > >
> > > ___ You have extraneous garbage in your review (merge commits etc)
> > >
> > > ___ You have giant attachments which should never have been sent;
> > >     Instead you should place your content in a public tree to be
pulled.
> > >
> > > ___ You have too many commits attached to an e-mail; resend as
> threaded
> > >     commits, or place in a public tree for a pull.
> > >
> > > ___ You have resent this content multiple times without a clear
> indication
> > >     of what has changed between each re-send.
> > >
> > > ___ You have failed to adequately and individually address all of the
> > >     comments and change requests that were proposed in the initial
> review.
> > >
> > > ___ You have a misconfigured ~/.gitconfig file (i.e. user.name,
> user.email
> > > etc)
> > >
> > > ___ Your computer have a badly configured date and time; confusing the
> > >     the threaded patch review.
> > >
> > > ___ Your changes affect IPC mechanism, and you don't present any
> results
> > >     for in-service upgradability test.
> > >
> > > ___ Your changes affect user manual and documentation, your patch
> series
> > >     do not contain the patch that updates the Doxygen manual.


--- End Message ---
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
Opensaf-devel mailing list
Opensaf-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/opensaf-devel

Reply via email to