[ 
https://issues.apache.org/jira/browse/PROTON-2931?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=18077720#comment-18077720
 ] 

ASF GitHub Bot commented on PROTON-2931:
----------------------------------------

astitcher commented on code in PR #444:
URL: https://github.com/apache/qpid-proton/pull/444#discussion_r3174824981


##########
c/src/proactor/epoll.c:
##########
@@ -1464,6 +1469,7 @@ static void connection_lookup_done_lh(pconnection_t *pc, 
struct addrinfo *ai, in
 static void connection_done_cb(void *user_data, struct addrinfo *ai, int 
gai_error) {
   pconnection_t *pc = (pconnection_t *)user_data;
   lock(&pc->task.mutex);
+  pc->name_lookup_pending = false;

Review Comment:
   If this is within the lock, maybe it should be in the _lh function not here? 
Especially as setting the flag is done in a _lh function.



##########
c/src/proactor/epoll.c:
##########
@@ -1472,10 +1478,28 @@ static void connection_done_cb(void *user_data, struct 
addrinfo *ai, int gai_err
 // Return true if the socket is connecting and there are no Proton events to 
deliver.
 static bool pconnection_first_connect_lh(pconnection_t *pc) {
   pn_proactor_t *p = pc->task.proactor;
+  pn_transport_t *tp = pc->driver.transport;
+  pc->name_lookup_pending = true;
+
   unlock(&pc->task.mutex);
   bool rc = pni_name_lookup_start(&p->name_lookup, pc->host, pc->port, pc, 
connection_done_cb);
   lock(&pc->task.mutex);
-  return rc;
+
+  if (!rc) {
+    // Either the callback was synchronous or no callback was possible
+    if (pc->name_lookup_pending) {
+      // Clean up since there will be no callback.
+      pc->name_lookup_pending = false;
+      psocket_error(&pc->psocket, EAI_FAIL, "internal error on connect");
+    }
+    return false;
+  }
+  if (!pc->name_lookup_pending) {
+    // connection_done_cb already completed
+    if (pn_condition_is_set(pn_transport_condition(tp)))
+      return false;
+  }
+  return !pc->queued_disconnect && !pni_task_wake_pending(&pc->task);

Review Comment:
   This is pretty obscure compared to the previous return value (which would 
essentially be true at his point I think if there wer no errors from starting 
the lookup), perhaps an explanation in a comment is in order.
   
   



##########
c/src/proactor/epoll.c:
##########
@@ -1447,7 +1448,11 @@ bool schedule_if_inactive(pn_proactor_t *p) {
 static void connection_lookup_done_lh(pconnection_t *pc, struct addrinfo *ai, 
int gai_error) {
   pn_proactor_t *p = pc->task.proactor;
   bool notify = false;
-  if (gai_error) {
+
+  if (pconnection_rclosed(pc) && pconnection_wclosed(pc)) {

Review Comment:
   Is it possible to only have one of these set in this condition?



##########
c/src/proactor/epoll_raw_connection.c:
##########
@@ -224,21 +240,30 @@ pn_raw_connection_t *pn_raw_connection(void) {
   return &conn->raw_connection;
 }
 
-// Call from pconnection_process with task lock held.
-// Return true if the socket is connecting and there are no Proton events to 
deliver.
-static bool praw_connection_first_connect_lh(praw_connection_t *prc) {
+// Call from pconnection_process with no locks.
+// Callback may complete before pni_name_lookup_start returns.
+static void praw_connection_first_connect(praw_connection_t *prc) {
   const char *host;
   const char *port;

Review Comment:
   Actually these lines should also be just before they are used in (new) line 
253



##########
c/src/proactor/epoll_raw_connection.c:
##########
@@ -211,6 +224,9 @@ static void praw_initiate_cleanup(praw_connection_t *prc) {
     shutdown(prc->psocket.epoll_io.fd, SHUT_RDWR);
     return;
   }
+  if (prc->name_lookup_pending) {

Review Comment:
   What about the protecting lock?



##########
c/src/proactor/epoll_raw_connection.c:
##########
@@ -161,6 +173,7 @@ static void raw_connection_lookup_done_lh(praw_connection_t 
*prc, struct addrinf
 static void raw_connection_done_cb(void *user_data, struct addrinfo *ai, int 
gai_error) {
   praw_connection_t *prc = (praw_connection_t *)user_data;
   lock(&prc->task.mutex);
+  prc->name_lookup_pending = false;

Review Comment:
   As above shouldn't this be inside the _lh function? Even though there is no 
_lh function anymore for lookup start here.



##########
c/src/proactor/epoll_raw_connection.c:
##########
@@ -110,8 +111,14 @@ static void praw_connection_start(praw_connection_t *prc, 
int fd) {
 }
 
 /* Called on initial connect, and if connection fails to try another address */
+/* May be called within the praw_connection task or from an external 
name_lookup task */
 static void praw_connection_maybe_connect_lh(praw_connection_t *prc) {
+  int err = 0;

Review Comment:
   Move this to just before the while loop where it belongs, also no need to 
set it to 0 here - it is initialised first thing in the loop.



##########
c/src/proactor/epoll_raw_connection.c:
##########
@@ -224,21 +240,30 @@ pn_raw_connection_t *pn_raw_connection(void) {
   return &conn->raw_connection;
 }
 
-// Call from pconnection_process with task lock held.
-// Return true if the socket is connecting and there are no Proton events to 
deliver.
-static bool praw_connection_first_connect_lh(praw_connection_t *prc) {
+// Call from pconnection_process with no locks.
+// Callback may complete before pni_name_lookup_start returns.
+static void praw_connection_first_connect(praw_connection_t *prc) {
   const char *host;
   const char *port;
   pn_proactor_t *p = prc->task.proactor;
+  bool notify = false;

Review Comment:
   Move this down to where it's used - I think inside the if (!rc) block



##########
c/src/proactor/epoll_raw_connection.c:
##########
@@ -443,17 +468,30 @@ pn_event_batch_t *pni_raw_connection_process(task_t *t, 
uint32_t io_events, bool
     praw_initiate_cleanup(rc);
     return NULL;
   }
+  if (rc->task.closing) {
+    // rclosed and wclosed.  Allow final events to be processed.
+    unlock(&rc->task.mutex);
+    return &rc->batch;
+  }
   int events = io_events;
   int fd = rc->psocket.epoll_io.fd;
 
   if (rc->first_schedule) {
     rc->first_schedule = false;
     assert(!events); // No socket yet.
     assert(!rc->connected);
-    if (praw_connection_first_connect_lh(rc)) {
-      unlock(&rc->task.mutex);
-      return NULL;
+    bool wake_event = pni_task_wake_pending(&rc->task);
+
+    t->working = false;
+    rc->name_lookup_pending = true;
+    unlock(&rc->task.mutex);
+    praw_connection_first_connect(rc);
+    if (wake_event) {
+      lock(&rc->task.mutex);
+      t->working = true;
+      return &rc->batch;

Review Comment:
   It looks wrong to return without unlocking here - if it actually correct 
then a comment explaining why is needed.





> Epoll proactor has race conditions with the async c-ares name resolver library
> ------------------------------------------------------------------------------
>
>                 Key: PROTON-2931
>                 URL: https://issues.apache.org/jira/browse/PROTON-2931
>             Project: Qpid Proton
>          Issue Type: Bug
>          Components: proton-c
>    Affects Versions: proton-c-0.41.0
>            Reporter: Clifford Jansen
>            Assignee: Clifford Jansen
>            Priority: Blocker
>
> If the c-ares callback is very quick, the pn_raw_connection_t can sometimes 
> fail to schedule itself and hang while still in the connecting phase.  This 
> can be easily reproduced with a ulimit for open files of 1024 or less and the 
> following reproducer.
>   https://github.com/fgiorgetti/router-locust
> Conversely, if the callback is extremely slow, the connection can wind up and 
> free resources before the callback tries to reference through an invalid 
> pointer.  The connection should remember if a callback is pending and defer 
> any cleanup until this concludes.  This applies to raw and AMQP connections.



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to