pbrezina's pull request #12: "failover: proceed normally when no new server is 
found" was opened

PR body:
"""
https://fedorahosted.org/sssd/ticket/3131

I couldn't reproduce manually so I used the second patch as a by-code 
reproducer. If you apply the patch then sssd will try to resolve meta server 
twice simultaneously and triggering the problematic code path. You can search 
for RESOLV SRV DONE in logs.
"""

See the full pull-request at https://github.com/SSSD/sssd/pull/12
... or pull the PR as Git branch:
git remote add ghsssd https://github.com/SSSD/sssd
git fetch ghsssd pull/12/head:pr12
git checkout pr12
From 11b6d2b7623208c7a3e148f4128e9f49c35a6ce1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pavel=20B=C5=99ezina?= <pbrez...@redhat.com>
Date: Wed, 24 Aug 2016 14:21:12 +0200
Subject: [PATCH 1/2] failover: proceed normally when no new server is found

Multiple failover requests come in same time, the first one will
result in collapsing the meta server but multiple resolution of
SRV records are triggered. The first one finishes normally but the
others won't find any new server thus ends with an error.

This patch makes failover to proceed normally even in such case.

Resolves:
https://fedorahosted.org/sssd/ticket/3131
---
 src/providers/fail_over.c | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/providers/fail_over.c b/src/providers/fail_over.c
index 8ab39f2..7708409 100644
--- a/src/providers/fail_over.c
+++ b/src/providers/fail_over.c
@@ -1112,7 +1112,9 @@ fo_resolve_service_cont(struct tevent_req *subreq)
     ret = resolve_srv_recv(subreq, &state->server);
     talloc_zfree(subreq);
 
-    if (ret) {
+    /* We will proceed normally on ERR_SRV_DUPLICATES and if the server
+     * is already being resolved, we hook to that request. */
+    if (ret != EOK && ret != ERR_SRV_DUPLICATES) {
         tevent_req_error(req, ret);
         return;
     }
@@ -1398,11 +1400,23 @@ resolve_srv_done(struct tevent_req *subreq)
         }
 
         if (last_server == state->meta) {
-            /* SRV lookup returned only those servers
-             * that are already present. */
+            /* SRV lookup returned only those servers that are already present.
+             * This may happen only when an ongoing SRV resolution already
+             * exist. We will return server, but won't set any state. */
             DEBUG(SSSDBG_TRACE_FUNC, "SRV lookup did not return "
                                       "any new server.\n");
             ret = ERR_SRV_DUPLICATES;
+
+            /* Since no new server is returned, state->meta->next is NULL.
+             * We return last tried server if possible which is server
+             * from previous resolution of SRV record, and first server
+             * otherwise. */
+            if (state->service->last_tried_server != NULL) {
+                state->out = state->service->last_tried_server;
+                goto done;
+            }
+
+            state->out = state->service->server_list;
             goto done;
         }
 
@@ -1438,7 +1452,10 @@ resolve_srv_done(struct tevent_req *subreq)
     }
 
 done:
-    if (ret != EOK) {
+    if (ret == ERR_SRV_DUPLICATES) {
+        tevent_req_error(req, ret);
+        return;
+    } else if (ret != EOK) {
         state->out = state->meta;
         set_srv_data_status(state->meta->srv_data, SRV_RESOLVE_ERROR);
         tevent_req_error(req, ret);

From 4034fbb62417cf6ff0e13cb61a4401a488134a8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Pavel=20B=C5=99ezina?= <pbrez...@redhat.com>
Date: Wed, 31 Aug 2016 13:01:31 +0200
Subject: [PATCH 2/2] debug

---
 src/providers/fail_over.c                  | 4 ++++
 src/providers/ldap/sdap_async_connection.c | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/src/providers/fail_over.c b/src/providers/fail_over.c
index 7708409..f83b5a6 100644
--- a/src/providers/fail_over.c
+++ b/src/providers/fail_over.c
@@ -1274,6 +1274,8 @@ resolve_srv_send(TALLOC_CTX *mem_ctx, struct tevent_context *ev,
     state->fo_ctx = ctx;
     state->meta = server->srv_data->meta;
 
+    DEBUG(SSSDBG_IMPORTANT_INFO, "RESOLV SRV SEND\n");
+
     status = get_srv_data_status(server->srv_data);
     DEBUG(SSSDBG_FUNC_DATA, "The status of SRV lookup is %s\n",
           str_srv_data_status(status));
@@ -1356,6 +1358,8 @@ resolve_srv_done(struct tevent_req *subreq)
     int ret;
     uint32_t ttl;
 
+    DEBUG(SSSDBG_IMPORTANT_INFO, "RESOLV SRV DONE\n");
+
     ret = state->fo_ctx->srv_recv_fn(state, subreq, &dns_domain, &ttl,
                                      &primary_servers, &num_primary_servers,
                                      &backup_servers, &num_backup_servers);
diff --git a/src/providers/ldap/sdap_async_connection.c b/src/providers/ldap/sdap_async_connection.c
index a8d4262..67604e3 100644
--- a/src/providers/ldap/sdap_async_connection.c
+++ b/src/providers/ldap/sdap_async_connection.c
@@ -1509,6 +1509,9 @@ static int sdap_cli_resolve_next(struct tevent_req *req)
     subreq = be_resolve_server_send(state, state->ev,
                                     state->be, state->service->name,
                                     state->srv == NULL ? true : false);
+    subreq = be_resolve_server_send(state, state->ev,
+                                    state->be, state->service->name,
+                                    state->srv == NULL ? true : false);
     if (!subreq) {
         return ENOMEM;
     }
_______________________________________________
sssd-devel mailing list
sssd-devel@lists.fedorahosted.org
https://lists.fedorahosted.org/admin/lists/sssd-devel@lists.fedorahosted.org

Reply via email to