James,

Please review the following uDAPL patch. Fixes my broken uAT retry code. 

Thanks,

-arlin

 
Signed-off by: Arlin Davis <[EMAIL PROTECTED]>


Index: dapl/openib/dapl_ib_util.c
===================================================================
--- dapl/openib/dapl_ib_util.c  (revision 2970)
+++ dapl/openib/dapl_ib_util.c  (working copy)
@@ -128,21 +128,34 @@ int dapli_get_hca_addr( struct dapl_hca 
        at_comp.context = &at_rec; 
        at_rec.addr = &hca_ptr->hca_address;
        at_rec.wait_object = &hca_ptr->ib_trans.wait_object;
+       at_rec.hca_ptr = hca_ptr;
+       at_rec.retries = 0;
 
        /*  call with async_comp until the sync version works */
        status = ib_at_ips_by_gid(&hca_ptr->ib_trans.gid, 
&ipv4_addr->sin_addr.s_addr, 1, 
                                  &at_comp, &at_rec.req_id);
        
-       if (status < 0) 
+       if (status < 0) {
+               dapl_dbg_log (DAPL_DBG_TYPE_ERR, 
+                             " get_hca_addr: ERR ips_by_gid %d %s \n",
+                               status, strerror(errno));
                return 1;
+       }
  
-        if (status > 0)
-                dapli_ip_comp_handler(at_rec.req_id, (void*)ipv4_addr, status);
-       
-       /* wait for answer, 5 seconds max */
-       dat_status = dapl_os_wait_object_wait 
(&hca_ptr->ib_trans.wait_object,5000000);
-        
-       if ((dat_status != DAT_SUCCESS ) || (!ipv4_addr->sin_addr.s_addr)) 
+       dapl_dbg_log (DAPL_DBG_TYPE_UTIL, 
+                     " get_hca_addr: ips_by_gid ret %d at_rec %p -> id %lld\n",
+                       status, &at_rec, at_rec.req_id );
+
+        if (status > 0) { 
+                dapli_ip_comp_handler(at_rec.req_id, (void*)&at_rec, status);
+       } else {
+               dat_status = 
dapl_os_wait_object_wait(&hca_ptr->ib_trans.wait_object,500000);
+               return 0;
+               if (dat_status != DAT_SUCCESS)
+                       ib_at_cancel(at_rec.req_id);
+       }
+
+       if (!ipv4_addr->sin_addr.s_addr) 
                return 1;
                
        return 0;
@@ -252,6 +265,13 @@ DAT_RETURN dapls_ib_open_hca (
                              ibv_get_device_name(hca_ptr->ib_trans.ib_dev) );
                goto bail;
        }
+
+       dapl_dbg_log(DAPL_DBG_TYPE_CM,
+                    " open_hca: LID 0x%x GID subnet %016llx id %016llx\n",
+                    hca_ptr->ib_trans.lid,
+                    (unsigned long 
long)bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix),
+                    (unsigned long 
long)bswap_64(hca_ptr->ib_trans.gid.global.interface_id) );
+
        /* get the IP address of the device */
        if (dapli_get_hca_addr(hca_ptr)) {
                dapl_dbg_log (DAPL_DBG_TYPE_ERR, 
@@ -282,11 +302,6 @@ DAT_RETURN dapls_ib_open_hca (
                      ((struct sockaddr_in 
*)&hca_ptr->hca_address)->sin_addr.s_addr >> 24 & 0xff,
                      hca_ptr->ib_trans.max_inline_send );
 
-       dapl_dbg_log(DAPL_DBG_TYPE_CM,
-                    " open_hca: LID 0x%x GID subnet %016llx id %016llx\n",
-                    hca_ptr->ib_trans.lid,
-                    (unsigned long 
long)bswap_64(hca_ptr->ib_trans.gid.global.subnet_prefix),
-                    (unsigned long 
long)bswap_64(hca_ptr->ib_trans.gid.global.interface_id) );
 
        return DAT_SUCCESS;
 
Index: dapl/openib/dapl_ib_cm.c
===================================================================
--- dapl/openib/dapl_ib_cm.c    (revision 2970)
+++ dapl/openib/dapl_ib_cm.c    (working copy)
@@ -158,19 +158,49 @@ void dapli_at_thread_destroy(void)
 void dapli_ip_comp_handler(uint64_t req_id, void *context, int rec_num)
 {
        struct dapl_at_record   *at_rec = context;
+       struct sockaddr_in      *ipv4_addr = (struct sockaddr_in*)at_rec->addr;
+       int                     status;
 
        dapl_dbg_log(DAPL_DBG_TYPE_CM,
-                    " ip_comp_handler: ctxt %p, req_id %lld rec_num %d\n",
-                    context, req_id, rec_num);
+                    " ip_comp_handler: at_rec %p ->id %lld id %lld rec_num %d 
%x\n",
+                    context, at_rec->req_id, req_id, rec_num,
+                    ipv4_addr->sin_addr.s_addr);
+
+        if (rec_num <= 0) {
+               struct ib_at_completion at_comp;
+
+                dapl_dbg_log(DAPL_DBG_TYPE_CM,
+                            " ip_comp_handler: resolution err %d retry %d\n",
+                            rec_num, at_rec->retries + 1);
+
+                if (++at_rec->retries > IB_MAX_AT_RETRY) 
+                        goto bail;
+
+               at_comp.fn = dapli_ip_comp_handler;
+               at_comp.context = at_rec;
+               ipv4_addr->sin_addr.s_addr = 0;
+
+               status = ib_at_ips_by_gid(&at_rec->hca_ptr->ib_trans.gid, 
+                                         &ipv4_addr->sin_addr.s_addr, 1,
+                                         &at_comp, &at_rec->req_id);
+               if (status < 0) 
+                       goto bail;
+
+               dapl_dbg_log (DAPL_DBG_TYPE_UTIL,
+                             " ip_comp_handler: NEW ips_by_gid ret %d at_rec 
%p -> id %lld\n",
+                             status, at_rec, at_rec->req_id );
+        } 
 
-       if ((at_rec) && ( at_rec->req_id == req_id)) {
+       if (ipv4_addr->sin_addr.s_addr)
                dapl_os_wait_object_wakeup(at_rec->wait_object);
-               return;
-       }
-       
-       dapl_dbg_log(DAPL_DBG_TYPE_ERR,
-                    " ip_comp_handler: at_rec->req_id %lld != req_id %lld\n",
-                    at_rec->req_id, req_id );
+
+       return;
+bail:
+       dapl_dbg_log(DAPL_DBG_TYPE_CM,
+                    " ip_comp_handler: ERR: at_rec  %p, req_id %lld rec_num 
%d\n",
+                    at_rec, req_id, rec_num);
+
+       dapl_os_wait_object_wakeup(at_rec->wait_object);
 }
 
 static void dapli_path_comp_handler(uint64_t req_id, void *context, int 
rec_num)
@@ -622,20 +652,21 @@ void cm_thread(void *arg) 
 
                dapl_os_unlock(&g_cm_lock);
                 ret = poll(&ufds, 1, -1); 
-               if ((ret <= 0) || (g_cm_destroy)) {
+               if (ret <= 0) {
                        dapl_dbg_log(DAPL_DBG_TYPE_CM,
                                     " cm_thread(%d): ERR %s poll\n",
                                     getpid(),strerror(errno));
                        dapl_os_lock(&g_cm_lock);
-                       break;
+                       continue;
                }
 
                dapl_dbg_log(DAPL_DBG_TYPE_CM,
                        " cm_thread: GET EVENT fd=%d n=%d\n",
                        ib_cm_get_fd(),ret);
+
                if (ib_cm_event_get_timed(0,&event)) { 
                        dapl_dbg_log(DAPL_DBG_TYPE_CM,
-                               " cm_thread: ERR %s eventi_get on %d\n", 
+                               " cm_thread: ERR %s event_get on %d\n", 
                                strerror(errno), ib_cm_get_fd() );
                        dapl_os_lock(&g_cm_lock);
                        continue;
Index: dapl/openib/dapl_ib_util.h
===================================================================
--- dapl/openib/dapl_ib_util.h  (revision 2970)
+++ dapl/openib/dapl_ib_util.h  (working copy)
@@ -97,6 +97,8 @@ struct dapl_at_record {
        uint64_t                req_id;
        DAT_SOCK_ADDR6          *addr;
        DAPL_OS_WAIT_OBJECT     *wait_object;
+       struct dapl_hca         *hca_ptr;
+       int                     retries;
 };
 
 /* 



_______________________________________________
openib-general mailing list
[email protected]
http://openib.org/mailman/listinfo/openib-general

To unsubscribe, please visit http://openib.org/mailman/listinfo/openib-general

Reply via email to