Hi, Josh, George,

Thank you very much for your answers about my questions.

Now I have 3 more questions.
I would like to post them one of these days.

I will try to use checkpoint/restart after this as much as possible.

I tried to fix some bugs.
But I could not fix them smartly.
For reference, I put temporary ugly concept patch in this mail.

In 7th question,
My debugging was mistaken.

Using ompi_ddt_type_size may be correct.
And receiver received the 14bytes messages correctly in the bkrmk.

DEBUG:wait_quiesce_drained:WCH x=1 a=4329.100098 y=2 b=8474.730469 c=48
--> Received messages are contiguous.

DEBUG:drain_message_find:My=1 drain_msg=184a280 [peer=0/0 count=1/1 comm=602500 ID 
0/0/0 R=1/1 tag=1000/1000 ddt_size=14/14 [datatype->size=1]]
[done=1 active=0 already_posted=0]
--> Matching is correct.

DEBUG:ompi_ddt_copy_content_same_ddt:Start size=14 flag=102/4 count=1

I think that problem is in ompi_ddt_copy_content_same_ddt function,
when copying message from drain_list to user buffer.

Please check my 8th question later too.
I think that 8th question(problem) is like 7th.
--- crcp_bkmrk_pml.c_orig       2009-12-09 05:36:26.000000000 +0900
+++ crcp_bkmrk_pml.c    2010-03-23 17:59:48.412809557 +0900
@@ -175,7 +175,10 @@
                                 opal_list_t * to_list,
                                 ompi_crcp_bkmrk_pml_traffic_message_ref_t 
**new_msg_ref,
                                 bool keep_active, /* If you have to create a 
new context, should it be initialized to active? */
-                                bool remove); /* Remove the original? - false 
= copy() */
+                                bool remove
+,int tag  /* seki : for Q5 */
+,int rank /* seki : for Q5 */
+); /* Remove the original? - false = copy() */

 /*
  * Traffic Message: Strip off the first matching request
@@ -829,6 +832,9 @@
     msg_ref->tag        = 0;
     msg_ref->rank       = 0;
     msg_ref->comm       = NULL;
+msg_ref->c_contextid = 0; /* seki  : for Q4 */
+msg_ref->c_my_rank   = 0; /* seki  : for Q4 */
+msg_ref->c_flags     = 0; /* seki  : for Q4 */

     OBJ_CONSTRUCT(&msg_ref->msg_contents, opal_list_t);

@@ -853,6 +859,9 @@
     msg_ref->tag        = 0;
     msg_ref->rank       = 0;
     msg_ref->comm       = NULL;
+msg_ref->c_contextid = 0; /* seki  : for Q4 */
+msg_ref->c_my_rank   = 0; /* seki  : for Q4 */
+msg_ref->c_flags     = 0; /* seki  : for Q4 */

     while( NULL != (item = opal_list_remove_first(&(msg_ref->msg_contents)) ) 
) {
         HOKE_CONTENT_REF_RETURN(item);
@@ -917,6 +926,9 @@
     msg_ref->tag        = 0;
     msg_ref->rank       = 0;
     msg_ref->comm       = NULL;
+msg_ref->c_contextid = 0; /* seki  : for Q4 */
+msg_ref->c_my_rank   = 0; /* seki  : for Q4 */
+msg_ref->c_flags     = 0; /* seki  : for Q4 */

     while( NULL != (item = opal_list_remove_first(&(msg_ref->msg_contents)) ) 
) {
         HOKE_CONTENT_REF_RETURN(item);
@@ -1019,6 +1031,8 @@
    msg_ref->tag     = v_tag;                                     \
    msg_ref->rank    = v_rank;                                    \
    msg_ref->comm    = v_comm;                                    \
+msg_ref->c_contextid = v_comm->c_contextid; /* seki : for Q4 */\
+msg_ref->c_my_rank   = v_comm->c_my_rank;   /* seki : for Q4 */\
                                                                  \
    msg_ref->proc_name.jobid  = p_jobid;                          \
    msg_ref->proc_name.vpid   = p_vpid;                           \
@@ -1047,6 +1061,8 @@
    msg_ref->tag     = v_tag;                                     \
    msg_ref->rank    = v_rank;                                    \
    msg_ref->comm    = v_comm;                                    \
+msg_ref->c_contextid = v_comm->c_contextid; /* seki : for Q4 */\
+msg_ref->c_my_rank   = v_comm->c_my_rank;   /* seki : for Q4 */\
                                                                  \
    msg_ref->proc_name.jobid  = p_jobid;                          \
    msg_ref->proc_name.vpid   = p_vpid;                           \
@@ -2313,7 +2329,10 @@
                                  peer_ref, &(peer_ref->recv_init_list),
                                  &new_msg_ref,
                                  true,
-                                 false);
+                                 false
+,(int)request->req_status.MPI_TAG    /* seki for Q5 */
+,(int)request->req_status.MPI_SOURCE /* seki for Q5 */
+);
             msg_ref = new_msg_ref;
         }
     }
@@ -2558,7 +2577,10 @@
                                  peer_ref, &(peer_ref->irecv_list),
                                  &new_msg_ref,
                                  true,
-                                 true);
+                                 true
+,(int)request->req_status.MPI_TAG    /* seki : for Q5 */
+,(int)request->req_status.MPI_SOURCE /* seki : for Q5 */
+);
             msg_ref = new_msg_ref;
         }
     }
@@ -2735,7 +2757,10 @@
                                  peer_ref, &(peer_ref->recv_list),
                                  &new_msg_ref,
                                  false,
-                                 true);
+                                 true
+,(int)status->MPI_TAG
+,(int)status->MPI_SOURCE
+);
             new_msg_ref->done++;
             new_msg_ref->active--;
         } else {
@@ -3269,12 +3294,16 @@
                                 opal_list_t * to_list,
                                 ompi_crcp_bkmrk_pml_traffic_message_ref_t 
**new_msg_ref,
                                 bool keep_active,
-                                bool remove)
+                                bool remove
+,int work_tag  /* seki : for Q5 */
+,int work_rank /* seki : for Q5 */
+)
 {
     int ret, exit_status = ORTE_SUCCESS;
     ompi_crcp_bkmrk_pml_message_content_ref_t *new_content = NULL, 
*prev_content = NULL;
     ompi_request_t *request = NULL;
     bool loc_already_drained = false;
+int temp_tag,temp_rank; /* seki : for Q5 */

     /* Append to the to_peer_ref */
     if( COORD_MSG_TYPE_B_RECV != msg_type ) {
@@ -3289,13 +3318,15 @@
         }
     }

+temp_tag  = (old_msg_ref->tag == MPI_ANY_TAG)     ? work_tag  : 
old_msg_ref->tag; /* seki : for Q5 */
+temp_rank = (old_msg_ref->rank == MPI_ANY_SOURCE) ? work_rank : 
old_msg_ref->rank; /* seki : for Q5 */
     ret = traffic_message_append(to_peer_ref, to_list,
                                  old_msg_ref->msg_type,
                                  old_msg_ref->count,
                                  NULL,
                                  old_msg_ref->ddt_size,
-                                 old_msg_ref->tag,
-                                 old_msg_ref->rank,
+                                 temp_tag,   /* seki : for Q5 */
+                                 temp_rank,  /* seki : for Q5 */
                                  old_msg_ref->comm,
                                  new_msg_ref);

@@ -3713,7 +3744,7 @@
         }

         if(msg_ref->count == count  &&
-           (NULL != msg_ref->comm && msg_ref->comm->c_contextid == comm_id) && 
+(msg_ref->c_contextid == comm_id) && /* seki : for Q4 */
            (msg_ref->tag  == MPI_ANY_TAG || msg_ref->tag  == tag)   &&
            (peer          == INVALID_INT || msg_ref->rank == peer)  &&
            msg_ref->ddt_size == ddt_size) {
@@ -4036,6 +4067,7 @@

     memcpy(&(content_ref->status), &drain_content_ref->status, 
sizeof(ompi_status_public_t)); 

+#if 0 /* seki */
     if( 0 != (ret = ompi_ddt_copy_content_same_ddt(drain_msg_ref->datatype,
                                                    drain_msg_ref->count,
                                                    content_ref->buffer,
@@ -4045,6 +4077,10 @@
                      ret);
         exit_status = ret;
     }
+#else
+/* seki : for Q2 : it is CONTIGUOUS message only! */
+memcpy((char *)content_ref->buffer,(char 
*)drain_content_ref->buffer,drain_msg_ref->ddt_size);
+#endif

     /* Remove the message from the list */
     drain_content_ref->request = NULL;
@@ -4081,7 +4117,7 @@
     }

     /* The buffer could be NULL - More likely when doing a count=0 type of 
message (e.g., Barrier) */
-    if( OPAL_LIKELY(NULL != buf) ) {
+    if( OPAL_LIKELY(NULL != buf) && (count != 0) ) { /* seki : for Q10 */
         if( 0 != (ret = ompi_ddt_copy_content_same_ddt(datatype, count,
                                                        (void*)buf, 
drain_content_ref->buffer) ) ) {
             opal_output( mca_crcp_bkmrk_component.super.output_handle,
@@ -4332,8 +4368,8 @@
         }

         /* Clear send_init_list */
-        for(rm_item  = opal_list_get_last(&peer_ref->send_list);
-            rm_item != opal_list_get_begin(&peer_ref->send_list);
+        for(rm_item  = opal_list_get_last(&peer_ref->send_init_list); /* seki 
: for Q1 */
+            rm_item != opal_list_get_begin(&peer_ref->send_init_list); /* seki 
: for Q1 */
             rm_item  = opal_list_get_prev(rm_item) ) {
             msg_ref = (ompi_crcp_bkmrk_pml_traffic_message_ref_t*)rm_item;
             msg_ref->matched = 0;
@@ -4370,8 +4406,8 @@
         }

         /* Clear recv_init_list */
-        for(rm_item  = opal_list_get_last(&peer_ref->recv_list);
-            rm_item != opal_list_get_begin(&peer_ref->recv_list);
+        for(rm_item  = opal_list_get_last(&peer_ref->recv_init_list); /* seki 
: for Q1 */
+            rm_item != opal_list_get_begin(&peer_ref->recv_init_list); /* seki 
: for Q1 */
             rm_item  = opal_list_get_prev(rm_item) ) {
             msg_ref = (ompi_crcp_bkmrk_pml_traffic_message_ref_t*)rm_item;
             msg_ref->matched = 0;
@@ -5523,9 +5559,9 @@
      * - Communicator Context ID
      * - My Rank in Communicator
      */
-    comm_my_rank  = ompi_comm_rank(msg_ref->comm);
+comm_my_rank  = msg_ref->c_my_rank; /* seki : for Q4 */

-    PACK_BUFFER(buffer, msg_ref->comm->c_contextid, 1, OPAL_UINT32,
+    PACK_BUFFER(buffer, msg_ref->c_contextid, 1, OPAL_UINT32, /* seki : for Q4 
*/
                 "crcp:bkmrk: send_msg_details: Unable to pack communicator 
ID");
     PACK_BUFFER(buffer, comm_my_rank, 1, OPAL_INT,
                 "crcp:bkmrk: send_msg_details: Unable to pack comm rank ID");
@@ -5794,6 +5830,14 @@
     return exit_status;
 }

+/* seki : for Q3 This fix may be not correct. */
+#define WORK_GET_UNRES(n_l_u,d,m)       \
+{                                       \
+work1 = (n_l_u < (d-m)) ? n_l_u : (d-m);\
+if (work1 < 0) { work1=0; }             \
+  m     += work1;                       \
+  n_l_u -= work1;                       \
+}
 static int do_recv_msg_detail_check_drain(ompi_crcp_bkmrk_pml_peer_ref_t 
*peer_ref,
                                     int rank, uint32_t comm_id, int tag,
                                     size_t count, size_t datatype_size,
@@ -5813,6 +5857,7 @@
     int num_still_active = 0;
     /* Number of drain messages posted */
     int num_posted = 0;
+int work1; /* seki : for Q3 This fix may be not correct. */

     *num_resolved = 0;
     num_left_unresolved = p_num_sent;
@@ -5865,28 +5910,28 @@
      * First pass: Count all 'done'
      */
     if( NULL != posted_recv_msg_ref ) {
-        posted_recv_msg_ref->matched += posted_recv_msg_ref->done;
-        num_left_unresolved          -= posted_recv_msg_ref->done;
+/* seki : for Q3 This fix may be not correct. */
+WORK_GET_UNRES(num_left_unresolved,posted_recv_msg_ref->done,posted_recv_msg_ref->matched)
         TRAFFIC_MSG_DUMP_MSG_INDV(11, (posted_recv_msg_ref, "Ck.  Recv", 
true));
     }
     if( NULL != posted_irecv_msg_ref ) {
-        posted_irecv_msg_ref->matched += posted_irecv_msg_ref->done;
-        num_left_unresolved           -= posted_irecv_msg_ref->done;
+/* seki : for Q3 This fix may be not correct. */
+WORK_GET_UNRES(num_left_unresolved,posted_irecv_msg_ref->done,posted_irecv_msg_ref->matched)
         TRAFFIC_MSG_DUMP_MSG_INDV(11, (posted_irecv_msg_ref, "Ck. iRecv", 
true));
     }
     if( NULL != posted_precv_msg_ref ) {
-        posted_precv_msg_ref->matched += posted_precv_msg_ref->done;
-        num_left_unresolved           -= posted_precv_msg_ref->done;
+/* seki : for Q3 This fix may be not correct. */
+WORK_GET_UNRES(num_left_unresolved,posted_precv_msg_ref->done,posted_precv_msg_ref->matched)
         TRAFFIC_MSG_DUMP_MSG_INDV(11, (posted_precv_msg_ref, "Ck. pRecv", 
true));
     }
     if( NULL != posted_unknown_recv_msg_ref ) {
-        posted_unknown_recv_msg_ref->matched += 
posted_unknown_recv_msg_ref->done;
-        num_left_unresolved                  -= 
posted_unknown_recv_msg_ref->done;
+/* seki : for Q3 This fix may be not correct. */
+WORK_GET_UNRES(num_left_unresolved,posted_unknown_recv_msg_ref->done,posted_unknown_recv_msg_ref->matched)
         TRAFFIC_MSG_DUMP_MSG_INDV(11, (posted_unknown_recv_msg_ref, "Ck. 
uRecv", true));
     }
     if( NULL != posted_unknown_precv_msg_ref ) {
-        posted_unknown_precv_msg_ref->matched += 
posted_unknown_precv_msg_ref->done;
-        num_left_unresolved                   -= 
posted_unknown_precv_msg_ref->done;
+/* seki : for Q3 This fix may be not correct. */
+WORK_GET_UNRES(num_left_unresolved,posted_unknown_precv_msg_ref->done,posted_unknown_precv_msg_ref->matched)
         TRAFFIC_MSG_DUMP_MSG_INDV(11, (posted_unknown_precv_msg_ref, "Ck. 
upRecv", true));
     }

--- crcp_bkmrk_pml.h_orig       2009-12-09 05:36:26.000000000 +0900
+++ crcp_bkmrk_pml.h    2010-03-23 10:24:42.531147723 +0900
@@ -222,6 +222,9 @@

         /** Communicator pointer */
         ompi_communicator_t* comm;
+uint32_t c_contextid;  /* seki: for Q4 */
+int c_my_rank;         /* seki: for Q4 */
+uint32_t c_flags;      /* seki: for Q4 */

         /** Message Contents */
         opal_list_t msg_contents;
@@ -303,6 +306,9 @@

         /** Communicator pointer */
         ompi_communicator_t* comm;
+uint32_t c_contextid;  /* seki: for Q4 */
+int c_my_rank;         /* seki: for Q4 */
+uint32_t c_flags;      /* seki: for Q4 */

         /** Message Contents */
         opal_list_t msg_contents;

Reply via email to