Hi, Josh, George,
Thank you very much for your answers about my questions.
Now I have 3 more questions.
I would like to post them one of these days.
I will try to use checkpoint/restart after this as much as possible.
I tried to fix some bugs.
But I could not fix them smartly.
For reference, I put temporary ugly concept patch in this mail.
In 7th question,
My debugging was mistaken.
Using ompi_ddt_type_size may be correct.
And receiver received the 14bytes messages correctly in the bkrmk.
DEBUG:wait_quiesce_drained:WCH x=1 a=4329.100098 y=2 b=8474.730469 c=48
--> Received messages are contiguous.
DEBUG:drain_message_find:My=1 drain_msg=184a280 [peer=0/0 count=1/1 comm=602500 ID
0/0/0 R=1/1 tag=1000/1000 ddt_size=14/14 [datatype->size=1]]
[done=1 active=0 already_posted=0]
--> Matching is correct.
DEBUG:ompi_ddt_copy_content_same_ddt:Start size=14 flag=102/4 count=1
I think that problem is in ompi_ddt_copy_content_same_ddt function,
when copying message from drain_list to user buffer.
Please check my 8th question later too.
I think that 8th question(problem) is like 7th.
--- crcp_bkmrk_pml.c_orig 2009-12-09 05:36:26.000000000 +0900
+++ crcp_bkmrk_pml.c 2010-03-23 17:59:48.412809557 +0900
@@ -175,7 +175,10 @@
opal_list_t * to_list,
ompi_crcp_bkmrk_pml_traffic_message_ref_t
**new_msg_ref,
bool keep_active, /* If you have to create a
new context, should it be initialized to active? */
- bool remove); /* Remove the original? - false
= copy() */
+ bool remove
+,int tag /* seki : for Q5 */
+,int rank /* seki : for Q5 */
+); /* Remove the original? - false = copy() */
/*
* Traffic Message: Strip off the first matching request
@@ -829,6 +832,9 @@
msg_ref->tag = 0;
msg_ref->rank = 0;
msg_ref->comm = NULL;
+msg_ref->c_contextid = 0; /* seki : for Q4 */
+msg_ref->c_my_rank = 0; /* seki : for Q4 */
+msg_ref->c_flags = 0; /* seki : for Q4 */
OBJ_CONSTRUCT(&msg_ref->msg_contents, opal_list_t);
@@ -853,6 +859,9 @@
msg_ref->tag = 0;
msg_ref->rank = 0;
msg_ref->comm = NULL;
+msg_ref->c_contextid = 0; /* seki : for Q4 */
+msg_ref->c_my_rank = 0; /* seki : for Q4 */
+msg_ref->c_flags = 0; /* seki : for Q4 */
while( NULL != (item = opal_list_remove_first(&(msg_ref->msg_contents)) )
) {
HOKE_CONTENT_REF_RETURN(item);
@@ -917,6 +926,9 @@
msg_ref->tag = 0;
msg_ref->rank = 0;
msg_ref->comm = NULL;
+msg_ref->c_contextid = 0; /* seki : for Q4 */
+msg_ref->c_my_rank = 0; /* seki : for Q4 */
+msg_ref->c_flags = 0; /* seki : for Q4 */
while( NULL != (item = opal_list_remove_first(&(msg_ref->msg_contents)) )
) {
HOKE_CONTENT_REF_RETURN(item);
@@ -1019,6 +1031,8 @@
msg_ref->tag = v_tag; \
msg_ref->rank = v_rank; \
msg_ref->comm = v_comm; \
+msg_ref->c_contextid = v_comm->c_contextid; /* seki : for Q4 */\
+msg_ref->c_my_rank = v_comm->c_my_rank; /* seki : for Q4 */\
\
msg_ref->proc_name.jobid = p_jobid; \
msg_ref->proc_name.vpid = p_vpid; \
@@ -1047,6 +1061,8 @@
msg_ref->tag = v_tag; \
msg_ref->rank = v_rank; \
msg_ref->comm = v_comm; \
+msg_ref->c_contextid = v_comm->c_contextid; /* seki : for Q4 */\
+msg_ref->c_my_rank = v_comm->c_my_rank; /* seki : for Q4 */\
\
msg_ref->proc_name.jobid = p_jobid; \
msg_ref->proc_name.vpid = p_vpid; \
@@ -2313,7 +2329,10 @@
peer_ref, &(peer_ref->recv_init_list),
&new_msg_ref,
true,
- false);
+ false
+,(int)request->req_status.MPI_TAG /* seki for Q5 */
+,(int)request->req_status.MPI_SOURCE /* seki for Q5 */
+);
msg_ref = new_msg_ref;
}
}
@@ -2558,7 +2577,10 @@
peer_ref, &(peer_ref->irecv_list),
&new_msg_ref,
true,
- true);
+ true
+,(int)request->req_status.MPI_TAG /* seki : for Q5 */
+,(int)request->req_status.MPI_SOURCE /* seki : for Q5 */
+);
msg_ref = new_msg_ref;
}
}
@@ -2735,7 +2757,10 @@
peer_ref, &(peer_ref->recv_list),
&new_msg_ref,
false,
- true);
+ true
+,(int)status->MPI_TAG
+,(int)status->MPI_SOURCE
+);
new_msg_ref->done++;
new_msg_ref->active--;
} else {
@@ -3269,12 +3294,16 @@
opal_list_t * to_list,
ompi_crcp_bkmrk_pml_traffic_message_ref_t
**new_msg_ref,
bool keep_active,
- bool remove)
+ bool remove
+,int work_tag /* seki : for Q5 */
+,int work_rank /* seki : for Q5 */
+)
{
int ret, exit_status = ORTE_SUCCESS;
ompi_crcp_bkmrk_pml_message_content_ref_t *new_content = NULL,
*prev_content = NULL;
ompi_request_t *request = NULL;
bool loc_already_drained = false;
+int temp_tag,temp_rank; /* seki : for Q5 */
/* Append to the to_peer_ref */
if( COORD_MSG_TYPE_B_RECV != msg_type ) {
@@ -3289,13 +3318,15 @@
}
}
+temp_tag = (old_msg_ref->tag == MPI_ANY_TAG) ? work_tag :
old_msg_ref->tag; /* seki : for Q5 */
+temp_rank = (old_msg_ref->rank == MPI_ANY_SOURCE) ? work_rank :
old_msg_ref->rank; /* seki : for Q5 */
ret = traffic_message_append(to_peer_ref, to_list,
old_msg_ref->msg_type,
old_msg_ref->count,
NULL,
old_msg_ref->ddt_size,
- old_msg_ref->tag,
- old_msg_ref->rank,
+ temp_tag, /* seki : for Q5 */
+ temp_rank, /* seki : for Q5 */
old_msg_ref->comm,
new_msg_ref);
@@ -3713,7 +3744,7 @@
}
if(msg_ref->count == count &&
- (NULL != msg_ref->comm && msg_ref->comm->c_contextid == comm_id) &&
+(msg_ref->c_contextid == comm_id) && /* seki : for Q4 */
(msg_ref->tag == MPI_ANY_TAG || msg_ref->tag == tag) &&
(peer == INVALID_INT || msg_ref->rank == peer) &&
msg_ref->ddt_size == ddt_size) {
@@ -4036,6 +4067,7 @@
memcpy(&(content_ref->status), &drain_content_ref->status,
sizeof(ompi_status_public_t));
+#if 0 /* seki */
if( 0 != (ret = ompi_ddt_copy_content_same_ddt(drain_msg_ref->datatype,
drain_msg_ref->count,
content_ref->buffer,
@@ -4045,6 +4077,10 @@
ret);
exit_status = ret;
}
+#else
+/* seki : for Q2 : it is CONTIGUOUS message only! */
+memcpy((char *)content_ref->buffer,(char
*)drain_content_ref->buffer,drain_msg_ref->ddt_size);
+#endif
/* Remove the message from the list */
drain_content_ref->request = NULL;
@@ -4081,7 +4117,7 @@
}
/* The buffer could be NULL - More likely when doing a count=0 type of
message (e.g., Barrier) */
- if( OPAL_LIKELY(NULL != buf) ) {
+ if( OPAL_LIKELY(NULL != buf) && (count != 0) ) { /* seki : for Q10 */
if( 0 != (ret = ompi_ddt_copy_content_same_ddt(datatype, count,
(void*)buf,
drain_content_ref->buffer) ) ) {
opal_output( mca_crcp_bkmrk_component.super.output_handle,
@@ -4332,8 +4368,8 @@
}
/* Clear send_init_list */
- for(rm_item = opal_list_get_last(&peer_ref->send_list);
- rm_item != opal_list_get_begin(&peer_ref->send_list);
+ for(rm_item = opal_list_get_last(&peer_ref->send_init_list); /* seki
: for Q1 */
+ rm_item != opal_list_get_begin(&peer_ref->send_init_list); /* seki
: for Q1 */
rm_item = opal_list_get_prev(rm_item) ) {
msg_ref = (ompi_crcp_bkmrk_pml_traffic_message_ref_t*)rm_item;
msg_ref->matched = 0;
@@ -4370,8 +4406,8 @@
}
/* Clear recv_init_list */
- for(rm_item = opal_list_get_last(&peer_ref->recv_list);
- rm_item != opal_list_get_begin(&peer_ref->recv_list);
+ for(rm_item = opal_list_get_last(&peer_ref->recv_init_list); /* seki
: for Q1 */
+ rm_item != opal_list_get_begin(&peer_ref->recv_init_list); /* seki
: for Q1 */
rm_item = opal_list_get_prev(rm_item) ) {
msg_ref = (ompi_crcp_bkmrk_pml_traffic_message_ref_t*)rm_item;
msg_ref->matched = 0;
@@ -5523,9 +5559,9 @@
* - Communicator Context ID
* - My Rank in Communicator
*/
- comm_my_rank = ompi_comm_rank(msg_ref->comm);
+comm_my_rank = msg_ref->c_my_rank; /* seki : for Q4 */
- PACK_BUFFER(buffer, msg_ref->comm->c_contextid, 1, OPAL_UINT32,
+ PACK_BUFFER(buffer, msg_ref->c_contextid, 1, OPAL_UINT32, /* seki : for Q4
*/
"crcp:bkmrk: send_msg_details: Unable to pack communicator
ID");
PACK_BUFFER(buffer, comm_my_rank, 1, OPAL_INT,
"crcp:bkmrk: send_msg_details: Unable to pack comm rank ID");
@@ -5794,6 +5830,14 @@
return exit_status;
}
+/* seki : for Q3 This fix may be not correct. */
+#define WORK_GET_UNRES(n_l_u,d,m) \
+{ \
+work1 = (n_l_u < (d-m)) ? n_l_u : (d-m);\
+if (work1 < 0) { work1=0; } \
+ m += work1; \
+ n_l_u -= work1; \
+}
static int do_recv_msg_detail_check_drain(ompi_crcp_bkmrk_pml_peer_ref_t
*peer_ref,
int rank, uint32_t comm_id, int tag,
size_t count, size_t datatype_size,
@@ -5813,6 +5857,7 @@
int num_still_active = 0;
/* Number of drain messages posted */
int num_posted = 0;
+int work1; /* seki : for Q3 This fix may be not correct. */
*num_resolved = 0;
num_left_unresolved = p_num_sent;
@@ -5865,28 +5910,28 @@
* First pass: Count all 'done'
*/
if( NULL != posted_recv_msg_ref ) {
- posted_recv_msg_ref->matched += posted_recv_msg_ref->done;
- num_left_unresolved -= posted_recv_msg_ref->done;
+/* seki : for Q3 This fix may be not correct. */
+WORK_GET_UNRES(num_left_unresolved,posted_recv_msg_ref->done,posted_recv_msg_ref->matched)
TRAFFIC_MSG_DUMP_MSG_INDV(11, (posted_recv_msg_ref, "Ck. Recv",
true));
}
if( NULL != posted_irecv_msg_ref ) {
- posted_irecv_msg_ref->matched += posted_irecv_msg_ref->done;
- num_left_unresolved -= posted_irecv_msg_ref->done;
+/* seki : for Q3 This fix may be not correct. */
+WORK_GET_UNRES(num_left_unresolved,posted_irecv_msg_ref->done,posted_irecv_msg_ref->matched)
TRAFFIC_MSG_DUMP_MSG_INDV(11, (posted_irecv_msg_ref, "Ck. iRecv",
true));
}
if( NULL != posted_precv_msg_ref ) {
- posted_precv_msg_ref->matched += posted_precv_msg_ref->done;
- num_left_unresolved -= posted_precv_msg_ref->done;
+/* seki : for Q3 This fix may be not correct. */
+WORK_GET_UNRES(num_left_unresolved,posted_precv_msg_ref->done,posted_precv_msg_ref->matched)
TRAFFIC_MSG_DUMP_MSG_INDV(11, (posted_precv_msg_ref, "Ck. pRecv",
true));
}
if( NULL != posted_unknown_recv_msg_ref ) {
- posted_unknown_recv_msg_ref->matched +=
posted_unknown_recv_msg_ref->done;
- num_left_unresolved -=
posted_unknown_recv_msg_ref->done;
+/* seki : for Q3 This fix may be not correct. */
+WORK_GET_UNRES(num_left_unresolved,posted_unknown_recv_msg_ref->done,posted_unknown_recv_msg_ref->matched)
TRAFFIC_MSG_DUMP_MSG_INDV(11, (posted_unknown_recv_msg_ref, "Ck.
uRecv", true));
}
if( NULL != posted_unknown_precv_msg_ref ) {
- posted_unknown_precv_msg_ref->matched +=
posted_unknown_precv_msg_ref->done;
- num_left_unresolved -=
posted_unknown_precv_msg_ref->done;
+/* seki : for Q3 This fix may be not correct. */
+WORK_GET_UNRES(num_left_unresolved,posted_unknown_precv_msg_ref->done,posted_unknown_precv_msg_ref->matched)
TRAFFIC_MSG_DUMP_MSG_INDV(11, (posted_unknown_precv_msg_ref, "Ck.
upRecv", true));
}
--- crcp_bkmrk_pml.h_orig 2009-12-09 05:36:26.000000000 +0900
+++ crcp_bkmrk_pml.h 2010-03-23 10:24:42.531147723 +0900
@@ -222,6 +222,9 @@
/** Communicator pointer */
ompi_communicator_t* comm;
+uint32_t c_contextid; /* seki: for Q4 */
+int c_my_rank; /* seki: for Q4 */
+uint32_t c_flags; /* seki: for Q4 */
/** Message Contents */
opal_list_t msg_contents;
@@ -303,6 +306,9 @@
/** Communicator pointer */
ompi_communicator_t* comm;
+uint32_t c_contextid; /* seki: for Q4 */
+int c_my_rank; /* seki: for Q4 */
+uint32_t c_flags; /* seki: for Q4 */
/** Message Contents */
opal_list_t msg_contents;