Re: [OMPI devel] [PATCH v2 2/2] Trying to get the C/R code to compile again. (send_*_nb)

2013-12-19 Thread Adrian Reber
Thanks for the review. I am re-spinning the patches and sending the new
version in a few moments.

On Wed, Dec 18, 2013 at 06:56:47AM -0800, Ralph Castain wrote:
> In the case of the send, there really isn't any problem with just replacing 
> things - the non-blocking change won't impact anything, so no need to retain 
> the old code. People were only concerned about the recv's as those places 
> will require further repair, and they wanted to ensure we know where those 
> places are located.
> 
> You also need to change those comparisons, however, as the return code isn't 
> the number of bytes sent any more - it is just ORTE_SUCCESS or else an error 
> code, so you should be testing for ORTE_SUCCESS ==
> 
> 
> 
> 
> On Dec 18, 2013, at 6:42 AM, Adrian Reber  wrote:
> 
> > From: Adrian Reber 
> > 
> > This patch changes all send/send_buffer occurrences in the C/R code
> > to send_nb/send_buffer_nb.
> > The old code is still there but disabled using ifdefs (ENABLE_FT_FIXED).
> > The new code compiles but does not work.
> > 
> > Changes from V1:
> > * #ifdef out the code (so it is preserved for later re-design)
> > * marked the broken C/R code with ENABLE_FT_FIXED
> > 
> > Signed-off-by: Adrian Reber 
> > ---
> > ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c| 18 +++
> > orte/mca/errmgr/base/errmgr_base_tool.c |  4 ++
> > orte/mca/rml/ftrm/rml_ftrm.h| 19 
> > orte/mca/rml/ftrm/rml_ftrm_component.c  |  2 -
> > orte/mca/rml/ftrm/rml_ftrm_module.c | 63 
> > +
> > orte/mca/snapc/full/snapc_full_app.c| 20 
> > orte/mca/snapc/full/snapc_full_global.c | 12 +
> > orte/mca/snapc/full/snapc_full_local.c  |  4 ++
> > orte/mca/sstore/central/sstore_central_app.c|  8 
> > orte/mca/sstore/central/sstore_central_global.c |  4 ++
> > orte/mca/sstore/central/sstore_central_local.c  | 12 +
> > orte/mca/sstore/stage/sstore_stage_app.c|  8 
> > orte/mca/sstore/stage/sstore_stage_global.c |  4 ++
> > orte/mca/sstore/stage/sstore_stage_local.c  | 16 +++
> > orte/tools/orte-checkpoint/orte-checkpoint.c|  4 ++
> > orte/tools/orte-migrate/orte-migrate.c  |  4 ++
> > 16 files changed, 130 insertions(+), 72 deletions(-)
> > 
> > diff --git a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c 
> > b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
> > index cba7586..4f7bd7f 100644
> > --- a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
> > +++ b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
> > @@ -5102,7 +5102,11 @@ static int wait_quiesce_drained(void)
> > PACK_BUFFER(buffer, response, 1, OPAL_SIZE, "");
> > 
> > /* JJH - Performance Optimization? - Why not post all isends, 
> > then wait? */
> > +#ifdef ENABLE_FT_FIXED
> > +/* This is the old, now broken code */
> > if ( 0 > ( ret = 
> > ompi_rte_send_buffer(&(cur_peer_ref->proc_name), buffer, 
> > OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
> > +#endif /* ENABLE_FT_FIXED */
> > +if ( 0 > ( ret = 
> > ompi_rte_send_buffer_nb(&(cur_peer_ref->proc_name), buffer, 
> > OMPI_CRCP_COORD_BOOKMARK_TAG, orte_rml_send_callback, NULL)) ) {
> > exit_status = ret;
> > goto cleanup;
> > }
> > @@ -5303,7 +5307,11 @@ static int send_bookmarks(int peer_idx)
> > PACK_BUFFER(buffer, (peer_ref->total_msgs_recvd), 1, OPAL_UINT32,
> > "crcp:bkmrk: send_bookmarks: Unable to pack 
> > total_msgs_recvd");
> > 
> > +#ifdef ENABLE_FT_FIXED
> > +/* This is the old, now broken code */
> > if ( 0 > ( ret = ompi_rte_send_buffer(_name, buffer, 
> > OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
> > +#endif /* ENABLE_FT_FIXED */
> > +if ( 0 > ( ret = ompi_rte_send_buffer_nb(_name, buffer, 
> > OMPI_CRCP_COORD_BOOKMARK_TAG, orte_rml_send_callback, NULL)) ) {
> > opal_output(mca_crcp_bkmrk_component.super.output_handle,
> > "crcp:bkmrk: send_bookmarks: Failed to send bookmark to 
> > peer %s: Return %d\n",
> > OMPI_NAME_PRINT(_name),
> > @@ -5599,8 +5607,13 @@ static int 
> > do_send_msg_detail(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
> > /*
> >  * Do the send...
> >  */
> > +#ifdef ENABLE_FT_FIXED
> > +/* This is the old, now broken code */
> > if ( 0 > ( ret = ompi_rte_send_buffer(_ref->proc_name, buffer,
> >   OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) 
> > ) {
> > +#endif /* ENABLE_FT_FIXED */
> > +if ( 0 > ( ret = ompi_rte_send_buffer_nb(_ref->proc_name, buffer,
> > +  OMPI_CRCP_COORD_BOOKMARK_TAG, 
> > orte_rml_send_callback, NULL)) ) {
> > opal_output(mca_crcp_bkmrk_component.super.output_handle,
> > "crcp:bkmrk: do_send_msg_detail: Unable to send message 
> > details to peer %s: Return %d\n",
> >  

Re: [OMPI devel] [PATCH v2 2/2] Trying to get the C/R code to compile again. (send_*_nb)

2013-12-18 Thread Ralph Castain
In the case of the send, there really isn't any problem with just replacing 
things - the non-blocking change won't impact anything, so no need to retain 
the old code. People were only concerned about the recv's as those places will 
require further repair, and they wanted to ensure we know where those places 
are located.

You also need to change those comparisons, however, as the return code isn't 
the number of bytes sent any more - it is just ORTE_SUCCESS or else an error 
code, so you should be testing for ORTE_SUCCESS ==




On Dec 18, 2013, at 6:42 AM, Adrian Reber  wrote:

> From: Adrian Reber 
> 
> This patch changes all send/send_buffer occurrences in the C/R code
> to send_nb/send_buffer_nb.
> The old code is still there but disabled using ifdefs (ENABLE_FT_FIXED).
> The new code compiles but does not work.
> 
> Changes from V1:
> * #ifdef out the code (so it is preserved for later re-design)
> * marked the broken C/R code with ENABLE_FT_FIXED
> 
> Signed-off-by: Adrian Reber 
> ---
> ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c| 18 +++
> orte/mca/errmgr/base/errmgr_base_tool.c |  4 ++
> orte/mca/rml/ftrm/rml_ftrm.h| 19 
> orte/mca/rml/ftrm/rml_ftrm_component.c  |  2 -
> orte/mca/rml/ftrm/rml_ftrm_module.c | 63 +
> orte/mca/snapc/full/snapc_full_app.c| 20 
> orte/mca/snapc/full/snapc_full_global.c | 12 +
> orte/mca/snapc/full/snapc_full_local.c  |  4 ++
> orte/mca/sstore/central/sstore_central_app.c|  8 
> orte/mca/sstore/central/sstore_central_global.c |  4 ++
> orte/mca/sstore/central/sstore_central_local.c  | 12 +
> orte/mca/sstore/stage/sstore_stage_app.c|  8 
> orte/mca/sstore/stage/sstore_stage_global.c |  4 ++
> orte/mca/sstore/stage/sstore_stage_local.c  | 16 +++
> orte/tools/orte-checkpoint/orte-checkpoint.c|  4 ++
> orte/tools/orte-migrate/orte-migrate.c  |  4 ++
> 16 files changed, 130 insertions(+), 72 deletions(-)
> 
> diff --git a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c 
> b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
> index cba7586..4f7bd7f 100644
> --- a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
> +++ b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
> @@ -5102,7 +5102,11 @@ static int wait_quiesce_drained(void)
> PACK_BUFFER(buffer, response, 1, OPAL_SIZE, "");
> 
> /* JJH - Performance Optimization? - Why not post all isends, 
> then wait? */
> +#ifdef ENABLE_FT_FIXED
> +/* This is the old, now broken code */
> if ( 0 > ( ret = ompi_rte_send_buffer(&(cur_peer_ref->proc_name), 
> buffer, OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
> +#endif /* ENABLE_FT_FIXED */
> +if ( 0 > ( ret = 
> ompi_rte_send_buffer_nb(&(cur_peer_ref->proc_name), buffer, 
> OMPI_CRCP_COORD_BOOKMARK_TAG, orte_rml_send_callback, NULL)) ) {
> exit_status = ret;
> goto cleanup;
> }
> @@ -5303,7 +5307,11 @@ static int send_bookmarks(int peer_idx)
> PACK_BUFFER(buffer, (peer_ref->total_msgs_recvd), 1, OPAL_UINT32,
> "crcp:bkmrk: send_bookmarks: Unable to pack 
> total_msgs_recvd");
> 
> +#ifdef ENABLE_FT_FIXED
> +/* This is the old, now broken code */
> if ( 0 > ( ret = ompi_rte_send_buffer(_name, buffer, 
> OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
> +#endif /* ENABLE_FT_FIXED */
> +if ( 0 > ( ret = ompi_rte_send_buffer_nb(_name, buffer, 
> OMPI_CRCP_COORD_BOOKMARK_TAG, orte_rml_send_callback, NULL)) ) {
> opal_output(mca_crcp_bkmrk_component.super.output_handle,
> "crcp:bkmrk: send_bookmarks: Failed to send bookmark to 
> peer %s: Return %d\n",
> OMPI_NAME_PRINT(_name),
> @@ -5599,8 +5607,13 @@ static int 
> do_send_msg_detail(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
> /*
>  * Do the send...
>  */
> +#ifdef ENABLE_FT_FIXED
> +/* This is the old, now broken code */
> if ( 0 > ( ret = ompi_rte_send_buffer(_ref->proc_name, buffer,
>   OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) 
> {
> +#endif /* ENABLE_FT_FIXED */
> +if ( 0 > ( ret = ompi_rte_send_buffer_nb(_ref->proc_name, buffer,
> +  OMPI_CRCP_COORD_BOOKMARK_TAG, 
> orte_rml_send_callback, NULL)) ) {
> opal_output(mca_crcp_bkmrk_component.super.output_handle,
> "crcp:bkmrk: do_send_msg_detail: Unable to send message 
> details to peer %s: Return %d\n",
> OMPI_NAME_PRINT(_ref->proc_name),
> @@ -6217,8 +6230,13 @@ static int 
> do_recv_msg_detail_resp(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
> "crcp:bkmrk: recv_msg_details: Unable to ask peer for more 
> messages");
> PACK_BUFFER(buffer, total_found, 1, OPAL_UINT32,
> "crcp:bkmrk: recv_msg_details: Unable to ask peer for more 
> 

[OMPI devel] [PATCH v2 2/2] Trying to get the C/R code to compile again. (send_*_nb)

2013-12-18 Thread Adrian Reber
From: Adrian Reber 

This patch changes all send/send_buffer occurrences in the C/R code
to send_nb/send_buffer_nb.
The old code is still there but disabled using ifdefs (ENABLE_FT_FIXED).
The new code compiles but does not work.

Changes from V1:
* #ifdef out the code (so it is preserved for later re-design)
* marked the broken C/R code with ENABLE_FT_FIXED

Signed-off-by: Adrian Reber 
---
 ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c| 18 +++
 orte/mca/errmgr/base/errmgr_base_tool.c |  4 ++
 orte/mca/rml/ftrm/rml_ftrm.h| 19 
 orte/mca/rml/ftrm/rml_ftrm_component.c  |  2 -
 orte/mca/rml/ftrm/rml_ftrm_module.c | 63 +
 orte/mca/snapc/full/snapc_full_app.c| 20 
 orte/mca/snapc/full/snapc_full_global.c | 12 +
 orte/mca/snapc/full/snapc_full_local.c  |  4 ++
 orte/mca/sstore/central/sstore_central_app.c|  8 
 orte/mca/sstore/central/sstore_central_global.c |  4 ++
 orte/mca/sstore/central/sstore_central_local.c  | 12 +
 orte/mca/sstore/stage/sstore_stage_app.c|  8 
 orte/mca/sstore/stage/sstore_stage_global.c |  4 ++
 orte/mca/sstore/stage/sstore_stage_local.c  | 16 +++
 orte/tools/orte-checkpoint/orte-checkpoint.c|  4 ++
 orte/tools/orte-migrate/orte-migrate.c  |  4 ++
 16 files changed, 130 insertions(+), 72 deletions(-)

diff --git a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c 
b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
index cba7586..4f7bd7f 100644
--- a/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
+++ b/ompi/mca/crcp/bkmrk/crcp_bkmrk_pml.c
@@ -5102,7 +5102,11 @@ static int wait_quiesce_drained(void)
 PACK_BUFFER(buffer, response, 1, OPAL_SIZE, "");

 /* JJH - Performance Optimization? - Why not post all isends, then 
wait? */
+#ifdef ENABLE_FT_FIXED
+/* This is the old, now broken code */
 if ( 0 > ( ret = ompi_rte_send_buffer(&(cur_peer_ref->proc_name), 
buffer, OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
+#endif /* ENABLE_FT_FIXED */
+if ( 0 > ( ret = 
ompi_rte_send_buffer_nb(&(cur_peer_ref->proc_name), buffer, 
OMPI_CRCP_COORD_BOOKMARK_TAG, orte_rml_send_callback, NULL)) ) {
 exit_status = ret;
 goto cleanup;
 }
@@ -5303,7 +5307,11 @@ static int send_bookmarks(int peer_idx)
 PACK_BUFFER(buffer, (peer_ref->total_msgs_recvd), 1, OPAL_UINT32,
 "crcp:bkmrk: send_bookmarks: Unable to pack total_msgs_recvd");

+#ifdef ENABLE_FT_FIXED
+/* This is the old, now broken code */
 if ( 0 > ( ret = ompi_rte_send_buffer(_name, buffer, 
OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
+#endif /* ENABLE_FT_FIXED */
+if ( 0 > ( ret = ompi_rte_send_buffer_nb(_name, buffer, 
OMPI_CRCP_COORD_BOOKMARK_TAG, orte_rml_send_callback, NULL)) ) {
 opal_output(mca_crcp_bkmrk_component.super.output_handle,
 "crcp:bkmrk: send_bookmarks: Failed to send bookmark to 
peer %s: Return %d\n",
 OMPI_NAME_PRINT(_name),
@@ -5599,8 +5607,13 @@ static int 
do_send_msg_detail(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
 /*
  * Do the send...
  */
+#ifdef ENABLE_FT_FIXED
+/* This is the old, now broken code */
 if ( 0 > ( ret = ompi_rte_send_buffer(_ref->proc_name, buffer,
   OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
+#endif /* ENABLE_FT_FIXED */
+if ( 0 > ( ret = ompi_rte_send_buffer_nb(_ref->proc_name, buffer,
+  OMPI_CRCP_COORD_BOOKMARK_TAG, 
orte_rml_send_callback, NULL)) ) {
 opal_output(mca_crcp_bkmrk_component.super.output_handle,
 "crcp:bkmrk: do_send_msg_detail: Unable to send message 
details to peer %s: Return %d\n",
 OMPI_NAME_PRINT(_ref->proc_name),
@@ -6217,8 +6230,13 @@ static int 
do_recv_msg_detail_resp(ompi_crcp_bkmrk_pml_peer_ref_t *peer_ref,
 "crcp:bkmrk: recv_msg_details: Unable to ask peer for more 
messages");
 PACK_BUFFER(buffer, total_found, 1, OPAL_UINT32,
 "crcp:bkmrk: recv_msg_details: Unable to ask peer for more 
messages");
+#ifdef ENABLE_FT_FIXED
+/* This is the old, now broken code */

 if ( 0 > ( ret = ompi_rte_send_buffer(_ref->proc_name, buffer, 
OMPI_CRCP_COORD_BOOKMARK_TAG, 0)) ) {
+#endif /* ENABLE_FT_FIXED */
+
+if ( 0 > ( ret = ompi_rte_send_buffer_nb(_ref->proc_name, buffer, 
OMPI_CRCP_COORD_BOOKMARK_TAG, orte_rml_send_callback, NULL)) ) {
 opal_output(mca_crcp_bkmrk_component.super.output_handle,
 "crcp:bkmrk: recv_msg_detail_resp: Unable to send message 
detail response to peer %s: Return %d\n",
 OMPI_NAME_PRINT(_ref->proc_name),
diff --git a/orte/mca/errmgr/base/errmgr_base_tool.c 
b/orte/mca/errmgr/base/errmgr_base_tool.c
index b982e46..e274bae 100644
---