Hefty, Sean wrote: > committed to trunk - needs to be added to winof 2.2
Hello, w.r.t. WinOF 2.2, 1) what larger set of application problems does this patch address? 2) what type/degree of testing has been successfully passed with this patch applied? Stan. > >> -----Original Message----- >> From: [email protected] [mailto:ofw- >> [email protected]] On Behalf Of Sean Hefty >> Sent: Thursday, December 17, 2009 2:53 PM >> To: [email protected] >> Subject: [ofw] [PATCH] ib/cm: fix handling failed send completions >> >> __cep_mad_send_cb() assumes that the mad being processed is >> associated with the current state of the CEP. This may not be >> the case. >> >> For example, for a short lived connection, it was observed that >> a REP mad completed with status canceled. This is normal. However, >> the user already attempted to disconnect the connection by sending >> a DREQ. This left the cep in the DREQ_SENT state by the time that >> the REP mad completed. Since the REP status was not success, but the >> state was DREQ_SENT, the code assumed that the DREQ had failed and >> transitioned the cep into TIMEWAIT. The result is that the DREQ is >> never matched with a DREP or canceled, but holds a reference on the >> CEP. >> >> Until the DREQ times out (time depends on connection, but easily >> up to a minute), attempts to destroy the CEP are blocked. >> >> Fix this by simply discarding any completed sends that were not >> sent from the current state of the cep when the completion handler >> is invoked. >> >> Signed-off-by: Sean Hefty <[email protected]> >> --- >> trunk/core/al/kernel/al_cm.c | 2 - >> trunk/core/al/kernel/al_cm_cep.c | 140 >> +++++++++++++++----------------------- 2 files changed, 56 >> insertions(+), 86 deletions(-) >> >> diff --git a/trunk/core/al/kernel/al_cm.c >> b/trunk/core/al/kernel/al_cm.c >> index 48b0cb5..955985a 100644 >> --- a/trunk/core/al/kernel/al_cm.c >> +++ b/trunk/core/al/kernel/al_cm.c >> @@ -37,7 +37,7 @@ >> typedef struct _iba_cm_id_priv >> { >> iba_cm_id id; >> - KEVENT destroy_event; >> + KEVENT destroy_event; >> >> } iba_cm_id_priv; >> >> diff --git a/trunk/core/al/kernel/al_cm_cep.c >> b/trunk/core/al/kernel/al_cm_cep.c >> index 49fa417..4749e1d 100644 >> --- a/trunk/core/al/kernel/al_cm_cep.c >> +++ b/trunk/core/al/kernel/al_cm_cep.c >> @@ -27,7 +27,7 @@ >> * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE >> * SOFTWARE. >> * >> - * $Id$ >> + * $Id: al_cm_cep.c 2540 2009-11-03 17:23:09Z shefty $ */ >> >> >> @@ -2213,105 +2213,79 @@ __cep_mad_send_cb( >> >> p_cep = (kcep_t*)p_mad->context1; >> >> - /* >> - * The connection context is not set when performing immediate >> responses, >> - * such as repeating MADS. >> - */ >> - if( !p_cep ) >> + /* The cep context is only set for MADs that are retried. */ + if( >> !p_cep ) { >> ib_put_mad( p_mad ); >> AL_EXIT( AL_DBG_CM ); >> return; >> } >> >> + CL_ASSERT( p_mad->status != IB_WCS_SUCCESS ); >> p_mad->context1 = NULL; >> >> KeAcquireInStackQueuedSpinLockAtDpcLevel( &gp_cep_mgr->lock, &hdl ); >> - /* Clear the sent MAD pointer so that we don't try cancelling >> again. */ >> - if( p_cep->p_send_mad == p_mad ) >> - p_cep->p_send_mad = NULL; >> - >> - switch( p_mad->status ) >> + if( p_cep->p_send_mad != p_mad ) >> { >> - case IB_WCS_SUCCESS: >> KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl ); >> ib_put_mad( >> p_mad ); >> - break; >> - >> - case IB_WCS_CANCELED: >> - if( p_cep->state != CEP_STATE_REQ_SENT && >> - p_cep->state != CEP_STATE_REQ_MRA_RCVD && >> - p_cep->state != CEP_STATE_REP_SENT && >> - p_cep->state != CEP_STATE_REP_MRA_RCVD && >> - p_cep->state != CEP_STATE_LAP_SENT && >> - p_cep->state != CEP_STATE_LAP_MRA_RCVD && >> - p_cep->state != CEP_STATE_DREQ_SENT && >> - p_cep->state != CEP_STATE_SREQ_SENT ) >> - { >> - KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl ); >> - ib_put_mad( p_mad ); >> - break; >> - } >> - /* Treat as a timeout so we don't stall the state machine. */ >> - p_mad->status = IB_WCS_TIMEOUT_RETRY_ERR; >> - >> - /* Fall through. */ >> - case IB_WCS_TIMEOUT_RETRY_ERR: >> - default: >> - /* Timeout. Reject the connection. */ >> - switch( p_cep->state ) >> - { >> - case CEP_STATE_REQ_SENT: >> - case CEP_STATE_REQ_MRA_RCVD: >> - case CEP_STATE_REP_SENT: >> - case CEP_STATE_REP_MRA_RCVD: >> - /* Send the REJ. */ >> - __reject_timeout( p_port_cep, p_cep, p_mad ); >> - __remove_cep( p_cep ); >> - p_cep->state = CEP_STATE_IDLE; >> - break; >> - >> - case CEP_STATE_DREQ_DESTROY: >> - p_cep->state = CEP_STATE_DESTROY; >> - __insert_timewait( p_cep ); >> - /* Fall through. */ >> + goto done; >> + } >> >> - case CEP_STATE_DESTROY: >> - KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl ); >> - ib_put_mad( p_mad ); >> - goto done; >> + /* Clear the sent MAD pointer so that we don't try cancelling >> again. */ + p_cep->p_send_mad = NULL; >> >> - case CEP_STATE_DREQ_SENT: >> - /* >> - * Make up a DREP mad so we can respond if we receive >> - * a DREQ while in timewait. >> - */ >> - __format_mad_hdr( &p_cep->mads.drep.hdr, p_cep, >> CM_DREP_ATTR_ID ); >> - __format_drep( p_cep, NULL, 0, &p_cep->mads.drep ); >> - p_cep->state = CEP_STATE_TIMEWAIT; >> - __insert_timewait( p_cep ); >> - break; >> + switch( p_cep->state ) >> + { >> + case CEP_STATE_REQ_SENT: >> + case CEP_STATE_REQ_MRA_RCVD: >> + case CEP_STATE_REP_SENT: >> + case CEP_STATE_REP_MRA_RCVD: >> + /* Send the REJ. */ >> + __reject_timeout( p_port_cep, p_cep, p_mad ); >> + __remove_cep( p_cep ); >> + p_cep->state = CEP_STATE_IDLE; >> + break; >> >> - case CEP_STATE_LAP_SENT: >> - /* >> - * Before CEP was sent, we have been in >> CEP_STATE_ESTABLISHED >> as we >> - * failed to send, we return to that state. >> - */ >> - p_cep->state = CEP_STATE_ESTABLISHED; >> - break; >> - default: >> - break; >> - } >> + case CEP_STATE_DREQ_DESTROY: >> + p_cep->state = CEP_STATE_DESTROY; >> + __insert_timewait( p_cep ); >> + /* Fall through. */ >> >> - status = __cep_queue_mad( p_cep, p_mad ); >> - CL_ASSERT( status != IB_INVALID_STATE ); >> + case CEP_STATE_DESTROY: >> KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl ); + >> ib_put_mad( >> p_mad ); + goto done; >> >> - if( status == IB_SUCCESS ) >> - __process_cep( p_cep ); >> + case CEP_STATE_DREQ_SENT: >> + /* >> + * Make up a DREP mad so we can respond if we receive >> + * a DREQ while in timewait. >> + */ >> + __format_mad_hdr( &p_cep->mads.drep.hdr, p_cep, CM_DREP_ATTR_ID >> ); >> + __format_drep( p_cep, NULL, 0, &p_cep->mads.drep ); >> + p_cep->state = CEP_STATE_TIMEWAIT; >> + __insert_timewait( p_cep ); >> + break; >> + >> + case CEP_STATE_LAP_SENT: >> + /* >> + * Before CEP was sent, we have been in CEP_STATE_ESTABLISHED as >> we + * failed to send, we return to that state. >> + */ >> + p_cep->state = CEP_STATE_ESTABLISHED; >> + break; >> + default: >> break; >> } >> >> + status = __cep_queue_mad( p_cep, p_mad ); >> + CL_ASSERT( status != IB_INVALID_STATE ); >> + KeReleaseInStackQueuedSpinLockFromDpcLevel( &hdl ); + >> + if( status == IB_SUCCESS ) >> + __process_cep( p_cep ); >> + >> done: >> pfn_destroy_cb = p_cep->pfn_destroy_cb; >> cep_context = p_cep->context; >> @@ -3938,12 +3912,8 @@ __cleanup_cep( >> CL_ASSERT( KeGetCurrentIrql() == DISPATCH_LEVEL ); >> >> /* If we've already come through here, we're done. */ >> - if( p_cep->state == CEP_STATE_DESTROY || >> - p_cep->state == CEP_STATE_DREQ_DESTROY ) >> - { >> - AL_EXIT( AL_DBG_CM ); >> - return -1; >> - } >> + CL_ASSERT( p_cep->state != CEP_STATE_DESTROY && >> + p_cep->state != CEP_STATE_DREQ_DESTROY ); >> >> /* Cleanup the pending MAD list. */ >> while( p_cep->p_mad_head ) _______________________________________________ ofw mailing list [email protected] http://lists.openfabrics.org/cgi-bin/mailman/listinfo/ofw
