Re: [Open-FCoE] [RFC PATCH] libfc: Fix remote port restart problem

Bhanu Gollapudi Mon, 17 May 2010 23:54:44 -0700

Joe,

Changes look good to me, however I have a comment.  Please read inline.


Thanks,
Bhanu

On Mon, 2010-05-17 at 16:38 -0700, Joe Eykholt wrote:
> This patch somewhat combines two fixes to remote port handing in libfc.
> 
> The first problem was that rport work could be queued on a deleted
> and freed rport.  This is handled by not resetting rdata->event
> ton NONE if the rdata is about to be deleted.
> 
> However, that fix led to the second problem, described by
> Bhanu Gollapudi, as follows:
> > Here is the sequence of events. T1 is first LOGO receive thread, T2 is
> > fc_rport_work() scheduled by T1 and T3 is second LOGO receive thread and
> > T4 is fc_rport_work scheduled by T3.
> >
> > 1. (T1)Received 1st LOGO in state Ready
> > 2. (T1)Delete port & enter to RESTART state.
> > 3. (T1)schdule event_work, since event is RPORT_EV_NONE.
> > 4. (T1)set event = RPORT_EV_LOGO
> > 5. (T1)Enter RESTART state as disc_id is set.
> > 6. (T2)remember to PLOGI, and set event = RPORT_EV_NONE
> > 6. (T3)Received 2nd LOGO
> > 7. (T3)Delete Port & enter to RESTART state.
> > 8. (T3)schedule event_work, since event is RPORT_EV_NONE.
> > 9. (T3)Enter RESTART state as disc_id is set.
> > 9. (T3)set event = RPORT_EV_LOGO
> > 10.(T2)work restart, enter PLOGI state and issues PLOGI
> > 11.(T4)Since state is not RESTART anymore, restart is not set, and the
> > event is not reset to RPORT_EV_NONE. (current event is RPORT_EV_LOGO).
> > 12. Now, PLOGI succeeds and fc_rport_enter_ready() will not schedule
> > event_work, and hence the rport will never be created, eventually losing
> > the target after dev_loss_tmo.
> 
> So, the problem here is that we were tracking the desire for
> the rport be restarted by state RESTART, which was otherwise
> equivalent to DELETE.  A contributing factor is that we dropped
> the lock between steps 6 and 10 in thread T2, which allows the
> state to change, and we didn't completely re-evaluate then.
> 
> This is hopefully corrected by the following minor redesign:
> 
> Simplify the rport restart logic by making the decision to
> restart after deleting the transport rport.  That decision
> is based on a new STARTED flag that indicates fc_rport_login()
> has been called and fc_rport_logoff() has not been called
> since then.  This replaces the need for the RESTART state.
> 
> Only restart if the rdata is still in DELETED state
> and only if it still has the STARTED flag set.
> 
> Also now, since we clear the event code much later in the
> work thread, allow for the possibility that the rport may
> have become READY again via incoming PLOGI, and if so,
> queue another event to handle that.
> 
> In the problem scenario, the second LOGO received will
> cause the LOGO event to occur again.
> 
> Reported-by: Bhanu Gollapudi <[email protected]>
> Signed-off-by: Joe Eykholt <[email protected]>
> ---
>  drivers/scsi/libfc/fc_rport.c |   80 
> +++++++++++++++++------------------------
>  include/scsi/libfc.h          |    5 +--
>  2 files changed, 35 insertions(+), 50 deletions(-)
> 
> 
> diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c
> index 39e440f..5a00a1a 100644
> --- a/drivers/scsi/libfc/fc_rport.c
> +++ b/drivers/scsi/libfc/fc_rport.c
> @@ -89,7 +89,6 @@ static const char *fc_rport_state_names[] = {
>       [RPORT_ST_LOGO] = "LOGO",
>       [RPORT_ST_ADISC] = "ADISC",
>       [RPORT_ST_DELETE] = "Delete",
> -     [RPORT_ST_RESTART] = "Restart",
>  };
>  
>  /**
> @@ -246,7 +245,6 @@ static void fc_rport_work(struct work_struct *work)
>       struct fc_rport_operations *rport_ops;
>       struct fc_rport_identifiers ids;
>       struct fc_rport *rport;
> -     int restart = 0;
>  
>       mutex_lock(&rdata->rp_mutex);
>       event = rdata->event;
> @@ -296,26 +294,9 @@ static void fc_rport_work(struct work_struct *work)
>       case RPORT_EV_LOGO:
>       case RPORT_EV_STOP:
>               port_id = rdata->ids.port_id;
> +             rdata->rport = NULL;

Can you please have this assignment after the callback? That way, it
will help the LLDD specific callback routine to access the driver
specific data structures from fc_rport_libfc_priv structure from
rdata->rport.

>               mutex_unlock(&rdata->rp_mutex);
>  
> -             if (port_id != FC_FID_DIR_SERV) {
> -                     /*
> -                      * We must drop rp_mutex before taking disc_mutex.
> -                      * Re-evaluate state to allow for restart.
> -                      * A transition to RESTART state must only happen
> -                      * while disc_mutex is held and rdata is on the list.
> -                      */
> -                     mutex_lock(&lport->disc.disc_mutex);
> -                     mutex_lock(&rdata->rp_mutex);
> -                     if (rdata->rp_state == RPORT_ST_RESTART)
> -                             restart = 1;
> -                     else
> -                             list_del(&rdata->peers);
> -                     rdata->event = RPORT_EV_NONE;
> -                     mutex_unlock(&rdata->rp_mutex);
> -                     mutex_unlock(&lport->disc.disc_mutex);
> -             }
> -
>               if (rport_ops && rport_ops->event_callback) {
>                       FC_RPORT_DBG(rdata, "callback ev %d\n", event);
>                       rport_ops->event_callback(lport, rdata, event);
> @@ -331,18 +312,36 @@ static void fc_rport_work(struct work_struct *work)
>               if (rport) {

I propose to have "rdata->rport = NULL" here, which ofcourse should be
called with rp_mutex held.

>                       rpriv = rport->dd_data;
>                       rpriv->rp_state = RPORT_ST_DELETE;
> -                     mutex_lock(&rdata->rp_mutex);
> -                     rdata->rport = NULL;
> -                     mutex_unlock(&rdata->rp_mutex);
>                       fc_remote_port_delete(rport);
>               }
> -             if (restart) {
> -                     mutex_lock(&rdata->rp_mutex);
> -                     FC_RPORT_DBG(rdata, "work restart\n");
> -                     fc_rport_enter_plogi(rdata);
> +
> +             mutex_lock(&lport->disc.disc_mutex);
> +             mutex_lock(&rdata->rp_mutex);
> +             if (rdata->rp_state == RPORT_ST_DELETE) {
> +                     if (port_id == FC_FID_DIR_SERV) {
> +                             rdata->event = RPORT_EV_NONE;
> +                             mutex_unlock(&rdata->rp_mutex);
> +                     } else if (rdata->flags & FC_RP_STARTED) {
> +                             rdata->event = RPORT_EV_NONE;
> +                             FC_RPORT_DBG(rdata, "work restart\n");
> +                             fc_rport_enter_plogi(rdata);
> +                             mutex_unlock(&rdata->rp_mutex);
> +                     } else {
> +                             FC_RPORT_DBG(rdata, "work delete\n");
> +                             list_del(&rdata->peers);
> +                             mutex_unlock(&rdata->rp_mutex);
> +                             kref_put(&rdata->kref, lport->tt.rport_destroy);
> +                     }
> +             } else {
> +                     /*
> +                      * Re-open for events.  Reissue READY event if ready.
> +                      */
> +                     rdata->event = RPORT_EV_NONE;
> +                     if (rdata->rp_state == RPORT_ST_READY)
> +                             fc_rport_enter_ready(rdata);
>                       mutex_unlock(&rdata->rp_mutex);
> -             } else
> -                     kref_put(&rdata->kref, lport->tt.rport_destroy);
> +             }
> +             mutex_unlock(&lport->disc.disc_mutex);
>               break;
>  
>       default:
> @@ -367,16 +366,14 @@ int fc_rport_login(struct fc_rport_priv *rdata)
>  {
>       mutex_lock(&rdata->rp_mutex);
>  
> +     rdata->flags |= FC_RP_STARTED;
>       switch (rdata->rp_state) {
>       case RPORT_ST_READY:
>               FC_RPORT_DBG(rdata, "ADISC port\n");
>               fc_rport_enter_adisc(rdata);
>               break;
> -     case RPORT_ST_RESTART:
> -             break;
>       case RPORT_ST_DELETE:
>               FC_RPORT_DBG(rdata, "Restart deleted port\n");
> -             fc_rport_state_enter(rdata, RPORT_ST_RESTART);
>               break;
>       default:
>               FC_RPORT_DBG(rdata, "Login to port\n");
> @@ -431,15 +428,12 @@ int fc_rport_logoff(struct fc_rport_priv *rdata)
>  
>       FC_RPORT_DBG(rdata, "Remove port\n");
>  
> +     rdata->flags &= ~FC_RP_STARTED;
>       if (rdata->rp_state == RPORT_ST_DELETE) {
>               FC_RPORT_DBG(rdata, "Port in Delete state, not removing\n");
>               goto out;
>       }
> -
> -     if (rdata->rp_state == RPORT_ST_RESTART)
> -             FC_RPORT_DBG(rdata, "Port in Restart state, deleting\n");
> -     else
> -             fc_rport_enter_logo(rdata);
> +     fc_rport_enter_logo(rdata);
>  
>       /*
>        * Change the state to Delete so that we discard
> @@ -503,7 +497,6 @@ static void fc_rport_timeout(struct work_struct *work)
>       case RPORT_ST_READY:
>       case RPORT_ST_INIT:
>       case RPORT_ST_DELETE:
> -     case RPORT_ST_RESTART:
>               break;
>       }
>  
> @@ -527,6 +520,7 @@ static void fc_rport_error(struct fc_rport_priv *rdata, 
> struct fc_frame *fp)
>       switch (rdata->rp_state) {
>       case RPORT_ST_PLOGI:
>       case RPORT_ST_LOGO:
> +             rdata->flags &= ~FC_RP_STARTED;
>               fc_rport_enter_delete(rdata, RPORT_EV_FAILED);
>               break;
>       case RPORT_ST_RTV:
> @@ -537,7 +531,6 @@ static void fc_rport_error(struct fc_rport_priv *rdata, 
> struct fc_frame *fp)
>               fc_rport_enter_logo(rdata);
>               break;
>       case RPORT_ST_DELETE:
> -     case RPORT_ST_RESTART:
>       case RPORT_ST_READY:
>       case RPORT_ST_INIT:
>               break;
> @@ -739,6 +732,7 @@ static void fc_rport_prli_resp(struct fc_seq *sp, struct 
> fc_frame *fp,
>  
>       } else {
>               FC_RPORT_DBG(rdata, "Bad ELS response for PRLI command\n");
> +             rdata->flags &= ~FC_RP_STARTED;
>               fc_rport_enter_delete(rdata, RPORT_EV_FAILED);
>       }
>  
> @@ -1377,7 +1371,6 @@ static void fc_rport_recv_plogi_req(struct fc_lport 
> *lport,
>               break;
>       case RPORT_ST_DELETE:
>       case RPORT_ST_LOGO:
> -     case RPORT_ST_RESTART:
>               FC_RPORT_DBG(rdata, "Received PLOGI in state %s - send busy\n",
>                            fc_rport_state(rdata));
>               mutex_unlock(&rdata->rp_mutex);
> @@ -1614,13 +1607,6 @@ static void fc_rport_recv_logo_req(struct fc_lport 
> *lport,
>                            fc_rport_state(rdata));
>  
>               fc_rport_enter_delete(rdata, RPORT_EV_LOGO);
> -
> -             /*
> -              * If the remote port was created due to discovery, set state
> -              * to log back in.  It may have seen a stale RSCN about us.
> -              */
> -             if (rdata->disc_id)
> -                     fc_rport_state_enter(rdata, RPORT_ST_RESTART);
>               mutex_unlock(&rdata->rp_mutex);
>       } else
>               FC_RPORT_ID_DBG(lport, sid,
> diff --git a/include/scsi/libfc.h b/include/scsi/libfc.h
> index 7495c0b..db54c4a 100644
> --- a/include/scsi/libfc.h
> +++ b/include/scsi/libfc.h
> @@ -104,7 +104,6 @@ enum fc_disc_event {
>   * @RPORT_ST_LOGO:    Remote port logout (LOGO) sent
>   * @RPORT_ST_ADISC:   Discover Address sent
>   * @RPORT_ST_DELETE:  Remote port being deleted
> - * @RPORT_ST_RESTART: Remote port being deleted and will restart
>  */
>  enum fc_rport_state {
>       RPORT_ST_INIT,
> @@ -115,7 +114,6 @@ enum fc_rport_state {
>       RPORT_ST_LOGO,
>       RPORT_ST_ADISC,
>       RPORT_ST_DELETE,
> -     RPORT_ST_RESTART,
>  };
>  
>  /**
> @@ -173,6 +171,7 @@ struct fc_rport_libfc_priv {
>       u16                        flags;
>       #define FC_RP_FLAGS_REC_SUPPORTED       (1 << 0)
>       #define FC_RP_FLAGS_RETRY               (1 << 1)
> +     #define FC_RP_STARTED                   (1 << 2)
>       unsigned int               e_d_tov;
>       unsigned int               r_a_tov;
>  };
> @@ -185,7 +184,7 @@ struct fc_rport_libfc_priv {
>   * @rp_state:       Enumeration that tracks progress of PLOGI, PRLI,
>   *                  and RTV exchanges
>   * @ids:            The remote port identifiers and roles
> - * @flags:          REC and RETRY supported flags
> + * @flags:          STARTED, REC and RETRY_SUPPORTED flags
>   * @max_seq:        Maximum number of concurrent sequences
>   * @disc_id:        The discovery identifier
>   * @maxframe_size:  The maximum frame size
> 
> 
> _______________________________________________
> devel mailing list
> [email protected]
> http://www.open-fcoe.org/mailman/listinfo/devel
> 



_______________________________________________
devel mailing list
[email protected]
http://www.open-fcoe.org/mailman/listinfo/devel

Re: [Open-FCoE] [RFC PATCH] libfc: Fix remote port restart problem

Reply via email to