On Sat, 2024-07-13 at 02:04 -0400, Benjamin Marzinski wrote:
> Move the code to sync the mpp device state into a helper function and
> add a counter to to make sure that the device is synced at least once
> every max_checkint secs. This makes sure that multipath devices with
> no
> paths will still get synced with the kernel.  Also, if multiple paths
> are checked in the same loop, the multipath device will only be
> synced
> with the kernel once, since every time the mpp is synced in any code
> path, mpp->sync_tick is reset.
> 
> The code still syncs the mpp before updating the path state for two
> main reasons.
> 
> 1. Sometimes multipathd leaves the mpp with a garbage state. Future
>    patches will fix most of these cases, but the code intentially
>    does not remove the mpp is resyncing fails while checking paths.
>    But this does leave the mpp with a garbage state.
> 
> 2. The kernel chages the multipath state independently of multipathd.
> If
>    the kernel fails a path, a uevent will arrive shortly. But the
> kernel
>    doesn't provide any notification when it switches the active
>    path group or if it ends up picking a different one than
> multipathd
>    selected. Multipathd needs to know the actual current pathgroup to
>    know when it should be switching them.
> 
> Signed-off-by: Benjamin Marzinski <[email protected]>
> ---
>  libmultipath/configure.c   |  1 +
>  libmultipath/structs.h     |  2 ++
>  libmultipath/structs_vec.c |  5 +++
>  multipathd/main.c          | 64 +++++++++++++++++++++++++-----------
> --
>  4 files changed, 50 insertions(+), 22 deletions(-)
> 
> 
> diff --git a/multipathd/main.c b/multipathd/main.c
> index fbd253ca..179fec24 100644
> --- a/multipathd/main.c
> +++ b/multipathd/main.c
> @@ -2342,6 +2342,37 @@ check_path_state(struct path *pp)
>       return newstate;
>  }
>  
> +static void
> +do_check_mpp(struct vectors * vecs, struct multipath *mpp)
> +{
> +     int i, ret;
> +     struct path *pp;
> +
> +     mpp->is_checked = true;
> +     ret = update_multipath_strings(mpp, vecs->pathvec);
> +     if (ret != DMP_OK) {
> +             condlog(1, "%s: %s", mpp->alias, ret ==
> DMP_NOT_FOUND ?
> +                     "device not found" :
> +                     "couldn't synchronize with kernel state");
> +             vector_foreach_slot (mpp->paths, pp, i)
> +                     pp->dmstate = PSTATE_UNDEF;
> +             return;
> +     }
> +     set_no_path_retry(mpp);
> +}
> +
> +static void
> +check_mpp(struct vectors * vecs, struct multipath *mpp, unsigned int
> ticks)
> +{
> +     if (mpp->sync_tick)
> +             mpp->sync_tick -= (mpp->sync_tick > ticks) ? ticks :
> +                               mpp->sync_tick;
> +     if (mpp->sync_tick)
> +             return;
> +
> +     do_check_mpp(vecs, mpp);
> +}
> +
>  /*
>   * Returns '1' if the path has been checked and '0' otherwise
>   */
> @@ -2356,7 +2387,6 @@ check_path (struct vectors * vecs, struct path
> * pp, unsigned int ticks)
>       unsigned int checkint, max_checkint;
>       struct config *conf;
>       int marginal_pathgroups, marginal_changed = 0;
> -     int ret;
>       bool need_reload;
>  
>       if (pp->initialized == INIT_REMOVED)
> @@ -2395,26 +2425,6 @@ check_path (struct vectors * vecs, struct path
> * pp, unsigned int ticks)
>               pp->tick = 1;
>               return 0;
>       }
> -     /*
> -      * Synchronize with kernel state
> -      */
> -     ret = update_multipath_strings(pp->mpp, vecs->pathvec);
> -     if (ret != DMP_OK) {
> -             if (ret == DMP_NOT_FOUND) {
> -                     /* multipath device missing. Likely removed
> */
> -                     condlog(1, "%s: multipath device '%s' not
> found",
> -                             pp->dev, pp->mpp ? pp->mpp->alias :
> "");
> -                     return 0;
> -             } else
> -                     condlog(1, "%s: Couldn't synchronize with
> kernel state",
> -                             pp->dev);
> -             pp->dmstate = PSTATE_UNDEF;
> -     }
> -     /* if update_multipath_strings orphaned the path, quit early
> */
> -     if (!pp->mpp)
> -             return 0;
> -     set_no_path_retry(pp->mpp);
> -
>       if (pp->recheck_wwid == RECHECK_WWID_ON &&
>           (newstate == PATH_UP || newstate == PATH_GHOST) &&
>           ((pp->state != PATH_UP && pp->state != PATH_GHOST) ||
> @@ -2424,7 +2434,12 @@ check_path (struct vectors * vecs, struct path
> * pp, unsigned int ticks)
>               handle_path_wwid_change(pp, vecs);
>               return 0;
>       }
> -
> +     if (!pp->mpp->is_checked) {
> +             do_check_mpp(vecs, pp->mpp);
> +             /* if update_multipath_strings orphaned the path,
> quit early */
> +             if (!pp->mpp)
> +                     return 0;
> +     }
>       if ((newstate != PATH_UP && newstate != PATH_GHOST &&
>            newstate != PATH_PENDING) && (pp->state ==
> PATH_DELAYED)) {
>               /* If path state become failed again cancel path
> delay state */
> @@ -2752,12 +2767,17 @@ checkerloop (void *ap)
>               while (checker_state != CHECKER_FINISHED) {
>                       unsigned int paths_checked = 0, i;
>                       struct timespec chk_start_time;
> +                     struct multipath *mpp;
>  
>                       pthread_cleanup_push(cleanup_lock, &vecs-
> >lock);
>                       lock(&vecs->lock);
>                       pthread_testcancel();
> +                     vector_foreach_slot(vecs->mpvec, mpp, i)
> +                             mpp->is_checked = false;

Why is this not done inside the "if (checker_state == CHECKER_STARTING)
code path?

Martin


>                       get_monotonic_time(&chk_start_time);
>                       if (checker_state == CHECKER_STARTING) {
> +                             vector_foreach_slot(vecs->mpvec,
> mpp, i)
> +                                     check_mpp(vecs, mpp, ticks);
>                               vector_foreach_slot(vecs->pathvec,
> pp, i)
>                                       pp->is_checked = false;
>                               checker_state = CHECKER_RUNNING;


Reply via email to