Hi Ben, I like this approach, but I have a few questions.
On Thu, 2025-07-10 at 14:10 -0400, Benjamin Marzinski wrote: > The workaround for releasing a reservation held by a failed path > (clearing all persistent reservation data and then reregistering all > the > keys) has several problems. It requires devices that support the READ > FULL STATUS command and are capable of specifying TransportIDs with > REGISTRATION commands (SIP_C), neither of which are mandatory to > support > SCSI Persistent Reservations. While I have made similar experiences, I wonder where the SCSI specs state which service actions and which parameters are mandatory for Persistent Reservation commands. I've tried to figure it out from SPC-6, but I couldn't. > Non SIP_C devices will be left with no > registered keys. Also, not all cleared keys are registered, just the > ones going to the Target Port receiving the REGISTRATION command. To > reregister all the keys, the code would have to also use the "All > Target > Ports" flag to register keys on different Target Ports, but this > could > end up registering keys on paths they aren't supposed to be on (for > instance if one of the registered keys was only there because the > path > was down and it couldn't be released). > > The redesign avoids these issues by only using mandatory Persistent > Reservation commands, without extra optional parameters or flags, and > only effects the keys of the multipath device it is being issued on. > The new workaround is: > 1. Suspend the multipath device to prevent I/O > 2. Preempt the key to move the reservation to an available path. This > also removes the registered keys from every path except the path > issuing the PREEMPT command. Since the device is suspended, not > I/O > can go to these unregisted paths and fail. > 3. Release the reservation on the path that now holds it. Why that? Can't the reservation just be kept? > 4. Resume the device (since it no longer matters that most of the > paths > no longer have a registered key) > 5. Reregister the keys on all the paths. Why not re-register before resuming? > > If steps 3 or 4 fail, the code will attempt to reregister the keys, > and > then attempt (or possibly re-attempt) the resume. > > Signed-off-by: Benjamin Marzinski <bmarz...@redhat.com> > --- > libmpathpersist/mpath_persist_int.c | 173 ++++++++++++-------------- > -- > libmultipath/libmultipath.version | 1 + > 2 files changed, 76 insertions(+), 98 deletions(-) > > diff --git a/libmpathpersist/mpath_persist_int.c > b/libmpathpersist/mpath_persist_int.c > index 13829742..7fb08b2e 100644 > --- a/libmpathpersist/mpath_persist_int.c > +++ b/libmpathpersist/mpath_persist_int.c > @@ -366,7 +366,8 @@ static int mpath_prout_reg(struct multipath > *mpp,int rq_servact, int rq_scope, > > static int mpath_prout_common(struct multipath *mpp,int rq_servact, > int rq_scope, > unsigned int rq_type, > - struct prout_param_descriptor* paramp, > int noisy) > + struct prout_param_descriptor* paramp, > int noisy, > + struct path **pptr) > { > int i,j, ret; > struct pathgroup *pgp = NULL; > @@ -385,6 +386,8 @@ static int mpath_prout_common(struct multipath > *mpp,int rq_servact, int rq_scope > found = true; > ret = prout_do_scsi_ioctl(pp->dev, > rq_servact, rq_scope, > rq_type, paramp, > noisy); > + if (ret == MPATH_PR_SUCCESS && pptr) > + *pptr = pp; > if (ret != MPATH_PR_RETRYABLE_ERROR) > return ret; > } > @@ -405,14 +408,12 @@ static int mpath_prout_rel(struct multipath > *mpp,int rq_servact, int rq_scope, > struct path *pp = NULL; > int active_pathcount = 0; > pthread_attr_t attr; > - int rc, found = 0; > + int rc; > int count = 0; > int status = MPATH_PR_SUCCESS; > - struct prin_resp resp; > - struct prout_param_descriptor *pamp = NULL; > - struct prin_resp *pr_buff; > - int length; > - struct transportid *pptr = NULL; > + struct prin_resp resp = {0}; > + uint16_t udev_flags = (mpp->skip_kpartx) ? > MPATH_UDEV_NO_KPARTX_FLAG : 0; > + bool did_resume = false; > > if (!mpp) > return MPATH_PR_DMMP_ERROR; > @@ -502,104 +503,78 @@ static int mpath_prout_rel(struct multipath > *mpp,int rq_servact, int rq_scope, > } > > condlog (2, "%s: Path holding reservation is not > available.", mpp->wwid); > - > - pr_buff = mpath_alloc_prin_response(MPATH_PRIN_RFSTAT_SA); > - if (!pr_buff){ > - condlog (0, "%s: failed to alloc pr in response > buffer.", mpp->wwid); > + /* > + * Cannot free the reservation because the path that is > holding it > + * is not usable. Workaround this by: > + * 1. Suspending the device > + * 2. Preempting the reservation to move it to a usable path > + * (this removes the registered keys on all paths except > the > + * preempting one. Since the device is suspended, no IO > can > + * go to these unregistered paths and fail). > + * 3. Releasing the reservation on the path that now holds > it. > + * 4. Resuming the device (since it no longer matters that > most of > + * that paths no longer have a registered key) > + * 5. Reregistering keys on all the paths > + */ > + > + if (!dm_simplecmd_noflush(DM_DEVICE_SUSPEND, mpp->alias, 0)) > { > + condlog(0, "%s: release: failed to suspend dm > device.", Why do you use dm_simplecmd_noflush() here? Shouldn't queued IO be flushed from the dm device to avoid it being sent to paths that are going to be unregistered? > mpp->wwid); > return MPATH_PR_OTHER; > } > > - status = mpath_prin_activepath (mpp, MPATH_PRIN_RFSTAT_SA, > pr_buff, noisy); > - > - if (status != MPATH_PR_SUCCESS){ > - condlog (0, "%s: pr in read full status command > failed.", mpp->wwid); > - goto out; > - } > - > - num = pr_buff- > >prin_descriptor.prin_readfd.number_of_descriptor; > - if (0 == num){ > - goto out; > - } > - length = sizeof (struct prout_param_descriptor) + (sizeof > (struct transportid *)); > - > - pamp = (struct prout_param_descriptor *)malloc (length); > - if (!pamp){ > - condlog (0, "%s: failed to alloc pr out parameter.", > mpp->wwid); > - goto out; > - } > - > - memset(pamp, 0, length); > - > - pamp->trnptid_list[0] = (struct transportid *) malloc > (sizeof (struct transportid)); > - if (!pamp->trnptid_list[0]){ > - condlog (0, "%s: failed to alloc pr out > transportid.", mpp->wwid); > - goto out1; > - } > - pptr = pamp->trnptid_list[0]; > - > - if (get_be64(mpp->reservation_key)){ > - memcpy (pamp->key, &mpp->reservation_key, 8); > - condlog (3, "%s: reservation key set.", mpp->wwid); > + memset(paramp, 0, sizeof(*paramp)); > + memcpy(paramp->key, &mpp->reservation_key, 8); > + memcpy(paramp->sa_key, &mpp->reservation_key, 8); > + status = mpath_prout_common(mpp, MPATH_PROUT_PREE_SA, > rq_scope, rq_type, > + paramp, noisy, &pp); > + if (status != MPATH_PR_SUCCESS) { > + condlog(0, "%s: release: pr preempt command > failed.", mpp->wwid); > + goto fail_resume; > } > > - status = mpath_prout_common (mpp, MPATH_PROUT_CLEAR_SA, > - rq_scope, rq_type, pamp, > noisy); > - > - if (status) { > - condlog(0, "%s: failed to send CLEAR_SA", mpp- > >wwid); > - goto out2; > + memset(paramp, 0, sizeof(*paramp)); > + memcpy(paramp->key, &mpp->reservation_key, 8); > + status = prout_do_scsi_ioctl(pp->dev, MPATH_PROUT_REL_SA, > rq_scope, > + rq_type, paramp, noisy); > + if (status != MPATH_PR_SUCCESS) { > + condlog(0, "%s: release on alternate path failed.", > mpp->wwid); > + goto out_reregister; > } > > - pamp->num_transportid = 1; > - > - for (i = 0; i < num; i++){ > - if (get_be64(mpp->reservation_key) && > - memcmp(pr_buff- > >prin_descriptor.prin_readfd.descriptors[i]->key, > - &mpp->reservation_key, 8)){ > - /*register with transport id*/ > - memset(pamp, 0, length); > - pamp->trnptid_list[0] = pptr; > - memset (pamp->trnptid_list[0], 0, sizeof > (struct transportid)); > - memcpy (pamp->sa_key, > - pr_buff- > >prin_descriptor.prin_readfd.descriptors[i]->key, 8); > - pamp->sa_flags = MPATH_F_SPEC_I_PT_MASK; > - pamp->num_transportid = 1; > - > - memcpy (pamp->trnptid_list[0], > - &pr_buff- > >prin_descriptor.prin_readfd.descriptors[i]->trnptid, > - sizeof (struct > transportid)); > - status = mpath_prout_common (mpp, > MPATH_PROUT_REG_SA, 0, rq_type, > - pamp, noisy); > - > - pamp->sa_flags = 0; > - memcpy (pamp->key, pr_buff- > >prin_descriptor.prin_readfd.descriptors[i]->key, 8); > - memset (pamp->sa_key, 0, 8); > - pamp->num_transportid = 0; > - status = mpath_prout_common (mpp, > MPATH_PROUT_REG_SA, 0, rq_type, > - pamp, noisy); > - } > - else > - { > - if (get_be64(mpp->reservation_key)) > - found = 1; > - } > - > - > - } > - > - if (found){ > - memset (pamp, 0, length); > - memcpy (pamp->sa_key, &mpp->reservation_key, 8); > - memset (pamp->key, 0, 8); > - status = mpath_prout_reg(mpp, MPATH_PROUT_REG_SA, > rq_scope, rq_type, pamp, noisy); > + if (!dm_simplecmd_noflush(DM_DEVICE_RESUME, mpp->alias, > udev_flags)) { > + condlog(0, "%s release: failed to resume dm > device.", mpp->wwid); > + /* > + * leave status set to MPATH_PR_SUCCESS, we will > have another > + * chance to resume the device. > + */ > + goto out_reregister; > + } > + did_resume = true; > + > +out_reregister: > + memset(paramp, 0, sizeof(*paramp)); > + memcpy(paramp->sa_key, &mpp->reservation_key, 8); > + rc = mpath_prout_reg(mpp, MPATH_PROUT_REG_IGN_SA, rq_scope, > rq_type, > + paramp, noisy); > + if (rc != MPATH_PR_SUCCESS) > + condlog(0, "%s: release: failed to reregister > paths.", mpp->wwid); > + > + /* > + * If we failed releasing the reservation or resuming > earlier > + * try resuming now. Otherwise, return with the > reregistering status > + * This means we will report failure, even though the > resevation > + * has been released, since the keys were not reregistered. > + */ > + if (did_resume) > + return rc; > + else if (status == MPATH_PR_SUCCESS) > + status = rc; > +fail_resume: > + if (!dm_simplecmd_noflush(DM_DEVICE_RESUME, mpp->alias, > udev_flags)) { > + condlog(0, "%s: release: failed to resume dm > device.", mpp->wwid); > + if (status == MPATH_PR_SUCCESS) > + status = MPATH_PR_OTHER; > } > - > -out2: > - free(pptr); > -out1: > - free (pamp); > -out: > - free (pr_buff); > return (status == MPATH_PR_RETRYABLE_ERROR) ? MPATH_PR_OTHER > : status; > } > > @@ -620,6 +595,7 @@ int do_mpath_persistent_reserve_out(vector curmp, > vector pathvec, int fd, > mpp->mpe = find_mpe(conf->mptable, mpp->wwid); > select_reservation_key(conf, mpp); > select_all_tg_pt(conf, mpp); > + select_skip_kpartx(conf, mpp); I understand that this is because of the DM_DEVICE_RESUME, but it deserves a comment here as it seems a bit out of place in this code that deals only with persistent reservations. Actually, we may want to move this logic to dm_simplecmd(). Thanks, Martin > put_multipath_config(conf); > > memcpy(&prkey, paramp->sa_key, 8); > @@ -661,7 +637,8 @@ int do_mpath_persistent_reserve_out(vector curmp, > vector pathvec, int fd, > case MPATH_PROUT_PREE_SA : > case MPATH_PROUT_PREE_AB_SA : > case MPATH_PROUT_CLEAR_SA: > - ret = mpath_prout_common(mpp, rq_servact, rq_scope, > rq_type, paramp, noisy); > + ret = mpath_prout_common(mpp, rq_servact, rq_scope, > rq_type, > + paramp, noisy, NULL); > break; > case MPATH_PROUT_REL_SA: > ret = mpath_prout_rel(mpp, rq_servact, rq_scope, > rq_type, paramp, noisy); > diff --git a/libmultipath/libmultipath.version > b/libmultipath/libmultipath.version > index a6718355..2f47b3ad 100644 > --- a/libmultipath/libmultipath.version > +++ b/libmultipath/libmultipath.version > @@ -249,4 +249,5 @@ local: > LIBMULTIPATH_29.1.0 { > global: > can_recheck_wwid; > + select_skip_kpartx; > } LIBMULTIPATH_29.0.0;