lru/failover path scheduling in mpath(4)

2011-06-15 Thread David Gwynne
the subject line says it all, but happy to explain further if
required.

Index: mpath.c
===
RCS file: /cvs/src/sys/scsi/mpath.c,v
retrieving revision 1.21
diff -u -p -r1.21 mpath.c
--- mpath.c 27 Apr 2011 05:22:24 -  1.21
+++ mpath.c 15 Jun 2011 08:03:06 -
@@ -58,6 +58,7 @@ struct mpath_dev {
 
u_intd_path_count;
 
+   const struct mpath_ops  *d_ops;
struct devid*d_id;
 };
 
@@ -89,7 +90,7 @@ void  mpath_cmd(struct scsi_xfer *);
 void   mpath_minphys(struct buf *, struct scsi_link *);
 intmpath_probe(struct scsi_link *);
 
-struct mpath_path *mpath_next_path(struct mpath_dev *);
+struct mpath_path *mpath_next_path(struct mpath_dev *, int);
 void   mpath_done(struct scsi_xfer *);
 
 struct scsi_adapter mpath_switch = {
@@ -161,7 +162,7 @@ mpath_probe(struct scsi_link *link)
 }
 
 struct mpath_path *
-mpath_next_path(struct mpath_dev *d)
+mpath_next_path(struct mpath_dev *d, int next)
 {
struct mpath_path *p;
 
@@ -169,7 +170,7 @@ mpath_next_path(struct mpath_dev *d)
panic(%s: d is NULL, __func__);
 
p = d-d_next_path;
-   if (p != NULL) {
+   if (p != NULL  next == MPATH_NEXT) {
d-d_next_path = TAILQ_NEXT(p, p_entry);
if (d-d_next_path == NULL)
d-d_next_path = TAILQ_FIRST(d-d_paths);
@@ -194,7 +195,7 @@ mpath_cmd(struct scsi_xfer *xs)
 
if (ISSET(xs-flags, SCSI_POLL)) {
mtx_enter(d-d_mtx);
-   p = mpath_next_path(d);
+   p = mpath_next_path(d, d-d_ops-op_schedule);
mtx_leave(d-d_mtx);
if (p == NULL) {
mpath_xs_stuffup(xs);
@@ -232,7 +233,7 @@ mpath_cmd(struct scsi_xfer *xs)
 
mtx_enter(d-d_mtx);
SIMPLEQ_INSERT_TAIL(d-d_ccbs, ccb, c_entry);
-   p = mpath_next_path(d);
+   p = mpath_next_path(d, d-d_ops-op_schedule);
mtx_leave(d-d_mtx);
 
if (p != NULL)
@@ -294,11 +295,15 @@ mpath_done(struct scsi_xfer *mxs)
struct mpath_ccb *ccb = xs-io;
struct mpath_dev *d = mpath_devs[link-target];
struct mpath_path *p;
+   int next = d-d_ops-op_schedule;
 
-   if (mxs-error == XS_RESET || mxs-error == XS_SELTIMEOUT) {
+   switch (mxs-error) {
+   case XS_SELTIMEOUT: /* physical path is gone, try the next */
+   next = MPATH_NEXT;
+   case XS_RESET:
mtx_enter(d-d_mtx);
SIMPLEQ_INSERT_HEAD(d-d_ccbs, ccb, c_entry);
-   p = mpath_next_path(d);
+   p = mpath_next_path(d, next);
mtx_leave(d-d_mtx);
 
scsi_xs_put(mxs);
@@ -363,7 +368,7 @@ mpath_path_probe(struct scsi_link *link)
 }
 
 int
-mpath_path_attach(struct mpath_path *p)
+mpath_path_attach(struct mpath_path *p, const struct mpath_ops *ops)
 {
struct scsi_link *link = p-p_link;
struct mpath_dev *d = NULL;
@@ -381,7 +386,7 @@ mpath_path_attach(struct mpath_path *p)
if ((d = mpath_devs[target]) == NULL)
continue;
 
-   if (DEVID_CMP(d-d_id, link-id))
+   if (DEVID_CMP(d-d_id, link-id)  d-d_ops == ops)
break;
 
d = NULL;
@@ -403,6 +408,7 @@ mpath_path_attach(struct mpath_path *p)
TAILQ_INIT(d-d_paths);
SIMPLEQ_INIT(d-d_ccbs);
d-d_id = devid_copy(link-id);
+   d-d_ops = ops;
 
mpath_devs[target] = d;
newdev = 1;
Index: mpath_emc.c
===
RCS file: /cvs/src/sys/scsi/mpath_emc.c,v
retrieving revision 1.5
diff -u -p -r1.5 mpath_emc.c
--- mpath_emc.c 15 Jun 2011 01:10:50 -  1.5
+++ mpath_emc.c 15 Jun 2011 08:03:06 -
@@ -94,11 +94,12 @@ int emc_mpath_checksense(struct scsi_xf
 intemc_mpath_online(struct scsi_link *);
 intemc_mpath_offline(struct scsi_link *);
 
-struct mpath_ops emc_mpath_ops = {
+const struct mpath_ops emc_mpath_ops = {
emc,
emc_mpath_checksense,
emc_mpath_online,
emc_mpath_offline,
+   MPATH_ROUNDROBIN
 };
 
 struct emc_device {
@@ -156,7 +157,6 @@ emc_attach(struct device *parent, struct
/* init path */
scsi_xsh_set(sc-sc_path.p_xsh, link, emc_mpath_start);
sc-sc_path.p_link = link;
-   sc-sc_path.p_ops = emc_mpath_ops;
 
if (emc_sp_info(sc)) {
printf(%s: unable to get sp info\n, DEVNAME(sc));
@@ -172,7 +172,7 @@ emc_attach(struct device *parent, struct
sc-sc_sp + 'A', sc-sc_port);
 
if (sc-sc_lun_state == EMC_SP_INFO_LUN_STATE_OWNED) {
-   if (mpath_path_attach(sc-sc_path) != 0)
+   if (mpath_path_attach(sc-sc_path, emc_mpath_ops) != 0)
printf(%s: 

Re: lru/failover path scheduling in mpath(4)

2011-06-15 Thread David Gwynne
On 15/06/2011, at 8:16 PM, Mark Kettenis wrote:

 Date: Wed, 15 Jun 2011 18:04:24 +1000
 From: David Gwynne l...@animata.net

 the subject line says it all, but happy to explain further if
 required.

 Hmm, I'm somewhat confused:

 +#define MPATH_ROUNDROBIN0
 +#define MPATH_NEXT  MPATH_ROUNDROBIN
 +#define MPATH_LRU   1

ah crap, i mean MRU for most recently used. i dont know why LRU keeps coming
out of my fingers.

 What does MPATH_NEXT mean?  Is that the strategy you fall back on if
 the path you're using fails?  What if you have more than 4 paths to a
 disk with a device that only allows a single active path and one of
 them fails?  Are you suddenly going to try roundrobin scheduling of
 IOs?

if you have a group of four paths and you're doing MRU scheduling, only one of
those will be used. if that path goes away (ie, you get XS_SELTIMEOUT from the
physical adapter), MPATH_NEXT moves us to the next available path in the
group.

 Also, I don't quite understand what MPATH_LRU would mean.  Least
 Recently Used?  How is that different from Round Robin?  Don't you
 need a Last Used policy for devices that can only have a single active
 path or devices where there is a significant overhead for switching to
 a different path.

round robin uses all available paths, L^HMRU uses only one of the available
paths until it fails or goes away.

dlg