Here are some thoughts on the interaction between softraid(4) and WAPBL,
and how to improve the current situation. All of the code references in
this mail concern the master branch with HEAD = 3ca4b444.
WAPBL saw the introduction of the DIOCCACHESYNC ioctl, whose purpose, as
the name implies, is to flush an I/O controller's cache.
WAPBL periodically issues the DIOCCACHESYNC ioctl to ensure that journal
metadata is safely on disk. This use of DIOCCACHESYNC happens at the
file system level, which means that a disk device handling multiple
partitions, each a fully independent file system, may be requested to
perform multiple and possibly concurrent DIOCCACHESYNC operations.
For softraid(4), DIOCCACHESYNC gets translated to sr_raid_sync(). What
sr_raid_sync() implements is not quite a sync operation per se, but
rather a drain: it loops until all pending I/O operations have finished.
This rationale assumes that no further I/O requests can get posted to
softraid(4) after a sync has started, which is the case of the scenarios
for which sr_raid_sync() was originally envisaged (look at the code
paths that lead to sd_flush() other than DIOCCACHESYNC: activate,
deactive, close).
However, that assumption does not hold true for a softraid(4) disk
backing multiple WAPBL file systems. In this situation, new I/O requests
may and very likely will be posted to softraid(4) when a sync is in
progress. This may have the effect of slowing down the sync operation,
as sr_raid_sync() loops waiting for the number of outstanding I/O
operations to reach zero, ultimately leading to a timeout after 15
seconds. This worst-case scenario will certainly happen should one of
these parallel operations be another sync, as one sync will cause the
other to timeout.
Oga, this is probably the slowdown you are seeing. I'm sorry that it
took so long for the problem to be diagnosed.
Below is nothing but a sketch of a fix. I plan to work on it as I find
the time.
-p.
diff --git sys/dev/softraid.c sys/dev/softraid.c
index a9e9ff5..4a2657c 100644
--- sys/dev/softraid.c
+++ sys/dev/softraid.c
@@ -2121,6 +2121,8 @@ sr_wu_free(struct sr_discipline *sd)
DNPRINTF(SR_D_WU, "%s: sr_wu_free %p\n", DEVNAME(sd->sd_sc), sd);
+ /* XXX mutex? */
+
while ((wu = TAILQ_FIRST(&sd->sd_wu_freeq)) != NULL)
TAILQ_REMOVE(&sd->sd_wu_freeq, wu, swu_link);
while ((wu = TAILQ_FIRST(&sd->sd_wu_pendq)) != NULL)
@@ -2144,6 +2146,7 @@ sr_wu_get(void *xsd)
wu = TAILQ_FIRST(&sd->sd_wu_freeq);
if (wu) {
TAILQ_REMOVE(&sd->sd_wu_freeq, wu, swu_link);
+ TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
sd->sd_wu_pending++;
}
mtx_leave(&sd->sd_wu_mtx);
@@ -2266,6 +2269,8 @@ sr_wu_done_callback(void *xwu)
goto done;
}
+ /* XXX mutex? */
+
/* Remove work unit from pending queue. */
TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link)
if (wup == wu)
@@ -4078,6 +4083,7 @@ sr_raid_start_stop(struct sr_workunit *wu)
int
sr_raid_sync(struct sr_workunit *wu)
{
+ struct sr_workunit *wup;
struct sr_discipline *sd = wu->swu_dis;
int ios = 0, rv = 0, retries = 15;
@@ -4099,35 +4105,36 @@ sr_raid_sync(struct sr_workunit *wu)
/* diagnostic */
if (sd->sd_sync != 0) {
- printf("sd->sd_sync != 0, called reentrant\n");
+ /* wait for ongoing syncs to finish */
}
sd->sd_sync = 1;
- for (;;) {
- if (sd->sd_wu_pending <= ios) {
- break;
- }
- /* wait a bit for io to drain */
- if (msleep(sd, &sd->sd_wu_mtx, PWAIT, "sr_sync", 1 * hz) ==
- EWOULDBLOCK) {
- if (retries-- > 0) {
- continue;
- }
+ TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
+ KASSERT((wup->swu_flags & SR_WUF_SEEN) == 0);
+ wup->swu_flags |= SR_WUF_SEEN;
+ }
- /* print timeout for now */
- printf("%s: sr_raid_sync timeout\n",
- DEVNAME(sd->sd_sc));
+ while (sd->sd_wu_pending > ios && retries-- > 0) {
+ msleep(sd, &sd->sd_wu_mtx, PWAIT, "sr_sync", 1 * hz);
+ TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link)
+ if (wup->swu_flags & SR_WUF_SEEN)
+ continue;
+ rv = 0;
+ break;
+ }
- DNPRINTF(SR_D_DIS, "%s: sr_raid_sync timeout\n",
- DEVNAME(sd->sd_sc));
- rv = 1;
- break;
- }
+ if (rv != 0)
+ TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
+ wup->swu_flags &= ~SR_WUF_SEEN;
}
- sd->sd_sync = 0;
+
mtx_leave(&sd->sd_wu_mtx);
+ sd->sd_sync = 0;
+
+ /* XXX should issue DIOCCACHESYNC on the underlying disk(s)! */
+
wakeup(&sd->sd_sync);
return (rv);
@@ -4183,6 +4190,8 @@ sr_schedule_wu(struct sr_workunit *wu)
panic("sr_schedule_wu: work unit not in progress (state %i)\n",
wu->swu_state);
+ /* XXX mutex? */
+
/* Walk queue backwards and fill in collider if we have one. */
TAILQ_FOREACH_REVERSE(wup, &sd->sd_wu_pendq, sr_wu_list, swu_link) {
if (wu->swu_blk_end < wup->swu_blk_start ||
@@ -4223,9 +4232,6 @@ sr_raid_startwu(struct sr_workunit *wu)
wu->swu_state = SR_WU_INPROGRESS;
}
- if (wu->swu_state != SR_WU_RESTART)
- TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
-
/* Start all of the individual I/Os. */
if (wu->swu_cb_active == 1)
panic("%s: sr_startwu_callback", DEVNAME(sd->sd_sc));
diff --git sys/dev/softraidvar.h sys/dev/softraidvar.h
index 462a4f2..d386fa0 100644
--- sys/dev/softraidvar.h
+++ sys/dev/softraidvar.h
@@ -387,6 +387,7 @@ struct sr_workunit {
#define SR_WUF_WAKEUP (1<<4) /* Wakeup on I/O completion.
*/
#define SR_WUF_DISCIPLINE (1<<5) /* Discipline specific I/O. */
#define SR_WUF_FAKE (1<<6) /* Faked workunit. */
+#define SR_WUF_SEEN (1<<7) /* Visited during sync. */
/* workunit io range */
daddr_t swu_blk_start;