I've pushed pedro_softraid_flush, which addresses the problem
differently, and hopefully in a much simpler manner. My x61s seems happy
with it + softraid(4) FDE.

-p.

Let me know if it seems ok and i'll update the affected laptop. I've let it
get rather behind.

On Tue Feb 24 2015 at 1:08:49 PM Pedro Martelletto <[email protected]>
wrote:

Hi Oga,

Thanks for the feedback. I've pushed a branch, pedro_softraid, which I
plan to test later today. Except for the missing DIOCCACHESYNC ioctl, I
expect it to work.

-p.

>Amusingly, this is actually the theory that I had about this. I thought i
>had shared it with you but I may well misremember. Had no time to hack it
>up though.
>
>I concur though. Sounds very plausible.
>
>On Tue Feb 24 2015 at 7:20:55 AM Pedro Martelletto <
[email protected]>
>wrote:
>
>> Here are some thoughts on the interaction between softraid(4) and WAPBL,
>> and how to improve the current situation. All of the code references in
>> this mail concern the master branch with HEAD = 3ca4b444.
>>
>> WAPBL saw the introduction of the DIOCCACHESYNC ioctl, whose purpose, as
>> the name implies, is to flush an I/O controller's cache.
>>
>> WAPBL periodically issues the DIOCCACHESYNC ioctl to ensure that journal
>> metadata is safely on disk. This use of DIOCCACHESYNC happens at the
>> file system level, which means that a disk device handling multiple
>> partitions, each a fully independent file system, may be requested to
>> perform multiple and possibly concurrent DIOCCACHESYNC operations.
>>
>> For softraid(4), DIOCCACHESYNC gets translated to sr_raid_sync(). What
>> sr_raid_sync() implements is not quite a sync operation per se, but
>> rather a drain: it loops until all pending I/O operations have finished.
>> This rationale assumes that no further I/O requests can get posted to
>> softraid(4) after a sync has started, which is the case of the scenarios
>> for which sr_raid_sync() was originally envisaged (look at the code
>> paths that lead to sd_flush() other than DIOCCACHESYNC: activate,
>> deactive, close).
>>
>> However, that assumption does not hold true for a softraid(4) disk
>> backing multiple WAPBL file systems. In this situation, new I/O requests
>> may and very likely will be posted to softraid(4) when a sync is in
>> progress. This may have the effect of slowing down the sync operation,
>> as sr_raid_sync() loops waiting for the number of outstanding I/O
>> operations to reach zero, ultimately leading to a timeout after 15
>> seconds. This worst-case scenario will certainly happen should one of
>> these parallel operations be another sync, as one sync will cause the
>> other to timeout.
>>
>> Oga, this is probably the slowdown you are seeing. I'm sorry that it
>> took so long for the problem to be diagnosed.
>>
>> Below is nothing but a sketch of a fix. I plan to work on it as I find
>> the time.
>>
>> -p.
>>
>> diff --git sys/dev/softraid.c sys/dev/softraid.c
>> index a9e9ff5..4a2657c 100644
>> --- sys/dev/softraid.c
>> +++ sys/dev/softraid.c
>> @@ -2121,6 +2121,8 @@ sr_wu_free(struct sr_discipline *sd)
>>
>>         DNPRINTF(SR_D_WU, "%s: sr_wu_free %p\n", DEVNAME(sd->sd_sc),
sd);
>>
>> +       /* XXX mutex? */
>> +
>>         while ((wu = TAILQ_FIRST(&sd->sd_wu_freeq)) != NULL)
>>                 TAILQ_REMOVE(&sd->sd_wu_freeq, wu, swu_link);
>>         while ((wu = TAILQ_FIRST(&sd->sd_wu_pendq)) != NULL)
>> @@ -2144,6 +2146,7 @@ sr_wu_get(void *xsd)
>>         wu = TAILQ_FIRST(&sd->sd_wu_freeq);
>>         if (wu) {
>>                 TAILQ_REMOVE(&sd->sd_wu_freeq, wu, swu_link);
>> +               TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
>>                 sd->sd_wu_pending++;
>>         }
>>         mtx_leave(&sd->sd_wu_mtx);
>> @@ -2266,6 +2269,8 @@ sr_wu_done_callback(void *xwu)
>>                         goto done;
>>         }
>>
>> +       /* XXX mutex? */
>> +
>>         /* Remove work unit from pending queue. */
>>         TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link)
>>                 if (wup == wu)
>> @@ -4078,6 +4083,7 @@ sr_raid_start_stop(struct sr_workunit *wu)
>>  int
>>  sr_raid_sync(struct sr_workunit *wu)
>>  {
>> +       struct sr_workunit      *wup;
>>         struct sr_discipline    *sd = wu->swu_dis;
>>         int                     ios = 0, rv = 0, retries = 15;
>>
>> @@ -4099,35 +4105,36 @@ sr_raid_sync(struct sr_workunit *wu)
>>
>>         /* diagnostic */
>>         if (sd->sd_sync != 0) {
>> -               printf("sd->sd_sync != 0, called reentrant\n");
>> +               /* wait for ongoing syncs to finish */
>>         }
>>
>>         sd->sd_sync = 1;
>> -       for (;;) {
>> -               if (sd->sd_wu_pending <= ios) {
>> -                       break;
>> -               }
>>
>> -               /* wait a bit for io to drain */
>> -               if (msleep(sd, &sd->sd_wu_mtx, PWAIT, "sr_sync", 1 *
hz) ==
>> -                   EWOULDBLOCK) {
>> -                       if (retries-- > 0) {
>> -                               continue;
>> -                       }
>> +       TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
>> +               KASSERT((wup->swu_flags & SR_WUF_SEEN) == 0);
>> +               wup->swu_flags |= SR_WUF_SEEN;
>> +       }
>>
>> -                       /* print timeout for now */
>> -                       printf("%s: sr_raid_sync timeout\n",
>> -                           DEVNAME(sd->sd_sc));
>> +       while (sd->sd_wu_pending > ios && retries-- > 0) {
>> +               msleep(sd, &sd->sd_wu_mtx, PWAIT, "sr_sync", 1 * hz);
>> +               TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link)
>> +                       if (wup->swu_flags & SR_WUF_SEEN)
>> +                               continue;
>> +               rv = 0;
>> +               break;
>> +       }
>>
>> -                       DNPRINTF(SR_D_DIS, "%s: sr_raid_sync timeout\n",
>> -                           DEVNAME(sd->sd_sc));
>> -                       rv = 1;
>> -                       break;
>> -               }
>> +       if (rv != 0)
>> +               TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
>> +                       wup->swu_flags &= ~SR_WUF_SEEN;
>>         }
>> -       sd->sd_sync = 0;
>> +
>>         mtx_leave(&sd->sd_wu_mtx);
>>
>> +       sd->sd_sync = 0;
>> +
>> +       /* XXX should issue DIOCCACHESYNC on the underlying disk(s)! */
>> +
>>         wakeup(&sd->sd_sync);
>>
>>         return (rv);
>> @@ -4183,6 +4190,8 @@ sr_schedule_wu(struct sr_workunit *wu)
>>                 panic("sr_schedule_wu: work unit not in progress (state
>> %i)\n",
>>                     wu->swu_state);
>>
>> +       /* XXX mutex? */
>> +
>>         /* Walk queue backwards and fill in collider if we have one. */
>>         TAILQ_FOREACH_REVERSE(wup, &sd->sd_wu_pendq, sr_wu_list,
swu_link)
>> {
>>                 if (wu->swu_blk_end < wup->swu_blk_start ||
>> @@ -4223,9 +4232,6 @@ sr_raid_startwu(struct sr_workunit *wu)
>>                 wu->swu_state = SR_WU_INPROGRESS;
>>         }
>>
>> -       if (wu->swu_state != SR_WU_RESTART)
>> -               TAILQ_INSERT_TAIL(&sd->sd_wu_pendq, wu, swu_link);
>> -
>>         /* Start all of the individual I/Os. */
>>         if (wu->swu_cb_active == 1)
>>                 panic("%s: sr_startwu_callback", DEVNAME(sd->sd_sc));
>> diff --git sys/dev/softraidvar.h sys/dev/softraidvar.h
>> index 462a4f2..d386fa0 100644
>> --- sys/dev/softraidvar.h
>> +++ sys/dev/softraidvar.h
>> @@ -387,6 +387,7 @@ struct sr_workunit {
>>  #define SR_WUF_WAKEUP          (1<<4)          /* Wakeup on I/O
>> completion. */
>>  #define SR_WUF_DISCIPLINE      (1<<5)          /* Discipline specific
>> I/O. */
>>  #define SR_WUF_FAKE            (1<<6)          /* Faked workunit. */
>> +#define SR_WUF_SEEN            (1<<7)          /* Visited during sync.
*/
>>
>>         /* workunit io range */
>>         daddr_t                 swu_blk_start;
>>
>>



Reply via email to