[WIP PATCH] SR RAID1 checksumming support V3

Karel Gardas Sun, 13 Sep 2015 14:24:07 -0700

Hello,

attached is my work in progress on checksumming support for softraid
RAID1. Currently it does:
- computation of checksums (crc32)
- verification of checksums
- hang-over to another chunk (restart wu) in case of checksum error
- properly handle errors happening on all chunks
- "self-healing" of bad sector
- supports rebuild
- signal bad checksum to console and to sensors


E.g.:
$ sysctl hw.sensors.softraid0
hw.sensors.softraid0.raw0=0 (sd0f), OK
hw.sensors.softraid0.raw1=0 (sd0g), OK
hw.sensors.softraid0.drive0=
online (sd1), OK


Next TODO items:
- fix openings, currently it suggests max 2 wus are used per I/O, but
this changed with healing where the max number is 3 wus for case of
failed read (read, healing read, healing write)
- I really do not like current state (either in this patch or in
OpenBSD current) of handling of dependent wus. I will probably add
something like:
  sr_schedule_depwus(struct sr_workunit *wu, struct sr_workunit *depwu);
which will be hopefully more clear and ends few different ways how to
do dependent wus in the SR RAID code.
- fix RAID5/6 wus dependency handling (see above item)

With the above tasks done I hope the patch will be complete. Another
big TODO, but not for this patch (series) is to add proper scrub
support since RAID1C will benefit from it a lot.
But this is task really for the future as this patch is already quite
big and I'd like to have it merged first (if possible in one form or
another).

Note: checksums are computed per sector basis, saved in the area
allocated at the end of the drive. Due to this design,
LBA collision detection in softraid.c was enhanced/fixed to support
also this case of application
and currently it may not be compatible with RAID5/6 usage.

Any comments welcome!

Thanks!
Karel

PS: sorry for not inlining the patch, gmail/firefox has frozen 3 times
on it so I've given up this time and just attached that.

? .cvsignore
Index: sbin/bioctl/bioctl.8
===================================================================
RCS file: /cvs/src/sbin/bioctl/bioctl.8,v
retrieving revision 1.96
diff -u -p -u -r1.96 bioctl.8
--- sbin/bioctl/bioctl.8        29 May 2015 00:33:37 -0000      1.96
+++ sbin/bioctl/bioctl.8        13 Sep 2015 20:40:28 -0000
@@ -199,6 +199,11 @@ for example, force the creation of volum
 with unclean data in the metadata areas.
 .It Ar noauto
 Do not automatically assemble this volume at boot time.
+.It Ar chksum
+Enforce usage of checksums on the device blocks. The checksum area is
+located at the end of the device data area and since it occupies some
+space it makes actual usable device size smaller. We need exactly 8
+bytes of checksum per device data block.
 .El
 .It Fl c Ar raidlevel
 Create a
Index: sbin/bioctl/bioctl.c
===================================================================
RCS file: /cvs/src/sbin/bioctl/bioctl.c,v
retrieving revision 1.129
diff -u -p -u -r1.129 bioctl.c
--- sbin/bioctl/bioctl.c        18 Jul 2015 23:23:20 -0000      1.129
+++ sbin/bioctl/bioctl.c        13 Sep 2015 20:40:29 -0000
@@ -1053,6 +1053,9 @@ bio_createflags(char *lst)
                        case 'n':
                                flags |= BIOC_SCNOAUTOASSEMBLE;
                                break;
+                       case 'c':
+                               flags |= BIOC_SCCHKSUM;
+                               break;
                        default:
                                strlcpy(fs, s, sz + 1);
                                errx(1, "invalid flag %s", fs);
Index: sys/dev/biovar.h
===================================================================
RCS file: /cvs/src/sys/dev/biovar.h,v
retrieving revision 1.44
diff -u -p -u -r1.44 biovar.h
--- sys/dev/biovar.h    29 May 2015 00:33:37 -0000      1.44
+++ sys/dev/biovar.h    13 Sep 2015 20:40:29 -0000
@@ -213,6 +213,7 @@ struct bioc_createraid {
 #define BIOC_SCDEVT            0x02    /* dev_t array or string in dev_list */
 #define BIOC_SCNOAUTOASSEMBLE  0x04    /* do not assemble during autoconf */
 #define BIOC_SCBOOTABLE                0x08    /* device is bootable */
+#define BIOC_SCCHKSUM          0x10    /* device provides chksum capability */
        u_int32_t       bc_opaque_size;
        u_int32_t       bc_opaque_flags;
 #define        BIOC_SOINVALID          0x00    /* no opaque pointer */
Index: sys/dev/softraid.c
===================================================================
RCS file: /cvs/src/sys/dev/softraid.c,v
retrieving revision 1.364
diff -u -p -u -r1.364 softraid.c
--- sys/dev/softraid.c  19 Aug 2015 19:05:24 -0000      1.364
+++ sys/dev/softraid.c  13 Sep 2015 20:40:30 -0000
@@ -71,6 +71,7 @@ uint32_t      sr_debug = 0
                    /* | SR_D_DIS */
                    /* | SR_D_STATE */
                    /* | SR_D_REBUILD */
+                   /* | SR_D_CHKSUM  */
                ;
 #endif
 
@@ -144,6 +145,8 @@ int                 sr_chunk_in_use(struct sr_softc *,
 int                    sr_rw(struct sr_softc *, dev_t, char *, size_t,
                            daddr_t, long);
 void                   sr_wu_done_callback(void *);
+int                    sr_wu_collision(struct sr_workunit *,
+                           struct sr_workunit *);
 
 /* don't include these on RAMDISK */
 #ifndef SMALL_KERNEL
@@ -2264,6 +2267,9 @@ sr_wu_done_callback(void *xwu)
 
        s = splbio();
 
+       DNPRINTF(SR_D_WU, "%s: sr_wu_done: %p\n",
+                DEVNAME(sd->sd_sc), wu);
+
        if (xs != NULL) {
                if (wu->swu_ios_failed)
                        xs->error = XS_DRIVER_STUFFUP;
@@ -2286,11 +2292,54 @@ sr_wu_done_callback(void *xwu)
        TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
 
        if (wu->swu_collider) {
-               if (wu->swu_ios_failed)
-                       sr_raid_recreate_wu(wu->swu_collider);
+               DNPRINTF(SR_D_WU, "%s: sr_wu_done, searching for collider: 
%p\n",
+                        DEVNAME(sd->sd_sc), wu->swu_collider);
+               if (wu->swu_ios_failed) {
+                 DNPRINTF(SR_D_WU, "%s: sr_wu_done, recreate collider?: %p 
WHY???\n",
+                          DEVNAME(sd->sd_sc), wu->swu_collider);
+                 sr_raid_recreate_wu(wu->swu_collider);
+               }
+               /*
+                * We're searching for wu which do have the same collider
+                * like current wu. If we find such wu we can continue
+                * without starting the collider. If we do not find such wu
+                * then we need to start the collieder as the current wu is
+                * the last wu the collider collides with.
+                */
+               int found = 0;
+               DNPRINTF(SR_D_WU, "%s: sr_wu_done, searching for collider:"
+                   " %p\n", DEVNAME(sd->sd_sc), wu->swu_collider);
+               TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
+                       if (wu->swu_collider == wup->swu_collider) {
+                               DNPRINTF(SR_D_WU, "%s: sr_wu_done, found"
+                                   " collider in wu: %p\n",
+                                   DEVNAME(sd->sd_sc), wup);
+                               found++;
+                               break;
+                       }
+               }
+               TAILQ_FOREACH(wup, &sd->sd_wu_defq, swu_link) {
+                       if (wu->swu_collider == wup->swu_collider) {
+                               DNPRINTF(SR_D_WU, "%s: sr_wu_done, found"
+                                   " collider in def wu: %p\n",
+                                   DEVNAME(sd->sd_sc), wup);
+                               found++;
+                               break;
+                       }
+               }
+               DNPRINTF(SR_D_WU, "%s: sr_wu_done, collider found: %d\n",
+                   DEVNAME(sd->sd_sc), found);
 
-               /* XXX Should the collider be failed if this xs failed? */
-               sr_raid_startwu(wu->swu_collider);
+               if (found == 0) {
+                       /* The current wu is the last wu colliding
+                          with the collider. */
+                       DNPRINTF(SR_D_WU, "%s: sr_wu_done, starting the 
collider: %p\n",
+                           DEVNAME(sd->sd_sc), wu->swu_collider);
+                       sr_raid_startwu(wu->swu_collider);
+               }
+               else {
+                       wu->swu_collider = NULL;
+               }
        }
 
        /*
@@ -3967,6 +4016,7 @@ sr_discipline_init(struct sr_discipline 
        sd->sd_set_chunk_state = sr_set_chunk_state;
        sd->sd_set_vol_state = sr_set_vol_state;
        sd->sd_start_discipline = NULL;
+       sd->sd_wu_collision_detection = NULL;
 
        task_set(&sd->sd_meta_save_task, sr_meta_save_callback, sd);
        task_set(&sd->sd_hotspare_rebuild_task, sr_hotspare_rebuild_callback,
@@ -4181,11 +4231,30 @@ sr_raid_intr(struct buf *bp)
        splx(s);
 }
 
+int
+sr_wu_collision(struct sr_workunit *wu1, struct sr_workunit *wu2)
+{
+       struct sr_discipline    *sd = wu1->swu_dis;
+
+       if (sd->sd_wu_collision_detection) {
+               return sd->sd_wu_collision_detection(wu1, wu2);
+       }
+       else if (wu1->swu_blk_end < wu2->swu_blk_start ||
+           wu2->swu_blk_end < wu1->swu_blk_start) {
+               return 0;
+       }
+       else {
+               return 1;
+       }
+}
+
 void
 sr_schedule_wu(struct sr_workunit *wu)
 {
        struct sr_discipline    *sd = wu->swu_dis;
        struct sr_workunit      *wup;
+       struct sr_workunit      *twup;
+
        int                     s;
 
        DNPRINTF(SR_D_WU, "sr_schedule_wu: schedule wu %p state %i "
@@ -4210,20 +4279,53 @@ sr_schedule_wu(struct sr_workunit *wu)
        if (wu->swu_state != SR_WU_INPROGRESS)
                panic("sr_schedule_wu: work unit not in progress (state %i)\n",
                    wu->swu_state);
+       /*
+        * Walk both pending and defferred queues and find colliding wus.
+        * If we find collision we set wu's collider to current wu and push
+        * the current wu into the defferred queue.
+        */
+       int colliding = 0;
 
-       /* Walk queue backwards and fill in collider if we have one. */
-       TAILQ_FOREACH_REVERSE(wup, &sd->sd_wu_pendq, sr_wu_list, swu_link) {
-               if (wu->swu_blk_end < wup->swu_blk_start ||
-                   wup->swu_blk_end < wu->swu_blk_start)
+       TAILQ_FOREACH_SAFE(wup, &sd->sd_wu_pendq, swu_link, twup) {
+               if (!sr_wu_collision(wu, wup))
                        continue;
 
+               colliding = 1;
                /* Defer work unit due to LBA collision. */
-               DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p\n",
-                   wu);
-               wu->swu_state = SR_WU_DEFERRED;
+               DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p"
+                   " due to collision with: %p, scsi: %s, blk_s: %lld, blk_e:"
+                   " %lld\n", wu, wup,
+                   ((wup->swu_xs != NULL) ?
+                   (wup->swu_xs->flags & SCSI_DATA_IN ? "READ" : "WRITE")
+                   : "NULL"),
+                   wup->swu_blk_start, wup->swu_blk_end);
+               while (wup->swu_collider)
+                       wup = wup->swu_collider;
+
+               if (wup != wu)
+                       wup->swu_collider = wu;
+       }
+       TAILQ_FOREACH_SAFE(wup, &sd->sd_wu_defq, swu_link, twup) {
+               if (!sr_wu_collision(wu, wup))
+                       continue;
+
+               colliding = 1;
+               DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p"
+                   " due to collision with: %p, scsi: %s, blk_s: %lld, blk_e:"
+                   " %lld\n", wu, wup,
+                   ((wup->swu_xs != NULL) ?
+                   (wup->swu_xs->flags & SCSI_DATA_IN ? "READ" : "WRITE")
+                   : "NULL"),
+                   wup->swu_blk_start, wup->swu_blk_end);
+
                while (wup->swu_collider)
                        wup = wup->swu_collider;
-               wup->swu_collider = wu;
+
+               if (wup != wu)
+                       wup->swu_collider = wu;
+       }
+       if (colliding == 1) {
+               wu->swu_state = SR_WU_DEFERRED;
                TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
                sd->sd_wu_collisions++;
                goto queued;
@@ -4651,7 +4753,7 @@ sr_rebuild(struct sr_discipline *sd)
        struct sr_workunit      *wu_r, *wu_w;
        struct scsi_xfer        xs_r, xs_w;
        struct scsi_rw_16       *cr, *cw;
-       int                     c, s, slept, percent = 0, old_percent = -1;
+       int                     c, slept, percent = 0, old_percent = -1;
        u_int8_t                *buf;
 
        whole_blk = sd->sd_meta->ssdi.ssd_size / SR_REBUILD_IO_SIZE;
@@ -4713,7 +4815,7 @@ sr_rebuild(struct sr_discipline *sd)
                cr->opcode = READ_16;
                _lto4b(sz, cr->length);
                _lto8b(lba, cr->addr);
-               wu_r->swu_state = SR_WU_CONSTRUCT;
+               wu_r->swu_state = SR_WU_INPROGRESS;
                wu_r->swu_flags |= SR_WUF_REBUILD;
                wu_r->swu_xs = &xs_r;
                if (sd->sd_scsi_rw(wu_r)) {
@@ -4734,31 +4836,20 @@ sr_rebuild(struct sr_discipline *sd)
                cw->opcode = WRITE_16;
                _lto4b(sz, cw->length);
                _lto8b(lba, cw->addr);
-               wu_w->swu_state = SR_WU_CONSTRUCT;
+               wu_w->swu_state = SR_WU_INPROGRESS;
                wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
                wu_w->swu_xs = &xs_w;
+               /*
+                * collide with the read io so that we get automatically
+                * started when the read is done
+                */
+               wu_w->swu_blk_start = wu_r->swu_blk_start;
+               wu_w->swu_blk_end = wu_r->swu_blk_end;
                if (sd->sd_scsi_rw(wu_w)) {
                        printf("%s: could not create write io\n",
                            DEVNAME(sc));
                        goto fail;
                }
-
-               /*
-                * collide with the read io so that we get automatically
-                * started when the read is done
-                */
-               wu_w->swu_state = SR_WU_DEFERRED;
-               wu_r->swu_collider = wu_w;
-               s = splbio();
-               TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
-               splx(s);
-
-               DNPRINTF(SR_D_REBUILD, "%s: %s rebuild scheduling wu_r %p\n",
-                   DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, wu_r);
-
-               wu_r->swu_state = SR_WU_INPROGRESS;
-               sr_schedule_wu(wu_r);
-
                /* wait for write completion */
                slept = 0;
                while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
@@ -4845,7 +4936,15 @@ void
 sr_sensors_delete(struct sr_discipline *sd)
 {
        DNPRINTF(SR_D_STATE, "%s: sr_sensors_delete\n", DEVNAME(sd->sd_sc));
-
+       /* first run thorough chunk specific sensors */
+       /* shall we enhance discipline API and add sensor delete function? */
+       int     chdx;
+       for (chdx = 0; chdx < sd->sd_meta->ssdi.ssd_chunk_no; chdx++) {
+               if (sd->sd_vol.sv_chunks[chdx]->src_sensor_attached) {
+                       sensor_detach(&sd->sd_sc->sc_sensordev,
+                           &sd->sd_vol.sv_chunks[chdx]->src_sensor);
+               }
+       }
        if (sd->sd_vol.sv_sensor_attached)
                sensor_detach(&sd->sd_sc->sc_sensordev, &sd->sd_vol.sv_sensor);
 }
@@ -4856,6 +4955,8 @@ sr_sensors_refresh(void *arg)
        struct sr_softc         *sc = arg;
        struct sr_volume        *sv;
        struct sr_discipline    *sd;
+       struct sr_chunk         *chunk;
+       struct sr_chunk_head    *cl;
 
        DNPRINTF(SR_D_STATE, "%s: sr_sensors_refresh\n", DEVNAME(sc));
 
@@ -4882,6 +4983,18 @@ sr_sensors_refresh(void *arg)
                default:
                        sv->sv_sensor.value = 0; /* unknown */
                        sv->sv_sensor.status = SENSOR_S_UNKNOWN;
+               }
+               /* shall we enhance discipline API and add
+                  sensor refresh function? */
+               if (sd->sd_type == SR_MD_RAID1_CHKSUM
+                   && sd->mds.mdd_raid1.sr1_use_chksum) {
+                 /* refreshing chksum errors sensors */
+                 cl = &sv->sv_chunk_list;
+                 SLIST_FOREACH(chunk, cl, src_link)
+                   if (chunk->src_errs > 0 && chunk->src_sensor_attached == 1) 
{
+                     chunk->src_sensor.value = chunk->src_errs;
+                     chunk->src_sensor.status = SENSOR_S_WARN;
+                   }
                }
        }
 }
Index: sys/dev/softraid_raid1.c
===================================================================
RCS file: /cvs/src/sys/dev/softraid_raid1.c,v
retrieving revision 1.63
diff -u -p -u -r1.63 softraid_raid1.c
--- sys/dev/softraid_raid1.c    21 Jul 2015 03:30:51 -0000      1.63
+++ sys/dev/softraid_raid1.c    13 Sep 2015 20:40:30 -0000
@@ -41,6 +41,8 @@
 
 #include <dev/softraidvar.h>
 
+#include <lib/libz/zlib.h>
+
 /* RAID 1 functions. */
 int    sr_raid1_create(struct sr_discipline *, struct bioc_createraid *,
            int, int64_t);
@@ -48,15 +50,52 @@ int sr_raid1_assemble(struct sr_discipli
            int, void *);
 int    sr_raid1_init(struct sr_discipline *sd);
 int    sr_raid1_rw(struct sr_workunit *);
+int    sr_raid1_openings(struct sr_discipline *);
 int    sr_raid1_wu_done(struct sr_workunit *);
 void   sr_raid1_set_chunk_state(struct sr_discipline *, int, int);
 void   sr_raid1_set_vol_state(struct sr_discipline *);
+int    sr_raid1_wu_collision_detection(struct sr_workunit *,
+           struct sr_workunit *);
+
+/* internal functions */
+int    sr_raid1_addio(struct sr_workunit *, int, daddr_t, daddr_t, void *,
+           int, int, void *);
+void   sr_raid1_intr(struct buf *);
+int    sr_raid1_verify_chksum(void*, int, daddr_t, daddr_t, void*);
+uLong  sr_raid1_update_chksum(void*, int, daddr_t, daddr_t, void*);
+int    sr_raid1_sensor_create(struct sr_discipline *, int);
+daddr_t        sr_raid1_chksum_blk_start(struct sr_workunit *);
+daddr_t        sr_raid1_chksum_blk_end(struct sr_workunit *);
+size_t sr_raid1_chksum_data_len(struct sr_workunit *);
+void   sr_raid1_attempt_to_heal(struct sr_workunit *,
+           struct sr_raid1_errrec *);
+int    sr_raid1_next_chunk_to_try(struct sr_workunit *, int);
+
+
+#define CHKSUM_SIZE 8
+#define CHKSUM_IN_BLOCK (DEV_BSIZE / 8)
+
+
+struct sr_raid1c_opaque {
+       /* 0 == read, 1 == write */
+       int             write;
+       void            *data;
+       int             len;
+       daddr_t         blk_start;
+       daddr_t         blk_end;
+       void            *chksum_data;
+};
+
 
 /* Discipline initialisation. */
 void
 sr_raid1_discipline_init(struct sr_discipline *sd)
 {
        /* Fill out discipline members. */
+       /*
+        * For now we assume run without check sums, if this is not true
+        * we will correct values in _create or _assembly functions.
+        */
        sd->sd_type = SR_MD_RAID1;
        strlcpy(sd->sd_name, "RAID 1", sizeof(sd->sd_name));
        sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
@@ -70,20 +109,49 @@ sr_raid1_discipline_init(struct sr_disci
        sd->sd_scsi_wu_done = sr_raid1_wu_done;
        sd->sd_set_chunk_state = sr_raid1_set_chunk_state;
        sd->sd_set_vol_state = sr_raid1_set_vol_state;
+       sd->sd_scsi_intr = sr_raid1_intr;
 }
 
 int
 sr_raid1_create(struct sr_discipline *sd, struct bioc_createraid *bc,
     int no_chunk, int64_t coerced_size)
 {
+       int ch;
        if (no_chunk < 2) {
                sr_error(sd->sd_sc, "%s requires two or more chunks",
                    sd->sd_name);
                return EINVAL;
        }
-
-       sd->sd_meta->ssdi.ssd_size = coerced_size;
-
+       if (bc->bc_flags & BIOC_SCCHKSUM) {
+               int64_t chksum_area_size = coerced_size * CHKSUM_SIZE
+                   / DEV_BSIZE;
+               if (((coerced_size * CHKSUM_SIZE) % DEV_BSIZE) != 0) {
+                       chksum_area_size++;
+               }
+               DNPRINTF(SR_D_MISC, "RAID 1 CHKSUM: coerced size: %lld,"
+                   " data size: %lld, chksum area size: %lld\n, ",
+                   coerced_size, coerced_size - chksum_area_size,
+                   chksum_area_size);
+               sd->sd_meta->ssdi.ssd_size = coerced_size - chksum_area_size;
+               sd->mds.mdd_raid1.sr1_coerced_size = coerced_size;
+               sd->mds.mdd_raid1.sr1_use_chksum = 1;
+               TAILQ_INIT(&sd->mds.mdd_raid1.sr1_errors);
+               /* fixing discipline values for chksum support */
+               sd->sd_type = SR_MD_RAID1_CHKSUM;
+               strlcpy(sd->sd_name, "RAID 1C", sizeof(sd->sd_name));
+               sd->sd_openings = sr_raid1_openings;
+               sd->sd_wu_collision_detection = sr_raid1_wu_collision_detection;
+               for (ch = 0; ch < no_chunk; ch++) {
+                       if (sr_raid1_sensor_create(sd, ch)) {
+                               DNPRINTF(SR_D_MISC, "RAID 1C: sensor can't be"
+                                   " created for chunk: %d\n", ch);
+                       }
+               }
+       }
+       else {
+               sd->sd_meta->ssdi.ssd_size = coerced_size;
+               sd->mds.mdd_raid1.sr1_use_chksum = 0;
+       }
        return sr_raid1_init(sd);
 }
 
@@ -91,17 +159,56 @@ int
 sr_raid1_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
     int no_chunk, void *data)
 {
+       int ch;
+       if (bc->bc_flags & BIOC_SCCHKSUM) {
+               int64_t coerced_size = sd->sd_vol.sv_chunks[0]
+                   ->src_meta.scmi.scm_coerced_size;
+               DNPRINTF(SR_D_MISC, "RAID 1 CHKSUM: coerced size: %lld\n, ",
+                   coerced_size);
+               sd->mds.mdd_raid1.sr1_coerced_size = coerced_size;
+               sd->mds.mdd_raid1.sr1_use_chksum = 1;
+               TAILQ_INIT(&sd->mds.mdd_raid1.sr1_errors);
+               /* fixing discipline values for chksum support */
+               sd->sd_type = SR_MD_RAID1_CHKSUM;
+               strlcpy(sd->sd_name, "RAID 1C", sizeof(sd->sd_name));
+               sd->sd_openings = sr_raid1_openings;
+               sd->sd_wu_collision_detection = sr_raid1_wu_collision_detection;
+               for (ch = 0; ch < no_chunk; ch++) {
+                       if (sr_raid1_sensor_create(sd, ch)) {
+                               DNPRINTF(SR_D_MISC, "RAID 1C: sensor can't be"
+                                   " created for chunk: %d\n", ch);
+                       }
+               }
+       }
+       else {
+               sd->mds.mdd_raid1.sr1_use_chksum = 0;
+       }
        return sr_raid1_init(sd);
 }
 
 int
 sr_raid1_init(struct sr_discipline *sd)
 {
-       sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
-
+       if (sd->mds.mdd_raid1.sr1_use_chksum) {
+               /*
+                * In case of chksum support we use two ccbs per chunk
+                * for read and write.
+                */
+               sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no * 2;
+       }
+       else {
+               sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
+       }
        return 0;
 }
 
+int
+sr_raid1_openings(struct sr_discipline *sd)
+{
+       /* Max two work units per I/O (in case of write) */
+       return sd->sd_max_wu >> 1;
+}
+
 void
 sr_raid1_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
 {
@@ -324,29 +431,49 @@ die:
 int
 sr_raid1_rw(struct sr_workunit *wu)
 {
+       struct sr_workunit      *wu_r_chksum = NULL;
        struct sr_discipline    *sd = wu->swu_dis;
        struct scsi_xfer        *xs = wu->swu_xs;
        struct sr_ccb           *ccb;
        struct sr_chunk         *scp;
        int                     ios, chunk, i, rt;
        daddr_t                 blkno;
+       int                     use_chksum = sd->mds.mdd_raid1.sr1_use_chksum;
 
        /* blkno and scsi error will be handled by sr_validate_io */
        if (sr_validate_io(wu, &blkno, "sr_raid1_rw"))
                goto bad;
 
+       DNPRINTF(SR_D_CHKSUM, "sr_raid1_rw: blkno: %lld, len: %d, %s,"
+           " swu_block_start %lld, swu_block_end: %lld\n", blkno, xs->datalen,
+           (xs->flags & SCSI_DATA_IN) ? "READ" : "WRITE", wu->swu_blk_start,
+           wu->swu_blk_end);
+
        if (xs->flags & SCSI_DATA_IN)
                ios = 1;
        else
                ios = sd->sd_meta->ssdi.ssd_chunk_no;
 
+       struct sr_raid1_errrec *error = NULL;
+       struct sr_raid1_errrec *terr;
+       TAILQ_FOREACH(terr, &sd->mds.mdd_raid1.sr1_errors, sr1_err_link) {
+               if (terr->wu == wu) {
+                       error = terr;
+                       break;
+               }
+       }
        for (i = 0; i < ios; i++) {
                if (xs->flags & SCSI_DATA_IN) {
                        rt = 0;
 ragain:
                        /* interleave reads */
-                       chunk = sd->mds.mdd_raid1.sr1_counter++ %
-                           sd->sd_meta->ssdi.ssd_chunk_no;
+                       if (error) {
+                               chunk = error->next_chunk;
+                       }
+                       else {
+                               chunk = sd->mds.mdd_raid1.sr1_counter++ %
+                                   sd->sd_meta->ssdi.ssd_chunk_no;
+                       }
                        scp = sd->sd_vol.sv_chunks[chunk];
                        switch (scp->src_meta.scm_status) {
                        case BIOC_SDONLINE:
@@ -368,8 +495,8 @@ ragain:
                        }
                } else {
                        /* writes go on all working disks */
-                       chunk = i;
-                       scp = sd->sd_vol.sv_chunks[chunk];
+                       chunk = i;
+                       scp = sd->sd_vol.sv_chunks[chunk];
                        switch (scp->src_meta.scm_status) {
                        case BIOC_SDONLINE:
                        case BIOC_SDSCRUB:
@@ -384,25 +511,163 @@ ragain:
                                goto bad;
                        }
                }
-
-               ccb = sr_ccb_rw(sd, chunk, blkno, xs->datalen, xs->data,
-                   xs->flags, 0);
-               if (!ccb) {
-                       /* should never happen but handle more gracefully */
-                       printf("%s: %s: too many ccbs queued\n",
-                           DEVNAME(sd->sd_sc),
-                           sd->sd_meta->ssd_devname);
-                       goto bad;
+               if (use_chksum) {
+                       daddr_t chksum_blk = sr_raid1_chksum_blk_start(wu);
+                       void* chksum_data = NULL;
+                       size_t chksum_data_len = sr_raid1_chksum_data_len(wu);
+                       if (xs->flags & SCSI_DATA_IN) {
+                               /* read data */
+                               if (sr_raid1_addio(wu, chunk, blkno,
+                                   xs->datalen, xs->data, xs->flags, 0, 0)) {
+                                       printf("%s: %s: too many ccbs queued"
+                                           " (2c)\n", DEVNAME(sd->sd_sc),
+                                           sd->sd_meta->ssd_devname);
+                                       goto bad;
+                               }
+                               chksum_data = sr_block_get(sd, DEV_BSIZE
+                                   * chksum_data_len);
+                               if (!chksum_data) {
+                                       printf("%s: %s: can't allocate chksum"
+                                           " data block"
+                                           " (2c)\n", DEVNAME(sd->sd_sc),
+                                           sd->sd_meta->ssd_devname);
+                                       goto bad;
+                               }
+                               struct sr_raid1c_opaque *chksum_info = malloc(
+                                   sizeof(struct sr_raid1c_opaque), M_DEVBUF,
+                                   M_ZERO | M_NOWAIT);
+                               if (!chksum_info) {
+                                       panic("%s: %s: can't allocate"
+                                           " chksum_info structure\n",
+                                           DEVNAME(sd->sd_sc),
+                                           sd->sd_meta->ssd_devname);
+                               }
+                               chksum_info->write = 0;
+                               chksum_info->blk_start = wu->swu_blk_start;
+                               chksum_info->blk_end = wu->swu_blk_end;
+                               chksum_info->data = xs->data;
+                               chksum_info->len = xs->datalen;
+                               chksum_info->chksum_data = chksum_data;
+                               /* read chksum */
+                               if (sr_raid1_addio(wu, chunk, chksum_blk,
+                                   DEV_BSIZE * chksum_data_len, chksum_data,
+                                   SCSI_DATA_IN, 0, chksum_info)) {
+                                       sr_block_put(sd, chksum_data,
+                                           DEV_BSIZE * chksum_data_len);
+                                       printf("%s: %s: too many ccbs queued"
+                                           " (2c)\n", DEVNAME(sd->sd_sc),
+                                           sd->sd_meta->ssd_devname);
+                                       goto bad;
+                               }
+                       }
+                       else {
+                               /* write with chksum */
+                               struct sr_raid1c_opaque *chksum_info = malloc(
+                                   sizeof(struct sr_raid1c_opaque), M_DEVBUF,
+                                   M_ZERO | M_NOWAIT);
+                               if (!chksum_info) {
+                                       panic("%s: %s: can't allocate"
+                                           " chksum_info structure\n",
+                                           DEVNAME(sd->sd_sc),
+                                           sd->sd_meta->ssd_devname);
+                               }
+                               if (!wu_r_chksum) {
+                                       if ((wu_r_chksum = sr_scsi_wu_get(sd,
+                                           SCSI_NOSLEEP)) == NULL) {
+                                               printf("%s: %s failed to get"
+                                                   " read work unit\n",
+                                                   DEVNAME(sd->sd_sc),
+                                                   sd->sd_meta->ssd_devname);
+                                               goto bad;
+                                       }
+                                       wu_r_chksum->swu_state
+                                           = SR_WU_INPROGRESS;
+                                       wu_r_chksum->swu_flags
+                                           |= SR_WUF_DISCIPLINE;
+                                       wu_r_chksum->swu_blk_start
+                                           = sr_raid1_chksum_blk_start(wu);
+                                       wu_r_chksum->swu_blk_end
+                                           = sr_raid1_chksum_blk_end(wu);
+                                       DNPRINTF(SR_D_CHKSUM, "sr_raid1_rw:"
+                                           " wu_r_chksum: %p\n", wu_r_chksum);
+                               }
+                               chksum_data = sr_block_get(sd,
+                                   DEV_BSIZE * chksum_data_len);
+                               if (!chksum_data) {
+                                       printf("%s: %s: can't allocate chksum"
+                                           " data block"
+                                           " (2c)\n", DEVNAME(sd->sd_sc),
+                                           sd->sd_meta->ssd_devname);
+                                       goto bad;
+                               }
+                               chksum_info->write = 1;
+                               chksum_info->blk_start = wu->swu_blk_start;
+                               chksum_info->blk_end = wu->swu_blk_end;
+                               chksum_info->data = xs->data;
+                               chksum_info->len = xs->datalen;
+                               chksum_info->chksum_data = chksum_data;
+                               DNPRINTF(SR_D_CHKSUM, "rw: chksum_info: %p\n",
+                                   chksum_info);
+                               /* read chksum */
+                               if (sr_raid1_addio(wu_r_chksum, chunk,
+                                   chksum_blk, DEV_BSIZE * chksum_data_len,
+                                   chksum_data, SCSI_DATA_IN, 0,
+                                   chksum_info)) {
+                                       sr_block_put(sd, chksum_data,
+                                           DEV_BSIZE * chksum_data_len);
+                                       printf("%s: %s: too many ccbs queued"
+                                           " (2c)\n", DEVNAME(sd->sd_sc),
+                                           sd->sd_meta->ssd_devname);
+                                       goto bad;
+                               }
+                               /* write data */
+                               if (sr_raid1_addio(wu, chunk, blkno,
+                                   xs->datalen, xs->data, xs->flags, 0, 0)) {
+                                       printf("%s: %s: too many ccbs queued"
+                                           " (2c)\n", DEVNAME(sd->sd_sc),
+                                           sd->sd_meta->ssd_devname);
+                                       goto bad;
+                               }
+                               /* write chksum */
+                               if (sr_raid1_addio(wu, chunk, chksum_blk,
+                                   DEV_BSIZE * chksum_data_len, chksum_data,
+                                   xs->flags, SR_CCBF_FREEBUF, 0)) {
+                                       printf("%s: %s: too many ccbs queued"
+                                           " (2c)\n", DEVNAME(sd->sd_sc),
+                                           sd->sd_meta->ssd_devname);
+                                       goto bad;
+                               }
+                       }
+               }
+               else {
+                 /* RAID 1 without chksum support */
+                 ccb = sr_ccb_rw(sd, chunk, blkno, xs->datalen, xs->data,
+                                 xs->flags, 0);
+                 if (!ccb) {
+                   /* should never happen but handle more gracefully */
+                   printf("%s: %s: too many ccbs queued\n",
+                          DEVNAME(sd->sd_sc),
+                          sd->sd_meta->ssd_devname);
+                   goto bad;
+                 }
+               
+                 sr_wu_enqueue_ccb(wu, ccb);
                }
-               sr_wu_enqueue_ccb(wu, ccb);
        }
-
+       if (wu_r_chksum) {
+         /* collide write request with chksum reads */
+         wu_r_chksum->swu_blk_start = wu->swu_blk_start;
+         wu_r_chksum->swu_blk_end = wu->swu_blk_end;
+         sr_schedule_wu(wu_r_chksum);
+       }
        sr_schedule_wu(wu);
 
        return (0);
 
 bad:
        /* wu is unwound by sr_wu_put */
+       if (wu_r_chksum)
+               sr_scsi_wu_put(sd, wu_r_chksum);
        return (1);
 }
 
@@ -411,9 +676,149 @@ sr_raid1_wu_done(struct sr_workunit *wu)
 {
        struct sr_discipline    *sd = wu->swu_dis;
        struct scsi_xfer        *xs = wu->swu_xs;
+       struct sr_ccb           *ccb = NULL;
+       struct sr_raid1_errrec  *error = NULL;
+       struct sr_raid1_errrec  *terr;
+
+       if (wu->swu_flags & SR_WUF_HEALING) {
+               /* this is healing wu */
+               ccb = TAILQ_FIRST(&wu->swu_ccb);
+               DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done (healing):"
+                   " %p, blk_s: %lld, blk_e: %lld\n", wu, wu->swu_blk_start,
+                   wu->swu_blk_end);
+               printf("%s: chunk: %d: healing of %lld-%lld"
+                   " block(s) done.\n",
+                   sd->sd_meta->ssd_devname, ccb->ccb_target,
+                   wu->swu_blk_start, wu->swu_blk_end);
+               return SR_WU_OK;
+       }
+       /* XXX - we have no way of propagating errors... */
+       if (wu->swu_flags & SR_WUF_DISCIPLINE) {
+               DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done (read chksum):"
+                   " %p, blk_s: %lld, blk_e: %lld\n", wu, wu->swu_blk_start,
+                   wu->swu_blk_end);
+               /*
+                * This is read chksum wu for a data write wu, we need to
+                * free ccb->ccb_opaque which is checksum_info here since
+                * ccb_buf with chksum data is passed directly to write
+                * and we do not need chksum_info anymore.
+                */
+               TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) {
+                       if (ccb->ccb_opaque) {
+                               struct sr_raid1c_opaque *chksum_info
+                                   = ccb->ccb_opaque;
+                               DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done: free"
+                                   " chksum_info: %p\n", chksum_info);
+                               free(chksum_info, M_DEVBUF, 0);
+                               ccb->ccb_opaque = NULL;
+                       }
+               }
+               return SR_WU_OK;
+       }
+
+       if (wu->swu_ios_complete != wu->swu_io_count)
+               return SR_WU_INPROGRESS;
 
-       /* If at least one I/O succeeded, we are okay. */
-       if (wu->swu_ios_succeeded > 0) {
+
+       /* search for error recovery item assigned to this wu */
+       TAILQ_FOREACH(terr, &sd->mds.mdd_raid1.sr1_errors, sr1_err_link) {
+               if (terr->wu == wu) {
+                       error = terr;
+                       break;
+               }
+       }
+
+       if (xs->flags & SCSI_DATA_IN) {
+               /* read: verify chksum */
+               TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) {
+                       if (ccb->ccb_opaque != NULL) {
+                               /* ccb is chksum ccb */
+                               struct sr_raid1c_opaque *chksum_info
+                                   = ccb->ccb_opaque;
+                               if (sr_raid1_verify_chksum(chksum_info->data,
+                                   chksum_info->len, chksum_info->blk_start,
+                                   chksum_info->blk_end,
+                                   ccb->ccb_buf.b_data)) {
+                                       wu->swu_state = SR_WU_CHKSUMFAILED;
+                                       DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done"
+                                           ": verify failed on area %lld-%lld"
+                                           " with wu state: %d and flags: %d"
+                                           " on chunk: %d\n",
+                                           wu->swu_blk_start, wu->swu_blk_end,
+                                           wu->swu_state, wu->swu_flags,
+                                           ccb->ccb_target);
+                                       /* update chunk error value */
+                                       if (ccb->ccb_target != -1) {
+                                               sd->sd_vol.sv_chunks
+                                                   [ccb->ccb_target]
+                                                   ->src_errs++;
+                                       }
+                               }
+                               /* free chksum ccb buf */
+                               DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done:"
+                                   " sr_block_put:"
+                                   " %p, size: %ld\n", ccb->ccb_buf.b_data,
+                                   ccb->ccb_buf.b_bcount);
+                               sr_block_put(sd, ccb->ccb_buf.b_data,
+                                   ccb->ccb_buf.b_bcount);
+                               ccb->ccb_buf.b_data = NULL;
+                               free(chksum_info, M_DEVBUF, 0);
+                               ccb->ccb_opaque = NULL;
+                       }
+               }
+       }
+
+       if (wu->swu_state == SR_WU_CHKSUMFAILED) {
+               ccb = TAILQ_FIRST(&wu->swu_ccb);
+               printf("%s: chunk: %d: verify chksum failed on %lld-%lld"
+                   " block(s)\n",
+                   sd->sd_meta->ssd_devname, ccb->ccb_target,
+                   wu->swu_blk_start, wu->swu_blk_end);
+               if (error) {
+                       error->next_chunk = sr_raid1_next_chunk_to_try
+                           (wu, ccb->ccb_target);
+                       if (error->next_chunk == error->err_chunk) {
+                               /* all chunks failed */
+                               printf("%s: all chunk fail on reading"
+                                   " %lld-%lld block(s).\n",
+                                   sd->sd_meta->ssd_devname, wu->swu_blk_start,
+                                   wu->swu_blk_end);
+                               TAILQ_REMOVE(&sd->mds.mdd_raid1.sr1_errors,
+                                   error, sr1_err_link);
+                               free(error, M_DEVBUF, 0);
+                               wu->swu_state = SR_WU_FAILED;
+                               xs->error = XS_DRIVER_STUFFUP;
+                               return SR_WU_FAILED;
+                       }
+               }
+               else {
+                       error = malloc(sizeof(struct sr_raid1_errrec),
+                           M_DEVBUF, M_ZERO | M_NOWAIT);
+                       if (!error) {
+                               /* can't recover, signal error to upper layer */
+                               wu->swu_state = SR_WU_FAILED;
+                               xs->error = XS_DRIVER_STUFFUP;
+                               return SR_WU_FAILED;
+                       }
+                       error->wu = wu;
+                       error->err_chunk = ccb->ccb_target;
+                       error->next_chunk = sr_raid1_next_chunk_to_try
+                           (wu, ccb->ccb_target);
+                       TAILQ_INSERT_TAIL(&sd->mds.mdd_raid1.sr1_errors, error,
+                           sr1_err_link);
+               }
+       }
+       /*
+        * If at least one I/O succeeded, we are okay
+        * if there is no chksum failure.
+        */
+       if (wu->swu_ios_succeeded > 0 && wu->swu_state != SR_WU_CHKSUMFAILED) {
+               if (error) {
+                       sr_raid1_attempt_to_heal(wu, error);
+
+                       TAILQ_REMOVE(&sd->mds.mdd_raid1.sr1_errors, error, 
sr1_err_link);
+                       free(error, M_DEVBUF, 0);
+               }
                xs->error = XS_NOERROR;
                return SR_WU_OK;
        }
@@ -438,4 +843,373 @@ sr_raid1_wu_done(struct sr_workunit *wu)
        xs->error = XS_DRIVER_STUFFUP;
 
        return SR_WU_FAILED;
+}
+
+int
+sr_raid1_wu_collision_detection(struct sr_workunit *wu1,
+    struct sr_workunit *wu2)
+{
+       if (wu1 == NULL || wu2 == NULL)
+               return 0;
+
+       daddr_t wu1_chksum_blk_start = sr_raid1_chksum_blk_start(wu1);
+       daddr_t wu1_chksum_blk_end = sr_raid1_chksum_blk_end(wu1);
+       daddr_t wu2_chksum_blk_start = sr_raid1_chksum_blk_start(wu2);
+       daddr_t wu2_chksum_blk_end = sr_raid1_chksum_blk_end(wu2);
+
+       if (wu1->swu_blk_end < wu2->swu_blk_start
+               || wu2->swu_blk_end < wu1->swu_blk_start) {
+               /* data blocks do not colide, let's test chksum blocks */
+               if (wu1_chksum_blk_end < wu2_chksum_blk_start
+                       || wu2_chksum_blk_end < wu1_chksum_blk_start) {
+                       return 0;
+               }
+       }
+       DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_collision_detection: collision"
+           " found! wu1: %p, blk_s: %lld, blk_e: %lld,"
+           " chksum_blk_s: %lld, chksum_blk_e: %lld, wu2: %p,"
+           " blk_s: %lld, blk_e: %lld, chksum_blk_s: %lld, chksum_blk_e:"
+           " %lld\n", wu1, wu1->swu_blk_start, wu1->swu_blk_end,
+           wu1_chksum_blk_start, wu1_chksum_blk_end, wu2, wu2->swu_blk_start,
+           wu2->swu_blk_end, wu2_chksum_blk_start, wu2_chksum_blk_end);
+       return 1;
+}
+
+int
+sr_raid1_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
+    daddr_t len, void *data, int xsflags, int ccbflags, void *chksumbuf)
+{
+       struct sr_discipline    *sd = wu->swu_dis;
+       struct sr_ccb           *ccb;
+       DNPRINTF(SR_D_CHKSUM, "sr_raid1_addio: %s chunk %d block %lld "
+           "length %lld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
+           chunk, (long long)blkno, (long long)len,
+           chksumbuf ? "CHKSUM" : "-");
+       /* Allocate temporary buffer. */
+       if (data == NULL) {
+               data = sr_block_get(sd, len);
+               if (data == NULL)
+                       return (-1);
+               ccbflags |= SR_CCBF_FREEBUF;
+       }
+
+       ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
+       if (ccb == NULL) {
+               if (ccbflags & SR_CCBF_FREEBUF)
+                       sr_block_put(sd, data, len);
+               return (-1);
+       }
+       ccb->ccb_opaque = chksumbuf;
+       sr_wu_enqueue_ccb(wu, ccb);
+
+       return (0);
+}
+
+void
+sr_raid1_intr(struct buf *bp)
+{
+       struct sr_ccb           *ccb = (struct sr_ccb *)bp;
+       struct sr_workunit      *wu = ccb->ccb_wu;
+       struct sr_discipline    *sd = wu->swu_dis;
+       int                     s;
+       int                     use_chksum;
+
+       use_chksum = sd->mds.mdd_raid1.sr1_use_chksum;
+
+       DNPRINTF(SR_D_INTR, "%s: sr_raid1_intr bp %p xs %p\n",
+           DEVNAME(sd->sd_sc), bp, wu->swu_xs);
+
+       s = splbio();
+       sr_ccb_done(ccb);
+
+       if (use_chksum && ccb->ccb_state == SR_CCB_OK
+           && ccb->ccb_opaque) {
+               struct sr_raid1c_opaque *chksum_info = ccb->ccb_opaque;
+               if (chksum_info->write == 1) {
+                       /* let's update read chksum for provided data */
+                       sr_raid1_update_chksum(chksum_info->data,
+                           chksum_info->len, chksum_info->blk_start,
+                           chksum_info->blk_end, ccb->ccb_buf.b_data);
+               }
+       }
+       /* Free allocated data buffer. */
+       if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
+               sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
+               ccb->ccb_buf.b_data = NULL;
+       }
+       sr_wu_done(wu);
+       splx(s);
+}
+
+int
+sr_raid1_verify_chksum(void* data, int len, daddr_t blk_start, daddr_t blk_end,
+    void* chksum_buf)
+{
+       int32_t         chksum_n = blk_start % CHKSUM_IN_BLOCK;
+       int32_t         chksum_count = blk_end - blk_start + 1;
+       uLong           *chksum = chksum_buf;
+       Bytef           *buf = data;
+       int32_t         i;
+
+       for (i = 0; i < chksum_count; i++) {
+               uLong crc = crc32(0L, Z_NULL, 0);
+               crc = crc32(crc, &(buf[i * DEV_BSIZE]), DEV_BSIZE);
+               if (crc != chksum[chksum_n]) {
+                       DNPRINTF(SR_D_CHKSUM, "verify failed on comparison"
+                           " block chksum(%ld) and saved chksum(%ld)[%d],"
+                           " called for start: %lld, end: %lld, n: %d, failed"
+                           " block: %lld\n", crc, chksum[chksum_n], chksum_n,
+                           blk_start, blk_end, chksum_n, blk_start + i);
+                       return (-1);
+               }
+               chksum_n++;
+       }
+       return 0;
+}
+
+uLong
+sr_raid1_update_chksum(void* data, int len, daddr_t blk_start, daddr_t blk_end,
+    void* chksum_buf)
+{
+       int32_t         chksum_n = blk_start % CHKSUM_IN_BLOCK;
+       int32_t         chksum_count = blk_end - blk_start + 1;
+       uLong           *chksum = chksum_buf;
+       Bytef           *buf = data;
+       int32_t         i;
+
+       DNPRINTF(SR_D_CHKSUM, "update chksum: start: %lld, end: %lld, n: %d\n",
+           blk_start, blk_end, chksum_n);
+       DNPRINTF(SR_D_CHKSUM, "blocks chksumed: ");
+       for (i = 0; i < chksum_count; i++) {
+               uLong crc = crc32(0L, Z_NULL, 0);
+               crc = crc32(crc, &(buf[i * DEV_BSIZE]), DEV_BSIZE);
+               chksum[chksum_n] = crc;
+               DNPRINTF(SR_D_CHKSUM, "%lld->(%ld)[%d], ", (blk_start + i),
+                   crc, chksum_n);
+               chksum_n++;
+       }
+       DNPRINTF(SR_D_CHKSUM, "\n");
+       return 0;
+}
+
+int
+sr_raid1_sensor_create(struct sr_discipline* sd, int chno)
+{
+       struct sr_chunk         *chunk;
+
+       chunk = sd->sd_vol.sv_chunks[chno];
+       chunk->src_errs = 0;
+       chunk->src_sensor.type = SENSOR_INTEGER;
+       chunk->src_sensor.status = SENSOR_S_OK;
+       strlcpy(chunk->src_sensor.desc, chunk->src_devname,
+           sizeof(chunk->src_sensor.desc));
+       sensor_attach(&sd->sd_sc->sc_sensordev, &chunk->src_sensor);
+       chunk->src_sensor_attached = 1;
+       return 0;
+}
+
+daddr_t
+sr_raid1_chksum_blk_start(struct sr_workunit *wu)
+{
+       struct sr_discipline    *sd = wu->swu_dis;
+       daddr_t                 blkno = wu->swu_blk_start;
+       daddr_t                 chksum_blk_start;
+
+       chksum_blk_start = sd->sd_meta->ssdi.ssd_size
+           + (blkno / CHKSUM_IN_BLOCK);
+
+       return chksum_blk_start;
+}
+
+daddr_t
+sr_raid1_chksum_blk_end(struct sr_workunit *wu)
+{
+       return sr_raid1_chksum_blk_start(wu)
+           + sr_raid1_chksum_data_len(wu) - 1;
+}
+
+size_t
+sr_raid1_chksum_data_len(struct sr_workunit* wu)
+{
+       size_t          chksum_len;
+       size_t          chksum_offset;
+       size_t          chksum_data_len;
+
+       chksum_len = (wu->swu_blk_end - wu->swu_blk_start + 1) * CHKSUM_SIZE;
+       chksum_offset = (wu->swu_blk_start % CHKSUM_IN_BLOCK) * CHKSUM_SIZE;
+       chksum_data_len = (chksum_len + chksum_offset) / DEV_BSIZE + 1;
+
+       return chksum_data_len;
+}
+
+void
+sr_raid1_attempt_to_heal(struct sr_workunit* wu, struct sr_raid1_errrec *error)
+{
+       struct sr_discipline    *sd = wu->swu_dis;
+       struct sr_workunit      *healing_wu_r;
+       struct sr_workunit      *healing_wu_w;
+       struct scsi_xfer        *xs = wu->swu_xs;
+       char                    *databuf = NULL;
+       daddr_t                 chksum_blk = sr_raid1_chksum_blk_start(wu);
+       void                    *chksum_data = NULL;
+       size_t                  chksum_data_len = sr_raid1_chksum_data_len(wu);
+
+       DNPRINTF(SR_D_CHKSUM, "attempt to heal with wu %p, error %p\n", wu,
+           error);
+       databuf = sr_block_get(sd, xs->datalen);
+       if (!databuf) {
+               DNPRINTF(SR_D_CHKSUM, "%s: %s failed to get"
+                   " healing data buffer.\n",
+                   DEVNAME(sd->sd_sc),
+                   sd->sd_meta->ssd_devname);
+               printf("%s: %s failed to get"
+                   " healing data buffer.\n",
+                   DEVNAME(sd->sd_sc),
+                   sd->sd_meta->ssd_devname);
+               goto bad;
+       }
+       chksum_data = sr_block_get(sd, DEV_BSIZE
+           * chksum_data_len);
+       if (!chksum_data) {
+               DNPRINTF(SR_D_CHKSUM, "%s: %s failed to get"
+                   " healing chksum buffer.\n",
+                   DEVNAME(sd->sd_sc),
+                   sd->sd_meta->ssd_devname);
+               printf("%s: %s failed to get"
+                   " healing chksum buffer.\n",
+                   DEVNAME(sd->sd_sc),
+               sd->sd_meta->ssd_devname);
+               goto bad;
+       }
+
+       if ((healing_wu_r = sr_scsi_wu_get(sd,
+           SCSI_NOSLEEP)) == NULL) {
+               printf("%s: %s failed to get"
+                   " healing read work unit\n",
+                   DEVNAME(sd->sd_sc),
+                   sd->sd_meta->ssd_devname);
+               goto bad;
+       }
+
+       healing_wu_r->swu_state = SR_WU_INPROGRESS;
+       healing_wu_r->swu_flags |= SR_WUF_DISCIPLINE;
+       healing_wu_r->swu_blk_start = wu->swu_blk_start;
+       healing_wu_r->swu_blk_end = wu->swu_blk_end;
+
+       if ((healing_wu_w = sr_scsi_wu_get(sd,
+           SCSI_NOSLEEP)) == NULL) {
+               printf("%s: %s failed to get"
+                   " healing write work unit\n",
+                   DEVNAME(sd->sd_sc),
+                   sd->sd_meta->ssd_devname);
+               goto bad;
+       }
+
+       healing_wu_w->swu_state = SR_WU_INPROGRESS;
+       healing_wu_w->swu_flags |= SR_WUF_DISCIPLINE | SR_WUF_HEALING;
+       healing_wu_w->swu_blk_start = wu->swu_blk_start;
+       healing_wu_w->swu_blk_end = wu->swu_blk_end;
+
+       /* read data */
+       if (sr_raid1_addio(healing_wu_r, error->next_chunk, wu->swu_blk_start,
+           xs->datalen, databuf, SCSI_DATA_IN, 0, 0)) {
+               printf("%s: %s: too many ccbs queued"
+                   " (2c)\n", DEVNAME(sd->sd_sc),
+                   sd->sd_meta->ssd_devname);
+               goto bad;
+       }
+
+       /* read chksum */
+       struct sr_raid1c_opaque *chksum_info = malloc(
+           sizeof(struct sr_raid1c_opaque), M_DEVBUF,
+           M_ZERO | M_NOWAIT);
+       if (!chksum_info) {
+               panic("%s: %s: can't allocate"
+                   " chksum_info structure\n",
+                   DEVNAME(sd->sd_sc),
+                   sd->sd_meta->ssd_devname);
+       }
+       chksum_info->write = 0;
+       chksum_info->blk_start = wu->swu_blk_start;
+       chksum_info->blk_end = wu->swu_blk_end;
+       chksum_info->data = databuf;
+       chksum_info->len = xs->datalen;
+       chksum_info->chksum_data = chksum_data;
+       if (sr_raid1_addio(healing_wu_r, error->next_chunk, chksum_blk,
+           DEV_BSIZE * chksum_data_len, chksum_data,
+           SCSI_DATA_IN, 0, chksum_info)) {
+               printf("%s: %s: too many ccbs queued"
+                   " (2c)\n", DEVNAME(sd->sd_sc),
+                   sd->sd_meta->ssd_devname);
+               goto bad;
+       }
+
+       /* write data */
+       if (sr_raid1_addio(healing_wu_w, error->err_chunk, wu->swu_blk_start,
+           xs->datalen, databuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, 0)) {
+               printf("%s: %s: too many ccbs queued"
+                   " (2c)\n", DEVNAME(sd->sd_sc),
+                   sd->sd_meta->ssd_devname);
+               goto bad;
+       }
+
+       /* write chksum */
+       if (sr_raid1_addio(healing_wu_w, error->err_chunk,
+           chksum_blk, DEV_BSIZE * chksum_data_len,
+           chksum_data, SCSI_DATA_OUT, SR_CCBF_FREEBUF,
+           0)) {
+               printf("%s: %s: too many ccbs queued"
+                   " (2c)\n", DEVNAME(sd->sd_sc),
+                   sd->sd_meta->ssd_devname);
+               goto bad;
+       }
+
+       sr_schedule_wu(healing_wu_r);
+       sr_schedule_wu(healing_wu_w);
+
+bad:
+       if (databuf) {
+               sr_block_put(sd, databuf, xs->datalen);
+       }
+       if (chksum_data) {
+               sr_block_put(sd, chksum_data, DEV_BSIZE * chksum_data_len);
+       }
+       if (healing_wu_r) {
+               sr_scsi_wu_put(sd, healing_wu_r);
+       }
+       if (healing_wu_w) {
+               sr_scsi_wu_put(sd, healing_wu_w);
+       }
+       return;
+}
+
+int
+sr_raid1_next_chunk_to_try(struct sr_workunit *wu, int chunk)
+{
+       struct sr_discipline    *sd = wu->swu_dis;
+       struct sr_chunk         *scp;
+       int                     tchunk = chunk;
+       int                     i;
+
+       for (i = 1; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
+               tchunk = (chunk + i) % sd->sd_meta->ssdi.ssd_chunk_no;
+               scp = sd->sd_vol.sv_chunks[tchunk];
+               switch (scp->src_meta.scm_status) {
+               case BIOC_SDONLINE:
+               case BIOC_SDSCRUB:
+                       break;
+
+               case BIOC_SDOFFLINE:
+               case BIOC_SDREBUILD:
+               case BIOC_SDHOTSPARE:
+                       continue;
+
+               /* FALLTHROUGH */
+               default:
+                       /* volume offline */
+                       printf("%s: is offline, cannot read, chunk %d, i %d\n",
+                           DEVNAME(sd->sd_sc), tchunk, i);
+               }
+       }
+       return tchunk;
 }
Index: sys/dev/softraidvar.h
===================================================================
RCS file: /cvs/src/sys/dev/softraidvar.h,v
retrieving revision 1.161
diff -u -p -u -r1.161 softraidvar.h
--- sys/dev/softraidvar.h       21 Jul 2015 03:30:51 -0000      1.161
+++ sys/dev/softraidvar.h       13 Sep 2015 20:40:30 -0000
@@ -307,7 +307,7 @@ SLIST_HEAD(sr_boot_volume_head, sr_boot_
 
 #define DEVNAME(_s)     ((_s)->sc_dev.dv_xname)
 
-/* #define SR_DEBUG */
+#define SR_DEBUG
 #ifdef SR_DEBUG
 extern u_int32_t               sr_debug;
 #define DPRINTF(x...)          do { if (sr_debug) printf(x); } while(0)
@@ -322,6 +322,7 @@ extern u_int32_t            sr_debug;
 #define        SR_D_DIS                0x0080
 #define        SR_D_STATE              0x0100
 #define        SR_D_REBUILD            0x0200
+#define        SR_D_CHKSUM             0x0400
 #else
 #define DPRINTF(x...)
 #define DNPRINTF(n,x...)
@@ -378,6 +379,7 @@ struct sr_workunit {
 #define SR_WU_RESTART          7
 #define SR_WU_REQUEUE          8
 #define SR_WU_CONSTRUCT                9
+#define        SR_WU_CHKSUMFAILED      10
 
        int                     swu_flags;      /* additional hints */
 #define SR_WUF_REBUILD         (1<<0)          /* rebuild io */
@@ -387,6 +389,7 @@ struct sr_workunit {
 #define SR_WUF_WAKEUP          (1<<4)          /* Wakeup on I/O completion. */
 #define SR_WUF_DISCIPLINE      (1<<5)          /* Discipline specific I/O. */
 #define SR_WUF_FAKE            (1<<6)          /* Faked workunit. */
+#define SR_WUF_HEALING         (1<<7)          /* Workunit to heal bad block */
 
        /* workunit io range */
        daddr_t                 swu_blk_start;
@@ -423,9 +426,24 @@ struct sr_raid0 {
 };
 
 /* RAID 1 */
+struct sr_raid1_errrec {
+       struct sr_workunit              *wu;
+       u_int32_t                       err_chunk;
+       u_int32_t                       next_chunk;
+       TAILQ_ENTRY(sr_raid1_errrec)    sr1_err_link;
+};
+
+TAILQ_HEAD(sr1_error_list, sr_raid1_errrec);
+
 #define SR_RAID1_NOWU          16
 struct sr_raid1 {
        u_int32_t               sr1_counter;
+       u_int32_t               sr1_use_chksum; /* are checksum in use? */
+
+       /* original coerced size in blocks */
+       int64_t                 sr1_coerced_size;
+       /* list of error recoveries */
+       struct sr1_error_list   sr1_errors;
 };
 
 /* RAID 5 */
@@ -474,6 +492,10 @@ struct sr_chunk {
        u_char                  src_duid[8];    /* Chunk disklabel UID. */
        int64_t                 src_size;       /* in blocks */
 
+       struct ksensor          src_sensor;     /* Chunk specific sensor */
+       int                     src_sensor_attached;
+       int                     src_errs;       /* Errors counter value */
+
        SLIST_ENTRY(sr_chunk)   src_link;
 };
 
@@ -503,6 +525,7 @@ struct sr_discipline {
        /* SR_MD_RAID4 was 7. */
 #define        SR_MD_RAID6             8
 #define        SR_MD_CONCAT            9
+#define        SR_MD_RAID1_CHKSUM      10
        char                    sd_name[10];    /* human readable dis name */
        u_int16_t               sd_target;      /* scsibus target discipline 
uses */
 
@@ -512,6 +535,7 @@ struct sr_discipline {
 #define SR_CAP_REBUILD         0x00000004      /* Supports rebuild. */
 #define SR_CAP_NON_COERCED     0x00000008      /* Uses non-coerced size. */
 #define SR_CAP_REDUNDANT       0x00000010      /* Redundant copies of data. */
+#define SR_CAP_CHKSUM          0x00000020      /* Check sums of data. */
 
        union {
            struct sr_raid0     mdd_raid0;
@@ -583,6 +607,9 @@ struct sr_discipline {
        int                     (*sd_meta_opt_handler)(struct sr_discipline *,
                                    struct sr_meta_opt_hdr *);
        void                    (*sd_rebuild)(struct sr_discipline *);
+
+       int                     (*sd_wu_collision_detection)(
+                                   struct sr_workunit *, struct sr_workunit *);
 
        /* SCSI emulation */
        struct scsi_sense_data  sd_scsi_sense;

[WIP PATCH] SR RAID1 checksumming support V3

Reply via email to