Hello,
attached is my work in progress on checksumming support for softraid
RAID1. Currently it does:
- computation of checksums (crc32)
- verification of checksums
- hang-over to another chunk (restart wu) in case of checksum error
- properly handle errors happening on all chunks
- "self-healing" of bad sector
- supports rebuild
- signal bad checksum to console and to sensors
E.g.:
$ sysctl hw.sensors.softraid0
hw.sensors.softraid0.raw0=0 (sd0f), OK
hw.sensors.softraid0.raw1=0 (sd0g), OK
hw.sensors.softraid0.drive0=
online (sd1), OK
Next TODO items:
- fix openings, currently it suggests max 2 wus are used per I/O, but
this changed with healing where the max number is 3 wus for case of
failed read (read, healing read, healing write)
- I really do not like current state (either in this patch or in
OpenBSD current) of handling of dependent wus. I will probably add
something like:
sr_schedule_depwus(struct sr_workunit *wu, struct sr_workunit *depwu);
which will be hopefully more clear and ends few different ways how to
do dependent wus in the SR RAID code.
- fix RAID5/6 wus dependency handling (see above item)
With the above tasks done I hope the patch will be complete. Another
big TODO, but not for this patch (series) is to add proper scrub
support since RAID1C will benefit from it a lot.
But this is task really for the future as this patch is already quite
big and I'd like to have it merged first (if possible in one form or
another).
Note: checksums are computed per sector basis, saved in the area
allocated at the end of the drive. Due to this design,
LBA collision detection in softraid.c was enhanced/fixed to support
also this case of application
and currently it may not be compatible with RAID5/6 usage.
Any comments welcome!
Thanks!
Karel
PS: sorry for not inlining the patch, gmail/firefox has frozen 3 times
on it so I've given up this time and just attached that.
? .cvsignore
Index: sbin/bioctl/bioctl.8
===================================================================
RCS file: /cvs/src/sbin/bioctl/bioctl.8,v
retrieving revision 1.96
diff -u -p -u -r1.96 bioctl.8
--- sbin/bioctl/bioctl.8 29 May 2015 00:33:37 -0000 1.96
+++ sbin/bioctl/bioctl.8 13 Sep 2015 20:40:28 -0000
@@ -199,6 +199,11 @@ for example, force the creation of volum
with unclean data in the metadata areas.
.It Ar noauto
Do not automatically assemble this volume at boot time.
+.It Ar chksum
+Enforce usage of checksums on the device blocks. The checksum area is
+located at the end of the device data area and since it occupies some
+space it makes actual usable device size smaller. We need exactly 8
+bytes of checksum per device data block.
.El
.It Fl c Ar raidlevel
Create a
Index: sbin/bioctl/bioctl.c
===================================================================
RCS file: /cvs/src/sbin/bioctl/bioctl.c,v
retrieving revision 1.129
diff -u -p -u -r1.129 bioctl.c
--- sbin/bioctl/bioctl.c 18 Jul 2015 23:23:20 -0000 1.129
+++ sbin/bioctl/bioctl.c 13 Sep 2015 20:40:29 -0000
@@ -1053,6 +1053,9 @@ bio_createflags(char *lst)
case 'n':
flags |= BIOC_SCNOAUTOASSEMBLE;
break;
+ case 'c':
+ flags |= BIOC_SCCHKSUM;
+ break;
default:
strlcpy(fs, s, sz + 1);
errx(1, "invalid flag %s", fs);
Index: sys/dev/biovar.h
===================================================================
RCS file: /cvs/src/sys/dev/biovar.h,v
retrieving revision 1.44
diff -u -p -u -r1.44 biovar.h
--- sys/dev/biovar.h 29 May 2015 00:33:37 -0000 1.44
+++ sys/dev/biovar.h 13 Sep 2015 20:40:29 -0000
@@ -213,6 +213,7 @@ struct bioc_createraid {
#define BIOC_SCDEVT 0x02 /* dev_t array or string in dev_list */
#define BIOC_SCNOAUTOASSEMBLE 0x04 /* do not assemble during autoconf */
#define BIOC_SCBOOTABLE 0x08 /* device is bootable */
+#define BIOC_SCCHKSUM 0x10 /* device provides chksum capability */
u_int32_t bc_opaque_size;
u_int32_t bc_opaque_flags;
#define BIOC_SOINVALID 0x00 /* no opaque pointer */
Index: sys/dev/softraid.c
===================================================================
RCS file: /cvs/src/sys/dev/softraid.c,v
retrieving revision 1.364
diff -u -p -u -r1.364 softraid.c
--- sys/dev/softraid.c 19 Aug 2015 19:05:24 -0000 1.364
+++ sys/dev/softraid.c 13 Sep 2015 20:40:30 -0000
@@ -71,6 +71,7 @@ uint32_t sr_debug = 0
/* | SR_D_DIS */
/* | SR_D_STATE */
/* | SR_D_REBUILD */
+ /* | SR_D_CHKSUM */
;
#endif
@@ -144,6 +145,8 @@ int sr_chunk_in_use(struct sr_softc *,
int sr_rw(struct sr_softc *, dev_t, char *, size_t,
daddr_t, long);
void sr_wu_done_callback(void *);
+int sr_wu_collision(struct sr_workunit *,
+ struct sr_workunit *);
/* don't include these on RAMDISK */
#ifndef SMALL_KERNEL
@@ -2264,6 +2267,9 @@ sr_wu_done_callback(void *xwu)
s = splbio();
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done: %p\n",
+ DEVNAME(sd->sd_sc), wu);
+
if (xs != NULL) {
if (wu->swu_ios_failed)
xs->error = XS_DRIVER_STUFFUP;
@@ -2286,11 +2292,54 @@ sr_wu_done_callback(void *xwu)
TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
if (wu->swu_collider) {
- if (wu->swu_ios_failed)
- sr_raid_recreate_wu(wu->swu_collider);
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, searching for collider:
%p\n",
+ DEVNAME(sd->sd_sc), wu->swu_collider);
+ if (wu->swu_ios_failed) {
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, recreate collider?: %p
WHY???\n",
+ DEVNAME(sd->sd_sc), wu->swu_collider);
+ sr_raid_recreate_wu(wu->swu_collider);
+ }
+ /*
+ * We're searching for wu which do have the same collider
+ * like current wu. If we find such wu we can continue
+ * without starting the collider. If we do not find such wu
+ * then we need to start the collieder as the current wu is
+ * the last wu the collider collides with.
+ */
+ int found = 0;
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, searching for collider:"
+ " %p\n", DEVNAME(sd->sd_sc), wu->swu_collider);
+ TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
+ if (wu->swu_collider == wup->swu_collider) {
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, found"
+ " collider in wu: %p\n",
+ DEVNAME(sd->sd_sc), wup);
+ found++;
+ break;
+ }
+ }
+ TAILQ_FOREACH(wup, &sd->sd_wu_defq, swu_link) {
+ if (wu->swu_collider == wup->swu_collider) {
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, found"
+ " collider in def wu: %p\n",
+ DEVNAME(sd->sd_sc), wup);
+ found++;
+ break;
+ }
+ }
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, collider found: %d\n",
+ DEVNAME(sd->sd_sc), found);
- /* XXX Should the collider be failed if this xs failed? */
- sr_raid_startwu(wu->swu_collider);
+ if (found == 0) {
+ /* The current wu is the last wu colliding
+ with the collider. */
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, starting the
collider: %p\n",
+ DEVNAME(sd->sd_sc), wu->swu_collider);
+ sr_raid_startwu(wu->swu_collider);
+ }
+ else {
+ wu->swu_collider = NULL;
+ }
}
/*
@@ -3967,6 +4016,7 @@ sr_discipline_init(struct sr_discipline
sd->sd_set_chunk_state = sr_set_chunk_state;
sd->sd_set_vol_state = sr_set_vol_state;
sd->sd_start_discipline = NULL;
+ sd->sd_wu_collision_detection = NULL;
task_set(&sd->sd_meta_save_task, sr_meta_save_callback, sd);
task_set(&sd->sd_hotspare_rebuild_task, sr_hotspare_rebuild_callback,
@@ -4181,11 +4231,30 @@ sr_raid_intr(struct buf *bp)
splx(s);
}
+int
+sr_wu_collision(struct sr_workunit *wu1, struct sr_workunit *wu2)
+{
+ struct sr_discipline *sd = wu1->swu_dis;
+
+ if (sd->sd_wu_collision_detection) {
+ return sd->sd_wu_collision_detection(wu1, wu2);
+ }
+ else if (wu1->swu_blk_end < wu2->swu_blk_start ||
+ wu2->swu_blk_end < wu1->swu_blk_start) {
+ return 0;
+ }
+ else {
+ return 1;
+ }
+}
+
void
sr_schedule_wu(struct sr_workunit *wu)
{
struct sr_discipline *sd = wu->swu_dis;
struct sr_workunit *wup;
+ struct sr_workunit *twup;
+
int s;
DNPRINTF(SR_D_WU, "sr_schedule_wu: schedule wu %p state %i "
@@ -4210,20 +4279,53 @@ sr_schedule_wu(struct sr_workunit *wu)
if (wu->swu_state != SR_WU_INPROGRESS)
panic("sr_schedule_wu: work unit not in progress (state %i)\n",
wu->swu_state);
+ /*
+ * Walk both pending and defferred queues and find colliding wus.
+ * If we find collision we set wu's collider to current wu and push
+ * the current wu into the defferred queue.
+ */
+ int colliding = 0;
- /* Walk queue backwards and fill in collider if we have one. */
- TAILQ_FOREACH_REVERSE(wup, &sd->sd_wu_pendq, sr_wu_list, swu_link) {
- if (wu->swu_blk_end < wup->swu_blk_start ||
- wup->swu_blk_end < wu->swu_blk_start)
+ TAILQ_FOREACH_SAFE(wup, &sd->sd_wu_pendq, swu_link, twup) {
+ if (!sr_wu_collision(wu, wup))
continue;
+ colliding = 1;
/* Defer work unit due to LBA collision. */
- DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p\n",
- wu);
- wu->swu_state = SR_WU_DEFERRED;
+ DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p"
+ " due to collision with: %p, scsi: %s, blk_s: %lld, blk_e:"
+ " %lld\n", wu, wup,
+ ((wup->swu_xs != NULL) ?
+ (wup->swu_xs->flags & SCSI_DATA_IN ? "READ" : "WRITE")
+ : "NULL"),
+ wup->swu_blk_start, wup->swu_blk_end);
+ while (wup->swu_collider)
+ wup = wup->swu_collider;
+
+ if (wup != wu)
+ wup->swu_collider = wu;
+ }
+ TAILQ_FOREACH_SAFE(wup, &sd->sd_wu_defq, swu_link, twup) {
+ if (!sr_wu_collision(wu, wup))
+ continue;
+
+ colliding = 1;
+ DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p"
+ " due to collision with: %p, scsi: %s, blk_s: %lld, blk_e:"
+ " %lld\n", wu, wup,
+ ((wup->swu_xs != NULL) ?
+ (wup->swu_xs->flags & SCSI_DATA_IN ? "READ" : "WRITE")
+ : "NULL"),
+ wup->swu_blk_start, wup->swu_blk_end);
+
while (wup->swu_collider)
wup = wup->swu_collider;
- wup->swu_collider = wu;
+
+ if (wup != wu)
+ wup->swu_collider = wu;
+ }
+ if (colliding == 1) {
+ wu->swu_state = SR_WU_DEFERRED;
TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
sd->sd_wu_collisions++;
goto queued;
@@ -4651,7 +4753,7 @@ sr_rebuild(struct sr_discipline *sd)
struct sr_workunit *wu_r, *wu_w;
struct scsi_xfer xs_r, xs_w;
struct scsi_rw_16 *cr, *cw;
- int c, s, slept, percent = 0, old_percent = -1;
+ int c, slept, percent = 0, old_percent = -1;
u_int8_t *buf;
whole_blk = sd->sd_meta->ssdi.ssd_size / SR_REBUILD_IO_SIZE;
@@ -4713,7 +4815,7 @@ sr_rebuild(struct sr_discipline *sd)
cr->opcode = READ_16;
_lto4b(sz, cr->length);
_lto8b(lba, cr->addr);
- wu_r->swu_state = SR_WU_CONSTRUCT;
+ wu_r->swu_state = SR_WU_INPROGRESS;
wu_r->swu_flags |= SR_WUF_REBUILD;
wu_r->swu_xs = &xs_r;
if (sd->sd_scsi_rw(wu_r)) {
@@ -4734,31 +4836,20 @@ sr_rebuild(struct sr_discipline *sd)
cw->opcode = WRITE_16;
_lto4b(sz, cw->length);
_lto8b(lba, cw->addr);
- wu_w->swu_state = SR_WU_CONSTRUCT;
+ wu_w->swu_state = SR_WU_INPROGRESS;
wu_w->swu_flags |= SR_WUF_REBUILD | SR_WUF_WAKEUP;
wu_w->swu_xs = &xs_w;
+ /*
+ * collide with the read io so that we get automatically
+ * started when the read is done
+ */
+ wu_w->swu_blk_start = wu_r->swu_blk_start;
+ wu_w->swu_blk_end = wu_r->swu_blk_end;
if (sd->sd_scsi_rw(wu_w)) {
printf("%s: could not create write io\n",
DEVNAME(sc));
goto fail;
}
-
- /*
- * collide with the read io so that we get automatically
- * started when the read is done
- */
- wu_w->swu_state = SR_WU_DEFERRED;
- wu_r->swu_collider = wu_w;
- s = splbio();
- TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu_w, swu_link);
- splx(s);
-
- DNPRINTF(SR_D_REBUILD, "%s: %s rebuild scheduling wu_r %p\n",
- DEVNAME(sd->sd_sc), sd->sd_meta->ssd_devname, wu_r);
-
- wu_r->swu_state = SR_WU_INPROGRESS;
- sr_schedule_wu(wu_r);
-
/* wait for write completion */
slept = 0;
while ((wu_w->swu_flags & SR_WUF_REBUILDIOCOMP) == 0) {
@@ -4845,7 +4936,15 @@ void
sr_sensors_delete(struct sr_discipline *sd)
{
DNPRINTF(SR_D_STATE, "%s: sr_sensors_delete\n", DEVNAME(sd->sd_sc));
-
+ /* first run thorough chunk specific sensors */
+ /* shall we enhance discipline API and add sensor delete function? */
+ int chdx;
+ for (chdx = 0; chdx < sd->sd_meta->ssdi.ssd_chunk_no; chdx++) {
+ if (sd->sd_vol.sv_chunks[chdx]->src_sensor_attached) {
+ sensor_detach(&sd->sd_sc->sc_sensordev,
+ &sd->sd_vol.sv_chunks[chdx]->src_sensor);
+ }
+ }
if (sd->sd_vol.sv_sensor_attached)
sensor_detach(&sd->sd_sc->sc_sensordev, &sd->sd_vol.sv_sensor);
}
@@ -4856,6 +4955,8 @@ sr_sensors_refresh(void *arg)
struct sr_softc *sc = arg;
struct sr_volume *sv;
struct sr_discipline *sd;
+ struct sr_chunk *chunk;
+ struct sr_chunk_head *cl;
DNPRINTF(SR_D_STATE, "%s: sr_sensors_refresh\n", DEVNAME(sc));
@@ -4882,6 +4983,18 @@ sr_sensors_refresh(void *arg)
default:
sv->sv_sensor.value = 0; /* unknown */
sv->sv_sensor.status = SENSOR_S_UNKNOWN;
+ }
+ /* shall we enhance discipline API and add
+ sensor refresh function? */
+ if (sd->sd_type == SR_MD_RAID1_CHKSUM
+ && sd->mds.mdd_raid1.sr1_use_chksum) {
+ /* refreshing chksum errors sensors */
+ cl = &sv->sv_chunk_list;
+ SLIST_FOREACH(chunk, cl, src_link)
+ if (chunk->src_errs > 0 && chunk->src_sensor_attached == 1)
{
+ chunk->src_sensor.value = chunk->src_errs;
+ chunk->src_sensor.status = SENSOR_S_WARN;
+ }
}
}
}
Index: sys/dev/softraid_raid1.c
===================================================================
RCS file: /cvs/src/sys/dev/softraid_raid1.c,v
retrieving revision 1.63
diff -u -p -u -r1.63 softraid_raid1.c
--- sys/dev/softraid_raid1.c 21 Jul 2015 03:30:51 -0000 1.63
+++ sys/dev/softraid_raid1.c 13 Sep 2015 20:40:30 -0000
@@ -41,6 +41,8 @@
#include <dev/softraidvar.h>
+#include <lib/libz/zlib.h>
+
/* RAID 1 functions. */
int sr_raid1_create(struct sr_discipline *, struct bioc_createraid *,
int, int64_t);
@@ -48,15 +50,52 @@ int sr_raid1_assemble(struct sr_discipli
int, void *);
int sr_raid1_init(struct sr_discipline *sd);
int sr_raid1_rw(struct sr_workunit *);
+int sr_raid1_openings(struct sr_discipline *);
int sr_raid1_wu_done(struct sr_workunit *);
void sr_raid1_set_chunk_state(struct sr_discipline *, int, int);
void sr_raid1_set_vol_state(struct sr_discipline *);
+int sr_raid1_wu_collision_detection(struct sr_workunit *,
+ struct sr_workunit *);
+
+/* internal functions */
+int sr_raid1_addio(struct sr_workunit *, int, daddr_t, daddr_t, void *,
+ int, int, void *);
+void sr_raid1_intr(struct buf *);
+int sr_raid1_verify_chksum(void*, int, daddr_t, daddr_t, void*);
+uLong sr_raid1_update_chksum(void*, int, daddr_t, daddr_t, void*);
+int sr_raid1_sensor_create(struct sr_discipline *, int);
+daddr_t sr_raid1_chksum_blk_start(struct sr_workunit *);
+daddr_t sr_raid1_chksum_blk_end(struct sr_workunit *);
+size_t sr_raid1_chksum_data_len(struct sr_workunit *);
+void sr_raid1_attempt_to_heal(struct sr_workunit *,
+ struct sr_raid1_errrec *);
+int sr_raid1_next_chunk_to_try(struct sr_workunit *, int);
+
+
+#define CHKSUM_SIZE 8
+#define CHKSUM_IN_BLOCK (DEV_BSIZE / 8)
+
+
+struct sr_raid1c_opaque {
+ /* 0 == read, 1 == write */
+ int write;
+ void *data;
+ int len;
+ daddr_t blk_start;
+ daddr_t blk_end;
+ void *chksum_data;
+};
+
/* Discipline initialisation. */
void
sr_raid1_discipline_init(struct sr_discipline *sd)
{
/* Fill out discipline members. */
+ /*
+ * For now we assume run without check sums, if this is not true
+ * we will correct values in _create or _assembly functions.
+ */
sd->sd_type = SR_MD_RAID1;
strlcpy(sd->sd_name, "RAID 1", sizeof(sd->sd_name));
sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
@@ -70,20 +109,49 @@ sr_raid1_discipline_init(struct sr_disci
sd->sd_scsi_wu_done = sr_raid1_wu_done;
sd->sd_set_chunk_state = sr_raid1_set_chunk_state;
sd->sd_set_vol_state = sr_raid1_set_vol_state;
+ sd->sd_scsi_intr = sr_raid1_intr;
}
int
sr_raid1_create(struct sr_discipline *sd, struct bioc_createraid *bc,
int no_chunk, int64_t coerced_size)
{
+ int ch;
if (no_chunk < 2) {
sr_error(sd->sd_sc, "%s requires two or more chunks",
sd->sd_name);
return EINVAL;
}
-
- sd->sd_meta->ssdi.ssd_size = coerced_size;
-
+ if (bc->bc_flags & BIOC_SCCHKSUM) {
+ int64_t chksum_area_size = coerced_size * CHKSUM_SIZE
+ / DEV_BSIZE;
+ if (((coerced_size * CHKSUM_SIZE) % DEV_BSIZE) != 0) {
+ chksum_area_size++;
+ }
+ DNPRINTF(SR_D_MISC, "RAID 1 CHKSUM: coerced size: %lld,"
+ " data size: %lld, chksum area size: %lld\n, ",
+ coerced_size, coerced_size - chksum_area_size,
+ chksum_area_size);
+ sd->sd_meta->ssdi.ssd_size = coerced_size - chksum_area_size;
+ sd->mds.mdd_raid1.sr1_coerced_size = coerced_size;
+ sd->mds.mdd_raid1.sr1_use_chksum = 1;
+ TAILQ_INIT(&sd->mds.mdd_raid1.sr1_errors);
+ /* fixing discipline values for chksum support */
+ sd->sd_type = SR_MD_RAID1_CHKSUM;
+ strlcpy(sd->sd_name, "RAID 1C", sizeof(sd->sd_name));
+ sd->sd_openings = sr_raid1_openings;
+ sd->sd_wu_collision_detection = sr_raid1_wu_collision_detection;
+ for (ch = 0; ch < no_chunk; ch++) {
+ if (sr_raid1_sensor_create(sd, ch)) {
+ DNPRINTF(SR_D_MISC, "RAID 1C: sensor can't be"
+ " created for chunk: %d\n", ch);
+ }
+ }
+ }
+ else {
+ sd->sd_meta->ssdi.ssd_size = coerced_size;
+ sd->mds.mdd_raid1.sr1_use_chksum = 0;
+ }
return sr_raid1_init(sd);
}
@@ -91,17 +159,56 @@ int
sr_raid1_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
int no_chunk, void *data)
{
+ int ch;
+ if (bc->bc_flags & BIOC_SCCHKSUM) {
+ int64_t coerced_size = sd->sd_vol.sv_chunks[0]
+ ->src_meta.scmi.scm_coerced_size;
+ DNPRINTF(SR_D_MISC, "RAID 1 CHKSUM: coerced size: %lld\n, ",
+ coerced_size);
+ sd->mds.mdd_raid1.sr1_coerced_size = coerced_size;
+ sd->mds.mdd_raid1.sr1_use_chksum = 1;
+ TAILQ_INIT(&sd->mds.mdd_raid1.sr1_errors);
+ /* fixing discipline values for chksum support */
+ sd->sd_type = SR_MD_RAID1_CHKSUM;
+ strlcpy(sd->sd_name, "RAID 1C", sizeof(sd->sd_name));
+ sd->sd_openings = sr_raid1_openings;
+ sd->sd_wu_collision_detection = sr_raid1_wu_collision_detection;
+ for (ch = 0; ch < no_chunk; ch++) {
+ if (sr_raid1_sensor_create(sd, ch)) {
+ DNPRINTF(SR_D_MISC, "RAID 1C: sensor can't be"
+ " created for chunk: %d\n", ch);
+ }
+ }
+ }
+ else {
+ sd->mds.mdd_raid1.sr1_use_chksum = 0;
+ }
return sr_raid1_init(sd);
}
int
sr_raid1_init(struct sr_discipline *sd)
{
- sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
-
+ if (sd->mds.mdd_raid1.sr1_use_chksum) {
+ /*
+ * In case of chksum support we use two ccbs per chunk
+ * for read and write.
+ */
+ sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no * 2;
+ }
+ else {
+ sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
+ }
return 0;
}
+int
+sr_raid1_openings(struct sr_discipline *sd)
+{
+ /* Max two work units per I/O (in case of write) */
+ return sd->sd_max_wu >> 1;
+}
+
void
sr_raid1_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
{
@@ -324,29 +431,49 @@ die:
int
sr_raid1_rw(struct sr_workunit *wu)
{
+ struct sr_workunit *wu_r_chksum = NULL;
struct sr_discipline *sd = wu->swu_dis;
struct scsi_xfer *xs = wu->swu_xs;
struct sr_ccb *ccb;
struct sr_chunk *scp;
int ios, chunk, i, rt;
daddr_t blkno;
+ int use_chksum = sd->mds.mdd_raid1.sr1_use_chksum;
/* blkno and scsi error will be handled by sr_validate_io */
if (sr_validate_io(wu, &blkno, "sr_raid1_rw"))
goto bad;
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_rw: blkno: %lld, len: %d, %s,"
+ " swu_block_start %lld, swu_block_end: %lld\n", blkno, xs->datalen,
+ (xs->flags & SCSI_DATA_IN) ? "READ" : "WRITE", wu->swu_blk_start,
+ wu->swu_blk_end);
+
if (xs->flags & SCSI_DATA_IN)
ios = 1;
else
ios = sd->sd_meta->ssdi.ssd_chunk_no;
+ struct sr_raid1_errrec *error = NULL;
+ struct sr_raid1_errrec *terr;
+ TAILQ_FOREACH(terr, &sd->mds.mdd_raid1.sr1_errors, sr1_err_link) {
+ if (terr->wu == wu) {
+ error = terr;
+ break;
+ }
+ }
for (i = 0; i < ios; i++) {
if (xs->flags & SCSI_DATA_IN) {
rt = 0;
ragain:
/* interleave reads */
- chunk = sd->mds.mdd_raid1.sr1_counter++ %
- sd->sd_meta->ssdi.ssd_chunk_no;
+ if (error) {
+ chunk = error->next_chunk;
+ }
+ else {
+ chunk = sd->mds.mdd_raid1.sr1_counter++ %
+ sd->sd_meta->ssdi.ssd_chunk_no;
+ }
scp = sd->sd_vol.sv_chunks[chunk];
switch (scp->src_meta.scm_status) {
case BIOC_SDONLINE:
@@ -368,8 +495,8 @@ ragain:
}
} else {
/* writes go on all working disks */
- chunk = i;
- scp = sd->sd_vol.sv_chunks[chunk];
+ chunk = i;
+ scp = sd->sd_vol.sv_chunks[chunk];
switch (scp->src_meta.scm_status) {
case BIOC_SDONLINE:
case BIOC_SDSCRUB:
@@ -384,25 +511,163 @@ ragain:
goto bad;
}
}
-
- ccb = sr_ccb_rw(sd, chunk, blkno, xs->datalen, xs->data,
- xs->flags, 0);
- if (!ccb) {
- /* should never happen but handle more gracefully */
- printf("%s: %s: too many ccbs queued\n",
- DEVNAME(sd->sd_sc),
- sd->sd_meta->ssd_devname);
- goto bad;
+ if (use_chksum) {
+ daddr_t chksum_blk = sr_raid1_chksum_blk_start(wu);
+ void* chksum_data = NULL;
+ size_t chksum_data_len = sr_raid1_chksum_data_len(wu);
+ if (xs->flags & SCSI_DATA_IN) {
+ /* read data */
+ if (sr_raid1_addio(wu, chunk, blkno,
+ xs->datalen, xs->data, xs->flags, 0, 0)) {
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ chksum_data = sr_block_get(sd, DEV_BSIZE
+ * chksum_data_len);
+ if (!chksum_data) {
+ printf("%s: %s: can't allocate chksum"
+ " data block"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ struct sr_raid1c_opaque *chksum_info = malloc(
+ sizeof(struct sr_raid1c_opaque), M_DEVBUF,
+ M_ZERO | M_NOWAIT);
+ if (!chksum_info) {
+ panic("%s: %s: can't allocate"
+ " chksum_info structure\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ }
+ chksum_info->write = 0;
+ chksum_info->blk_start = wu->swu_blk_start;
+ chksum_info->blk_end = wu->swu_blk_end;
+ chksum_info->data = xs->data;
+ chksum_info->len = xs->datalen;
+ chksum_info->chksum_data = chksum_data;
+ /* read chksum */
+ if (sr_raid1_addio(wu, chunk, chksum_blk,
+ DEV_BSIZE * chksum_data_len, chksum_data,
+ SCSI_DATA_IN, 0, chksum_info)) {
+ sr_block_put(sd, chksum_data,
+ DEV_BSIZE * chksum_data_len);
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ }
+ else {
+ /* write with chksum */
+ struct sr_raid1c_opaque *chksum_info = malloc(
+ sizeof(struct sr_raid1c_opaque), M_DEVBUF,
+ M_ZERO | M_NOWAIT);
+ if (!chksum_info) {
+ panic("%s: %s: can't allocate"
+ " chksum_info structure\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ }
+ if (!wu_r_chksum) {
+ if ((wu_r_chksum = sr_scsi_wu_get(sd,
+ SCSI_NOSLEEP)) == NULL) {
+ printf("%s: %s failed to get"
+ " read work unit\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ wu_r_chksum->swu_state
+ = SR_WU_INPROGRESS;
+ wu_r_chksum->swu_flags
+ |= SR_WUF_DISCIPLINE;
+ wu_r_chksum->swu_blk_start
+ = sr_raid1_chksum_blk_start(wu);
+ wu_r_chksum->swu_blk_end
+ = sr_raid1_chksum_blk_end(wu);
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_rw:"
+ " wu_r_chksum: %p\n", wu_r_chksum);
+ }
+ chksum_data = sr_block_get(sd,
+ DEV_BSIZE * chksum_data_len);
+ if (!chksum_data) {
+ printf("%s: %s: can't allocate chksum"
+ " data block"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ chksum_info->write = 1;
+ chksum_info->blk_start = wu->swu_blk_start;
+ chksum_info->blk_end = wu->swu_blk_end;
+ chksum_info->data = xs->data;
+ chksum_info->len = xs->datalen;
+ chksum_info->chksum_data = chksum_data;
+ DNPRINTF(SR_D_CHKSUM, "rw: chksum_info: %p\n",
+ chksum_info);
+ /* read chksum */
+ if (sr_raid1_addio(wu_r_chksum, chunk,
+ chksum_blk, DEV_BSIZE * chksum_data_len,
+ chksum_data, SCSI_DATA_IN, 0,
+ chksum_info)) {
+ sr_block_put(sd, chksum_data,
+ DEV_BSIZE * chksum_data_len);
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ /* write data */
+ if (sr_raid1_addio(wu, chunk, blkno,
+ xs->datalen, xs->data, xs->flags, 0, 0)) {
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ /* write chksum */
+ if (sr_raid1_addio(wu, chunk, chksum_blk,
+ DEV_BSIZE * chksum_data_len, chksum_data,
+ xs->flags, SR_CCBF_FREEBUF, 0)) {
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ }
+ }
+ else {
+ /* RAID 1 without chksum support */
+ ccb = sr_ccb_rw(sd, chunk, blkno, xs->datalen, xs->data,
+ xs->flags, 0);
+ if (!ccb) {
+ /* should never happen but handle more gracefully */
+ printf("%s: %s: too many ccbs queued\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+
+ sr_wu_enqueue_ccb(wu, ccb);
}
- sr_wu_enqueue_ccb(wu, ccb);
}
-
+ if (wu_r_chksum) {
+ /* collide write request with chksum reads */
+ wu_r_chksum->swu_blk_start = wu->swu_blk_start;
+ wu_r_chksum->swu_blk_end = wu->swu_blk_end;
+ sr_schedule_wu(wu_r_chksum);
+ }
sr_schedule_wu(wu);
return (0);
bad:
/* wu is unwound by sr_wu_put */
+ if (wu_r_chksum)
+ sr_scsi_wu_put(sd, wu_r_chksum);
return (1);
}
@@ -411,9 +676,149 @@ sr_raid1_wu_done(struct sr_workunit *wu)
{
struct sr_discipline *sd = wu->swu_dis;
struct scsi_xfer *xs = wu->swu_xs;
+ struct sr_ccb *ccb = NULL;
+ struct sr_raid1_errrec *error = NULL;
+ struct sr_raid1_errrec *terr;
+
+ if (wu->swu_flags & SR_WUF_HEALING) {
+ /* this is healing wu */
+ ccb = TAILQ_FIRST(&wu->swu_ccb);
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done (healing):"
+ " %p, blk_s: %lld, blk_e: %lld\n", wu, wu->swu_blk_start,
+ wu->swu_blk_end);
+ printf("%s: chunk: %d: healing of %lld-%lld"
+ " block(s) done.\n",
+ sd->sd_meta->ssd_devname, ccb->ccb_target,
+ wu->swu_blk_start, wu->swu_blk_end);
+ return SR_WU_OK;
+ }
+ /* XXX - we have no way of propagating errors... */
+ if (wu->swu_flags & SR_WUF_DISCIPLINE) {
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done (read chksum):"
+ " %p, blk_s: %lld, blk_e: %lld\n", wu, wu->swu_blk_start,
+ wu->swu_blk_end);
+ /*
+ * This is read chksum wu for a data write wu, we need to
+ * free ccb->ccb_opaque which is checksum_info here since
+ * ccb_buf with chksum data is passed directly to write
+ * and we do not need chksum_info anymore.
+ */
+ TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) {
+ if (ccb->ccb_opaque) {
+ struct sr_raid1c_opaque *chksum_info
+ = ccb->ccb_opaque;
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done: free"
+ " chksum_info: %p\n", chksum_info);
+ free(chksum_info, M_DEVBUF, 0);
+ ccb->ccb_opaque = NULL;
+ }
+ }
+ return SR_WU_OK;
+ }
+
+ if (wu->swu_ios_complete != wu->swu_io_count)
+ return SR_WU_INPROGRESS;
- /* If at least one I/O succeeded, we are okay. */
- if (wu->swu_ios_succeeded > 0) {
+
+ /* search for error recovery item assigned to this wu */
+ TAILQ_FOREACH(terr, &sd->mds.mdd_raid1.sr1_errors, sr1_err_link) {
+ if (terr->wu == wu) {
+ error = terr;
+ break;
+ }
+ }
+
+ if (xs->flags & SCSI_DATA_IN) {
+ /* read: verify chksum */
+ TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) {
+ if (ccb->ccb_opaque != NULL) {
+ /* ccb is chksum ccb */
+ struct sr_raid1c_opaque *chksum_info
+ = ccb->ccb_opaque;
+ if (sr_raid1_verify_chksum(chksum_info->data,
+ chksum_info->len, chksum_info->blk_start,
+ chksum_info->blk_end,
+ ccb->ccb_buf.b_data)) {
+ wu->swu_state = SR_WU_CHKSUMFAILED;
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done"
+ ": verify failed on area %lld-%lld"
+ " with wu state: %d and flags: %d"
+ " on chunk: %d\n",
+ wu->swu_blk_start, wu->swu_blk_end,
+ wu->swu_state, wu->swu_flags,
+ ccb->ccb_target);
+ /* update chunk error value */
+ if (ccb->ccb_target != -1) {
+ sd->sd_vol.sv_chunks
+ [ccb->ccb_target]
+ ->src_errs++;
+ }
+ }
+ /* free chksum ccb buf */
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done:"
+ " sr_block_put:"
+ " %p, size: %ld\n", ccb->ccb_buf.b_data,
+ ccb->ccb_buf.b_bcount);
+ sr_block_put(sd, ccb->ccb_buf.b_data,
+ ccb->ccb_buf.b_bcount);
+ ccb->ccb_buf.b_data = NULL;
+ free(chksum_info, M_DEVBUF, 0);
+ ccb->ccb_opaque = NULL;
+ }
+ }
+ }
+
+ if (wu->swu_state == SR_WU_CHKSUMFAILED) {
+ ccb = TAILQ_FIRST(&wu->swu_ccb);
+ printf("%s: chunk: %d: verify chksum failed on %lld-%lld"
+ " block(s)\n",
+ sd->sd_meta->ssd_devname, ccb->ccb_target,
+ wu->swu_blk_start, wu->swu_blk_end);
+ if (error) {
+ error->next_chunk = sr_raid1_next_chunk_to_try
+ (wu, ccb->ccb_target);
+ if (error->next_chunk == error->err_chunk) {
+ /* all chunks failed */
+ printf("%s: all chunk fail on reading"
+ " %lld-%lld block(s).\n",
+ sd->sd_meta->ssd_devname, wu->swu_blk_start,
+ wu->swu_blk_end);
+ TAILQ_REMOVE(&sd->mds.mdd_raid1.sr1_errors,
+ error, sr1_err_link);
+ free(error, M_DEVBUF, 0);
+ wu->swu_state = SR_WU_FAILED;
+ xs->error = XS_DRIVER_STUFFUP;
+ return SR_WU_FAILED;
+ }
+ }
+ else {
+ error = malloc(sizeof(struct sr_raid1_errrec),
+ M_DEVBUF, M_ZERO | M_NOWAIT);
+ if (!error) {
+ /* can't recover, signal error to upper layer */
+ wu->swu_state = SR_WU_FAILED;
+ xs->error = XS_DRIVER_STUFFUP;
+ return SR_WU_FAILED;
+ }
+ error->wu = wu;
+ error->err_chunk = ccb->ccb_target;
+ error->next_chunk = sr_raid1_next_chunk_to_try
+ (wu, ccb->ccb_target);
+ TAILQ_INSERT_TAIL(&sd->mds.mdd_raid1.sr1_errors, error,
+ sr1_err_link);
+ }
+ }
+ /*
+ * If at least one I/O succeeded, we are okay
+ * if there is no chksum failure.
+ */
+ if (wu->swu_ios_succeeded > 0 && wu->swu_state != SR_WU_CHKSUMFAILED) {
+ if (error) {
+ sr_raid1_attempt_to_heal(wu, error);
+
+ TAILQ_REMOVE(&sd->mds.mdd_raid1.sr1_errors, error,
sr1_err_link);
+ free(error, M_DEVBUF, 0);
+ }
xs->error = XS_NOERROR;
return SR_WU_OK;
}
@@ -438,4 +843,373 @@ sr_raid1_wu_done(struct sr_workunit *wu)
xs->error = XS_DRIVER_STUFFUP;
return SR_WU_FAILED;
+}
+
+int
+sr_raid1_wu_collision_detection(struct sr_workunit *wu1,
+ struct sr_workunit *wu2)
+{
+ if (wu1 == NULL || wu2 == NULL)
+ return 0;
+
+ daddr_t wu1_chksum_blk_start = sr_raid1_chksum_blk_start(wu1);
+ daddr_t wu1_chksum_blk_end = sr_raid1_chksum_blk_end(wu1);
+ daddr_t wu2_chksum_blk_start = sr_raid1_chksum_blk_start(wu2);
+ daddr_t wu2_chksum_blk_end = sr_raid1_chksum_blk_end(wu2);
+
+ if (wu1->swu_blk_end < wu2->swu_blk_start
+ || wu2->swu_blk_end < wu1->swu_blk_start) {
+ /* data blocks do not colide, let's test chksum blocks */
+ if (wu1_chksum_blk_end < wu2_chksum_blk_start
+ || wu2_chksum_blk_end < wu1_chksum_blk_start) {
+ return 0;
+ }
+ }
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_collision_detection: collision"
+ " found! wu1: %p, blk_s: %lld, blk_e: %lld,"
+ " chksum_blk_s: %lld, chksum_blk_e: %lld, wu2: %p,"
+ " blk_s: %lld, blk_e: %lld, chksum_blk_s: %lld, chksum_blk_e:"
+ " %lld\n", wu1, wu1->swu_blk_start, wu1->swu_blk_end,
+ wu1_chksum_blk_start, wu1_chksum_blk_end, wu2, wu2->swu_blk_start,
+ wu2->swu_blk_end, wu2_chksum_blk_start, wu2_chksum_blk_end);
+ return 1;
+}
+
+int
+sr_raid1_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
+ daddr_t len, void *data, int xsflags, int ccbflags, void *chksumbuf)
+{
+ struct sr_discipline *sd = wu->swu_dis;
+ struct sr_ccb *ccb;
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_addio: %s chunk %d block %lld "
+ "length %lld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
+ chunk, (long long)blkno, (long long)len,
+ chksumbuf ? "CHKSUM" : "-");
+ /* Allocate temporary buffer. */
+ if (data == NULL) {
+ data = sr_block_get(sd, len);
+ if (data == NULL)
+ return (-1);
+ ccbflags |= SR_CCBF_FREEBUF;
+ }
+
+ ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
+ if (ccb == NULL) {
+ if (ccbflags & SR_CCBF_FREEBUF)
+ sr_block_put(sd, data, len);
+ return (-1);
+ }
+ ccb->ccb_opaque = chksumbuf;
+ sr_wu_enqueue_ccb(wu, ccb);
+
+ return (0);
+}
+
+void
+sr_raid1_intr(struct buf *bp)
+{
+ struct sr_ccb *ccb = (struct sr_ccb *)bp;
+ struct sr_workunit *wu = ccb->ccb_wu;
+ struct sr_discipline *sd = wu->swu_dis;
+ int s;
+ int use_chksum;
+
+ use_chksum = sd->mds.mdd_raid1.sr1_use_chksum;
+
+ DNPRINTF(SR_D_INTR, "%s: sr_raid1_intr bp %p xs %p\n",
+ DEVNAME(sd->sd_sc), bp, wu->swu_xs);
+
+ s = splbio();
+ sr_ccb_done(ccb);
+
+ if (use_chksum && ccb->ccb_state == SR_CCB_OK
+ && ccb->ccb_opaque) {
+ struct sr_raid1c_opaque *chksum_info = ccb->ccb_opaque;
+ if (chksum_info->write == 1) {
+ /* let's update read chksum for provided data */
+ sr_raid1_update_chksum(chksum_info->data,
+ chksum_info->len, chksum_info->blk_start,
+ chksum_info->blk_end, ccb->ccb_buf.b_data);
+ }
+ }
+ /* Free allocated data buffer. */
+ if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
+ sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
+ ccb->ccb_buf.b_data = NULL;
+ }
+ sr_wu_done(wu);
+ splx(s);
+}
+
+int
+sr_raid1_verify_chksum(void* data, int len, daddr_t blk_start, daddr_t blk_end,
+ void* chksum_buf)
+{
+ int32_t chksum_n = blk_start % CHKSUM_IN_BLOCK;
+ int32_t chksum_count = blk_end - blk_start + 1;
+ uLong *chksum = chksum_buf;
+ Bytef *buf = data;
+ int32_t i;
+
+ for (i = 0; i < chksum_count; i++) {
+ uLong crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, &(buf[i * DEV_BSIZE]), DEV_BSIZE);
+ if (crc != chksum[chksum_n]) {
+ DNPRINTF(SR_D_CHKSUM, "verify failed on comparison"
+ " block chksum(%ld) and saved chksum(%ld)[%d],"
+ " called for start: %lld, end: %lld, n: %d, failed"
+ " block: %lld\n", crc, chksum[chksum_n], chksum_n,
+ blk_start, blk_end, chksum_n, blk_start + i);
+ return (-1);
+ }
+ chksum_n++;
+ }
+ return 0;
+}
+
+uLong
+sr_raid1_update_chksum(void* data, int len, daddr_t blk_start, daddr_t blk_end,
+ void* chksum_buf)
+{
+ int32_t chksum_n = blk_start % CHKSUM_IN_BLOCK;
+ int32_t chksum_count = blk_end - blk_start + 1;
+ uLong *chksum = chksum_buf;
+ Bytef *buf = data;
+ int32_t i;
+
+ DNPRINTF(SR_D_CHKSUM, "update chksum: start: %lld, end: %lld, n: %d\n",
+ blk_start, blk_end, chksum_n);
+ DNPRINTF(SR_D_CHKSUM, "blocks chksumed: ");
+ for (i = 0; i < chksum_count; i++) {
+ uLong crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, &(buf[i * DEV_BSIZE]), DEV_BSIZE);
+ chksum[chksum_n] = crc;
+ DNPRINTF(SR_D_CHKSUM, "%lld->(%ld)[%d], ", (blk_start + i),
+ crc, chksum_n);
+ chksum_n++;
+ }
+ DNPRINTF(SR_D_CHKSUM, "\n");
+ return 0;
+}
+
+int
+sr_raid1_sensor_create(struct sr_discipline* sd, int chno)
+{
+ struct sr_chunk *chunk;
+
+ chunk = sd->sd_vol.sv_chunks[chno];
+ chunk->src_errs = 0;
+ chunk->src_sensor.type = SENSOR_INTEGER;
+ chunk->src_sensor.status = SENSOR_S_OK;
+ strlcpy(chunk->src_sensor.desc, chunk->src_devname,
+ sizeof(chunk->src_sensor.desc));
+ sensor_attach(&sd->sd_sc->sc_sensordev, &chunk->src_sensor);
+ chunk->src_sensor_attached = 1;
+ return 0;
+}
+
+daddr_t
+sr_raid1_chksum_blk_start(struct sr_workunit *wu)
+{
+ struct sr_discipline *sd = wu->swu_dis;
+ daddr_t blkno = wu->swu_blk_start;
+ daddr_t chksum_blk_start;
+
+ chksum_blk_start = sd->sd_meta->ssdi.ssd_size
+ + (blkno / CHKSUM_IN_BLOCK);
+
+ return chksum_blk_start;
+}
+
+daddr_t
+sr_raid1_chksum_blk_end(struct sr_workunit *wu)
+{
+ return sr_raid1_chksum_blk_start(wu)
+ + sr_raid1_chksum_data_len(wu) - 1;
+}
+
+size_t
+sr_raid1_chksum_data_len(struct sr_workunit* wu)
+{
+ size_t chksum_len;
+ size_t chksum_offset;
+ size_t chksum_data_len;
+
+ chksum_len = (wu->swu_blk_end - wu->swu_blk_start + 1) * CHKSUM_SIZE;
+ chksum_offset = (wu->swu_blk_start % CHKSUM_IN_BLOCK) * CHKSUM_SIZE;
+ chksum_data_len = (chksum_len + chksum_offset) / DEV_BSIZE + 1;
+
+ return chksum_data_len;
+}
+
+void
+sr_raid1_attempt_to_heal(struct sr_workunit* wu, struct sr_raid1_errrec *error)
+{
+ struct sr_discipline *sd = wu->swu_dis;
+ struct sr_workunit *healing_wu_r;
+ struct sr_workunit *healing_wu_w;
+ struct scsi_xfer *xs = wu->swu_xs;
+ char *databuf = NULL;
+ daddr_t chksum_blk = sr_raid1_chksum_blk_start(wu);
+ void *chksum_data = NULL;
+ size_t chksum_data_len = sr_raid1_chksum_data_len(wu);
+
+ DNPRINTF(SR_D_CHKSUM, "attempt to heal with wu %p, error %p\n", wu,
+ error);
+ databuf = sr_block_get(sd, xs->datalen);
+ if (!databuf) {
+ DNPRINTF(SR_D_CHKSUM, "%s: %s failed to get"
+ " healing data buffer.\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ printf("%s: %s failed to get"
+ " healing data buffer.\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ chksum_data = sr_block_get(sd, DEV_BSIZE
+ * chksum_data_len);
+ if (!chksum_data) {
+ DNPRINTF(SR_D_CHKSUM, "%s: %s failed to get"
+ " healing chksum buffer.\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ printf("%s: %s failed to get"
+ " healing chksum buffer.\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+
+ if ((healing_wu_r = sr_scsi_wu_get(sd,
+ SCSI_NOSLEEP)) == NULL) {
+ printf("%s: %s failed to get"
+ " healing read work unit\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+
+ healing_wu_r->swu_state = SR_WU_INPROGRESS;
+ healing_wu_r->swu_flags |= SR_WUF_DISCIPLINE;
+ healing_wu_r->swu_blk_start = wu->swu_blk_start;
+ healing_wu_r->swu_blk_end = wu->swu_blk_end;
+
+ if ((healing_wu_w = sr_scsi_wu_get(sd,
+ SCSI_NOSLEEP)) == NULL) {
+ printf("%s: %s failed to get"
+ " healing write work unit\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+
+ healing_wu_w->swu_state = SR_WU_INPROGRESS;
+ healing_wu_w->swu_flags |= SR_WUF_DISCIPLINE | SR_WUF_HEALING;
+ healing_wu_w->swu_blk_start = wu->swu_blk_start;
+ healing_wu_w->swu_blk_end = wu->swu_blk_end;
+
+ /* read data */
+ if (sr_raid1_addio(healing_wu_r, error->next_chunk, wu->swu_blk_start,
+ xs->datalen, databuf, SCSI_DATA_IN, 0, 0)) {
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+
+ /* read chksum */
+ struct sr_raid1c_opaque *chksum_info = malloc(
+ sizeof(struct sr_raid1c_opaque), M_DEVBUF,
+ M_ZERO | M_NOWAIT);
+ if (!chksum_info) {
+ panic("%s: %s: can't allocate"
+ " chksum_info structure\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ }
+ chksum_info->write = 0;
+ chksum_info->blk_start = wu->swu_blk_start;
+ chksum_info->blk_end = wu->swu_blk_end;
+ chksum_info->data = databuf;
+ chksum_info->len = xs->datalen;
+ chksum_info->chksum_data = chksum_data;
+ if (sr_raid1_addio(healing_wu_r, error->next_chunk, chksum_blk,
+ DEV_BSIZE * chksum_data_len, chksum_data,
+ SCSI_DATA_IN, 0, chksum_info)) {
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+
+ /* write data */
+ if (sr_raid1_addio(healing_wu_w, error->err_chunk, wu->swu_blk_start,
+ xs->datalen, databuf, SCSI_DATA_OUT, SR_CCBF_FREEBUF, 0)) {
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+
+ /* write chksum */
+ if (sr_raid1_addio(healing_wu_w, error->err_chunk,
+ chksum_blk, DEV_BSIZE * chksum_data_len,
+ chksum_data, SCSI_DATA_OUT, SR_CCBF_FREEBUF,
+ 0)) {
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+
+ sr_schedule_wu(healing_wu_r);
+ sr_schedule_wu(healing_wu_w);
+
+bad:
+ if (databuf) {
+ sr_block_put(sd, databuf, xs->datalen);
+ }
+ if (chksum_data) {
+ sr_block_put(sd, chksum_data, DEV_BSIZE * chksum_data_len);
+ }
+ if (healing_wu_r) {
+ sr_scsi_wu_put(sd, healing_wu_r);
+ }
+ if (healing_wu_w) {
+ sr_scsi_wu_put(sd, healing_wu_w);
+ }
+ return;
+}
+
+int
+sr_raid1_next_chunk_to_try(struct sr_workunit *wu, int chunk)
+{
+ struct sr_discipline *sd = wu->swu_dis;
+ struct sr_chunk *scp;
+ int tchunk = chunk;
+ int i;
+
+ for (i = 1; i < sd->sd_meta->ssdi.ssd_chunk_no; i++) {
+ tchunk = (chunk + i) % sd->sd_meta->ssdi.ssd_chunk_no;
+ scp = sd->sd_vol.sv_chunks[tchunk];
+ switch (scp->src_meta.scm_status) {
+ case BIOC_SDONLINE:
+ case BIOC_SDSCRUB:
+ break;
+
+ case BIOC_SDOFFLINE:
+ case BIOC_SDREBUILD:
+ case BIOC_SDHOTSPARE:
+ continue;
+
+ /* FALLTHROUGH */
+ default:
+ /* volume offline */
+ printf("%s: is offline, cannot read, chunk %d, i %d\n",
+ DEVNAME(sd->sd_sc), tchunk, i);
+ }
+ }
+ return tchunk;
}
Index: sys/dev/softraidvar.h
===================================================================
RCS file: /cvs/src/sys/dev/softraidvar.h,v
retrieving revision 1.161
diff -u -p -u -r1.161 softraidvar.h
--- sys/dev/softraidvar.h 21 Jul 2015 03:30:51 -0000 1.161
+++ sys/dev/softraidvar.h 13 Sep 2015 20:40:30 -0000
@@ -307,7 +307,7 @@ SLIST_HEAD(sr_boot_volume_head, sr_boot_
#define DEVNAME(_s) ((_s)->sc_dev.dv_xname)
-/* #define SR_DEBUG */
+#define SR_DEBUG
#ifdef SR_DEBUG
extern u_int32_t sr_debug;
#define DPRINTF(x...) do { if (sr_debug) printf(x); } while(0)
@@ -322,6 +322,7 @@ extern u_int32_t sr_debug;
#define SR_D_DIS 0x0080
#define SR_D_STATE 0x0100
#define SR_D_REBUILD 0x0200
+#define SR_D_CHKSUM 0x0400
#else
#define DPRINTF(x...)
#define DNPRINTF(n,x...)
@@ -378,6 +379,7 @@ struct sr_workunit {
#define SR_WU_RESTART 7
#define SR_WU_REQUEUE 8
#define SR_WU_CONSTRUCT 9
+#define SR_WU_CHKSUMFAILED 10
int swu_flags; /* additional hints */
#define SR_WUF_REBUILD (1<<0) /* rebuild io */
@@ -387,6 +389,7 @@ struct sr_workunit {
#define SR_WUF_WAKEUP (1<<4) /* Wakeup on I/O completion. */
#define SR_WUF_DISCIPLINE (1<<5) /* Discipline specific I/O. */
#define SR_WUF_FAKE (1<<6) /* Faked workunit. */
+#define SR_WUF_HEALING (1<<7) /* Workunit to heal bad block */
/* workunit io range */
daddr_t swu_blk_start;
@@ -423,9 +426,24 @@ struct sr_raid0 {
};
/* RAID 1 */
+struct sr_raid1_errrec {
+ struct sr_workunit *wu;
+ u_int32_t err_chunk;
+ u_int32_t next_chunk;
+ TAILQ_ENTRY(sr_raid1_errrec) sr1_err_link;
+};
+
+TAILQ_HEAD(sr1_error_list, sr_raid1_errrec);
+
#define SR_RAID1_NOWU 16
struct sr_raid1 {
u_int32_t sr1_counter;
+ u_int32_t sr1_use_chksum; /* are checksum in use? */
+
+ /* original coerced size in blocks */
+ int64_t sr1_coerced_size;
+ /* list of error recoveries */
+ struct sr1_error_list sr1_errors;
};
/* RAID 5 */
@@ -474,6 +492,10 @@ struct sr_chunk {
u_char src_duid[8]; /* Chunk disklabel UID. */
int64_t src_size; /* in blocks */
+ struct ksensor src_sensor; /* Chunk specific sensor */
+ int src_sensor_attached;
+ int src_errs; /* Errors counter value */
+
SLIST_ENTRY(sr_chunk) src_link;
};
@@ -503,6 +525,7 @@ struct sr_discipline {
/* SR_MD_RAID4 was 7. */
#define SR_MD_RAID6 8
#define SR_MD_CONCAT 9
+#define SR_MD_RAID1_CHKSUM 10
char sd_name[10]; /* human readable dis name */
u_int16_t sd_target; /* scsibus target discipline
uses */
@@ -512,6 +535,7 @@ struct sr_discipline {
#define SR_CAP_REBUILD 0x00000004 /* Supports rebuild. */
#define SR_CAP_NON_COERCED 0x00000008 /* Uses non-coerced size. */
#define SR_CAP_REDUNDANT 0x00000010 /* Redundant copies of data. */
+#define SR_CAP_CHKSUM 0x00000020 /* Check sums of data. */
union {
struct sr_raid0 mdd_raid0;
@@ -583,6 +607,9 @@ struct sr_discipline {
int (*sd_meta_opt_handler)(struct sr_discipline *,
struct sr_meta_opt_hdr *);
void (*sd_rebuild)(struct sr_discipline *);
+
+ int (*sd_wu_collision_detection)(
+ struct sr_workunit *, struct sr_workunit *);
/* SCSI emulation */
struct scsi_sense_data sd_scsi_sense;