Hello,
attached my work in progress on checksumming support for softraid
RAID1. Currently it does just:
- computation of checksums (crc32)
- verification of checksums
- signal bad checksum to console and to sensors
E.g.:
$ sysctl hw.sensors.softraid0
hw.sensors.softraid0.raw0=0 (sd0f), OK
hw.sensors.softraid0.raw1=0 (sd0g), OK
hw.sensors.softraid0.drive0=online (sd1), OK
Next TODO items:
- hang-over to another chunk (restart wu) in case of checksum error
- properly handle errors hapenning on all chunks
- "self-healing" of bad sector
Note: checksums are computed per sector basis, saved in the area
allocated at the end of the drive. Due to this design,
LBA collision detection in softraid.c was enhanced/fixed to support
also this case of application
and currently it may not be compatible with RAID5/6 usage.
Any comments welcome!
Thanks!
Karel
Index: sbin/bioctl/bioctl.8
===================================================================
RCS file: /cvs/src/sbin/bioctl/bioctl.8,v
retrieving revision 1.96
diff -u -p -u -r1.96 bioctl.8
--- sbin/bioctl/bioctl.8 29 May 2015 00:33:37 -0000 1.96
+++ sbin/bioctl/bioctl.8 31 Aug 2015 20:02:47 -0000
@@ -199,6 +199,11 @@ for example, force the creation of volum
with unclean data in the metadata areas.
.It Ar noauto
Do not automatically assemble this volume at boot time.
+.It Ar chksum
+Enforce usage of checksums on the device blocks. The checksum area is
+located at the end of the device data area and since it accupies some
+space it makes actual usable device size smaller. We need exactly 8
+bytes of checksum per device data block.
.El
.It Fl c Ar raidlevel
Create a
Index: sbin/bioctl/bioctl.c
===================================================================
RCS file: /cvs/src/sbin/bioctl/bioctl.c,v
retrieving revision 1.129
diff -u -p -u -r1.129 bioctl.c
--- sbin/bioctl/bioctl.c 18 Jul 2015 23:23:20 -0000 1.129
+++ sbin/bioctl/bioctl.c 31 Aug 2015 20:02:47 -0000
@@ -1053,6 +1053,9 @@ bio_createflags(char *lst)
case 'n':
flags |= BIOC_SCNOAUTOASSEMBLE;
break;
+ case 'c':
+ flags |= BIOC_SCCHKSUM;
+ break;
default:
strlcpy(fs, s, sz + 1);
errx(1, "invalid flag %s", fs);
Index: sys/dev/biovar.h
===================================================================
RCS file: /cvs/src/sys/dev/biovar.h,v
retrieving revision 1.44
diff -u -p -u -r1.44 biovar.h
--- sys/dev/biovar.h 29 May 2015 00:33:37 -0000 1.44
+++ sys/dev/biovar.h 31 Aug 2015 20:02:49 -0000
@@ -213,6 +213,7 @@ struct bioc_createraid {
#define BIOC_SCDEVT 0x02 /* dev_t array or string in dev_list */
#define BIOC_SCNOAUTOASSEMBLE 0x04 /* do not assemble during autoconf */
#define BIOC_SCBOOTABLE 0x08 /* device is bootable */
+#define BIOC_SCCHKSUM 0x10 /* device provides chksum capability */
u_int32_t bc_opaque_size;
u_int32_t bc_opaque_flags;
#define BIOC_SOINVALID 0x00 /* no opaque pointer */
Index: sys/dev/softraid.c
===================================================================
RCS file: /cvs/src/sys/dev/softraid.c,v
retrieving revision 1.364
diff -u -p -u -r1.364 softraid.c
--- sys/dev/softraid.c 19 Aug 2015 19:05:24 -0000 1.364
+++ sys/dev/softraid.c 31 Aug 2015 20:02:50 -0000
@@ -71,6 +71,7 @@ uint32_t sr_debug = 0
/* | SR_D_DIS */
/* | SR_D_STATE */
/* | SR_D_REBUILD */
+ /* | SR_D_CHKSUM */
;
#endif
@@ -144,6 +145,8 @@ int sr_chunk_in_use(struct sr_softc *,
int sr_rw(struct sr_softc *, dev_t, char *, size_t,
daddr_t, long);
void sr_wu_done_callback(void *);
+int sr_wu_collision(struct sr_workunit *,
+ struct sr_workunit *);
/* don't include these on RAMDISK */
#ifndef SMALL_KERNEL
@@ -2264,6 +2267,9 @@ sr_wu_done_callback(void *xwu)
s = splbio();
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done: %p\n",
+ DEVNAME(sd->sd_sc), wu);
+
if (xs != NULL) {
if (wu->swu_ios_failed)
xs->error = XS_DRIVER_STUFFUP;
@@ -2286,11 +2292,54 @@ sr_wu_done_callback(void *xwu)
TAILQ_REMOVE(&sd->sd_wu_pendq, wu, swu_link);
if (wu->swu_collider) {
- if (wu->swu_ios_failed)
- sr_raid_recreate_wu(wu->swu_collider);
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, searching for collider: %p\n",
+ DEVNAME(sd->sd_sc), wu->swu_collider);
+ if (wu->swu_ios_failed) {
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, recreate collider?: %p WHY???\n",
+ DEVNAME(sd->sd_sc), wu->swu_collider);
+ sr_raid_recreate_wu(wu->swu_collider);
+ }
+ /*
+ * We're searching for wu which do have the same collider
+ * like current wu. If we find such wu we can continue
+ * without starting the collider. If we do not find such wu
+ * then we need to start the collieder as the current wu is
+ * the last wu the collider collides with.
+ */
+ int found = 0;
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, searching for collider:"
+ " %p\n", DEVNAME(sd->sd_sc), wu->swu_collider);
+ TAILQ_FOREACH(wup, &sd->sd_wu_pendq, swu_link) {
+ if (wu->swu_collider == wup->swu_collider) {
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, found"
+ " collider in wu: %p\n",
+ DEVNAME(sd->sd_sc), wup);
+ found++;
+ break;
+ }
+ }
+ TAILQ_FOREACH(wup, &sd->sd_wu_defq, swu_link) {
+ if (wu->swu_collider == wup->swu_collider) {
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, found"
+ " collider in def wu: %p\n",
+ DEVNAME(sd->sd_sc), wup);
+ found++;
+ break;
+ }
+ }
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, collider found: %d\n",
+ DEVNAME(sd->sd_sc), found);
- /* XXX Should the collider be failed if this xs failed? */
- sr_raid_startwu(wu->swu_collider);
+ if (found == 0) {
+ /* The current wu is the last wu colliding
+ with the collider. */
+ DNPRINTF(SR_D_WU, "%s: sr_wu_done, starting the collider: %p\n",
+ DEVNAME(sd->sd_sc), wu->swu_collider);
+ sr_raid_startwu(wu->swu_collider);
+ }
+ else {
+ wu->swu_collider = NULL;
+ }
}
/*
@@ -3967,6 +4016,7 @@ sr_discipline_init(struct sr_discipline
sd->sd_set_chunk_state = sr_set_chunk_state;
sd->sd_set_vol_state = sr_set_vol_state;
sd->sd_start_discipline = NULL;
+ sd->sd_wu_collision_detection = NULL;
task_set(&sd->sd_meta_save_task, sr_meta_save_callback, sd);
task_set(&sd->sd_hotspare_rebuild_task, sr_hotspare_rebuild_callback,
@@ -4181,11 +4231,30 @@ sr_raid_intr(struct buf *bp)
splx(s);
}
+int
+sr_wu_collision(struct sr_workunit *wu1, struct sr_workunit *wu2)
+{
+ struct sr_discipline *sd = wu1->swu_dis;
+
+ if (sd->sd_wu_collision_detection) {
+ return sd->sd_wu_collision_detection(wu1, wu2);
+ }
+ else if (wu1->swu_blk_end < wu2->swu_blk_start ||
+ wu2->swu_blk_end < wu1->swu_blk_start) {
+ return 0;
+ }
+ else {
+ return 1;
+ }
+}
+
void
sr_schedule_wu(struct sr_workunit *wu)
{
struct sr_discipline *sd = wu->swu_dis;
struct sr_workunit *wup;
+ struct sr_workunit *twup;
+
int s;
DNPRINTF(SR_D_WU, "sr_schedule_wu: schedule wu %p state %i "
@@ -4210,20 +4279,53 @@ sr_schedule_wu(struct sr_workunit *wu)
if (wu->swu_state != SR_WU_INPROGRESS)
panic("sr_schedule_wu: work unit not in progress (state %i)\n",
wu->swu_state);
+ /*
+ * Walk both pending and defferred queues and find colliding wus.
+ * If we find collision we set wu's collider to current wu and push
+ * the current wu into the defferred queue.
+ */
+ int colliding = 0;
- /* Walk queue backwards and fill in collider if we have one. */
- TAILQ_FOREACH_REVERSE(wup, &sd->sd_wu_pendq, sr_wu_list, swu_link) {
- if (wu->swu_blk_end < wup->swu_blk_start ||
- wup->swu_blk_end < wu->swu_blk_start)
+ TAILQ_FOREACH_SAFE(wup, &sd->sd_wu_pendq, swu_link, twup) {
+ if (!sr_wu_collision(wu, wup))
continue;
+ colliding = 1;
/* Defer work unit due to LBA collision. */
- DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p\n",
- wu);
- wu->swu_state = SR_WU_DEFERRED;
+ DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p"
+ " due to collision with: %p, scsi: %s, blk_s: %lld, blk_e:"
+ " %lld\n", wu, wup,
+ ((wup->swu_xs != NULL) ?
+ (wup->swu_xs->flags & SCSI_DATA_IN ? "READ" : "WRITE")
+ : "NULL"),
+ wup->swu_blk_start, wup->swu_blk_end);
while (wup->swu_collider)
wup = wup->swu_collider;
- wup->swu_collider = wu;
+
+ if (wup != wu)
+ wup->swu_collider = wu;
+ }
+ TAILQ_FOREACH_SAFE(wup, &sd->sd_wu_defq, swu_link, twup) {
+ if (!sr_wu_collision(wu, wup))
+ continue;
+
+ colliding = 1;
+ DNPRINTF(SR_D_WU, "sr_schedule_wu: deferring work unit %p"
+ " due to collision with: %p, scsi: %s, blk_s: %lld, blk_e:"
+ " %lld\n", wu, wup,
+ ((wup->swu_xs != NULL) ?
+ (wup->swu_xs->flags & SCSI_DATA_IN ? "READ" : "WRITE")
+ : "NULL"),
+ wup->swu_blk_start, wup->swu_blk_end);
+
+ while (wup->swu_collider)
+ wup = wup->swu_collider;
+
+ if (wup != wu)
+ wup->swu_collider = wu;
+ }
+ if (colliding == 1) {
+ wu->swu_state = SR_WU_DEFERRED;
TAILQ_INSERT_TAIL(&sd->sd_wu_defq, wu, swu_link);
sd->sd_wu_collisions++;
goto queued;
@@ -4845,7 +4947,15 @@ void
sr_sensors_delete(struct sr_discipline *sd)
{
DNPRINTF(SR_D_STATE, "%s: sr_sensors_delete\n", DEVNAME(sd->sd_sc));
-
+ /* first run thorough chunk specific sensors */
+ /* shall we enhance discipline API and add sensor delete function? */
+ int chdx;
+ for (chdx = 0; chdx < sd->sd_meta->ssdi.ssd_chunk_no; chdx++) {
+ if (sd->sd_vol.sv_chunks[chdx]->src_sensor_attached) {
+ sensor_detach(&sd->sd_sc->sc_sensordev,
+ &sd->sd_vol.sv_chunks[chdx]->src_sensor);
+ }
+ }
if (sd->sd_vol.sv_sensor_attached)
sensor_detach(&sd->sd_sc->sc_sensordev, &sd->sd_vol.sv_sensor);
}
@@ -4856,6 +4966,8 @@ sr_sensors_refresh(void *arg)
struct sr_softc *sc = arg;
struct sr_volume *sv;
struct sr_discipline *sd;
+ struct sr_chunk *chunk;
+ struct sr_chunk_head *cl;
DNPRINTF(SR_D_STATE, "%s: sr_sensors_refresh\n", DEVNAME(sc));
@@ -4882,6 +4994,18 @@ sr_sensors_refresh(void *arg)
default:
sv->sv_sensor.value = 0; /* unknown */
sv->sv_sensor.status = SENSOR_S_UNKNOWN;
+ }
+ /* shall we enhance discipline API and add
+ sensor refresh function? */
+ if (sd->sd_type == SR_MD_RAID1_CHKSUM
+ && sd->mds.mdd_raid1.sr1_use_chksum) {
+ /* refreshing chksum errors sensors */
+ cl = &sv->sv_chunk_list;
+ SLIST_FOREACH(chunk, cl, src_link)
+ if (chunk->src_errs > 0 && chunk->src_sensor_attached == 1) {
+ chunk->src_sensor.value = chunk->src_errs;
+ chunk->src_sensor.status = SENSOR_S_WARN;
+ }
}
}
}
Index: sys/dev/softraid_raid1.c
===================================================================
RCS file: /cvs/src/sys/dev/softraid_raid1.c,v
retrieving revision 1.63
diff -u -p -u -r1.63 softraid_raid1.c
--- sys/dev/softraid_raid1.c 21 Jul 2015 03:30:51 -0000 1.63
+++ sys/dev/softraid_raid1.c 31 Aug 2015 20:02:50 -0000
@@ -41,6 +41,8 @@
#include <dev/softraidvar.h>
+#include <lib/libz/zlib.h>
+
/* RAID 1 functions. */
int sr_raid1_create(struct sr_discipline *, struct bioc_createraid *,
int, int64_t);
@@ -48,15 +50,49 @@ int sr_raid1_assemble(struct sr_discipli
int, void *);
int sr_raid1_init(struct sr_discipline *sd);
int sr_raid1_rw(struct sr_workunit *);
+int sr_raid1_openings(struct sr_discipline *);
int sr_raid1_wu_done(struct sr_workunit *);
void sr_raid1_set_chunk_state(struct sr_discipline *, int, int);
void sr_raid1_set_vol_state(struct sr_discipline *);
+int sr_raid1_wu_collision_detection(struct sr_workunit *,
+ struct sr_workunit *);
+
+/* internal functions */
+int sr_raid1_addio(struct sr_workunit *, int, daddr_t, daddr_t, void *,
+ int, int, void *);
+void sr_raid1_intr(struct buf *);
+int sr_raid1_verify_chksum(void*, int, daddr_t, daddr_t, void*);
+uLong sr_raid1_update_chksum(void*, int, daddr_t, daddr_t, void*);
+int sr_raid1_sensor_create(struct sr_discipline *, int);
+daddr_t sr_raid1_chksum_blk_start(struct sr_workunit *);
+daddr_t sr_raid1_chksum_blk_end(struct sr_workunit *);
+size_t sr_raid1_chksum_data_len(struct sr_workunit *);
+
+
+#define CHKSUM_SIZE 8
+#define CHKSUM_IN_BLOCK (DEV_BSIZE / 8)
+
+
+struct sr_raid1c_opaque {
+ /* 0 == read, 1 == write */
+ int write;
+ void *data;
+ int len;
+ daddr_t blk_start;
+ daddr_t blk_end;
+ void *chksum_data;
+};
+
/* Discipline initialisation. */
void
sr_raid1_discipline_init(struct sr_discipline *sd)
{
/* Fill out discipline members. */
+ /*
+ * For now we assume run without check sums, if this is not true
+ * we will correct values in _create or _assembly functions.
+ */
sd->sd_type = SR_MD_RAID1;
strlcpy(sd->sd_name, "RAID 1", sizeof(sd->sd_name));
sd->sd_capabilities = SR_CAP_SYSTEM_DISK | SR_CAP_AUTO_ASSEMBLE |
@@ -70,20 +106,48 @@ sr_raid1_discipline_init(struct sr_disci
sd->sd_scsi_wu_done = sr_raid1_wu_done;
sd->sd_set_chunk_state = sr_raid1_set_chunk_state;
sd->sd_set_vol_state = sr_raid1_set_vol_state;
+ sd->sd_scsi_intr = sr_raid1_intr;
}
int
sr_raid1_create(struct sr_discipline *sd, struct bioc_createraid *bc,
int no_chunk, int64_t coerced_size)
{
+ int ch;
if (no_chunk < 2) {
sr_error(sd->sd_sc, "%s requires two or more chunks",
sd->sd_name);
return EINVAL;
}
-
- sd->sd_meta->ssdi.ssd_size = coerced_size;
-
+ if (bc->bc_flags & BIOC_SCCHKSUM) {
+ int64_t chksum_area_size = coerced_size * CHKSUM_SIZE
+ / DEV_BSIZE;
+ if (((coerced_size * CHKSUM_SIZE) % DEV_BSIZE) != 0) {
+ chksum_area_size++;
+ }
+ DNPRINTF(SR_D_MISC, "RAID 1 CHKSUM: coerced size: %lld,"
+ " data size: %lld, chksum area size: %lld\n, ",
+ coerced_size, coerced_size - chksum_area_size,
+ chksum_area_size);
+ sd->sd_meta->ssdi.ssd_size = coerced_size - chksum_area_size;
+ sd->mds.mdd_raid1.sr1_coerced_size = coerced_size;
+ sd->mds.mdd_raid1.sr1_use_chksum = 1;
+ /* fixing discipline values for chksum support */
+ sd->sd_type = SR_MD_RAID1_CHKSUM;
+ strlcpy(sd->sd_name, "RAID 1C", sizeof(sd->sd_name));
+ sd->sd_openings = sr_raid1_openings;
+ sd->sd_wu_collision_detection = sr_raid1_wu_collision_detection;
+ for (ch = 0; ch < no_chunk; ch++) {
+ if (sr_raid1_sensor_create(sd, ch)) {
+ DNPRINTF(SR_D_MISC, "RAID 1C: sensor can't be"
+ " created for chunk: %d\n", ch);
+ }
+ }
+ }
+ else {
+ sd->sd_meta->ssdi.ssd_size = coerced_size;
+ sd->mds.mdd_raid1.sr1_use_chksum = 0;
+ }
return sr_raid1_init(sd);
}
@@ -91,17 +155,55 @@ int
sr_raid1_assemble(struct sr_discipline *sd, struct bioc_createraid *bc,
int no_chunk, void *data)
{
+ int ch;
+ if (bc->bc_flags & BIOC_SCCHKSUM) {
+ int64_t coerced_size = sd->sd_vol.sv_chunks[0]
+ ->src_meta.scmi.scm_coerced_size;
+ DNPRINTF(SR_D_MISC, "RAID 1 CHKSUM: coerced size: %lld\n, ",
+ coerced_size);
+ sd->mds.mdd_raid1.sr1_coerced_size = coerced_size;
+ sd->mds.mdd_raid1.sr1_use_chksum = 1;
+ /* fixing discipline values for chksum support */
+ sd->sd_type = SR_MD_RAID1_CHKSUM;
+ strlcpy(sd->sd_name, "RAID 1C", sizeof(sd->sd_name));
+ sd->sd_openings = sr_raid1_openings;
+ sd->sd_wu_collision_detection = sr_raid1_wu_collision_detection;
+ for (ch = 0; ch < no_chunk; ch++) {
+ if (sr_raid1_sensor_create(sd, ch)) {
+ DNPRINTF(SR_D_MISC, "RAID 1C: sensor can't be"
+ " created for chunk: %d\n", ch);
+ }
+ }
+ }
+ else {
+ sd->mds.mdd_raid1.sr1_use_chksum = 0;
+ }
return sr_raid1_init(sd);
}
int
sr_raid1_init(struct sr_discipline *sd)
{
- sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
-
+ if (sd->mds.mdd_raid1.sr1_use_chksum) {
+ /*
+ * In case of chksum support we use two ccbs per chunk
+ * for read and write.
+ */
+ sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no * 2;
+ }
+ else {
+ sd->sd_max_ccb_per_wu = sd->sd_meta->ssdi.ssd_chunk_no;
+ }
return 0;
}
+int
+sr_raid1_openings(struct sr_discipline *sd)
+{
+ /* Max two work units per I/O (in case of write) */
+ return sd->sd_max_wu >> 1;
+}
+
void
sr_raid1_set_chunk_state(struct sr_discipline *sd, int c, int new_state)
{
@@ -324,17 +426,24 @@ die:
int
sr_raid1_rw(struct sr_workunit *wu)
{
+ struct sr_workunit *wu_r_chksum = NULL;
struct sr_discipline *sd = wu->swu_dis;
struct scsi_xfer *xs = wu->swu_xs;
struct sr_ccb *ccb;
struct sr_chunk *scp;
int ios, chunk, i, rt;
daddr_t blkno;
+ int use_chksum = sd->mds.mdd_raid1.sr1_use_chksum;
/* blkno and scsi error will be handled by sr_validate_io */
if (sr_validate_io(wu, &blkno, "sr_raid1_rw"))
goto bad;
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_rw: blkno: %lld, len: %d, %s,"
+ " swu_block_start %lld, swu_block_end: %lld\n", blkno, xs->datalen,
+ (xs->flags & SCSI_DATA_IN) ? "READ" : "WRITE", wu->swu_blk_start,
+ wu->swu_blk_end);
+
if (xs->flags & SCSI_DATA_IN)
ios = 1;
else
@@ -368,8 +477,8 @@ ragain:
}
} else {
/* writes go on all working disks */
- chunk = i;
- scp = sd->sd_vol.sv_chunks[chunk];
+ chunk = i;
+ scp = sd->sd_vol.sv_chunks[chunk];
switch (scp->src_meta.scm_status) {
case BIOC_SDONLINE:
case BIOC_SDSCRUB:
@@ -384,25 +493,155 @@ ragain:
goto bad;
}
}
-
- ccb = sr_ccb_rw(sd, chunk, blkno, xs->datalen, xs->data,
- xs->flags, 0);
- if (!ccb) {
- /* should never happen but handle more gracefully */
- printf("%s: %s: too many ccbs queued\n",
- DEVNAME(sd->sd_sc),
- sd->sd_meta->ssd_devname);
- goto bad;
+ if (use_chksum) {
+ daddr_t chksum_blk = sr_raid1_chksum_blk_start(wu);
+ void* chksum_data = NULL;
+ size_t chksum_data_len = sr_raid1_chksum_data_len(wu);
+ if (xs->flags & SCSI_DATA_IN) {
+ /* read data */
+ if (sr_raid1_addio(wu, chunk, blkno,
+ xs->datalen, xs->data, xs->flags, 0, 0)) {
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ chksum_data = sr_block_get(sd, DEV_BSIZE
+ * chksum_data_len);
+ if (!chksum_data) {
+ goto bad;
+ }
+ struct sr_raid1c_opaque *chksum_info = malloc(
+ sizeof(struct sr_raid1c_opaque), M_DEVBUF,
+ M_ZERO | M_NOWAIT);
+ if (!chksum_info) {
+ panic("%s: %s: can't allocate"
+ " chksum_info structure\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ }
+ chksum_info->write = 0;
+ chksum_info->blk_start = wu->swu_blk_start;
+ chksum_info->blk_end = wu->swu_blk_end;
+ chksum_info->data = xs->data;
+ chksum_info->len = xs->datalen;
+ chksum_info->chksum_data = chksum_data;
+ /* read chksum */
+ if (sr_raid1_addio(wu, chunk, chksum_blk,
+ DEV_BSIZE * chksum_data_len, chksum_data,
+ SCSI_DATA_IN, 0, chksum_info)) {
+ sr_block_put(sd, chksum_data,
+ DEV_BSIZE * chksum_data_len);
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ }
+ else {
+ /* write with chksum */
+ struct sr_raid1c_opaque *chksum_info = malloc(
+ sizeof(struct sr_raid1c_opaque), M_DEVBUF,
+ M_ZERO | M_NOWAIT);
+ if (!chksum_info) {
+ panic("%s: %s: can't allocate"
+ " chksum_info structure\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ }
+ if (!wu_r_chksum) {
+ if ((wu_r_chksum = sr_scsi_wu_get(sd,
+ SCSI_NOSLEEP)) == NULL) {
+ printf("%s: %s failed to get"
+ " read work unit\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ wu_r_chksum->swu_state
+ = SR_WU_INPROGRESS;
+ wu_r_chksum->swu_flags
+ |= SR_WUF_DISCIPLINE;
+ wu_r_chksum->swu_blk_start
+ = sr_raid1_chksum_blk_start(wu);
+ wu_r_chksum->swu_blk_end
+ = sr_raid1_chksum_blk_end(wu);
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_rw:"
+ " wu_r_chksum: %p\n", wu_r_chksum);
+ }
+ chksum_data = sr_block_get(sd,
+ DEV_BSIZE * chksum_data_len);
+ if (!chksum_data) {
+ goto bad;
+ }
+ chksum_info->write = 1;
+ chksum_info->blk_start = wu->swu_blk_start;
+ chksum_info->blk_end = wu->swu_blk_end;
+ chksum_info->data = xs->data;
+ chksum_info->len = xs->datalen;
+ chksum_info->chksum_data = chksum_data;
+ DNPRINTF(SR_D_CHKSUM, "rw: chksum_info: %p\n",
+ chksum_info);
+ /* read chksum */
+ if (sr_raid1_addio(wu_r_chksum, chunk,
+ chksum_blk, DEV_BSIZE * chksum_data_len,
+ chksum_data, SCSI_DATA_IN, 0,
+ chksum_info)) {
+ sr_block_put(sd, chksum_data,
+ DEV_BSIZE * chksum_data_len);
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ /* write data */
+ if (sr_raid1_addio(wu, chunk, blkno,
+ xs->datalen, xs->data, xs->flags, 0, 0)) {
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ /* write chksum */
+ if (sr_raid1_addio(wu, chunk, chksum_blk,
+ DEV_BSIZE * chksum_data_len, chksum_data,
+ xs->flags, SR_CCBF_FREEBUF, 0)) {
+ printf("%s: %s: too many ccbs queued"
+ " (2c)\n", DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+ }
+ }
+ else {
+ /* RAID 1 without chksum support */
+ ccb = sr_ccb_rw(sd, chunk, blkno, xs->datalen, xs->data,
+ xs->flags, 0);
+ if (!ccb) {
+ /* should never happen but handle more gracefully */
+ printf("%s: %s: too many ccbs queued\n",
+ DEVNAME(sd->sd_sc),
+ sd->sd_meta->ssd_devname);
+ goto bad;
+ }
+
+ sr_wu_enqueue_ccb(wu, ccb);
}
- sr_wu_enqueue_ccb(wu, ccb);
}
-
+ if (wu_r_chksum) {
+ /* collide write request with chksum reads */
+ wu_r_chksum->swu_blk_start = wu->swu_blk_start;
+ wu_r_chksum->swu_blk_end = wu->swu_blk_end;
+ sr_schedule_wu(wu_r_chksum);
+ }
sr_schedule_wu(wu);
return (0);
bad:
/* wu is unwound by sr_wu_put */
+ if (wu_r_chksum)
+ sr_scsi_wu_put(sd, wu_r_chksum);
return (1);
}
@@ -411,7 +650,83 @@ sr_raid1_wu_done(struct sr_workunit *wu)
{
struct sr_discipline *sd = wu->swu_dis;
struct scsi_xfer *xs = wu->swu_xs;
+ struct sr_ccb *ccb = NULL;
+
+ /* XXX - we have no way of propagating errors... */
+ if (wu->swu_flags & SR_WUF_DISCIPLINE) {
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done (read chksum):"
+ " %p, blk_s: %lld, blk_e: %lld\n", wu, wu->swu_blk_start,
+ wu->swu_blk_end);
+ /*
+ * This is read chksum wu for a data write wu, we need to
+ * free ccb->ccb_opaque which is checksum_info here since
+ * ccb_buf with chksum data is passed directly to write
+ * and we do not need chksum_info anymore.
+ */
+ TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) {
+ if (ccb->ccb_opaque) {
+ struct sr_raid1c_opaque *chksum_info
+ = ccb->ccb_opaque;
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done: free"
+ " chksum_info: %p\n", chksum_info);
+ free(chksum_info, M_DEVBUF, 0);
+ ccb->ccb_opaque = NULL;
+ }
+ }
+ return SR_WU_OK;
+ }
+
+ if (wu->swu_ios_complete != wu->swu_io_count)
+ return SR_WU_INPROGRESS;
+
+ if (xs->flags & SCSI_DATA_IN) {
+ /* read: verify chksum */
+ TAILQ_FOREACH(ccb, &wu->swu_ccb, ccb_link) {
+ if (ccb->ccb_opaque != NULL) {
+ /* ccb is chksum ccb */
+ struct sr_raid1c_opaque *chksum_info
+ = ccb->ccb_opaque;
+ if (sr_raid1_verify_chksum(chksum_info->data,
+ chksum_info->len, chksum_info->blk_start,
+ chksum_info->blk_end,
+ ccb->ccb_buf.b_data)) {
+ wu->swu_state = SR_WU_CHKSUMFAILED;
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done"
+ ": verify failed on area %lld-%lld"
+ " with wu state: %d and flags: %d"
+ " on chunk: %d\n",
+ wu->swu_blk_start, wu->swu_blk_end,
+ wu->swu_state, wu->swu_flags,
+ ccb->ccb_target);
+ /* update chunk error value */
+ if (ccb->ccb_target != -1) {
+ sd->sd_vol.sv_chunks
+ [ccb->ccb_target]
+ ->src_errs++;
+ }
+ }
+ /* free chksum ccb buf */
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_done:"
+ " sr_block_put:"
+ " %p, size: %ld\n", ccb->ccb_buf.b_data,
+ ccb->ccb_buf.b_bcount);
+ sr_block_put(sd, ccb->ccb_buf.b_data,
+ ccb->ccb_buf.b_bcount);
+ ccb->ccb_buf.b_data = NULL;
+ free(chksum_info, M_DEVBUF, 0);
+ ccb->ccb_opaque = NULL;
+ }
+ }
+ }
+
+ if (wu->swu_state == SR_WU_CHKSUMFAILED) {
+ ccb = TAILQ_FIRST(&wu->swu_ccb);
+ printf("%s: chunk: %d: verify chksum failed on %lld-%lld"
+ " block(s)\n",
+ sd->sd_meta->ssd_devname, ccb->ccb_target,
+ wu->swu_blk_start, wu->swu_blk_end);
+ }
/* If at least one I/O succeeded, we are okay. */
if (wu->swu_ios_succeeded > 0) {
xs->error = XS_NOERROR;
@@ -438,4 +753,230 @@ sr_raid1_wu_done(struct sr_workunit *wu)
xs->error = XS_DRIVER_STUFFUP;
return SR_WU_FAILED;
+}
+
+int
+sr_raid1_wu_collision_detection(struct sr_workunit *wu1,
+ struct sr_workunit *wu2)
+{
+ if (wu1 == NULL || wu2 == NULL)
+ return 0;
+
+ daddr_t wu1_chksum_blk_start = sr_raid1_chksum_blk_start(wu1);
+ daddr_t wu1_chksum_blk_end = sr_raid1_chksum_blk_end(wu1);
+ daddr_t wu2_chksum_blk_start = sr_raid1_chksum_blk_start(wu2);
+ daddr_t wu2_chksum_blk_end = sr_raid1_chksum_blk_end(wu2);
+
+ if (wu1->swu_blk_end < wu2->swu_blk_start
+ || wu2->swu_blk_end < wu1->swu_blk_start) {
+ /* data blocks do not colide, let's test chksum blocks */
+ if (wu1_chksum_blk_end < wu2_chksum_blk_start
+ || wu2_chksum_blk_end < wu1_chksum_blk_start) {
+ return 0;
+ }
+ }
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_wu_collision_detection: collision"
+ " found! wu1: %p, blk_s: %lld, blk_e: %lld,"
+ " chksum_blk_s: %lld, chksum_blk_e: %lld, wu2: %p,"
+ " blk_s: %lld, blk_e: %lld, chksum_blk_s: %lld, chksum_blk_e:"
+ " %lld\n", wu1, wu1->swu_blk_start, wu1->swu_blk_end,
+ wu1_chksum_blk_start, wu1_chksum_blk_end, wu2, wu2->swu_blk_start,
+ wu2->swu_blk_end, wu2_chksum_blk_start, wu2_chksum_blk_end);
+ return 1;
+}
+
+int
+sr_raid1_addio(struct sr_workunit *wu, int chunk, daddr_t blkno,
+ daddr_t len, void *data, int xsflags, int ccbflags, void *chksumbuf)
+{
+ struct sr_discipline *sd = wu->swu_dis;
+ struct sr_ccb *ccb;
+ DNPRINTF(SR_D_CHKSUM, "sr_raid1_addio: %s chunk %d block %lld "
+ "length %lld %s\n", (xsflags & SCSI_DATA_IN) ? "read" : "write",
+ chunk, (long long)blkno, (long long)len,
+ chksumbuf ? "CHKSUM" : "-");
+ /* Allocate temporary buffer. */
+ if (data == NULL) {
+ data = sr_block_get(sd, len);
+ if (data == NULL)
+ return (-1);
+ ccbflags |= SR_CCBF_FREEBUF;
+ }
+
+ ccb = sr_ccb_rw(sd, chunk, blkno, len, data, xsflags, ccbflags);
+ if (ccb == NULL) {
+ if (ccbflags & SR_CCBF_FREEBUF)
+ sr_block_put(sd, data, len);
+ return (-1);
+ }
+ ccb->ccb_opaque = chksumbuf;
+ sr_wu_enqueue_ccb(wu, ccb);
+
+ return (0);
+}
+
+void
+sr_raid1_intr(struct buf *bp)
+{
+ struct sr_ccb *ccb = (struct sr_ccb *)bp;
+ struct sr_workunit *wu = ccb->ccb_wu;
+ struct sr_discipline *sd = wu->swu_dis;
+ int s;
+ int use_chksum;
+
+ use_chksum = sd->mds.mdd_raid1.sr1_use_chksum;
+
+ DNPRINTF(SR_D_INTR, "%s: sr_raid1_intr bp %p xs %p\n",
+ DEVNAME(sd->sd_sc), bp, wu->swu_xs);
+
+ s = splbio();
+ sr_ccb_done(ccb);
+
+ if (use_chksum && ccb->ccb_state == SR_CCB_OK
+ && ccb->ccb_opaque) {
+ struct sr_raid1c_opaque *chksum_info = ccb->ccb_opaque;
+ if (chksum_info->write == 1) {
+ /* let's update read chksum for provided data */
+ sr_raid1_update_chksum(chksum_info->data,
+ chksum_info->len, chksum_info->blk_start,
+ chksum_info->blk_end, ccb->ccb_buf.b_data);
+ }
+ }
+ /* Free allocated data buffer. */
+ if (ccb->ccb_flags & SR_CCBF_FREEBUF) {
+ sr_block_put(sd, ccb->ccb_buf.b_data, ccb->ccb_buf.b_bcount);
+ ccb->ccb_buf.b_data = NULL;
+ }
+ sr_wu_done(wu);
+ splx(s);
+}
+
+int
+sr_raid1_verify_chksum(void* data, int len, daddr_t blk_start, daddr_t blk_end,
+ void* chksum_buf)
+{
+ int32_t chksum_n = blk_start % CHKSUM_IN_BLOCK;
+ int32_t chksum_count = blk_end - blk_start + 1;
+ uLong *chksum = chksum_buf;
+ Bytef *buf = data;
+ int32_t i;
+
+ for (i = 0; i < chksum_count; i++) {
+ uLong crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, &(buf[i * DEV_BSIZE]), DEV_BSIZE);
+ if (crc != chksum[chksum_n]) {
+ DNPRINTF(SR_D_CHKSUM, "verify failed on comparison"
+ " block chksum(%ld) and saved chksum(%ld)[%d],"
+ " called for start: %lld, end: %lld, n: %d, failed"
+ " block: %lld\n", crc, chksum[chksum_n], chksum_n,
+ blk_start, blk_end, chksum_n, blk_start + i);
+ return (-1);
+ }
+ chksum_n++;
+ }
+ return 0;
+}
+
+uLong
+sr_raid1_update_chksum(void* data, int len, daddr_t blk_start, daddr_t blk_end,
+ void* chksum_buf)
+{
+ int32_t chksum_n = blk_start % CHKSUM_IN_BLOCK;
+ int32_t chksum_count = blk_end - blk_start + 1;
+ uLong *chksum = chksum_buf;
+ Bytef *buf = data;
+ int32_t i;
+
+ DNPRINTF(SR_D_CHKSUM, "update chksum: start: %lld, end: %lld, n: %d\n",
+ blk_start, blk_end, chksum_n);
+ DNPRINTF(SR_D_CHKSUM, "blocks chksumed: ");
+ for (i = 0; i < chksum_count; i++) {
+ uLong crc = crc32(0L, Z_NULL, 0);
+ crc = crc32(crc, &(buf[i * DEV_BSIZE]), DEV_BSIZE);
+ chksum[chksum_n] = crc;
+ DNPRINTF(SR_D_CHKSUM, "%lld->(%ld)[%d], ", (blk_start + i),
+ crc, chksum_n);
+ chksum_n++;
+ }
+ DNPRINTF(SR_D_CHKSUM, "\n");
+ return 0;
+}
+
+int
+sr_raid1_sensor_create(struct sr_discipline* sd, int chno)
+{
+ struct sr_chunk *chunk;
+
+ chunk = sd->sd_vol.sv_chunks[chno];
+ chunk->src_errs = 0;
+ chunk->src_sensor.type = SENSOR_INTEGER;
+ chunk->src_sensor.status = SENSOR_S_OK;
+ strlcpy(chunk->src_sensor.desc, chunk->src_devname,
+ sizeof(chunk->src_sensor.desc));
+ sensor_attach(&sd->sd_sc->sc_sensordev, &chunk->src_sensor);
+ chunk->src_sensor_attached = 1;
+ return 0;
+}
+
+daddr_t
+sr_raid1_chksum_blk_start(struct sr_workunit *wu)
+{
+ struct sr_discipline *sd = wu->swu_dis;
+ daddr_t blkno = wu->swu_blk_start;
+ daddr_t chksum_blk_start;
+
+ chksum_blk_start = sd->sd_meta->ssdi.ssd_size
+ + (blkno / CHKSUM_IN_BLOCK);
+
+ return chksum_blk_start;
+}
+
+daddr_t
+sr_raid1_chksum_blk_end(struct sr_workunit *wu)
+{
+ return sr_raid1_chksum_blk_start(wu)
+ + sr_raid1_chksum_data_len(wu) - 1;
+}
+
+size_t
+sr_raid1_chksum_data_len(struct sr_workunit* wu)
+{
+ size_t chksum_len;
+ size_t chksum_offset;
+ size_t chksum_data_len;
+
+ chksum_len = (wu->swu_blk_end - wu->swu_blk_start + 1) * CHKSUM_SIZE;
+ chksum_offset = (wu->swu_blk_start % CHKSUM_IN_BLOCK) * CHKSUM_SIZE;
+ chksum_data_len = (chksum_len + chksum_offset) / DEV_BSIZE + 1;
+
+ return chksum_data_len;
+}
+
+/* debug code used in softraid.c directly */
+void
+sr_raid1_print_wu(struct sr_workunit *wu, int f, const char* msg)
+{
+ DNPRINTF(f, msg);
+ if (wu->swu_flags & SR_WUF_DISCIPLINE) {
+ DNPRINTF(f, "(read chksum) %p, blk_s: %lld, blk_e: %lld,"
+ " chksum_s: %lld, chksum_e: %lld\n", wu, wu->swu_blk_start,
+ wu->swu_blk_end, sr_raid1_chksum_blk_start(wu),
+ sr_raid1_chksum_blk_end(wu));
+ }
+ else {
+ if (wu->swu_xs->flags & SCSI_DATA_IN) {
+ DNPRINTF(f, "(read) %p, blk_s: %lld, blk_e: %lld,"
+ " chksum_s: %lld, chksum_e: %lld\n", wu,
+ wu->swu_blk_start, wu->swu_blk_end,
+ sr_raid1_chksum_blk_start(wu),
+ sr_raid1_chksum_blk_end(wu));
+ }
+ else {
+ DNPRINTF(f, "(write) %p, blk_s: %lld, blk_e: %lld,"
+ " chksum_s: %lld, chksum_e: %lld\n", wu,
+ wu->swu_blk_start, wu->swu_blk_end,
+ sr_raid1_chksum_blk_start(wu),
+ sr_raid1_chksum_blk_end(wu));
+ }
+ }
}
Index: sys/dev/softraidvar.h
===================================================================
RCS file: /cvs/src/sys/dev/softraidvar.h,v
retrieving revision 1.161
diff -u -p -u -r1.161 softraidvar.h
--- sys/dev/softraidvar.h 21 Jul 2015 03:30:51 -0000 1.161
+++ sys/dev/softraidvar.h 31 Aug 2015 20:02:50 -0000
@@ -307,7 +307,7 @@ SLIST_HEAD(sr_boot_volume_head, sr_boot_
#define DEVNAME(_s) ((_s)->sc_dev.dv_xname)
-/* #define SR_DEBUG */
+#define SR_DEBUG
#ifdef SR_DEBUG
extern u_int32_t sr_debug;
#define DPRINTF(x...) do { if (sr_debug) printf(x); } while(0)
@@ -322,6 +322,7 @@ extern u_int32_t sr_debug;
#define SR_D_DIS 0x0080
#define SR_D_STATE 0x0100
#define SR_D_REBUILD 0x0200
+#define SR_D_CHKSUM 0x0400
#else
#define DPRINTF(x...)
#define DNPRINTF(n,x...)
@@ -378,6 +379,7 @@ struct sr_workunit {
#define SR_WU_RESTART 7
#define SR_WU_REQUEUE 8
#define SR_WU_CONSTRUCT 9
+#define SR_WU_CHKSUMFAILED 10
int swu_flags; /* additional hints */
#define SR_WUF_REBUILD (1<<0) /* rebuild io */
@@ -426,6 +428,10 @@ struct sr_raid0 {
#define SR_RAID1_NOWU 16
struct sr_raid1 {
u_int32_t sr1_counter;
+ u_int32_t sr1_use_chksum; /* are checksum in use? */
+
+ /* original coerced size in blocks */
+ int64_t sr1_coerced_size;
};
/* RAID 5 */
@@ -474,6 +480,10 @@ struct sr_chunk {
u_char src_duid[8]; /* Chunk disklabel UID. */
int64_t src_size; /* in blocks */
+ struct ksensor src_sensor; /* Chunk specific sensor */
+ int src_sensor_attached;
+ int src_errs; /* Errors counter value */
+
SLIST_ENTRY(sr_chunk) src_link;
};
@@ -503,6 +513,7 @@ struct sr_discipline {
/* SR_MD_RAID4 was 7. */
#define SR_MD_RAID6 8
#define SR_MD_CONCAT 9
+#define SR_MD_RAID1_CHKSUM 10
char sd_name[10]; /* human readable dis name */
u_int16_t sd_target; /* scsibus target discipline uses */
@@ -512,6 +523,7 @@ struct sr_discipline {
#define SR_CAP_REBUILD 0x00000004 /* Supports rebuild. */
#define SR_CAP_NON_COERCED 0x00000008 /* Uses non-coerced size. */
#define SR_CAP_REDUNDANT 0x00000010 /* Redundant copies of data. */
+#define SR_CAP_CHKSUM 0x00000020 /* Check sums of data. */
union {
struct sr_raid0 mdd_raid0;
@@ -583,6 +595,9 @@ struct sr_discipline {
int (*sd_meta_opt_handler)(struct sr_discipline *,
struct sr_meta_opt_hdr *);
void (*sd_rebuild)(struct sr_discipline *);
+
+ int (*sd_wu_collision_detection)(
+ struct sr_workunit *, struct sr_workunit *);
/* SCSI emulation */
struct scsi_sense_data sd_scsi_sense;