Module Name: src
Committed By: haad
Date: Sat Oct 23 21:18:55 UTC 2010
Modified Files:
src/sys/dev/dm: device-mapper.c dm.h dm_target_stripe.c
Added Files:
src/sys/dev/dm/doc: locking.txt
Log Message:
Add old file describing locking schema used in dm driver.
To generate a diff of this commit:
cvs rdiff -u -r1.24 -r1.25 src/sys/dev/dm/device-mapper.c
cvs rdiff -u -r1.18 -r1.19 src/sys/dev/dm/dm.h
cvs rdiff -u -r1.10 -r1.11 src/sys/dev/dm/dm_target_stripe.c
cvs rdiff -u -r0 -r1.1 src/sys/dev/dm/doc/locking.txt
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/dev/dm/device-mapper.c
diff -u src/sys/dev/dm/device-mapper.c:1.24 src/sys/dev/dm/device-mapper.c:1.25
--- src/sys/dev/dm/device-mapper.c:1.24 Sat Oct 9 12:56:06 2010
+++ src/sys/dev/dm/device-mapper.c Sat Oct 23 21:18:54 2010
@@ -1,4 +1,4 @@
-/* $NetBSD: device-mapper.c,v 1.24 2010/10/09 12:56:06 haad Exp $ */
+/* $NetBSD: device-mapper.c,v 1.25 2010/10/23 21:18:54 haad Exp $ */
/*
* Copyright (c) 2010 The NetBSD Foundation, Inc.
@@ -350,7 +350,6 @@
r = 0;
aprint_debug("dmioctl called\n");
-
KASSERT(data != NULL);
if (( r = disk_ioctl_switch(dev, cmd, data)) == ENOTTY) {
Index: src/sys/dev/dm/dm.h
diff -u src/sys/dev/dm/dm.h:1.18 src/sys/dev/dm/dm.h:1.19
--- src/sys/dev/dm/dm.h:1.18 Tue May 18 15:10:41 2010
+++ src/sys/dev/dm/dm.h Sat Oct 23 21:18:54 2010
@@ -1,4 +1,4 @@
-/* $NetBSD: dm.h,v 1.18 2010/05/18 15:10:41 haad Exp $ */
+/* $NetBSD: dm.h,v 1.19 2010/10/23 21:18:54 haad Exp $ */
/*
* Copyright (c) 2008 The NetBSD Foundation, Inc.
@@ -170,12 +170,23 @@
typedef struct target_linear_config {
dm_pdev_t *pdev;
uint64_t offset;
+ TAILQ_ENTRY(target_linear_config) entries;
} dm_target_linear_config_t;
+/*
+ * Striping devices are stored in a linked list, this might be inefficient
+ * for more than 8 striping devices and can be changed to something more
+ * scalable.
+ * TODO: look for other options than linked list.
+ */
+TAILQ_HEAD(target_linear_devs, target_linear_config);
+
+typedef struct target_linear_devs dm_target_linear_devs_t;
+
/* for stripe : */
typedef struct target_stripe_config {
-#define MAX_STRIPES 2
- struct target_linear_config stripe_devs[MAX_STRIPES];
+#define DM_STRIPE_DEV_OFFSET 2
+ struct target_linear_devs stripe_devs;
uint8_t stripe_num;
uint64_t stripe_chunksize;
size_t params_len;
Index: src/sys/dev/dm/dm_target_stripe.c
diff -u src/sys/dev/dm/dm_target_stripe.c:1.10 src/sys/dev/dm/dm_target_stripe.c:1.11
--- src/sys/dev/dm/dm_target_stripe.c:1.10 Tue May 18 15:10:41 2010
+++ src/sys/dev/dm/dm_target_stripe.c Sat Oct 23 21:18:54 2010
@@ -1,4 +1,4 @@
-/*$NetBSD: dm_target_stripe.c,v 1.10 2010/05/18 15:10:41 haad Exp $*/
+/*$NetBSD: dm_target_stripe.c,v 1.11 2010/10/23 21:18:54 haad Exp $*/
/*
* Copyright (c) 2009 The NetBSD Foundation, Inc.
@@ -102,6 +102,8 @@
/*
* Init function called from dm_table_load_ioctl.
+ * DM_STRIPE_DEV_OFFSET should always hold the index of the first device-offset
+ * pair in the parameters.
* Example line sent to dm from lvm tools when using striped target.
* start length striped #stripes chunk_size device1 offset1 ... deviceN offsetN
* 0 65536 striped 2 512 /dev/hda 0 /dev/hdb 0
@@ -109,9 +111,11 @@
int
dm_target_stripe_init(dm_dev_t * dmv, void **target_config, char *params)
{
+ dm_target_linear_config_t *tlc;
dm_target_stripe_config_t *tsc;
size_t len;
char **ap, *argv[10];
+ int strpc, strpi;
if (params == NULL)
return EINVAL;
@@ -130,33 +134,34 @@
printf("Stripe target init function called!!\n");
- printf("Stripe target chunk size %s number of stripes %s\n", argv[1], argv[0]);
- printf("Stripe target device name %s -- offset %s\n", argv[2], argv[3]);
- printf("Stripe target device name %s -- offset %s\n", argv[4], argv[5]);
+ printf("Stripe target chunk size %s number of stripes %s\n",
+ argv[1], argv[0]);
- if (atoi(argv[0]) > MAX_STRIPES)
- return ENOTSUP;
-
- if ((tsc = kmem_alloc(sizeof(dm_target_stripe_config_t), KM_NOSLEEP))
- == NULL)
+ if ((tsc = kmem_alloc(sizeof(*tsc), KM_NOSLEEP)) == NULL)
return ENOMEM;
- /* Insert dmp to global pdev list */
- if ((tsc->stripe_devs[0].pdev = dm_pdev_insert(argv[2])) == NULL)
- return ENOENT;
-
- /* Insert dmp to global pdev list */
- if ((tsc->stripe_devs[1].pdev = dm_pdev_insert(argv[4])) == NULL)
- return ENOENT;
-
- tsc->stripe_devs[0].offset = atoi(argv[3]);
- tsc->stripe_devs[1].offset = atoi(argv[5]);
+ /* Initialize linked list for striping devices */
+ TAILQ_INIT(&tsc->stripe_devs);
/* Save length of param string */
tsc->params_len = len;
tsc->stripe_chunksize = atoi(argv[1]);
tsc->stripe_num = (uint8_t) atoi(argv[0]);
+ strpc = DM_STRIPE_DEV_OFFSET + (tsc->stripe_num * 2);
+ for (strpi = DM_STRIPE_DEV_OFFSET; strpi < strpc; strpi += 2) {
+ printf("Stripe target device name %s -- offset %s\n",
+ argv[strpi], argv[strpi+1]);
+
+ tlc = kmem_alloc(sizeof(*tlc), KM_NOSLEEP);
+ if ((tlc->pdev = dm_pdev_insert(argv[strpi])) == NULL)
+ return ENOENT;
+ tlc->offset = atoi(argv[strpi+1]);
+
+ /* Insert striping device to linked list. */
+ TAILQ_INSERT_TAIL(&tsc->stripe_devs, tlc, entries);
+ }
+
*target_config = tsc;
dmv->dev_type = DM_STRIPE_DEV;
@@ -167,18 +172,28 @@
char *
dm_target_stripe_status(void *target_config)
{
+ dm_target_linear_config_t *tlc;
dm_target_stripe_config_t *tsc;
- char *params;
+ char *params, *tmp;
tsc = target_config;
if ((params = kmem_alloc(DM_MAX_PARAMS_SIZE, KM_SLEEP)) == NULL)
return NULL;
- snprintf(params, DM_MAX_PARAMS_SIZE, "%d %" PRIu64 " %s %" PRIu64 " %s %" PRIu64,
- tsc->stripe_num, tsc->stripe_chunksize,
- tsc->stripe_devs[0].pdev->name, tsc->stripe_devs[0].offset,
- tsc->stripe_devs[1].pdev->name, tsc->stripe_devs[1].offset);
+ if ((tmp = kmem_alloc(DM_MAX_PARAMS_SIZE, KM_SLEEP)) == NULL)
+ return NULL;
+
+ snprintf(params, DM_MAX_PARAMS_SIZE, "%d %" PRIu64,
+ tsc->stripe_num, tsc->stripe_chunksize);
+
+ TAILQ_FOREACH(tlc, &tsc->stripe_devs, entries) {
+ snprintf(tmp, DM_MAX_PARAMS_SIZE, " %s %" PRIu64,
+ tlc->pdev->name, tlc->offset);
+ strcat(params, tmp);
+ }
+
+ kmem_free(tmp, DM_MAX_PARAMS_SIZE);
return params;
}
@@ -186,12 +201,13 @@
int
dm_target_stripe_strategy(dm_table_entry_t * table_en, struct buf * bp)
{
+ dm_target_linear_config_t *tlc;
dm_target_stripe_config_t *tsc;
struct buf *nestbuf;
uint64_t blkno, blkoff;
uint64_t stripe, stripe_blknr;
uint32_t stripe_off, stripe_rest, num_blks, issue_blks;
- int stripe_devnr;
+ int i, stripe_devnr;
tsc = table_en->target_config;
if (tsc == NULL)
@@ -224,9 +240,17 @@
nestiobuf_setup(bp, nestbuf, blkoff, issue_blks * DEV_BSIZE);
nestbuf->b_blkno = stripe_blknr * tsc->stripe_chunksize + stripe_off;
- nestbuf->b_blkno += tsc->stripe_devs[stripe_devnr].offset;
- VOP_STRATEGY(tsc->stripe_devs[stripe_devnr].pdev->pdev_vnode, nestbuf);
+ tlc = TAILQ_FIRST(&tsc->stripe_devs);
+ for (i = 0; i < stripe_devnr && tlc == NULL; i++)
+ tlc = TAILQ_NEXT(tlc, entries);
+
+ /* by this point we should have an tlc */
+ KASSERT(tlc == NULL);
+
+ nestbuf->b_blkno += tlc->offset;
+
+ VOP_STRATEGY(tlc->pdev->pdev_vnode, nestbuf);
blkno += issue_blks;
blkoff += issue_blks * DEV_BSIZE;
@@ -242,16 +266,17 @@
int
dm_target_stripe_sync(dm_table_entry_t * table_en)
{
- int cmd, err, i;
+ int cmd, err;
dm_target_stripe_config_t *tsc;
+ dm_target_linear_config_t *tlc;
tsc = table_en->target_config;
err = 0;
cmd = 1;
- for (i = 0; i < tsc->stripe_num; i++) {
- if ((err = VOP_IOCTL(tsc->stripe_devs[i].pdev->pdev_vnode, DIOCCACHESYNC,
+ TAILQ_FOREACH(tlc, &tsc->stripe_devs, entries) {
+ if ((err = VOP_IOCTL(tlc->pdev->pdev_vnode, DIOCCACHESYNC,
&cmd, FREAD|FWRITE, kauth_cred_get())) != 0)
return err;
}
@@ -264,19 +289,23 @@
dm_target_stripe_destroy(dm_table_entry_t * table_en)
{
dm_target_stripe_config_t *tsc;
+ dm_target_linear_config_t *tlc;
tsc = table_en->target_config;
if (tsc == NULL)
return 0;
- dm_pdev_decr(tsc->stripe_devs[0].pdev);
- dm_pdev_decr(tsc->stripe_devs[1].pdev);
+ while ((tlc = TAILQ_FIRST(&tsc->stripe_devs)) != NULL) {
+ TAILQ_REMOVE(&tsc->stripe_devs, tlc, entries);
+ dm_pdev_decr(tlc->pdev);
+ kmem_free(tlc, sizeof(*tlc));
+ }
/* Unbusy target so we can unload it */
dm_target_unbusy(table_en->target);
- kmem_free(tsc, sizeof(dm_target_stripe_config_t));
+ kmem_free(tsc, sizeof(*tsc));
table_en->target_config = NULL;
@@ -287,6 +316,7 @@
dm_target_stripe_deps(dm_table_entry_t * table_en, prop_array_t prop_array)
{
dm_target_stripe_config_t *tsc;
+ dm_target_linear_config_t *tlc;
struct vattr va;
int error;
@@ -296,15 +326,12 @@
tsc = table_en->target_config;
- if ((error = VOP_GETATTR(tsc->stripe_devs[0].pdev->pdev_vnode, &va, curlwp->l_cred)) != 0)
- return error;
-
- prop_array_add_uint64(prop_array, (uint64_t) va.va_rdev);
+ TAILQ_FOREACH(tlc, &tsc->stripe_devs, entries) {
+ if ((error = VOP_GETATTR(tlc->pdev->pdev_vnode, &va, curlwp->l_cred)) != 0)
+ return error;
- if ((error = VOP_GETATTR(tsc->stripe_devs[1].pdev->pdev_vnode, &va, curlwp->l_cred)) != 0)
- return error;
-
- prop_array_add_uint64(prop_array, (uint64_t) va.va_rdev);
+ prop_array_add_uint64(prop_array, (uint64_t) va.va_rdev);
+ }
return 0;
}
Added files:
Index: src/sys/dev/dm/doc/locking.txt
diff -u /dev/null src/sys/dev/dm/doc/locking.txt:1.1
--- /dev/null Sat Oct 23 21:18:55 2010
+++ src/sys/dev/dm/doc/locking.txt Sat Oct 23 21:18:55 2010
@@ -0,0 +1,263 @@
+
+ Device-mapper Locking architecture
+
+Overview
+
+There are 2 users in device-mapper driver
+ a) Users who uses disk drives
+ b) Users who uses ioctl management interface
+
+Management is done by dm_dev_*_ioctl and dm_table_*_ioctl routines. There are
+two major structures used in these routines/device-mapper.
+
+Table entry:
+
+typedef struct dm_table_entry {
+ struct dm_dev *dm_dev; /* backlink */
+ uint64_t start;
+ uint64_t length;
+
+ struct dm_target *target; /* Link to table target. */
+ void *target_config; /* Target specific data. */
+ SLIST_ENTRY(dm_table_entry) next;
+} dm_table_entry_t;
+
+This structure stores every target part of dm device. Every device can have
+more than one target mapping entries stored in a list. This structure describe
+mapping between logical/physical blocks in dm device.
+
+start length target block device offset
+0 102400 linear /dev/wd1a 384
+102400 204800 linear /dev/wd2a 384
+204800 409600 linear /dev/wd3a 384
+
+Every device has at least two tables ACTIVE and INACTIVE. Only ACTIVE table is
+used during IO. Every IO operation on dm device have to walk through dm_table_entries list.
+
+Device entry:
+
+typedef struct dm_dev {
+ char name[DM_NAME_LEN];
+ char uuid[DM_UUID_LEN];
+
+ int minor;
+ uint32_t flags; /* store communication protocol flags */
+
+ kmutex_t dev_mtx; /* mutex for generall device lock */
+ kcondvar_t dev_cv; /* cv for ioctl synchronisation */
+
+ uint32_t event_nr;
+ uint32_t ref_cnt;
+
+ uint32_t dev_type;
+
+ dm_table_head_t table_head;
+
+ struct dm_dev_head upcalls;
+
+ struct disklabel *dk_label; /* Disklabel for this table. */
+
+ TAILQ_ENTRY(dm_dev) next_upcall; /* LIST of mirrored, snapshoted devices. */
+
+ TAILQ_ENTRY(dm_dev) next_devlist; /* Major device list. */
+} dm_dev_t;
+
+Every device created in dm device-mapper is represented with this structure.
+All devices are stored in a list. Every ioctl routine have to work with this
+structure.
+
+ Locking in dm driver
+
+Locking must be done in two ways. Synchronisation between ioctl routines and
+between IO operations and ioctl. Table entries are read during IO and during some ioctl routines. There are only few routines which manipulates table lists.
+
+Read access to table list:
+
+dmsize
+dmstrategy
+dm_dev_status_ioctl
+dm_table_info_ioctl
+dm_table_deps_ioctl
+dm_disk_ioctl -> DIOCCACHESYNC ioctl
+
+Write access to table list:
+dm_dev_remove_ioctl -> remove device from list, this routine have to
+ remove all tables.
+dm_dev_resume_ioctl -> Switch tables on suspended device, switch INACTIVE
+ and ACTIVE tables.
+dm_table_clear_ioctl -> Remove INACTIVE table from table list.
+
+
+Synchronisation between readers and writers in table list
+
+I moved everything needed for table synchronisation to struct dm_table_head.
+
+typedef struct dm_table_head {
+ /* Current active table is selected with this. */
+ int cur_active_table;
+ struct dm_table tables[2];
+
+ kmutex_t table_mtx;
+ kcondvar_t table_cv; /*IO waiting cv */
+
+ uint32_t io_cnt;
+} dm_table_head_t;
+
+dm_table_head_t is used as entry for every dm_table synchronisation routine.
+
+Because every table user have to get list to table list head I have implemented
+these routines to manage access to table lists.
+
+/*
+ * Destroy all table data. This function can run when there are no
+ * readers on table lists.
+ */
+int dm_table_destroy(dm_table_head_t *, uint8_t);
+
+/*
+ * Return length of active table in device.
+ */
+uint64_t dm_table_size(dm_table_head_t *);
+
+/*
+ * Return current active table to caller, increment io_cnt reference counter.
+ */
+struct dm_table * dm_table_get_entry(dm_table_head_t *, uint8_t);
+
+/*
+ * Return > 0 if table is at least one table entry (returns number of entries)
+ * and return 0 if there is not. Target count returned from this function
+ * doesn't need to be true when userspace user receive it (after return
+ * there can be dm_dev_resume_ioctl), therfore this isonly informative.
+ */
+int dm_table_get_target_count(dm_table_head_t *, uint8_t);
+
+/*
+ * Decrement io reference counter and wake up all callers, with table_head cv.
+ */
+void dm_table_release(dm_table_head_t *, uint8_t s);
+
+/*
+ * Switch table from inactive to active mode. Have to wait until io_cnt is 0.
+ */
+void dm_table_switch_tables(dm_table_head_t *);
+
+/*
+ * Initialize table_head structures, I'm trying to keep this structure as
+ * opaque as possible.
+ */
+void dm_table_head_init(dm_table_head_t *);
+
+/*
+ * Destroy all variables in table_head
+ */
+void dm_table_head_destroy(dm_table_head_t *);
+
+Internal table synchronisation protocol
+
+Readers:
+dm_table_size
+dm_table_get_target_count
+dm_table_get_target_count
+
+Readers with hold reference counter:
+dm_table_get_entry
+dm_table_release
+
+Writer:
+dm_table_destroy
+dm_table_switch_tables
+
+For managing synchronisation to table lists I use these routines. Every reader
+uses dm_table_busy routine to hold reference counter during work and dm_table_unbusy for reference counter release. Every writer have to wait while
+is reference counter 0 and only then it can work with device. It will sleep on
+head->table_cv while there are other readers. dm_table_get_entry is specific in that it will return table with hold reference counter. After dm_table_get_entry
+every caller must call dm_table_release when it doesn't want to work with it.
+
+/*
+ * Function to increment table user reference counter. Return id
+ * of table_id table.
+ * DM_TABLE_ACTIVE will return active table id.
+ * DM_TABLE_INACTIVE will return inactive table id.
+ */
+static int
+dm_table_busy(dm_table_head_t *head, uint8_t table_id)
+{
+ uint8_t id;
+
+ id = 0;
+
+ mutex_enter(&head->table_mtx);
+
+ if (table_id == DM_TABLE_ACTIVE)
+ id = head->cur_active_table;
+ else
+ id = 1 - head->cur_active_table;
+
+ head->io_cnt++;
+
+ mutex_exit(&head->table_mtx);
+ return id;
+}
+
+/*
+ * Function release table lock and eventually wakeup all waiters.
+ */
+static void
+dm_table_unbusy(dm_table_head_t *head)
+{
+ KASSERT(head->io_cnt != 0);
+
+ mutex_enter(&head->table_mtx);
+
+ if (--head->io_cnt == 0)
+ cv_broadcast(&head->table_cv);
+
+ mutex_exit(&head->table_mtx);
+}
+
+Device-mapper betwwen ioctl device synchronisation
+
+
+Every ioctl user have to find dm_device with name, uuid, minor number.
+For this dm_dev_lookup is used. This routine returns device with hold reference
+counter.
+
+void
+dm_dev_busy(dm_dev_t *dmv)
+{
+ mutex_enter(&dmv->dev_mtx);
+ dmv->ref_cnt++;
+ mutex_exit(&dmv->dev_mtx);
+}
+
+void
+dm_dev_unbusy(dm_dev_t *dmv)
+{
+ KASSERT(dmv->ref_cnt != 0);
+
+ mutex_enter(&dmv->dev_mtx);
+ if (--dmv->ref_cnt == 0)
+ cv_broadcast(&dmv->dev_cv);
+ mutex_exit(&dmv->dev_mtx);
+}
+
+Before returning from ioctl routine must release reference counter with
+dm_dev_unbusy.
+
+dm_dev_remove_ioctl routine have to remove dm_dev from global device list,
+and wait until all ioctl users from dm_dev are gone.
+
+
+
+
+
+
+
+
+
+
+
+
+
+