[PATCH 005 of 006] raid5: Move expansion operations to a work queue
This patch modifies handle_write_operations5() to handle the parity calculation request made by the reshape code. However this patch does not move the copy operation associated with an expand to the work queue. First, it was difficult to find a clean way to pass the parameters of this operation to the queue. Second, this section of code is a good candidate for performing the copies with inline calls to the dma routines. This patch also cleans up the *_End flags which as of this version of the patch set are not needed. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> drivers/md/raid5.c | 51 - include/linux/raid/raid5.h | 36 +++ 2 files changed, 54 insertions(+), 33 deletions(-) === Index: linux-2.6-raid/drivers/md/raid5.c === --- linux-2.6-raid.orig/drivers/md/raid5.c 2006-06-28 10:35:40.0 -0700 +++ linux-2.6-raid/drivers/md/raid5.c 2006-06-28 10:35:50.0 -0700 @@ -1250,16 +1250,25 @@ */ if (locked == 0) { if (rcw == 0) { - /* enter stage 1 of reconstruct write operation */ - set_bit(STRIPE_OP_RCW, &sh->state); - set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state); - for (i=disks ; i-- ;) { - struct r5dev *dev = &sh->dev[i]; - - if (i!=pd_idx && dev->towrite) { - set_bit(R5_LOCKED, &dev->flags); + /* skip the drain operation on an expand */ + if (test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state)) { + set_bit(STRIPE_OP_RCW, &sh->state); + set_bit(STRIPE_OP_RCW_Parity, &sh->ops.state); + for (i=disks ; i-- ;) { + set_bit(R5_LOCKED, &sh->dev[i].flags); locked++; } + } else { /* enter stage 1 of reconstruct write operation */ + set_bit(STRIPE_OP_RCW, &sh->state); + set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state); + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + + if (i!=pd_idx && dev->towrite) { + set_bit(R5_LOCKED, &dev->flags); + locked++; + } + } } } else { /* enter stage 1 of read modify write operation */ @@ -2213,16 +,24 @@ } if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + int work_queued, start_n=1; /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - compute_parity5(sh, RECONSTRUCT_WRITE); - for (i= conf->raid_disks; i--;) { - set_bit(R5_LOCKED, &sh->dev[i].flags); - locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); + if (!(test_bit(STRIPE_OP_RCW, &sh->state) || + test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state))) { + start_n = 0; + set_bit(STRIPE_OP_RCW_Expand, &sh->ops.state); + } + work_queued = handle_write_operations5(sh, 0, start_n); + if (work_queued == 0) { + for (i= conf->raid_disks; i--;) + set_bit(R5_Wantwrite, &sh->dev[i].flags); + clear_bit(STRIPE_EXPANDING, &sh->state); + clear_bit(STRIPE_OP_RCW_Expand, &sh->ops.state); + } else if (work_queued > 0) { + locked += work_queued; } - clear_bit(STRIPE_EXPANDING, &sh->state); } else if (expanded) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); @@ -2257,9 +2274,15 @@ release_stripe(sh2); continue; } + /
[PATCH 002 of 006] raid5: Move check parity operations to a work queue
This patch adds 'check parity' capabilities to the work queue and fixes 'queue_raid_work'. Also, raid5_do_soft_block_ops now accesses the stripe state under the lock to ensure that it is never out of sync with handle_stripe5. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> drivers/md/raid5.c | 123 ++--- include/linux/raid/raid5.h | 25 ++--- 2 files changed, 113 insertions(+), 35 deletions(-) === Index: linux-2.6-raid/drivers/md/raid5.c === --- linux-2.6-raid.orig/drivers/md/raid5.c 2006-06-28 09:52:07.0 -0700 +++ linux-2.6-raid/drivers/md/raid5.c 2006-06-28 10:35:23.0 -0700 @@ -1289,7 +1289,7 @@ if (locked > 0) { set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); - sh->ops.queue_count++; + sh->ops.pending++; } else if (locked == 0) set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); @@ -1300,6 +1300,37 @@ return locked; } +static int handle_check_operations5(struct stripe_head *sh, int start_n) +{ + int complete=0, work_queued = -EBUSY; + + if (test_bit(STRIPE_OP_CHECK, &sh->state) && + test_bit(STRIPE_OP_CHECK_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_CHECK, &sh->state); + clear_bit(STRIPE_OP_CHECK_Done, &sh->ops.state); + complete = 1; + } + + if (start_n == 0) { + /* enter stage 1 of parity check operation */ + set_bit(STRIPE_OP_CHECK, &sh->state); + set_bit(STRIPE_OP_CHECK_Gen, &sh->ops.state); + work_queued = 1; + } else if (complete) + work_queued = 0; + + if (work_queued > 0) { + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + sh->ops.pending++; + } + + PRINTK("%s: stripe %llu start: %d complete: %d op_state: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + start_n == 0, complete, sh->ops.state); + + return work_queued; +} + /* * Each stripe/dev can have one or more bion attached. @@ -1406,11 +1437,11 @@ /* must be called under the stripe lock */ static void queue_raid_work(struct stripe_head *sh) { - if (--sh->ops.queue_count == 0) { + if (!test_bit(STRIPE_OP_QUEUED, &sh->state) && sh->ops.pending) { + set_bit(STRIPE_OP_QUEUED, &sh->state); atomic_inc(&sh->count); queue_work(sh->raid_conf->block_ops_queue, &sh->ops.work); - } else if (sh->ops.queue_count < 0) - sh->ops.queue_count = 0; + } } /* @@ -1423,16 +1454,17 @@ int i, pd_idx = sh->pd_idx, disks = sh->disks, count = 1; void *ptr[MAX_XOR_BLOCKS]; struct bio *chosen; - int overlap=0, new_work=0, written=0; - unsigned long state, ops_state; + int overlap=0, work=0, written=0; + unsigned long state, ops_state, ops_state_orig; /* take a snapshot of what needs to be done at this point in time */ spin_lock(&sh->lock); state = sh->state; - ops_state = sh->ops.state; + ops_state_orig = ops_state = sh->ops.state; spin_unlock(&sh->lock); if (test_bit(STRIPE_OP_RMW, &state)) { + BUG_ON(test_bit(STRIPE_OP_RCW, &state)); PRINTK("%s: stripe %llu STRIPE_OP_RMW op_state: %lx\n", __FUNCTION__, (unsigned long long)sh->sector, ops_state); @@ -1483,14 +1515,14 @@ if (count != 1) xor_block(count, STRIPE_SIZE, ptr); - /* signal completion and acknowledge the last state seen -* by sh->ops.state -*/ + work++; set_bit(STRIPE_OP_RMW_Done, &ops_state); - set_bit(STRIPE_OP_RMW_ParityPre, &ops_state); } - } else if (test_bit(STRIPE_OP_RCW, &state)) { + } + + if (test_bit(STRIPE_OP_RCW, &state)) { + BUG_ON(test_bit(STRIPE_OP_RMW, &state)); PRINTK("%s: stripe %llu STRIPE_OP_RCW op_state: %lx\n", __FUNCTION__, (unsigned long long)sh->sector, ops_state); @@ -1527,20 +1559,47 @@ if (count != 1) xor_
[PATCH 006 of 006] raid5: Remove compute_block and compute_parity
compute_block and compute_parity5 are replaced by the work queue and the handle_*_operations5 routines. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> raid5.c | 123 1 files changed, 123 deletions(-) === --- linux-2.6-raid.orig/drivers/md/raid5.c 2006-06-27 16:16:31.0 -0700 +++ linux-2.6-raid/drivers/md/raid5.c 2006-06-27 16:19:13.0 -0700 @@ -918,129 +918,6 @@ } while(0) -static void compute_block(struct stripe_head *sh, int dd_idx) -{ - int i, count, disks = sh->disks; - void *ptr[MAX_XOR_BLOCKS], *p; - - PRINTK("compute_block, stripe %llu, idx %d\n", - (unsigned long long)sh->sector, dd_idx); - - ptr[0] = page_address(sh->dev[dd_idx].page); - memset(ptr[0], 0, STRIPE_SIZE); - count = 1; - for (i = disks ; i--; ) { - if (i == dd_idx) - continue; - p = page_address(sh->dev[i].page); - if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) - ptr[count++] = p; - else - printk(KERN_ERR "compute_block() %d, stripe %llu, %d" - " not present\n", dd_idx, - (unsigned long long)sh->sector, i); - - check_xor(); - } - if (count != 1) - xor_block(count, STRIPE_SIZE, ptr); - set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); -} - -static void compute_parity5(struct stripe_head *sh, int method) -{ - raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, disks = sh->disks, count; - void *ptr[MAX_XOR_BLOCKS]; - struct bio *chosen; - - PRINTK("compute_parity5, stripe %llu, method %d\n", - (unsigned long long)sh->sector, method); - - count = 1; - ptr[0] = page_address(sh->dev[pd_idx].page); - switch(method) { - case READ_MODIFY_WRITE: - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); - for (i=disks ; i-- ;) { - if (i==pd_idx) - continue; - if (sh->dev[i].towrite && - test_bit(R5_UPTODATE, &sh->dev[i].flags)) { - ptr[count++] = page_address(sh->dev[i].page); - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - check_xor(); - } - } - break; - case RECONSTRUCT_WRITE: - memset(ptr[0], 0, STRIPE_SIZE); - for (i= disks; i-- ;) - if (i!=pd_idx && sh->dev[i].towrite) { - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - } - break; - case CHECK_PARITY: - break; - } - if (count>1) { - xor_block(count, STRIPE_SIZE, ptr); - count = 1; - } - - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); - } - - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); - } - - switch(method) { - case RECONSTRUCT_WRITE: - case CHECK_PARITY: - for (i=disks; i--;) - if (i != pd_idx) { - ptr[count++] = page_address(sh->dev[i].page); - check_xor(); - } - break; - case
[PATCH 003 of 006] raid5: Move compute block operations to a work queue
This patch adds 'compute block' capabilities to the work queue. Here are a few notes about the new flags R5_ComputeReq and STRIPE_OP_COMPUTE_Recover: Previously, when handle_stripe5 found a block that needed to be computed it updated it in the same step. Now that these operations are separated (across multiple calls to handle_stripe5), a R5_ComputeReq flag is needed to tell other parts of handle_stripe5 to treat the block under computation as if it were up to date. The order of events in the work queue ensures that the block is indeed up to date before performing further operations. STRIPE_OP_COMPUTE_Recover was added to track when the parity block is being computed due to a failed parity check. This allows the code in handle_stripe5 that produces requests for check_parity and compute_block operations to be separate from the code that consumes the result. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> drivers/md/raid5.c | 147 + include/linux/raid/raid5.h |7 +- 2 files changed, 129 insertions(+), 25 deletions(-) === Index: linux-2.6-raid/drivers/md/raid5.c === --- linux-2.6-raid.orig/drivers/md/raid5.c 2006-06-28 10:47:43.0 -0700 +++ linux-2.6-raid/drivers/md/raid5.c 2006-06-28 11:06:06.0 -0700 @@ -1263,7 +1263,9 @@ } } else { /* enter stage 1 of read modify write operation */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); + BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || + test_bit(R5_ComputeReq, &sh->dev[pd_idx].flags))); + set_bit(STRIPE_OP_RMW, &sh->state); set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state); for (i=disks ; i-- ;) { @@ -1272,7 +1274,8 @@ continue; if (dev->towrite && - test_bit(R5_UPTODATE, &dev->flags)) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_ComputeReq, &dev->flags))) { set_bit(R5_LOCKED, &dev->flags); locked++; } @@ -1331,6 +1334,30 @@ return work_queued; } +static int handle_compute_operations5(struct stripe_head *sh, int dd_idx) +{ + int work_queued = -EBUSY; + + if (test_bit(STRIPE_OP_COMPUTE, &sh->state) && + test_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_COMPUTE, &sh->state); + clear_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state); + clear_bit(R5_ComputeReq, &sh->dev[dd_idx].flags); + work_queued = 0; + } else if (!test_bit(STRIPE_OP_COMPUTE, &sh->state)) { + set_bit(STRIPE_OP_COMPUTE, &sh->state); + set_bit(STRIPE_OP_COMPUTE_Prep, &sh->ops.state); + set_bit(R5_ComputeReq, &sh->dev[dd_idx].flags); + work_queued = 1; + sh->ops.pending++; + } + + PRINTK("%s: stripe %llu work_queued: %d op_state: %lx dev[%d].flags: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + work_queued, sh->ops.state, dd_idx, sh->dev[dd_idx].flags); + + return work_queued; +} /* * Each stripe/dev can have one or more bion attached. @@ -1454,7 +1481,7 @@ int i, pd_idx = sh->pd_idx, disks = sh->disks, count = 1; void *ptr[MAX_XOR_BLOCKS]; struct bio *chosen; - int overlap=0, work=0, written=0; + int overlap=0, work=0, written=0, compute=0, dd_idx=0; unsigned long state, ops_state, ops_state_orig; /* take a snapshot of what needs to be done at this point in time */ @@ -1463,6 +1490,51 @@ ops_state_orig = ops_state = sh->ops.state; spin_unlock(&sh->lock); + if (test_bit(STRIPE_OP_COMPUTE, &state)) { + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + if (test_bit(R5_ComputeReq, &dev->flags)) { + dd_idx = i; + i = -1; + break; + } + } + BUG_ON(i >= 0); + PRINTK("%s: stripe %llu STRIPE_OP_COMPUTE op_state: %lx block: %d\n", + __FUNCTION__, (unsigned long long)sh->sector, +
[PATCH 004 of 006] raid5: Move read completion copies to a work queue
This patch moves the data copying portion of satisfying read requests into the work queue. It adds a 'read' (past tense) pointer to the r5dev structure to to track reads that have been offloaded to the work queue. When the copy operation is complete the 'read' pointer is reused as the return_bi for the bi_end_io() call. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> drivers/md/raid5.c | 94 - include/linux/raid/raid5.h |6 +- 2 files changed, 71 insertions(+), 29 deletions(-) === Index: linux-2.6-raid/drivers/md/raid5.c === --- linux-2.6-raid.orig/drivers/md/raid5.c 2006-06-28 10:35:31.0 -0700 +++ linux-2.6-raid/drivers/md/raid5.c 2006-06-28 10:35:40.0 -0700 @@ -213,11 +213,11 @@ for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->toread || dev->towrite || dev->written || + if (dev->toread || dev->read || dev->towrite || dev->written || test_bit(R5_LOCKED, &dev->flags)) { - printk("sector=%llx i=%d %p %p %p %d\n", + printk("sector=%llx i=%d %p %p %p %p %d\n", (unsigned long long)sh->sector, i, dev->toread, - dev->towrite, dev->written, + dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); BUG(); } @@ -1490,6 +1490,35 @@ ops_state_orig = ops_state = sh->ops.state; spin_unlock(&sh->lock); + if (test_bit(STRIPE_OP_BIOFILL, &state)) { + raid5_conf_t *conf = sh->raid_conf; + struct bio *return_bi=NULL; + PRINTK("%s: stripe %llu STRIPE_OP_BIOFILL op_state: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + ops_state); + + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + struct bio *rbi, *rbi2; + rbi = dev->read; + while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { + copy_data(0, rbi, dev->page, dev->sector); + rbi2 = r5_next_bio(rbi, dev->sector); + spin_lock_irq(&conf->device_lock); + if (--rbi->bi_phys_segments == 0) { + rbi->bi_next = return_bi; + return_bi = rbi; + } + spin_unlock_irq(&conf->device_lock); + rbi = rbi2; + dev->read = return_bi; + } + } + + work++; + set_bit(STRIPE_OP_BIOFILL_Done, &ops_state); + } + if (test_bit(STRIPE_OP_COMPUTE, &state)) { for (i=disks ; i-- ;) { struct r5dev *dev = &sh->dev[i]; @@ -1725,6 +1754,7 @@ int i; int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; + int fill_complete=0, to_fill=0; int non_overwrite = 0; int failed_num=0; struct r5dev *dev; @@ -1740,45 +1770,49 @@ syncing = test_bit(STRIPE_SYNCING, &sh->state); expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ + if (test_bit(STRIPE_OP_BIOFILL, &sh->state) && + test_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_BIOFILL, &sh->state); + clear_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state); + fill_complete++; + } + /* Now to look around and see what can be done */ rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); - PRINTK("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); + PRINTK("check %d: state 0x%lx toread %p read %p write %p written %p\n", + i, dev->flags, dev->toread, dev->
[PATCH 000 of 006] raid5: Offload RAID operations to a workqueue
This patch set is a step towards enabling hardware offload in the md-raid5 driver. These patches are considered experimental and are not yet suitable for production environments. As mentioned, this patch set is the first step in that it moves work from handle_stripe5 to a work queue. The next step is to enable the work queue to offload the operations to hardware copy/xor engines using the dmaengine API (include/linux/dmaengine.h). Initial testing shows that about 60% of the array maintenance work previously performed by raid5d has moved to the work queue. These patches apply to the version of md as of commit 266bee88699ddbde42ab303bbc426a105cc49809 in Linus' tree. Regards, Dan Williams [PATCH 001 of 006] raid5: Move write operations to a work queue [PATCH 002 of 006] raid5: Move check parity operations to a work queue [PATCH 003 of 006] raid5: Move compute block operations to a work queue [PATCH 004 of 006] raid5: Move read completion copies to a work queue [PATCH 005 of 006] raid5: Move expansion operations to a work queue [PATCH 006 of 006] raid5: Remove compute_block and compute_parity - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 001 of 006] raid5: Move write operations to a work queue
This patch moves write (reconstruct and read-modify) operations to a work queue. Note the next patch in this series fixes some incorrect assumptions around having multiple operations in flight (i.e. ignore this version of 'queue_raid_work'). Signed-off-by: Dan Williams <[EMAIL PROTECTED]> drivers/md/raid5.c | 314 + include/linux/raid/raid5.h | 67 + 2 files changed, 357 insertions(+), 24 deletions(-) === Index: linux-2.6-raid/drivers/md/raid5.c === --- linux-2.6-raid.orig/drivers/md/raid5.c 2006-06-28 08:44:11.0 -0700 +++ linux-2.6-raid/drivers/md/raid5.c 2006-06-28 09:52:07.0 -0700 @@ -305,6 +305,7 @@ memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); sh->raid_conf = conf; spin_lock_init(&sh->lock); + INIT_WORK(&sh->ops.work, conf->do_block_ops, sh); if (grow_buffers(sh, conf->raid_disks)) { shrink_buffers(sh, conf->raid_disks); @@ -1224,6 +1225,80 @@ } } +static int handle_write_operations5(struct stripe_head *sh, int rcw, int locked) +{ + int i, pd_idx = sh->pd_idx, disks = sh->disks; + int complete=0; + + if (test_bit(STRIPE_OP_RCW, &sh->state) && + test_bit(STRIPE_OP_RCW_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_RCW, &sh->state); + clear_bit(STRIPE_OP_RCW_Done, &sh->ops.state); + complete++; + } + + if (test_bit(STRIPE_OP_RMW, &sh->state) && + test_bit(STRIPE_OP_RMW_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_RMW, &sh->state); + clear_bit(STRIPE_OP_RMW_Done, &sh->ops.state); + BUG_ON(++complete == 2); + } + + + /* If no operation is currently in process then use the rcw flag to +* select an operation +*/ + if (locked == 0) { + if (rcw == 0) { + /* enter stage 1 of reconstruct write operation */ + set_bit(STRIPE_OP_RCW, &sh->state); + set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state); + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + + if (i!=pd_idx && dev->towrite) { + set_bit(R5_LOCKED, &dev->flags); + locked++; + } + } + } else { + /* enter stage 1 of read modify write operation */ + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); + set_bit(STRIPE_OP_RMW, &sh->state); + set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state); + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + if (i==pd_idx) + continue; + + if (dev->towrite && + test_bit(R5_UPTODATE, &dev->flags)) { + set_bit(R5_LOCKED, &dev->flags); + locked++; + } + } + } + } else if (locked && complete == 0) /* the queue has an operation in flight */ + locked = -EBUSY; + else if (complete) + locked = 0; + + /* keep the parity disk locked while asynchronous operations +* are in flight +*/ + if (locked > 0) { + set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + sh->ops.queue_count++; + } else if (locked == 0) + set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + + PRINTK("%s: stripe %llu locked: %d complete: %d op_state: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + locked, complete, sh->ops.state); + + return locked; +} /* @@ -1320,6 +1395,174 @@ return pd_idx; } +static inline void drain_bio(struct bio *wbi, sector_t sector, struct page *page) +{ + while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { + copy_data(1, wbi, page, sector); + wbi = r5_next_bio(wbi, sector); + }
Re: [PATCH 000 of 006] raid5: Offload RAID operations to a workqueue
> Hi, > > since using work queues involve more context switches than doing things > inline... have you measured the performance impact of your changes? If > so... was there any impact that you could measure, and how big was that? > > Greetings, > Arjan van de Ven Good point. Especially on ARM extra context switching can be very expensive. In general more testing (and testers for that matter) is needed. To facilitate the determination of whether a multi-threaded work queue is better/worse than an in context implementation here is a patch that makes this configurable. Thanks, Dan [PATCH] raid5: Configuration options to allow raid ops to run in raid5d context Signed-off-by: Dan Williams <[EMAIL PROTECTED]> drivers/md/Kconfig | 21 + drivers/md/raid5.c | 25 + include/linux/raid/raid5.h |6 ++ 3 files changed, 52 insertions(+) Index: linux-2.6-raid/drivers/md/Kconfig === --- linux-2.6-raid.orig/drivers/md/Kconfig 2006-06-29 11:40:02.0 -0700 +++ linux-2.6-raid/drivers/md/Kconfig 2006-06-29 13:43:03.0 -0700 @@ -162,6 +162,27 @@ There should be enough spares already present to make the new array workable. +config MD_RAID456_WORKQUEUE + depends on MD_RAID456 + bool "Offload raid work to a workqueue from raid5d" + ---help--- + This option enables raid work (block copy and xor operations) + to run in a workqueue. However this may come at the expense of + extra context switching. Single processor systems may benefit + from keeping the work within the raid5d context. + + If unsure say, Y. + +config MD_RAID456_WORKQUEUE_MULTITHREAD + depends on MD_RAID456_WORKQUEUE && SMP + bool "Enable multi-threaded raid processing" + default y + ---help--- + This option controls whether the raid workqueue will be multi- + threaded or single threaded. + + If unsure say, Y. + config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD Index: linux-2.6-raid/drivers/md/raid5.c === --- linux-2.6-raid.orig/drivers/md/raid5.c 2006-06-29 13:42:57.0 -0700 +++ linux-2.6-raid/drivers/md/raid5.c 2006-06-29 13:43:03.0 -0700 @@ -305,7 +305,9 @@ memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); sh->raid_conf = conf; spin_lock_init(&sh->lock); + #ifdef CONFIG_MD_RAID456_WORKQUEUE INIT_WORK(&sh->ops.work, conf->do_block_ops, sh); + #endif if (grow_buffers(sh, conf->raid_disks)) { shrink_buffers(sh, conf->raid_disks); @@ -1352,8 +1354,10 @@ { if (!test_bit(STRIPE_OP_QUEUED, &sh->state) && sh->ops.pending) { set_bit(STRIPE_OP_QUEUED, &sh->state); + #ifdef CONFIG_MD_RAID456_WORKQUEUE atomic_inc(&sh->count); queue_work(sh->raid_conf->block_ops_queue, &sh->ops.work); + #endif } } @@ -1614,7 +1618,9 @@ queue_raid_work(sh); spin_unlock(&sh->lock); + #ifdef CONFIG_MD_RAID456_WORKQUEUE release_stripe(sh); + #endif } /* @@ -2182,6 +2188,13 @@ spin_unlock(&sh->lock); + #ifndef CONFIG_MD_RAID456_WORKQUEUE + while (test_bit(STRIPE_OP_QUEUED, &sh->state)) { + PRINTK("run do_block_ops\n", __FUNCTION__); + conf->do_block_ops(sh); + } + #endif + while ((bi=return_bi)) { int bytes = bi->bi_size; @@ -3480,12 +3493,20 @@ goto abort; } + #ifdef CONFIG_MD_RAID456_WORKQUEUE sprintf(conf->workqueue_name, "%s_raid5_ops", mddev->gendisk->disk_name); + #ifdef CONFIG_MD_RAID456_MULTITHREAD if ((conf->block_ops_queue = create_workqueue(conf->workqueue_name)) == NULL) goto abort; + #else + if ((conf->block_ops_queue = __create_workqueue(conf->workqueue_name, 1)) +== NULL) + goto abort; + #endif + #endif /* To Do: * 1/ Offload to asynchronous copy / xor engines @@ -3656,8 +3677,10 @@ safe_put_page(conf->spare_page); kfree(conf->disks); kfree(conf->stripe_hashtbl); + #ifdef CONFIG_MD_RAID456_WORKQUEUE if (conf->do_block_ops) destroy_workqueue(conf->block_ops_queue); + #endif
Re: [PATCH 004 of 006] raid5: Move read completion copies to a work queue
Minor refresh to make 'biofill' go through a test_and_clear_bit check before performing the copy. Which is important for the hardware offload implementation where operations might need to be retried until DMA resources are available. - This patch moves the data copying portion of satisfying read requests into the workqueue. It adds a 'read' (past tense) pointer to the r5dev structure to to track reads that have been offloaded to the workqueue. When the copy operation is complete the 'read' pointer is reused as the return_bi for the bi_end_io() call. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> drivers/md/raid5.c | 98 - include/linux/raid/raid5.h |7 ++- 2 files changed, 76 insertions(+), 29 deletions(-) === Index: linux-2.6-raid/drivers/md/raid5.c === --- linux-2.6-raid.orig/drivers/md/raid5.c 2006-06-28 11:06:06.0 -0700 +++ linux-2.6-raid/drivers/md/raid5.c 2006-06-29 11:43:35.0 -0700 @@ -213,11 +213,11 @@ for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->toread || dev->towrite || dev->written || + if (dev->toread || dev->read || dev->towrite || dev->written || test_bit(R5_LOCKED, &dev->flags)) { - printk("sector=%llx i=%d %p %p %p %d\n", + printk("sector=%llx i=%d %p %p %p %p %d\n", (unsigned long long)sh->sector, i, dev->toread, - dev->towrite, dev->written, + dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); BUG(); } @@ -1490,6 +1490,38 @@ ops_state_orig = ops_state = sh->ops.state; spin_unlock(&sh->lock); + if (test_bit(STRIPE_OP_BIOFILL, &state)) { + PRINTK("%s: stripe %llu STRIPE_OP_BIOFILL op_state: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + ops_state); + + if (test_and_clear_bit(STRIPE_OP_BIOFILL_Copy, &ops_state)) { + raid5_conf_t *conf = sh->raid_conf; + struct bio *return_bi=NULL; + + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + struct bio *rbi, *rbi2; + rbi = dev->read; + while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { + copy_data(0, rbi, dev->page, dev->sector); + rbi2 = r5_next_bio(rbi, dev->sector); + spin_lock_irq(&conf->device_lock); + if (--rbi->bi_phys_segments == 0) { + rbi->bi_next = return_bi; + return_bi = rbi; + } + spin_unlock_irq(&conf->device_lock); + rbi = rbi2; + dev->read = return_bi; + } + } + + work++; + set_bit(STRIPE_OP_BIOFILL_Done, &ops_state); + } + } + if (test_bit(STRIPE_OP_COMPUTE, &state)) { for (i=disks ; i-- ;) { struct r5dev *dev = &sh->dev[i]; @@ -1725,6 +1757,7 @@ int i; int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; + int fill_complete=0, to_fill=0; int non_overwrite = 0; int failed_num=0; struct r5dev *dev; @@ -1740,45 +1773,49 @@ syncing = test_bit(STRIPE_SYNCING, &sh->state); expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ + if (test_bit(STRIPE_OP_BIOFILL, &sh->state) && + test_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_BIOFILL, &sh->state); + clear_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state); + fill_complete++; + } + /* Now to look around and see what can be done */ rcu
Re: [PATCH 005 of 006] raid5: Move expansion operations to a work queue
Refresh to apply on top the new version of [PATCH 004 of 006]. --- This patch modifies handle_write_operations5() to handle the parity calculation request made by the reshape code. However this patch does not move the copy operation associated with an expand to the work queue. First, it was difficult to find a clean way to pass the parameters of this operation to the queue. Second, this section of code is a good candidate for performing the copies with inline calls to the dma routines. This patch also cleans up the *_End flags which as of this version of the patch set are not needed. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> drivers/md/raid5.c | 51 - include/linux/raid/raid5.h | 36 +++ 2 files changed, 54 insertions(+), 33 deletions(-) === Index: linux-2.6-raid/drivers/md/raid5.c === --- linux-2.6-raid.orig/drivers/md/raid5.c 2006-06-29 11:43:35.0 -0700 +++ linux-2.6-raid/drivers/md/raid5.c 2006-06-29 11:44:30.0 -0700 @@ -1250,16 +1250,25 @@ */ if (locked == 0) { if (rcw == 0) { - /* enter stage 1 of reconstruct write operation */ - set_bit(STRIPE_OP_RCW, &sh->state); - set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state); - for (i=disks ; i-- ;) { - struct r5dev *dev = &sh->dev[i]; - - if (i!=pd_idx && dev->towrite) { - set_bit(R5_LOCKED, &dev->flags); + /* skip the drain operation on an expand */ + if (test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state)) { + set_bit(STRIPE_OP_RCW, &sh->state); + set_bit(STRIPE_OP_RCW_Parity, &sh->ops.state); + for (i=disks ; i-- ;) { + set_bit(R5_LOCKED, &sh->dev[i].flags); locked++; } + } else { /* enter stage 1 of reconstruct write operation */ + set_bit(STRIPE_OP_RCW, &sh->state); + set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state); + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + + if (i!=pd_idx && dev->towrite) { + set_bit(R5_LOCKED, &dev->flags); + locked++; + } + } } } else { /* enter stage 1 of read modify write operation */ @@ -2217,16 +2226,24 @@ } if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + int work_queued, start_n=1; /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - compute_parity5(sh, RECONSTRUCT_WRITE); - for (i= conf->raid_disks; i--;) { - set_bit(R5_LOCKED, &sh->dev[i].flags); - locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); + if (!(test_bit(STRIPE_OP_RCW, &sh->state) || + test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state))) { + start_n = 0; + set_bit(STRIPE_OP_RCW_Expand, &sh->ops.state); + } + work_queued = handle_write_operations5(sh, 0, start_n); + if (work_queued == 0) { + for (i= conf->raid_disks; i--;) + set_bit(R5_Wantwrite, &sh->dev[i].flags); + clear_bit(STRIPE_EXPANDING, &sh->state); + clear_bit(STRIPE_OP_RCW_Expand, &sh->ops.state); + } else if (work_queued > 0) { + locked += work_queued; } - clear_bit(STRIPE_EXPANDING, &sh->state); } else if (expanded) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); @@ -2261,9 +2278,15 @@ release_stripe(sh2);
Re: Hardware assisted parity computation - is it now worth it?
On 7/13/06, Burn Alting <[EMAIL PROTECTED]> wrote: Last year, there were discussions on this list about the possible use of a 'co-processor' (Intel's IOP333) to compute raid 5/6's parity data. The MD patches have been posted for review, and the hardware offload pieces are nearing completion. We are about to see low cost, multi core cpu chips with very high speed memory bandwidth. In light of this, is there any effective benefit to such devices as the IOP333? It is true that upcoming server platforms have an abundance of CPU cycles, but what about the case where an IOP is the host processor? This is the primary target of the current work. Also, what about the more expensive RAID6 conditions (2-failed disks) where there might be benefits to having MD split its work over many CPUs? Regards, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: In Trouble--Please Help! (was Re: Can't add disk to failed raid array)
On 7/23/06, Paul Waldo <[EMAIL PROTECTED]> wrote: Here is the dmesg output. No log files are created with the FC5 rescue disk. Thanks! I ran into this as well, I believe at this point you want to set: md-mod.start_dirty_degraded=1 as part of your boot options. Understand you may see some filesystem corruption as noted in the documentation. See: http://www.linux-m32r.org/lxr/http/source/Documentation/md.txt?v=2.6.17#L54 Regards, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: In Trouble--Please Help! (was Re: Can't add disk to failed raid array)
I'll certainly give that a try later on, as I need physical access to the box. The corruption part is worrisome... When you did this, did you experience corruption? I'm running RAID6 with 7 disks; presumably even with two disks out of whack, I should be in good shape...??? I was running a 5 disk RAID-5 and did not detect any corruption. Neil correct me if I am wrong, but I believe that since your failure occured without power loss that the chances for data corruption in this case are small. Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 001 of 006] raid5: Move write operations to a work queue
On 7/27/06, Yuri Tikhonov <[EMAIL PROTECTED]> wrote: Hello, Dan. I've looked through your patches, and have some suggestions about write operations processing. Thanks for reviewing the code. In the current implementation of the Raid5 driver the RMW operation won't begin until old blocks in the stripe cache, which are to be rewritten, become UPTODATE. But if you have dedicated h/w DMA engines, then, while an IOC(input/output contoller) performs transmition of the old strip data from the disk to the stripe cache, it may make sense to start a DMA engine, which will transmit new strip data from the bio requested to write. So, when an IOC operation complete, we'll already have all necessary data to compute new parity value. 1) For the current implementation: Trmw = Tioc1 + Txor1 + Tdma + Txor2 + Tioc2, where Tioc1 is the time it takes to update stripe cache with old data, Txor1 is the time it takes to substract old data from old parity value, Tdma is the time it takes to update strip with new data, Txor2 is the time it takes to compute new parity, and Tioc2 is the time it takes to transfer updated data to disks. So, Trmw = 2*Tioc + 2*Txor + Tdma 2) If copying old and new data to stripe cache is performed simultaneously, then time to complete the whole RMW operation will take: T'rmw = max(Tioc1, Tdma) + 2*Txor + Tioc2, where Tioc1 is the time it takes to update stripe cache with old data, Tdma is the time it takes to update strip with new data, 2*Txor is the time it takes to compute new parity, and Tioc2 is the time it takes to transfer updated data to disks. So, T'rmw = 2*Tioc + 2*Txor. (in any case, i think that Tioc > Tdma, because Tioc corresponds to the time spent reading from disk, and Tdma corresponds to operations with SDRAM, which are faster). Also, 2*Txor for (2) is less then 2*Txor for (1), because in (2) approach we have to prepare XOR engine descriptors only once, but in the (1) approach - twice. Does it make sense to revise your Raid5 driver implementaion to allow IOC and DMA to have separate destination buffers? That is, some kind of a stripe shadow. IOC will copy to the regular buffer in the stripe cache, DMA - to the shadow one. The issue I see with this is that Tioc1 is orders of magnitude greater than Tdma. So while I agree there may be room to get some pre-work done while the reads are in flight I do not expect that the performance increase would be significant, and definitely not worth the design complexity of adding a "shadow buffer". Regards, Yuri. Regards, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: raid 5 read performance
Neil hello Sorry for the delay. too many things to do. I have implemented all said in : http://www.spinics.net/lists/raid/msg11838.html As always I have some questions: 1. mergeable_bvec I did not understand first i must admit. now i do not see how it differs from the one of raid0. so i actually copied it and renamed it. 2. statistics. i have added md statistics since the code does not reach the statics in make_request. it returns from make_request before that. 3. i have added the new retry list called toread_aligned to raid5_conf_t . hope this is correct. 4. your instructions are to add a failed bio to sh, but it does not say to handle it directly. i have tried it and something is missing here. raid5d handle stripes only if conf->handle_list is not empty. i added handle_stripe and and release_stripe of my own. this way i managed to get from the completion routine: "R5: read error corrected!! " message . ( i have tested by failing a ram disk ). 5. I am going to test the non common path heavily before submitting you the patch ( on real disks and use several file systems and several chunk sizes). It is quite a big patch so I need to know which kernel do you want me to use ? i am using poor 2.6.15. I thank you -- Raz Hi Raz, What is the status of this patch? Anything I can help out with, like testing or forward porting to the latest kernel? Thanks, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: raid 5 read performance
On 8/5/06, Raz Ben-Jehuda(caro) <[EMAIL PROTECTED]> wrote: patch is applied by Neil. I do not know when he going to apply it. i have applied it on my systems ( on 2.6.15 ) but they are currenly in the lab and not in production. Raz. PS I must say that it saves lots of cpu cycles. Did you send the 2.6.15 patch in a private message I can't find it in the archives? Thanks, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Linux: Why software RAID?
On 8/23/06, H. Peter Anvin <[EMAIL PROTECTED]> wrote: Chris Friesen wrote: > Jeff Garzik wrote: > >> But anyway, to help answer the question of hardware vs. software RAID, >> I wrote up a page: >> >> http://linux.yyz.us/why-software-raid.html > > Just curious...with these guys > (http://www.bigfootnetworks.com/KillerOverview.aspx) putting linux on a > PCI NIC to allow them to bypass Windows' network stack, has anyone ever > considered doing "hardware" raid by using an embedded cpu running linux > software RAID, with battery-backed memory? > > It would theoretically allow you to remain feature-compatible by > downloading new kernels to your RAID card. > Yes. In fact, I have been told by several RAID chip vendors that their customers are *strongly* demanding that their chips be able to run Linux md (and still use whatever hardware offload features.) So it's happening. Speaking of md with hardware offload features: http://prdownloads.sourceforge.net/xscaleiop/ols_paper_2006.pdf?download -hpa Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 00/19] Hardware Accelerated MD RAID5: Introduction
Neil, The following patches implement hardware accelerated raid5 for the Intel XscaleĀ® series of I/O Processors. The MD changes allow stripe operations to run outside the spin lock in a work queue. Hardware acceleration is achieved by using a dma-engine-aware work queue routine instead of the default software only routine. Since the last release of the raid5 changes many bug fixes and other improvements have been made as a result of stress testing. See the per patch change logs for more information about what was fixed. This release is the first release of the full dma implementation. The patches touch 3 areas, the md-raid5 driver, the generic dmaengine interface, and a platform device driver for IOPs. The raid5 changes follow your comments concerning making the acceleration implementation similar to how the stripe cache handles I/O requests. The dmaengine changes are the second release of this code. They expand the interface to handle more than memcpy operations, and add a generic raid5-dma client. The iop-adma driver supports dma memcpy, xor, xor zero sum, and memset across all IOP architectures (32x, 33x, and 13xx). Concerning the context switching performance concerns raised at the previous release, I have observed the following. For the hardware accelerated case it appears that performance is always better with the work queue than without since it allows multiple stripes to be operated on simultaneously. I expect the same for an SMP platform, but so far my testing has been limited to IOPs. For a single-processor non-accelerated configuration I have not observed performance degradation with work queue support enabled, but in the Kconfig option help text I recommend disabling it (CONFIG_MD_RAID456_WORKQUEUE). Please consider the patches for -mm. -Dan [PATCH 01/19] raid5: raid5_do_soft_block_ops [PATCH 02/19] raid5: move write operations to a workqueue [PATCH 03/19] raid5: move check parity operations to a workqueue [PATCH 04/19] raid5: move compute block operations to a workqueue [PATCH 05/19] raid5: move read completion copies to a workqueue [PATCH 06/19] raid5: move the reconstruct write expansion operation to a workqueue [PATCH 07/19] raid5: remove compute_block and compute_parity5 [PATCH 08/19] dmaengine: enable multiple clients and operations [PATCH 09/19] dmaengine: reduce backend address permutations [PATCH 10/19] dmaengine: expose per channel dma mapping characteristics to clients [PATCH 11/19] dmaengine: add memset as an asynchronous dma operation [PATCH 12/19] dmaengine: dma_async_memcpy_err for DMA engines that do not support memcpy [PATCH 13/19] dmaengine: add support for dma xor zero sum operations [PATCH 14/19] dmaengine: add dma_sync_wait [PATCH 15/19] dmaengine: raid5 dma client [PATCH 16/19] dmaengine: Driver for the Intel IOP 32x, 33x, and 13xx RAID engines [PATCH 17/19] iop3xx: define IOP3XX_REG_ADDR[32|16|8] and clean up DMA/AAU defs [PATCH 18/19] iop3xx: Give Linux control over PCI (ATU) initialization [PATCH 19/19] iop3xx: IOP 32x and 33x support for the iop-adma driver Note, the iop3xx patches apply against the iop3xx platform code re-factoring done by Lennert Buytenhek. His patches are reproduced, with permission, on the Xscale IOP SourceForge site. Also available on SourceForge: Linux Symposium Paper: MD RAID Acceleration Support for Asynchronous DMA/XOR Engines http://prdownloads.sourceforge.net/xscaleiop/ols_paper_2006.pdf?download Tar archive of the patch set http://prdownloads.sourceforge.net/xscaleiop/md_raid_accel-2.6.18-rc6.tar.gz?download [PATCH 01/19] http://prdownloads.sourceforge.net/xscaleiop/md-add-raid5-do-soft-block-ops.patch?download [PATCH 02/19] http://prdownloads.sourceforge.net/xscaleiop/md-move-write-operations-to-a-workqueue.patch?download [PATCH 03/19] http://prdownloads.sourceforge.net/xscaleiop/md-move-check-parity-operations-to-a-workqueue.patch?download [PATCH 04/19] http://prdownloads.sourceforge.net/xscaleiop/md-move-compute-block-operations-to-a-workqueue.patch?download [PATCH 05/19] http://prdownloads.sourceforge.net/xscaleiop/md-move-read-completion-copies-to-a-workqueue.patch?download [PATCH 06/19] http://prdownloads.sourceforge.net/xscaleiop/md-move-expansion-operations-to-a-workqueue.patch?download [PATCH 07/19] http://prdownloads.sourceforge.net/xscaleiop/md-remove-compute_block-and-compute_parity5.patch?download [PATCH 08/19] http://prdownloads.sourceforge.net/xscaleiop/dmaengine-multiple-clients-and-multiple-operations.patch?download [PATCH 09/19] http://prdownloads.sourceforge.net/xscaleiop/dmaengine-unite-backend-address-types.patch?download [PATCH 10/19] http://prdownloads.sourceforge.net/xscaleiop/dmaengine-dma-async-map-page.patch?download [PATCH 11/19] http://prdownloads.sourceforge.net/xscaleiop/dmaengine-dma-async-memset.patch?download [PATCH 12/19] http://prdownloads.sourceforge.net/xscaleiop/dmaengine-dma-async-memcpy-err.patch?download [PATCH 13/19] http://prdownloads.sourceforge.net/xscale
[PATCH 01/19] raid5: raid5_do_soft_block_ops
From: Dan Williams <[EMAIL PROTECTED]> raid5_do_soft_block_ops consolidates all the stripe cache maintenance operations into a single routine. The stripe operations are: * copying data between the stripe cache and user application buffers * computing blocks to save a disk access, or to recover a missing block * updating the parity on a write operation (reconstruct write and read-modify-write) * checking parity correctness Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 289 include/linux/raid/raid5.h | 129 +++- 2 files changed, 415 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4500660..8fde62b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1362,6 +1362,295 @@ static int stripe_to_pdidx(sector_t stri return pd_idx; } +/* + * raid5_do_soft_block_ops - perform block memory operations on stripe data + * outside the spin lock. + */ +static void raid5_do_soft_block_ops(void *stripe_head_ref) +{ + struct stripe_head *sh = stripe_head_ref; + int i, pd_idx = sh->pd_idx, disks = sh->disks; + void *ptr[MAX_XOR_BLOCKS]; + int overlap=0, work=0, written=0, compute=0, dd_idx=0; + int pd_uptodate=0; + unsigned long state, ops_state, ops_state_orig; + raid5_conf_t *conf = sh->raid_conf; + + /* take a snapshot of what needs to be done at this point in time */ + spin_lock(&sh->lock); + state = sh->state; + ops_state_orig = ops_state = sh->ops.state; + spin_unlock(&sh->lock); + + if (test_bit(STRIPE_OP_BIOFILL, &state)) { + struct bio *return_bi=NULL; + + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + if (test_bit(R5_ReadReq, &dev->flags)) { + struct bio *rbi, *rbi2; + PRINTK("%s: stripe %llu STRIPE_OP_BIOFILL op_state: %lx disk: %d\n", + __FUNCTION__, (unsigned long long)sh->sector, + ops_state, i); + spin_lock_irq(&conf->device_lock); + rbi = dev->toread; + dev->toread = NULL; + spin_unlock_irq(&conf->device_lock); + overlap++; + while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { + copy_data(0, rbi, dev->page, dev->sector); + rbi2 = r5_next_bio(rbi, dev->sector); + spin_lock_irq(&conf->device_lock); + if (--rbi->bi_phys_segments == 0) { + rbi->bi_next = return_bi; + return_bi = rbi; + } + spin_unlock_irq(&conf->device_lock); + rbi = rbi2; + } + dev->read = return_bi; + } + } + if (overlap) { + set_bit(STRIPE_OP_BIOFILL_Done, &ops_state); + work++; + } + } + + if (test_bit(STRIPE_OP_COMPUTE, &state)) { + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + if (test_bit(R5_ComputeReq, &dev->flags)) { + dd_idx = i; + i = -1; + break; + } + } + BUG_ON(i >= 0); + PRINTK("%s: stripe %llu STRIPE_OP_COMPUTE op_state: %lx block: %d\n", + __FUNCTION__, (unsigned long long)sh->sector, + ops_state, dd_idx); + ptr[0] = page_address(sh->dev[dd_idx].page); + + if (test_and_clear_bit(STRIPE_OP_COMPUTE_Prep, &ops_state)) { + memset(ptr[0], 0, STRIPE_SIZE); + set_bit(STRIPE_OP_COMPUTE_Parity, &ops_state); + } + + if (test_and_clear_bit(STRIPE_OP_COMPUTE_Parity, &ops_state)) { + int count = 1; + for (i = disks ; i--; ) { + struct r5dev *dev = &sh->dev[i]; + void *p; + if (i == dd_idx) + continue; +
[PATCH 17/19] iop3xx: define IOP3XX_REG_ADDR[32|16|8] and clean up DMA/AAU defs
From: Dan Williams <[EMAIL PROTECTED]> Also brings the iop3xx registers in line with the format of the iop13xx register definitions. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- include/asm-arm/arch-iop32x/entry-macro.S |2 include/asm-arm/arch-iop32x/iop32x.h | 14 + include/asm-arm/arch-iop33x/entry-macro.S |2 include/asm-arm/arch-iop33x/iop33x.h | 38 ++- include/asm-arm/hardware/iop3xx.h | 347 + 5 files changed, 188 insertions(+), 215 deletions(-) diff --git a/include/asm-arm/arch-iop32x/entry-macro.S b/include/asm-arm/arch-iop32x/entry-macro.S index 1500cbb..f357be4 100644 --- a/include/asm-arm/arch-iop32x/entry-macro.S +++ b/include/asm-arm/arch-iop32x/entry-macro.S @@ -13,7 +13,7 @@ #include .endm .macro get_irqnr_and_base, irqnr, irqstat, base, tmp - ldr \base, =IOP3XX_REG_ADDR(0x07D8) + ldr \base, =0xfeffe7d8 ldr \irqstat, [\base] @ Read IINTSRC cmp \irqstat, #0 clzne \irqnr, \irqstat diff --git a/include/asm-arm/arch-iop32x/iop32x.h b/include/asm-arm/arch-iop32x/iop32x.h index 15b4d6a..904a14d 100644 --- a/include/asm-arm/arch-iop32x/iop32x.h +++ b/include/asm-arm/arch-iop32x/iop32x.h @@ -19,16 +19,18 @@ #define __IOP32X_H * Peripherals that are shared between the iop32x and iop33x but * located at different addresses. */ -#define IOP3XX_GPIO_REG(reg) (IOP3XX_PERIPHERAL_VIRT_BASE + 0x07c0 + (reg)) -#define IOP3XX_TIMER_REG(reg) (IOP3XX_PERIPHERAL_VIRT_BASE + 0x07e0 + (reg)) +#define IOP3XX_GPIO_REG32(reg) (volatile u32 *)(IOP3XX_PERIPHERAL_VIRT_BASE +\ + 0x07c0 + (reg)) +#define IOP3XX_TIMER_REG32(reg) (volatile u32 *)(IOP3XX_PERIPHERAL_VIRT_BASE +\ + 0x07e0 + (reg)) #include /* Interrupt Controller */ -#define IOP32X_INTCTL (volatile u32 *)IOP3XX_REG_ADDR(0x07d0) -#define IOP32X_INTSTR (volatile u32 *)IOP3XX_REG_ADDR(0x07d4) -#define IOP32X_IINTSRC (volatile u32 *)IOP3XX_REG_ADDR(0x07d8) -#define IOP32X_FINTSRC (volatile u32 *)IOP3XX_REG_ADDR(0x07dc) +#define IOP32X_INTCTL IOP3XX_REG_ADDR32(0x07d0) +#define IOP32X_INTSTR IOP3XX_REG_ADDR32(0x07d4) +#define IOP32X_IINTSRC IOP3XX_REG_ADDR32(0x07d8) +#define IOP32X_FINTSRC IOP3XX_REG_ADDR32(0x07dc) #endif diff --git a/include/asm-arm/arch-iop33x/entry-macro.S b/include/asm-arm/arch-iop33x/entry-macro.S index 92b7917..eb207d2 100644 --- a/include/asm-arm/arch-iop33x/entry-macro.S +++ b/include/asm-arm/arch-iop33x/entry-macro.S @@ -13,7 +13,7 @@ #include .endm .macro get_irqnr_and_base, irqnr, irqstat, base, tmp - ldr \base, =IOP3XX_REG_ADDR(0x07C8) + ldr \base, =0xfeffe7c8 ldr \irqstat, [\base] @ Read IINTVEC cmp \irqstat, #0 ldreq \irqstat, [\base] @ erratum 63 workaround diff --git a/include/asm-arm/arch-iop33x/iop33x.h b/include/asm-arm/arch-iop33x/iop33x.h index 9b38fde..c171383 100644 --- a/include/asm-arm/arch-iop33x/iop33x.h +++ b/include/asm-arm/arch-iop33x/iop33x.h @@ -18,28 +18,30 @@ #define __IOP33X_H * Peripherals that are shared between the iop32x and iop33x but * located at different addresses. */ -#define IOP3XX_GPIO_REG(reg) (IOP3XX_PERIPHERAL_VIRT_BASE + 0x1780 + (reg)) -#define IOP3XX_TIMER_REG(reg) (IOP3XX_PERIPHERAL_VIRT_BASE + 0x07d0 + (reg)) +#define IOP3XX_GPIO_REG32(reg) (volatile u32 *)(IOP3XX_PERIPHERAL_VIRT_BASE +\ + 0x1780 + (reg)) +#define IOP3XX_TIMER_REG32(reg) (volatile u32 *)(IOP3XX_PERIPHERAL_VIRT_BASE +\ + 0x07d0 + (reg)) #include /* Interrupt Controller */ -#define IOP33X_INTCTL0 (volatile u32 *)IOP3XX_REG_ADDR(0x0790) -#define IOP33X_INTCTL1 (volatile u32 *)IOP3XX_REG_ADDR(0x0794) -#define IOP33X_INTSTR0 (volatile u32 *)IOP3XX_REG_ADDR(0x0798) -#define IOP33X_INTSTR1 (volatile u32 *)IOP3XX_REG_ADDR(0x079c) -#define IOP33X_IINTSRC0(volatile u32 *)IOP3XX_REG_ADDR(0x07a0) -#define IOP33X_IINTSRC1(volatile u32 *)IOP3XX_REG_ADDR(0x07a4) -#define IOP33X_FINTSRC0(volatile u32 *)IOP3XX_REG_ADDR(0x07a8) -#define IOP33X_FINTSRC1(volatile u32 *)IOP3XX_REG_ADDR(0x07ac) -#define IOP33X_IPR0(volatile u32 *)IOP3XX_REG_ADDR(0x07b0) -#define IOP33X_IPR1(volatile u32 *)IOP3XX_REG_ADDR(0x07b4) -#define IOP33X_IPR2(volatile u32 *)IOP3XX_REG_ADDR(0x07b8) -#define IOP33X_IPR3(volatile u32 *)IOP3XX_REG_ADDR(0x07bc) -#define IOP33X_INTBASE (volatile u32 *)IOP3XX_REG_ADDR(0x07c0) -#define IO
[PATCH 13/19] dmaengine: add support for dma xor zero sum operations
From: Dan Williams <[EMAIL PROTECTED]> Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/dmaengine.c | 15 drivers/dma/ioatdma.c |6 + include/linux/dmaengine.h | 56 + 3 files changed, 77 insertions(+), 0 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 33ad690..190c612 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -617,6 +617,18 @@ dma_cookie_t dma_async_do_xor_err(struct } /** + * dma_async_do_zero_sum_err - default function for dma devices that + * do not support xor zero sum + */ +dma_cookie_t dma_async_do_zero_sum_err(struct dma_chan *chan, + union dmaengine_addr src, unsigned int src_cnt, + unsigned int src_off, size_t len, u32 *result, + unsigned long flags) +{ + return -ENXIO; +} + +/** * dma_async_do_memset_err - default function for dma devices that * do not support memset */ @@ -649,6 +661,8 @@ EXPORT_SYMBOL_GPL(dma_async_memset_page) EXPORT_SYMBOL_GPL(dma_async_memset_dma); EXPORT_SYMBOL_GPL(dma_async_xor_pgs_to_pg); EXPORT_SYMBOL_GPL(dma_async_xor_dma_list_to_dma); +EXPORT_SYMBOL_GPL(dma_async_zero_sum_pgs); +EXPORT_SYMBOL_GPL(dma_async_zero_sum_dma_list); EXPORT_SYMBOL_GPL(dma_async_operation_complete); EXPORT_SYMBOL_GPL(dma_async_issue_pending); EXPORT_SYMBOL_GPL(dma_async_device_register); @@ -656,6 +670,7 @@ EXPORT_SYMBOL_GPL(dma_async_device_unreg EXPORT_SYMBOL_GPL(dma_chan_cleanup); EXPORT_SYMBOL_GPL(dma_async_do_memcpy_err); EXPORT_SYMBOL_GPL(dma_async_do_xor_err); +EXPORT_SYMBOL_GPL(dma_async_do_zero_sum_err); EXPORT_SYMBOL_GPL(dma_async_do_memset_err); EXPORT_SYMBOL_GPL(dma_async_chan_init); EXPORT_SYMBOL_GPL(dma_async_map_page); diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 231247c..4e90b02 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -637,6 +637,11 @@ extern dma_cookie_t dma_async_do_xor_err union dmaengine_addr src, unsigned int src_cnt, unsigned int src_off, size_t len, unsigned long flags); +extern dma_cookie_t dma_async_do_zero_sum_err(struct dma_chan *chan, +union dmaengine_addr src, unsigned int src_cnt, +unsigned int src_off, size_t len, u32 *result, + unsigned long flags); + extern dma_cookie_t dma_async_do_memset_err(struct dma_chan *chan, union dmaengine_addr dest, unsigned int dest_off, int val, size_t size, unsigned long flags); @@ -752,6 +757,7 @@ #endif device->common.capabilities = DMA_MEMCPY; device->common.device_do_dma_memcpy = do_ioat_dma_memcpy; device->common.device_do_dma_xor = dma_async_do_xor_err; + device->common.device_do_dma_zero_sum = dma_async_do_zero_sum_err; device->common.device_do_dma_memset = dma_async_do_memset_err; device->common.map_page = ioat_map_page; device->common.map_single = ioat_map_single; diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 8d53b08..9fd6cbd 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -260,6 +260,7 @@ struct dma_chan_client_ref { * @device_issue_pending: push appended descriptors to hardware * @device_do_dma_memcpy: perform memcpy with a dma engine * @device_do_dma_xor: perform block xor with a dma engine + * @device_do_dma_zero_sum: perform block xor zero sum with a dma engine * @device_do_dma_memset: perform block fill with a dma engine */ struct dma_device { @@ -285,6 +286,10 @@ struct dma_device { union dmaengine_addr src, unsigned int src_cnt, unsigned int src_off, size_t len, unsigned long flags); + dma_cookie_t (*device_do_dma_zero_sum)(struct dma_chan *chan, + union dmaengine_addr src, unsigned int src_cnt, + unsigned int src_off, size_t len, u32 *result, + unsigned long flags); dma_cookie_t (*device_do_dma_memset)(struct dma_chan *chan, union dmaengine_addr dest, unsigned int dest_off, int value, size_t len, unsigned long flags); @@ -601,6 +606,57 @@ static inline dma_cookie_t dma_async_xor } /** + * dma_async_zero_sum_pgs - offloaded xor zero sum from a list of pages + * @chan: DMA channel to offload zero sum to + * @src_pgs: array of source pages + * @src_cnt: number of source pages + * @src_off: offset in pages to xor from + * @len: length + * @result: set to 1 if sum is zero else 0 + * + * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus + * address according to the DMA mapping API rules for streaming mappings. + * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident + * (kernel memory or locked user space pages) + */ +static inline dma_cookie_t dma_async_zero_sum_pgs(struct dma_chan *ch
[PATCH 15/19] dmaengine: raid5 dma client
From: Dan Williams <[EMAIL PROTECTED]> Adds a dmaengine client that is the hardware accelerated version of raid5_do_soft_block_ops. It utilizes the raid5 workqueue implementation to operate on multiple stripes simultaneously. See the iop-adma.c driver for an example of a driver that enables hardware accelerated raid5. Changelog: * mark operations as _Dma rather than _Done until all outstanding operations have completed. Once all operations have completed update the state and return it to the handle list * add a helper routine to retrieve the last used cookie * use dma_async_zero_sum_dma_list for checking parity which optionally allows parity check operations to not dirty the parity block in the cache (if 'disks' is less than 'MAX_ADMA_XOR_SOURCES') * remove dependencies on iop13xx * take into account the fact that dma engines have a staging buffer so we can perform 1 less block operation compared to software xor * added __arch_raid5_dma_chan_request __arch_raid5_dma_next_channel and __arch_raid5_dma_check_channel to make the driver architecture independent * added channel switching capability for architectures that implement different operations (i.e. copy & xor) on individual channels * added initial support for "non-blocking" channel switching Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/Kconfig|9 + drivers/dma/Makefile |1 drivers/dma/raid5-dma.c| 730 drivers/md/Kconfig | 11 + drivers/md/raid5.c | 66 include/linux/dmaengine.h |5 include/linux/raid/raid5.h | 24 + 7 files changed, 839 insertions(+), 7 deletions(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 30d021d..fced8c3 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -22,6 +22,15 @@ config NET_DMA Since this is the main user of the DMA engine, it should be enabled; say Y here. +config RAID5_DMA +tristate "MD raid5: block operations offload" + depends on INTEL_IOP_ADMA && MD_RAID456 + default y + ---help--- + This enables the use of DMA engines in the MD-RAID5 driver to + offload stripe cache operations, freeing CPU cycles. + say Y here + comment "DMA Devices" config INTEL_IOATDMA diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index bdcfdbd..4e36d6e 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_DMA_ENGINE) += dmaengine.o obj-$(CONFIG_NET_DMA) += iovlock.o +obj-$(CONFIG_RAID5_DMA) += raid5-dma.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o diff --git a/drivers/dma/raid5-dma.c b/drivers/dma/raid5-dma.c new file mode 100644 index 000..04a1790 --- /dev/null +++ b/drivers/dma/raid5-dma.c @@ -0,0 +1,730 @@ +/* + * Offload raid5 operations to hardware RAID engines + * Copyright(c) 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +#include +#include + +static struct dma_client *raid5_dma_client; +static atomic_t raid5_count; +extern void release_stripe(struct stripe_head *sh); +extern void __arch_raid5_dma_chan_request(struct dma_client *client); +extern struct dma_chan *__arch_raid5_dma_next_channel(struct dma_client *client); + +#define MAX_HW_XOR_SRCS 16 + +#ifndef STRIPE_SIZE +#define STRIPE_SIZE PAGE_SIZE +#endif + +#ifndef STRIPE_SECTORS +#define STRIPE_SECTORS (STRIPE_SIZE>>9) +#endif + +#ifndef r5_next_bio +#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) +#endif + +#define DMA_RAID5_DEBUG 0 +#define PRINTK(x...) ((void)(DMA_RAID5_DEBUG && printk(x))) + +/* + * Copy data between a page in the stripe cache, and one or more bion + * The page could align with the middle of the bio, or there could be + * several bion, each with several bio_vecs, which cover part of the page + * Multiple bion are linked together on bi_next. There may be extras + * at the end of this list. We ignore them. + */ +static dma_cookie_t dma_raid_copy_data
[PATCH 04/19] raid5: move compute block operations to a workqueue
From: Dan Williams <[EMAIL PROTECTED]> Enable handle_stripe5 to pass off compute block operations to raid5_do_soft_block_ops, formerly handled by compute_block. Here are a few notes about the new flags R5_ComputeReq and STRIPE_OP_COMPUTE_Recover: Previously, when handle_stripe5 found a block that needed to be computed it updated it in the same step. Now that these operations are separated (across multiple calls to handle_stripe5), a R5_ComputeReq flag is needed to tell other parts of handle_stripe5 to treat the block under computation as if it were up to date. The order of events in the work queue ensures that the block is indeed up to date before performing further operations. STRIPE_OP_COMPUTE_Recover_pd was added to track when the parity block is being computed due to a failed parity check. This allows the code in handle_stripe5 that produces requests for check_parity and compute_block operations to be separate from the code that consumes the result. Changelog: * count blocks under computation as uptodate * removed handle_compute_operations5. All logic moved into handle_stripe5 so that we do not need to go through the initiation logic to end the operation. * since the write operations mark blocks !uptodate we hold off the code to compute/read blocks until it completes. * new compute block operations and reads are held off while a compute is in flight * do not compute a block while a check parity operation is pending, and do not start a new check parity operation while a compute operation is pending * STRIPE_OP_Recover_pd holds off the clearing of the STRIPE_OP_COMPUTE state. This allows the transition to be handled by the check parity logic that writes recomputed parity to disk. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 153 1 files changed, 107 insertions(+), 46 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 24ed4d8..0c39203 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1300,7 +1300,8 @@ static int handle_write_operations5(stru } } else { /* enter stage 1 of read modify write operation */ - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); + BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || + test_bit(R5_ComputeReq, &sh->dev[pd_idx].flags))); set_bit(STRIPE_OP_RMW, &sh->state); set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state); @@ -1314,7 +1315,8 @@ static int handle_write_operations5(stru * so we distinguish these blocks by the RMWReq bit */ if (dev->towrite && - test_bit(R5_UPTODATE, &dev->flags)) { + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_ComputeReq, &dev->flags))) { set_bit(R5_RMWReq, &dev->flags); set_bit(R5_LOCKED, &dev->flags); clear_bit(R5_UPTODATE, &dev->flags); @@ -1748,7 +1750,7 @@ static void handle_stripe5(struct stripe int i; int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; - int non_overwrite=0, write_complete=0; + int compute=0, non_overwrite=0, write_complete=0; int failed_num=0; struct r5dev *dev; @@ -1799,7 +1801,7 @@ static void handle_stripe5(struct stripe /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) locked++; if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; - + if (test_bit(R5_ComputeReq, &dev->flags)) BUG_ON(++compute > 1); if (dev->toread) to_read++; if (dev->towrite) { @@ -1955,40 +1957,83 @@ static void handle_stripe5(struct stripe * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { - for (i=disks; i--;) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || -(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || -syncing || -expanding || -(failed && (sh->dev[failed_num].toread || -(sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE
[PATCH 19/19] iop3xx: IOP 32x and 33x support for the iop-adma driver
From: Dan Williams <[EMAIL PROTECTED]> Adds the platform device definitions and the architecture specific support routines (i.e. register initialization and descriptor formats) for the iop-adma driver. Changelog: * add support for > 1k zero sum buffer sizes * added dma/aau platform devices to iq80321 and iq80332 setup * fixed the calculation in iop_desc_is_aligned * support xor buffer sizes larger than 16MB * fix places where software descriptors are assumed to be contiguous, only hardware descriptors are contiguous * iop32x does not support hardware zero sum, add software emulation support for up to a PAGE_SIZE buffer size * added raid5 dma driver support functions Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- arch/arm/mach-iop32x/iq80321.c | 141 + arch/arm/mach-iop33x/iq80331.c |9 arch/arm/mach-iop33x/iq80332.c |8 arch/arm/mach-iop33x/setup.c | 132 + include/asm-arm/arch-iop32x/adma.h |5 include/asm-arm/arch-iop33x/adma.h |5 include/asm-arm/hardware/iop3xx-adma.h | 901 7 files changed, 1201 insertions(+), 0 deletions(-) diff --git a/arch/arm/mach-iop32x/iq80321.c b/arch/arm/mach-iop32x/iq80321.c index cdd2265..79d6514 100644 --- a/arch/arm/mach-iop32x/iq80321.c +++ b/arch/arm/mach-iop32x/iq80321.c @@ -33,6 +33,9 @@ #include #include #include #include +#ifdef CONFIG_DMA_ENGINE +#include +#endif /* * IQ80321 timer tick configuration. @@ -170,12 +173,150 @@ static struct platform_device iq80321_se .resource = &iq80321_uart_resource, }; +#ifdef CONFIG_DMA_ENGINE +/* AAU and DMA Channels */ +static struct resource iop3xx_dma_0_resources[] = { + [0] = { + .start = (unsigned long) IOP3XX_DMA_CCR(0), + .end = ((unsigned long) IOP3XX_DMA_DCR(0)) + 4, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_IOP32X_DMA0_EOT, + .end = IRQ_IOP32X_DMA0_EOT, + .flags = IORESOURCE_IRQ + }, + [2] = { + .start = IRQ_IOP32X_DMA0_EOC, + .end = IRQ_IOP32X_DMA0_EOC, + .flags = IORESOURCE_IRQ + }, + [3] = { + .start = IRQ_IOP32X_DMA0_ERR, + .end = IRQ_IOP32X_DMA0_ERR, + .flags = IORESOURCE_IRQ + } +}; + +static struct resource iop3xx_dma_1_resources[] = { + [0] = { + .start = (unsigned long) IOP3XX_DMA_CCR(1), + .end = ((unsigned long) IOP3XX_DMA_DCR(1)) + 4, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_IOP32X_DMA1_EOT, + .end = IRQ_IOP32X_DMA1_EOT, + .flags = IORESOURCE_IRQ + }, + [2] = { + .start = IRQ_IOP32X_DMA1_EOC, + .end = IRQ_IOP32X_DMA1_EOC, + .flags = IORESOURCE_IRQ + }, + [3] = { + .start = IRQ_IOP32X_DMA1_ERR, + .end = IRQ_IOP32X_DMA1_ERR, + .flags = IORESOURCE_IRQ + } +}; + + +static struct resource iop3xx_aau_resources[] = { + [0] = { + .start = (unsigned long) IOP3XX_AAU_ACR, + .end = (unsigned long) IOP3XX_AAU_SAR_EDCR(32), + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_IOP32X_AA_EOT, + .end = IRQ_IOP32X_AA_EOT, + .flags = IORESOURCE_IRQ + }, + [2] = { + .start = IRQ_IOP32X_AA_EOC, + .end = IRQ_IOP32X_AA_EOC, + .flags = IORESOURCE_IRQ + }, + [3] = { + .start = IRQ_IOP32X_AA_ERR, + .end = IRQ_IOP32X_AA_ERR, + .flags = IORESOURCE_IRQ + } +}; + +static u64 iop3xx_adma_dmamask = DMA_32BIT_MASK; + +static struct iop_adma_platform_data iop3xx_dma_0_data = { + .hw_id = IOP3XX_DMA0_ID, + .capabilities = DMA_MEMCPY | DMA_MEMCPY_CRC32C, + .pool_size = PAGE_SIZE, +}; + +static struct iop_adma_platform_data iop3xx_dma_1_data = { + .hw_id = IOP3XX_DMA1_ID, + .capabilities = DMA_MEMCPY | DMA_MEMCPY_CRC32C, + .pool_size = PAGE_SIZE, +}; + +static struct iop_adma_platform_data iop3xx_aau_data = { + .hw_id = IOP3XX_AAU_ID, + .capabilities = DMA_XOR | DMA_ZERO_SUM | DMA_MEMSET, + .pool_size = 3 * PAGE_SIZE, +}; + +struct platform_device iop3xx_dma_0_channel = { + .name = "IOP-ADMA", + .id = 0, + .num_resources = 4, + .resource = iop3xx_dma_0_resources, + .dev = { + .dma_mask = &iop3xx_adma_dmamask, + .coherent_dma_mask = DMA_64BIT_MASK, + .platform_data = (void *) &iop3xx_dma_0_data, + }, +}; + +struct platform_device iop3xx_dma_1_channel = { + .name = "IOP-ADMA", + .id = 1, + .num_resources = 4, + .
[PATCH 06/19] raid5: move the reconstruct write expansion operation to a workqueue
From: Dan Williams <[EMAIL PROTECTED]> Enable handle_stripe5 to use the reconstruct write operations capability for expansion operations. However this does not move the copy operation associated with an expand to the workqueue. First, it was difficult to find a clean way to pass the parameters of this operation to the queue. Second, this section of code is a good candidate for performing the copies with inline calls to the dma routines. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 36 +++- 1 files changed, 27 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1a8dfd2..a07b52b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2053,6 +2053,7 @@ #endif * completed */ if (test_bit(STRIPE_OP_RCW, &sh->state) && + !test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state) && test_bit(STRIPE_OP_RCW_Done, &sh->ops.state)) { clear_bit(STRIPE_OP_RCW, &sh->state); clear_bit(STRIPE_OP_RCW_Done, &sh->ops.state); @@ -2226,6 +2227,7 @@ #endif } } } + if (test_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state) && test_bit(STRIPE_OP_COMPUTE_Recover_pd, &sh->ops.state)) { clear_bit(STRIPE_OP_COMPUTE, &sh->state); @@ -2282,18 +2284,28 @@ #endif } } - if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { + /* Finish 'rcw' operations initiated by the expansion +* process +*/ + if (test_bit(STRIPE_OP_RCW, &sh->state) && + test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state) && + test_bit(STRIPE_OP_RCW_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_RCW, &sh->state); + clear_bit(STRIPE_OP_RCW_Done, &sh->ops.state); + clear_bit(STRIPE_OP_RCW_Expand, &sh->ops.state); + clear_bit(STRIPE_EXPANDING, &sh->state); + for (i= conf->raid_disks; i--;) + set_bit(R5_Wantwrite, &sh->dev[i].flags); + } + + if (expanded && test_bit(STRIPE_EXPANDING, &sh->state) && + !test_bit(STRIPE_OP_RCW, &sh->state)) { /* Need to write out all blocks after computing parity */ sh->disks = conf->raid_disks; sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - compute_parity5(sh, RECONSTRUCT_WRITE); - for (i= conf->raid_disks; i--;) { - set_bit(R5_LOCKED, &sh->dev[i].flags); - locked++; - set_bit(R5_Wantwrite, &sh->dev[i].flags); - } - clear_bit(STRIPE_EXPANDING, &sh->state); - } else if (expanded) { + set_bit(STRIPE_OP_RCW_Expand, &sh->ops.state); + locked += handle_write_operations5(sh, 0); + } else if (expanded && !test_bit(STRIPE_OP_RCW, &sh->state)) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -2327,9 +2339,15 @@ #endif release_stripe(sh2); continue; } + /* to do: perform these operations with a dma engine +* inline (rather than pushing to the workqueue) +*/ + /*#ifdef CONFIG_RAID5_DMA*/ + /*#else*/ memcpy(page_address(sh2->dev[dd_idx].page), page_address(sh->dev[i].page), STRIPE_SIZE); + /*#endif*/ set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j=0; jraid_disks; j++) - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 12/19] dmaengine: dma_async_memcpy_err for DMA engines that do not support memcpy
From: Dan Williams <[EMAIL PROTECTED]> Default virtual function that returns an error if the user attempts a memcpy operation. An XOR engine is an example of a DMA engine that does not support memcpy. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/dmaengine.c | 13 + 1 files changed, 13 insertions(+), 0 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index fe62237..33ad690 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -593,6 +593,18 @@ void dma_async_device_unregister(struct } /** + * dma_async_do_memcpy_err - default function for dma devices that + * do not support memcpy + */ +dma_cookie_t dma_async_do_memcpy_err(struct dma_chan *chan, + union dmaengine_addr dest, unsigned int dest_off, + union dmaengine_addr src, unsigned int src_off, +size_t len, unsigned long flags) +{ + return -ENXIO; +} + +/** * dma_async_do_xor_err - default function for dma devices that * do not support xor */ @@ -642,6 +654,7 @@ EXPORT_SYMBOL_GPL(dma_async_issue_pendin EXPORT_SYMBOL_GPL(dma_async_device_register); EXPORT_SYMBOL_GPL(dma_async_device_unregister); EXPORT_SYMBOL_GPL(dma_chan_cleanup); +EXPORT_SYMBOL_GPL(dma_async_do_memcpy_err); EXPORT_SYMBOL_GPL(dma_async_do_xor_err); EXPORT_SYMBOL_GPL(dma_async_do_memset_err); EXPORT_SYMBOL_GPL(dma_async_chan_init); - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/19] dmaengine: reduce backend address permutations
From: Dan Williams <[EMAIL PROTECTED]> Change the backend dma driver API to accept a 'union dmaengine_addr'. The intent is to be able to support a wide range of frontend address type permutations without needing an equal number of function type permutations on the backend. Changelog: * make the dmaengine api EXPORT_SYMBOL_GPL * zero sum support should be standalone, not integrated into xor Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/dmaengine.c | 15 ++- drivers/dma/ioatdma.c | 186 +-- include/linux/dmaengine.h | 193 +++-- 3 files changed, 249 insertions(+), 145 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index e10f19d..9b02afa 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -593,12 +593,13 @@ void dma_async_device_unregister(struct } /** - * dma_async_xor_pgs_to_pg_err - default function for dma devices that + * dma_async_do_xor_err - default function for dma devices that * do not support xor */ -dma_cookie_t dma_async_xor_pgs_to_pg_err(struct dma_chan *chan, - struct page *dest_pg, unsigned int dest_off, struct page *src_pgs, - unsigned int src_cnt, unsigned int src_off, size_t len) +dma_cookie_t dma_async_do_xor_err(struct dma_chan *chan, + union dmaengine_addr dest, unsigned int dest_off, + union dmaengine_addr src, unsigned int src_cnt, + unsigned int src_off, size_t len, unsigned long flags) { return -ENXIO; } @@ -617,11 +618,15 @@ EXPORT_SYMBOL_GPL(dma_async_client_chan_ EXPORT_SYMBOL_GPL(dma_async_memcpy_buf_to_buf); EXPORT_SYMBOL_GPL(dma_async_memcpy_buf_to_pg); EXPORT_SYMBOL_GPL(dma_async_memcpy_pg_to_pg); +EXPORT_SYMBOL_GPL(dma_async_memcpy_dma_to_dma); +EXPORT_SYMBOL_GPL(dma_async_memcpy_pg_to_dma); +EXPORT_SYMBOL_GPL(dma_async_memcpy_dma_to_pg); EXPORT_SYMBOL_GPL(dma_async_xor_pgs_to_pg); +EXPORT_SYMBOL_GPL(dma_async_xor_dma_list_to_dma); EXPORT_SYMBOL_GPL(dma_async_operation_complete); EXPORT_SYMBOL_GPL(dma_async_issue_pending); EXPORT_SYMBOL_GPL(dma_async_device_register); EXPORT_SYMBOL_GPL(dma_async_device_unregister); EXPORT_SYMBOL_GPL(dma_chan_cleanup); -EXPORT_SYMBOL_GPL(dma_async_xor_pgs_to_pg_err); +EXPORT_SYMBOL_GPL(dma_async_do_xor_err); EXPORT_SYMBOL_GPL(dma_async_chan_init); diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 415de03..dd5b9f0 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -213,20 +213,25 @@ static void ioat_dma_free_chan_resources /** * do_ioat_dma_memcpy - actual function that initiates a IOAT DMA transaction - * @ioat_chan: IOAT DMA channel handle - * @dest: DMA destination address - * @src: DMA source address + * @chan: IOAT DMA channel handle + * @dest: DMAENGINE destination address + * @dest_off: Page offset + * @src: DMAENGINE source address + * @src_off: Page offset * @len: transaction length in bytes */ -static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan, - dma_addr_t dest, - dma_addr_t src, - size_t len) +static dma_cookie_t do_ioat_dma_memcpy(struct dma_chan *dma_chan, + union dmaengine_addr dest, + unsigned int dest_off, + union dmaengine_addr src, + unsigned int src_off, + size_t len, + unsigned long flags) { struct ioat_desc_sw *first; struct ioat_desc_sw *prev; - struct ioat_desc_sw *new; + struct ioat_desc_sw *new = 0; dma_cookie_t cookie; LIST_HEAD(new_chain); u32 copy; @@ -234,16 +239,47 @@ static dma_cookie_t do_ioat_dma_memcpy(s dma_addr_t orig_src, orig_dst; unsigned int desc_count = 0; unsigned int append = 0; + struct ioat_dma_chan *ioat_chan = to_ioat_chan(dma_chan); - if (!ioat_chan || !dest || !src) + if (!dma_chan || !dest.dma || !src.dma) return -EFAULT; if (!len) return ioat_chan->common.cookie; + switch (flags & (DMA_SRC_BUF | DMA_SRC_PAGE | DMA_SRC_DMA)) { + case DMA_SRC_BUF: + src.dma = pci_map_single(ioat_chan->device->pdev, + src.buf, len, PCI_DMA_TODEVICE); + break; + case DMA_SRC_PAGE: + src.dma = pci_map_page(ioat_chan->device->pdev, + src.pg, src_off, len, PCI_DMA_TODEVICE); + break; + case DMA_SRC_DMA: + break; + default: + return -EFAULT; + } + + switch (flags & (DMA_DEST_BUF | DMA_DEST_PAGE | DMA_DEST_DMA)) { +
[PATCH 14/19] dmaengine: add dma_sync_wait
From: Dan Williams <[EMAIL PROTECTED]> dma_sync_wait is a common routine to live wait for a dma operation to complete. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- include/linux/dmaengine.h | 12 1 files changed, 12 insertions(+), 0 deletions(-) diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 9fd6cbd..0a70c9e 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -750,6 +750,18 @@ static inline void dma_async_unmap_singl chan->device->unmap_single(chan, handle, size, direction); } +static inline enum dma_status dma_sync_wait(struct dma_chan *chan, + dma_cookie_t cookie) +{ + enum dma_status status; + dma_async_issue_pending(chan); + do { + status = dma_async_operation_complete(chan, cookie, NULL, NULL); + } while (status == DMA_IN_PROGRESS); + + return status; +} + /* --- DMA device --- */ int dma_async_device_register(struct dma_device *device); - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 10/19] dmaengine: expose per channel dma mapping characteristics to clients
From: Dan Williams <[EMAIL PROTECTED]> Allow a client to ensure that the dma channel it has selected can dma to the specified buffer or page address. Also allow the client to pre-map address ranges to be passed to the operations API. Changelog: * make the dmaengine api EXPORT_SYMBOL_GPL * zero sum support should be standalone, not integrated into xor Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/dmaengine.c |4 drivers/dma/ioatdma.c | 35 +++ include/linux/dmaengine.h | 34 ++ 3 files changed, 73 insertions(+), 0 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 9b02afa..e78ce89 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -630,3 +630,7 @@ EXPORT_SYMBOL_GPL(dma_async_device_unreg EXPORT_SYMBOL_GPL(dma_chan_cleanup); EXPORT_SYMBOL_GPL(dma_async_do_xor_err); EXPORT_SYMBOL_GPL(dma_async_chan_init); +EXPORT_SYMBOL_GPL(dma_async_map_page); +EXPORT_SYMBOL_GPL(dma_async_map_single); +EXPORT_SYMBOL_GPL(dma_async_unmap_page); +EXPORT_SYMBOL_GPL(dma_async_unmap_single); diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index dd5b9f0..0159d14 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -637,6 +637,37 @@ extern dma_cookie_t dma_async_do_xor_err union dmaengine_addr src, unsigned int src_cnt, unsigned int src_off, size_t len, unsigned long flags); +static dma_addr_t ioat_map_page(struct dma_chan *chan, struct page *page, + unsigned long offset, size_t size, + int direction) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + return pci_map_page(ioat_chan->device->pdev, page, offset, size, + direction); +} + +static dma_addr_t ioat_map_single(struct dma_chan *chan, void *cpu_addr, + size_t size, int direction) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + return pci_map_single(ioat_chan->device->pdev, cpu_addr, size, + direction); +} + +static void ioat_unmap_page(struct dma_chan *chan, dma_addr_t handle, + size_t size, int direction) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + pci_unmap_page(ioat_chan->device->pdev, handle, size, direction); +} + +static void ioat_unmap_single(struct dma_chan *chan, dma_addr_t handle, + size_t size, int direction) +{ + struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); + pci_unmap_single(ioat_chan->device->pdev, handle, size, direction); +} + static int __devinit ioat_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -717,6 +748,10 @@ #endif device->common.capabilities = DMA_MEMCPY; device->common.device_do_dma_memcpy = do_ioat_dma_memcpy; device->common.device_do_dma_xor = dma_async_do_xor_err; + device->common.map_page = ioat_map_page; + device->common.map_single = ioat_map_single; + device->common.unmap_page = ioat_unmap_page; + device->common.unmap_single = ioat_unmap_single; printk(KERN_INFO "Intel(R) I/OAT DMA Engine found, %d channels\n", device->common.chancnt); diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index df055cc..cb4cfcf 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -287,6 +287,15 @@ struct dma_device { enum dma_status (*device_operation_complete)(struct dma_chan *chan, dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used); + dma_addr_t (*map_page)(struct dma_chan *chan, struct page *page, + unsigned long offset, size_t size, + int direction); + dma_addr_t (*map_single)(struct dma_chan *chan, void *cpu_addr, + size_t size, int direction); + void (*unmap_page)(struct dma_chan *chan, dma_addr_t handle, + size_t size, int direction); + void (*unmap_single)(struct dma_chan *chan, dma_addr_t handle, + size_t size, int direction); void (*device_issue_pending)(struct dma_chan *chan); }; @@ -592,6 +601,31 @@ static inline enum dma_status dma_async_ return DMA_IN_PROGRESS; } +static inline dma_addr_t dma_async_map_page(struct dma_chan *chan, + struct page *page, unsigned long offset, size_t size, + int direction) +{ + return chan->device->map_page(chan, page, offset, size, direction); +} + +static inline dma_addr_t dma_async_map_single(struct dma_chan *chan, + void *cpu_addr, size_t si
[PATCH 11/19] dmaengine: add memset as an asynchronous dma operation
From: Dan Williams <[EMAIL PROTECTED]> Changelog: * make the dmaengine api EXPORT_SYMBOL_GPL * zero sum support should be standalone, not integrated into xor Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/dmaengine.c | 15 ++ drivers/dma/ioatdma.c |5 +++ include/linux/dmaengine.h | 68 + 3 files changed, 88 insertions(+), 0 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index e78ce89..fe62237 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -604,6 +604,17 @@ dma_cookie_t dma_async_do_xor_err(struct return -ENXIO; } +/** + * dma_async_do_memset_err - default function for dma devices that + * do not support memset + */ +dma_cookie_t dma_async_do_memset_err(struct dma_chan *chan, +union dmaengine_addr dest, unsigned int dest_off, +int val, size_t len, unsigned long flags) +{ +return -ENXIO; +} + static int __init dma_bus_init(void) { mutex_init(&dma_list_mutex); @@ -621,6 +632,9 @@ EXPORT_SYMBOL_GPL(dma_async_memcpy_pg_to EXPORT_SYMBOL_GPL(dma_async_memcpy_dma_to_dma); EXPORT_SYMBOL_GPL(dma_async_memcpy_pg_to_dma); EXPORT_SYMBOL_GPL(dma_async_memcpy_dma_to_pg); +EXPORT_SYMBOL_GPL(dma_async_memset_buf); +EXPORT_SYMBOL_GPL(dma_async_memset_page); +EXPORT_SYMBOL_GPL(dma_async_memset_dma); EXPORT_SYMBOL_GPL(dma_async_xor_pgs_to_pg); EXPORT_SYMBOL_GPL(dma_async_xor_dma_list_to_dma); EXPORT_SYMBOL_GPL(dma_async_operation_complete); @@ -629,6 +643,7 @@ EXPORT_SYMBOL_GPL(dma_async_device_regis EXPORT_SYMBOL_GPL(dma_async_device_unregister); EXPORT_SYMBOL_GPL(dma_chan_cleanup); EXPORT_SYMBOL_GPL(dma_async_do_xor_err); +EXPORT_SYMBOL_GPL(dma_async_do_memset_err); EXPORT_SYMBOL_GPL(dma_async_chan_init); EXPORT_SYMBOL_GPL(dma_async_map_page); EXPORT_SYMBOL_GPL(dma_async_map_single); diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 0159d14..231247c 100644 --- a/drivers/dma/ioatdma.c +++ b/drivers/dma/ioatdma.c @@ -637,6 +637,10 @@ extern dma_cookie_t dma_async_do_xor_err union dmaengine_addr src, unsigned int src_cnt, unsigned int src_off, size_t len, unsigned long flags); +extern dma_cookie_t dma_async_do_memset_err(struct dma_chan *chan, + union dmaengine_addr dest, unsigned int dest_off, + int val, size_t size, unsigned long flags); + static dma_addr_t ioat_map_page(struct dma_chan *chan, struct page *page, unsigned long offset, size_t size, int direction) @@ -748,6 +752,7 @@ #endif device->common.capabilities = DMA_MEMCPY; device->common.device_do_dma_memcpy = do_ioat_dma_memcpy; device->common.device_do_dma_xor = dma_async_do_xor_err; + device->common.device_do_dma_memset = dma_async_do_memset_err; device->common.map_page = ioat_map_page; device->common.map_single = ioat_map_single; device->common.unmap_page = ioat_unmap_page; diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index cb4cfcf..8d53b08 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -260,6 +260,7 @@ struct dma_chan_client_ref { * @device_issue_pending: push appended descriptors to hardware * @device_do_dma_memcpy: perform memcpy with a dma engine * @device_do_dma_xor: perform block xor with a dma engine + * @device_do_dma_memset: perform block fill with a dma engine */ struct dma_device { @@ -284,6 +285,9 @@ struct dma_device { union dmaengine_addr src, unsigned int src_cnt, unsigned int src_off, size_t len, unsigned long flags); + dma_cookie_t (*device_do_dma_memset)(struct dma_chan *chan, + union dmaengine_addr dest, unsigned int dest_off, + int value, size_t len, unsigned long flags); enum dma_status (*device_operation_complete)(struct dma_chan *chan, dma_cookie_t cookie, dma_cookie_t *last, dma_cookie_t *used); @@ -478,6 +482,70 @@ static inline dma_cookie_t dma_async_mem } /** + * dma_async_memset_buf - offloaded memset + * @chan: DMA channel to offload memset to + * @buf: destination buffer + * @val: value to initialize the buffer + * @len: length + */ +static inline dma_cookie_t dma_async_memset_buf(struct dma_chan *chan, + void *buf, int val, size_t len) +{ + unsigned long flags = DMA_DEST_BUF; + union dmaengine_addr dest_addr = { .buf = buf }; + int cpu = get_cpu(); + per_cpu_ptr(chan->local, cpu)->bytes_transferred += len; + per_cpu_ptr(chan->local, cpu)->memcpy_count++; + put_cpu(); + + return chan->device->device_do_dma_memset(chan, dest_addr, 0, val, + len, flags);
[PATCH 16/19] dmaengine: Driver for the Intel IOP 32x, 33x, and 13xx RAID engines
From: Dan Williams <[EMAIL PROTECTED]> This is a driver for the iop DMA/AAU/ADMA units which are capable of pq_xor, pq_update, pq_zero_sum, xor, dual_xor, xor_zero_sum, fill, copy+crc, and copy operations. Changelog: * fixed a slot allocation bug in do_iop13xx_adma_xor that caused too few slots to be requested eventually leading to data corruption * enabled the slot allocation routine to attempt to free slots before returning -ENOMEM * switched the cleanup routine to solely use the software chain and the status register to determine if a descriptor is complete. This is necessary to support other IOP engines that do not have status writeback capability * make the driver iop generic * modified the allocation routines to understand allocating a group of slots for a single operation * added a null xor initialization operation for the xor only channel on iop3xx * add software emulation of zero sum on iop32x * support xor operations on buffers larger than the hardware maximum * add architecture specific raid5-dma support functions Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/Kconfig | 27 + drivers/dma/Makefile|1 drivers/dma/iop-adma.c | 1501 +++ include/asm-arm/hardware/iop_adma.h | 98 ++ 4 files changed, 1624 insertions(+), 3 deletions(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index fced8c3..3556143 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -7,8 +7,8 @@ menu "DMA Engine support" config DMA_ENGINE bool "Support for DMA engines" ---help--- - DMA engines offload copy operations from the CPU to dedicated - hardware, allowing the copies to happen asynchronously. + DMA engines offload block memory operations from the CPU to dedicated + hardware, allowing the operations to happen asynchronously. comment "DMA Clients" @@ -28,9 +28,19 @@ config RAID5_DMA default y ---help--- This enables the use of DMA engines in the MD-RAID5 driver to - offload stripe cache operations, freeing CPU cycles. + offload stripe cache operations (i.e. xor, memcpy), freeing CPU cycles. say Y here +config RAID5_DMA_WAIT_VIA_REQUEUE + bool "raid5-dma: Non-blocking channel switching" + depends on RAID5_DMA_ARCH_NEEDS_CHAN_SWITCH && RAID5_DMA && BROKEN + default n + ---help--- + This enables the raid5-dma driver to continue to operate on incoming + stripes when it determines that the current stripe must wait for a + a hardware channel to finish operations. This code is a work in + progress, only say Y to debug the implementation, otherwise say N. + comment "DMA Devices" config INTEL_IOATDMA @@ -40,4 +50,15 @@ config INTEL_IOATDMA ---help--- Enable support for the Intel(R) I/OAT DMA engine. +config INTEL_IOP_ADMA +tristate "Intel IOP ADMA support" +depends on DMA_ENGINE && (ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX) + select RAID5_DMA_ARCH_NEEDS_CHAN_SWITCH if (ARCH_IOP32X || ARCH_IOP33X) +default m +---help--- + Enable support for the Intel(R) IOP Series RAID engines. + +config RAID5_DMA_ARCH_NEEDS_CHAN_SWITCH + bool + endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 4e36d6e..233eae7 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -2,3 +2,4 @@ obj-$(CONFIG_DMA_ENGINE) += dmaengine.o obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_RAID5_DMA) += raid5-dma.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o +obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c new file mode 100644 index 000..51f1c54 --- /dev/null +++ b/drivers/dma/iop-adma.c @@ -0,0 +1,1501 @@ +/* + * Copyright(c) 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This driver supports the asynchrounous DMA copy and RAID engines available + * on the Intel Xscale(R) family of
[PATCH 18/19] iop3xx: Give Linux control over PCI (ATU) initialization
From: Dan Williams <[EMAIL PROTECTED]> Currently the iop3xx platform support code assumes that RedBoot is the bootloader and has already initialized the ATU. Linux should handle this initialization for three reasons: 1/ The memory map that RedBoot sets up is not optimal (page_to_dma and virt_to_phys return different addresses). The effect of this is that using the dma mapping API for the internal bus dma units generates pci bus addresses that are incorrect for the internal bus. 2/ Not all iop platforms use RedBoot 3/ If the ATU is already initialized it indicates that the iop is an add-in card in another host, it does not own the PCI bus, and should not be re-initialized. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- arch/arm/mach-iop32x/Kconfig |8 ++ arch/arm/mach-iop32x/ep80219.c |4 + arch/arm/mach-iop32x/iq31244.c |5 + arch/arm/mach-iop32x/iq80321.c |5 + arch/arm/mach-iop33x/Kconfig |8 ++ arch/arm/mach-iop33x/iq80331.c |5 + arch/arm/mach-iop33x/iq80332.c |4 + arch/arm/plat-iop/pci.c | 140 ++ include/asm-arm/arch-iop32x/iop32x.h |9 ++ include/asm-arm/arch-iop32x/memory.h |4 - include/asm-arm/arch-iop33x/iop33x.h | 10 ++ include/asm-arm/arch-iop33x/memory.h |4 - include/asm-arm/hardware/iop3xx.h| 20 - 13 files changed, 214 insertions(+), 12 deletions(-) diff --git a/arch/arm/mach-iop32x/Kconfig b/arch/arm/mach-iop32x/Kconfig index 05549a5..b2788e3 100644 --- a/arch/arm/mach-iop32x/Kconfig +++ b/arch/arm/mach-iop32x/Kconfig @@ -22,6 +22,14 @@ config ARCH_IQ80321 Say Y here if you want to run your kernel on the Intel IQ80321 evaluation kit for the IOP321 processor. +config IOP3XX_ATU +bool "Enable the PCI Controller" +default y +help + Say Y here if you want the IOP to initialize its PCI Controller. + Say N if the IOP is an add in card, the host system owns the PCI + bus in this case. + endmenu endif diff --git a/arch/arm/mach-iop32x/ep80219.c b/arch/arm/mach-iop32x/ep80219.c index f616d3e..1a5c586 100644 --- a/arch/arm/mach-iop32x/ep80219.c +++ b/arch/arm/mach-iop32x/ep80219.c @@ -100,7 +100,7 @@ ep80219_pci_map_irq(struct pci_dev *dev, static struct hw_pci ep80219_pci __initdata = { .swizzle= pci_std_swizzle, - .nr_controllers = 1, + .nr_controllers = 0, .setup = iop3xx_pci_setup, .preinit= iop3xx_pci_preinit, .scan = iop3xx_pci_scan_bus, @@ -109,6 +109,8 @@ static struct hw_pci ep80219_pci __initd static int __init ep80219_pci_init(void) { + if (iop3xx_get_init_atu() == IOP3XX_INIT_ATU_ENABLE) + ep80219_pci.nr_controllers = 1; #if 0 if (machine_is_ep80219()) pci_common_init(&ep80219_pci); diff --git a/arch/arm/mach-iop32x/iq31244.c b/arch/arm/mach-iop32x/iq31244.c index 967a696..25d5d62 100644 --- a/arch/arm/mach-iop32x/iq31244.c +++ b/arch/arm/mach-iop32x/iq31244.c @@ -97,7 +97,7 @@ iq31244_pci_map_irq(struct pci_dev *dev, static struct hw_pci iq31244_pci __initdata = { .swizzle= pci_std_swizzle, - .nr_controllers = 1, + .nr_controllers = 0, .setup = iop3xx_pci_setup, .preinit= iop3xx_pci_preinit, .scan = iop3xx_pci_scan_bus, @@ -106,6 +106,9 @@ static struct hw_pci iq31244_pci __initd static int __init iq31244_pci_init(void) { + if (iop3xx_get_init_atu() == IOP3XX_INIT_ATU_ENABLE) + iq31244_pci.nr_controllers = 1; + if (machine_is_iq31244()) pci_common_init(&iq31244_pci); diff --git a/arch/arm/mach-iop32x/iq80321.c b/arch/arm/mach-iop32x/iq80321.c index ef4388c..cdd2265 100644 --- a/arch/arm/mach-iop32x/iq80321.c +++ b/arch/arm/mach-iop32x/iq80321.c @@ -97,7 +97,7 @@ iq80321_pci_map_irq(struct pci_dev *dev, static struct hw_pci iq80321_pci __initdata = { .swizzle= pci_std_swizzle, - .nr_controllers = 1, + .nr_controllers = 0, .setup = iop3xx_pci_setup, .preinit= iop3xx_pci_preinit, .scan = iop3xx_pci_scan_bus, @@ -106,6 +106,9 @@ static struct hw_pci iq80321_pci __initd static int __init iq80321_pci_init(void) { + if (iop3xx_get_init_atu() == IOP3XX_INIT_ATU_ENABLE) + iq80321_pci.nr_controllers = 1; + if (machine_is_iq80321()) pci_common_init(&iq80321_pci); diff --git a/arch/arm/mach-iop33x/Kconfig b/arch/arm/mach-iop33x/Kconfig index 9aa016b..45598e0 100644 --- a/arch/arm/mach-iop33x/Kconfig +++ b/arch/arm/mach-iop33x/Kconfig @@ -16,6 +16,14 @@ config MACH_IQ80332 Say Y here if you want to run your kernel on the Intel IQ80332 evaluation kit for the IOP332 chipset. +config IOP3XX_ATU + bool "Enabl
[PATCH 08/19] dmaengine: enable multiple clients and operations
From: Dan Williams <[EMAIL PROTECTED]> Enable the dmaengine interface to allow multiple clients to share a channel, and enable clients to request channels based on an operations capability mask. This prepares the interface for use with the RAID5 client and the future RAID6 client. Multi-client support is achieved by modifying channels to maintain a list of peer clients. Multi-operation support is achieved by modifying clients to maintain lists of channel references. Channel references in a given request list satisfy a client specified capability mask. Changelog: * make the dmaengine api EXPORT_SYMBOL_GPL * zero sum support should be standalone, not integrated into xor Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/dmaengine.c | 357 - drivers/dma/ioatdma.c | 12 +- include/linux/dmaengine.h | 164 ++--- net/core/dev.c| 21 +-- net/ipv4/tcp.c|4 - 5 files changed, 443 insertions(+), 115 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 1527804..e10f19d 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -37,8 +37,13 @@ * Each device has a channels list, which runs unlocked but is never modified * once the device is registered, it's just setup by the driver. * - * Each client has a channels list, it's only modified under the client->lock - * and in an RCU callback, so it's safe to read under rcu_read_lock(). + * Each client has 'n' lists of channel references where + * n == DMA_MAX_CHAN_TYPE_REQ. These lists are only modified under the + * client->lock and in an RCU callback, so they are safe to read under + * rcu_read_lock(). + * + * Each channel has a list of peer clients, it's only modified under the + * chan->lock. This allows a channel to be shared amongst several clients * * Each device has a kref, which is initialized to 1 when the device is * registered. A kref_put is done for each class_device registered. When the @@ -85,6 +90,18 @@ static ssize_t show_memcpy_count(struct return sprintf(buf, "%lu\n", count); } +static ssize_t show_xor_count(struct class_device *cd, char *buf) +{ + struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); + unsigned long count = 0; + int i; + + for_each_possible_cpu(i) + count += per_cpu_ptr(chan->local, i)->xor_count; + + return sprintf(buf, "%lu\n", count); +} + static ssize_t show_bytes_transferred(struct class_device *cd, char *buf) { struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); @@ -97,16 +114,37 @@ static ssize_t show_bytes_transferred(st return sprintf(buf, "%lu\n", count); } +static ssize_t show_bytes_xor(struct class_device *cd, char *buf) +{ + struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); + unsigned long count = 0; + int i; + + for_each_possible_cpu(i) + count += per_cpu_ptr(chan->local, i)->bytes_xor; + + return sprintf(buf, "%lu\n", count); +} + static ssize_t show_in_use(struct class_device *cd, char *buf) { + unsigned int clients = 0; + struct list_head *peer; struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev); - return sprintf(buf, "%d\n", (chan->client ? 1 : 0)); + rcu_read_lock(); + list_for_each_rcu(peer, &chan->peers) + clients++; + rcu_read_unlock(); + + return sprintf(buf, "%d\n", clients); } static struct class_device_attribute dma_class_attrs[] = { __ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL), + __ATTR(xor_count, S_IRUGO, show_xor_count, NULL), __ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL), + __ATTR(bytes_xor, S_IRUGO, show_bytes_xor, NULL), __ATTR(in_use, S_IRUGO, show_in_use, NULL), __ATTR_NULL }; @@ -130,34 +168,79 @@ static struct class dma_devclass = { /** * dma_client_chan_alloc - try to allocate a channel to a client * @client: &dma_client + * @req: request descriptor * * Called with dma_list_mutex held. */ -static struct dma_chan *dma_client_chan_alloc(struct dma_client *client) +static struct dma_chan *dma_client_chan_alloc(struct dma_client *client, + struct dma_req *req) { struct dma_device *device; struct dma_chan *chan; + struct dma_client_chan_peer *peer; + struct dma_chan_client_ref *chan_ref; unsigned long flags; int desc; /* allocated descriptor count */ + int allocated; /* flag re-allocations */ - /* Find a channel, any DMA engine will do */ + /* Find a channel */ list_for_each_entry(device, &dma_device_list, global_node) { + if ((req->cap_m
[PATCH 03/19] raid5: move check parity operations to a workqueue
From: Dan Williams <[EMAIL PROTECTED]> Enable handle_stripe5 to pass off check parity operations to raid5_do_soft_block_ops formerly handled by compute_parity5. Changelog: * removed handle_check_operations5. All logic moved into handle_stripe5 so that we do not need to go through the initiation logic to end the operation. * clear the uptodate bit on the parity block * hold off check operations if a parity dependent operation is in flight like a write Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 60 1 files changed, 42 insertions(+), 18 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e39d248..24ed4d8 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2121,35 +2121,59 @@ #endif locked += handle_write_operations5(sh, rcw); } - /* maybe we need to check and possibly fix the parity for this stripe -* Any reads will already have been scheduled, so we just see if enough data -* is available + /* 1/ Maybe we need to check and possibly fix the parity for this stripe. +*Any reads will already have been scheduled, so we just see if enough data +*is available. +* 2/ Hold off parity checks while parity dependent operations are in flight +*(RCW and RMW are protected by 'locked') */ - if (syncing && locked == 0 && - !test_bit(STRIPE_INSYNC, &sh->state)) { + if ((syncing && locked == 0 && + !test_bit(STRIPE_INSYNC, &sh->state)) || + test_bit(STRIPE_OP_CHECK, &sh->state)) { + set_bit(STRIPE_HANDLE, &sh->state); + /* Take one of the following actions: +* 1/ start a check parity operation if (uptodate == disks) +* 2/ finish a check parity operation and act on the result +*/ if (failed == 0) { - BUG_ON(uptodate != disks); - compute_parity5(sh, CHECK_PARITY); - uptodate--; - if (page_is_zero(sh->dev[sh->pd_idx].page)) { - /* parity is correct (on disc, not in buffer any more) */ - set_bit(STRIPE_INSYNC, &sh->state); - } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ + if (!test_bit(STRIPE_OP_CHECK, &sh->state)) { + BUG_ON(uptodate != disks); + set_bit(STRIPE_OP_CHECK, &sh->state); + set_bit(STRIPE_OP_CHECK_Gen, &sh->ops.state); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + sh->ops.pending++; + uptodate--; + } else if (test_and_clear_bit(STRIPE_OP_CHECK_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_CHECK, &sh->state); + + if (test_and_clear_bit(STRIPE_OP_CHECK_IsZero, + &sh->ops.state)) + /* parity is correct (on disc, not in buffer any more) */ set_bit(STRIPE_INSYNC, &sh->state); else { - compute_block(sh, sh->pd_idx); - uptodate++; + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + compute_block(sh, sh->pd_idx); + uptodate++; + } } } } - if (!test_bit(STRIPE_INSYNC, &sh->state)) { + + /* Wait for check parity operations to complete +* before write-back +*/ + if (!test_bit(STRIPE_INSYNC, &sh->state) && + !test_bit(STRIPE_OP_CHECK, &sh->state)) { + /* either
[PATCH 05/19] raid5: move read completion copies to a workqueue
From: Dan Williams <[EMAIL PROTECTED]> Enable handle_stripe5 to hand off the memory copy operations that satisfy read requests to raid5_do_soft_blocks_ops, formerly this was handled in line within handle_stripe5. It adds a 'read' (past tense) pointer to the r5dev structure to to track reads that have been offloaded to the workqueue. When the copy operation is complete the 'read' pointer is reused as the return_bi for the bi_end_io() call. Changelog: * dev->read only holds reads that have been satisfied, previously it doubled as a request queue to the operations routine * added R5_ReadReq to mark the blocks that belong to a given bio fill operation * requested reads no longer count towards the 'to_read' count, 'to_fill' tracks the number of requested reads Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 67 +--- 1 files changed, 38 insertions(+), 29 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0c39203..1a8dfd2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -240,11 +240,11 @@ static void init_stripe(struct stripe_he for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->toread || dev->towrite || dev->written || + if (dev->toread || dev->read || dev->towrite || dev->written || test_bit(R5_LOCKED, &dev->flags)) { - printk("sector=%llx i=%d %p %p %p %d\n", + printk("sector=%llx i=%d %p %p %p %p %d\n", (unsigned long long)sh->sector, i, dev->toread, - dev->towrite, dev->written, + dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); BUG(); } @@ -1749,7 +1749,7 @@ static void handle_stripe5(struct stripe struct bio *bi; int i; int syncing, expanding, expanded; - int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; + int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0, to_fill=0; int compute=0, non_overwrite=0, write_complete=0; int failed_num=0; struct r5dev *dev; @@ -1765,44 +1765,47 @@ static void handle_stripe5(struct stripe syncing = test_bit(STRIPE_SYNCING, &sh->state); expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ + if (test_bit(STRIPE_OP_BIOFILL, &sh->state) && + test_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state)) { + clear_bit(STRIPE_OP_BIOFILL, &sh->state); + clear_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state); + } + + /* Now to look around and see what can be done */ rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); - PRINTK("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); + PRINTK("check %d: state 0x%lx toread %p read %p write %p written %p\n", + i, dev->flags, dev->toread, dev->read, dev->towrite, dev->written); + + /* maybe we can acknowledge completion of a biofill operation */ + if (test_bit(R5_ReadReq, &dev->flags) && !dev->toread) + clear_bit(R5_ReadReq, &dev->flags); + /* maybe we can reply to a read */ + if (dev->read && !test_bit(R5_ReadReq, &dev->flags) && + !test_bit(STRIPE_OP_BIOFILL, &sh->state)) { + return_bi = dev->read; + dev->read = NULL; + } + + /* maybe we can start a biofill operation */ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { - struct bio *rbi, *rbi2; - PRINTK("Return read for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - rbi = dev->toread; - dev->toread = NULL; - if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); - while (rbi && r
[PATCH 02/19] raid5: move write operations to a workqueue
From: Dan Williams <[EMAIL PROTECTED]> Enable handle_stripe5 to pass off write operations to raid5_do_soft_blocks_ops (which can be run as a workqueue). The operations moved are reconstruct-writes and read-modify-writes formerly handled by compute_parity5. Changelog: * moved raid5_do_soft_block_ops changes into a separate patch * changed handle_write_operations5 to only initiate write operations, which prevents new writes from being requested while the current one is in flight * all blocks undergoing a write are now marked locked and !uptodate at the beginning of the write operation * blocks undergoing a read-modify-write need a request flag to distinguish them from blocks that are locked for reading. Reconstruct-writes still use the R5_LOCKED bit to select blocks for the operation * integrated the work queue Kconfig option Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/Kconfig | 21 + drivers/md/raid5.c | 192 ++-- include/linux/raid/raid5.h |3 + 3 files changed, 190 insertions(+), 26 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index bf869ed..2a16b3b 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -162,6 +162,27 @@ config MD_RAID5_RESHAPE There should be enough spares already present to make the new array workable. +config MD_RAID456_WORKQUEUE + depends on MD_RAID456 + bool "Offload raid work to a workqueue from raid5d" + ---help--- + This option enables raid work (block copy and xor operations) + to run in a workqueue. If your platform has a high context + switch penalty say N. If you are using hardware offload or + are running on an SMP platform say Y. + + If unsure say, Y. + +config MD_RAID456_WORKQUEUE_MULTITHREAD + depends on MD_RAID456_WORKQUEUE && SMP + bool "Enable multi-threaded raid processing" + default y + ---help--- + This option controls whether the raid workqueue will be multi- + threaded or single threaded. + + If unsure say, Y. + config MD_MULTIPATH tristate "Multipath I/O support" depends on BLK_DEV_MD diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8fde62b..e39d248 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -222,6 +222,8 @@ static void init_stripe(struct stripe_he BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); + BUG_ON(sh->ops.state); + BUG_ON(sh->ops.pending); CHECK_DEVLOCK(); PRINTK("init_stripe called, stripe %llu\n", @@ -331,6 +333,9 @@ static int grow_one_stripe(raid5_conf_t memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); sh->raid_conf = conf; spin_lock_init(&sh->lock); + #ifdef CONFIG_MD_RAID456_WORKQUEUE + INIT_WORK(&sh->ops.work, conf->do_block_ops, sh); + #endif if (grow_buffers(sh, conf->raid_disks)) { shrink_buffers(sh, conf->raid_disks); @@ -1266,7 +1271,72 @@ static void compute_block_2(struct strip } } +static int handle_write_operations5(struct stripe_head *sh, int rcw) +{ + int i, pd_idx = sh->pd_idx, disks = sh->disks; + int locked=0; + + if (rcw == 0) { + /* skip the drain operation on an expand */ + if (test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state)) { + set_bit(STRIPE_OP_RCW, &sh->state); + set_bit(STRIPE_OP_RCW_Parity, &sh->ops.state); + for (i=disks ; i-- ;) { + set_bit(R5_LOCKED, &sh->dev[i].flags); + locked++; + } + } else { /* enter stage 1 of reconstruct write operation */ + set_bit(STRIPE_OP_RCW, &sh->state); + set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state); + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->towrite) { + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } + } else { + /* enter stage 1 of read modify write operation */ + BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); + + set_bit(STRIPE_OP_RMW, &sh->state); + set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state); + for (i=disks ; i-- ;) { +
[PATCH 07/19] raid5: remove compute_block and compute_parity5
From: Dan Williams <[EMAIL PROTECTED]> replaced by the workqueue implementation Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 123 1 files changed, 0 insertions(+), 123 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a07b52b..ad6883b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -964,129 +964,6 @@ #define check_xor() do { \ } while(0) -static void compute_block(struct stripe_head *sh, int dd_idx) -{ - int i, count, disks = sh->disks; - void *ptr[MAX_XOR_BLOCKS], *p; - - PRINTK("compute_block, stripe %llu, idx %d\n", - (unsigned long long)sh->sector, dd_idx); - - ptr[0] = page_address(sh->dev[dd_idx].page); - memset(ptr[0], 0, STRIPE_SIZE); - count = 1; - for (i = disks ; i--; ) { - if (i == dd_idx) - continue; - p = page_address(sh->dev[i].page); - if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) - ptr[count++] = p; - else - printk(KERN_ERR "compute_block() %d, stripe %llu, %d" - " not present\n", dd_idx, - (unsigned long long)sh->sector, i); - - check_xor(); - } - if (count != 1) - xor_block(count, STRIPE_SIZE, ptr); - set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); -} - -static void compute_parity5(struct stripe_head *sh, int method) -{ - raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, disks = sh->disks, count; - void *ptr[MAX_XOR_BLOCKS]; - struct bio *chosen; - - PRINTK("compute_parity5, stripe %llu, method %d\n", - (unsigned long long)sh->sector, method); - - count = 1; - ptr[0] = page_address(sh->dev[pd_idx].page); - switch(method) { - case READ_MODIFY_WRITE: - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); - for (i=disks ; i-- ;) { - if (i==pd_idx) - continue; - if (sh->dev[i].towrite && - test_bit(R5_UPTODATE, &sh->dev[i].flags)) { - ptr[count++] = page_address(sh->dev[i].page); - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - check_xor(); - } - } - break; - case RECONSTRUCT_WRITE: - memset(ptr[0], 0, STRIPE_SIZE); - for (i= disks; i-- ;) - if (i!=pd_idx && sh->dev[i].towrite) { - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - } - break; - case CHECK_PARITY: - break; - } - if (count>1) { - xor_block(count, STRIPE_SIZE, ptr); - count = 1; - } - - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); - } - - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); - } - - switch(method) { - case RECONSTRUCT_WRITE: - case CHECK_PARITY: - for (i=disks; i--;) - if (i != pd_idx) { - ptr[count++] = page_address(sh->dev[i].page); - check_xor(); - } - break; -
Re: [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction
On 9/11/06, Jeff Garzik <[EMAIL PROTECTED]> wrote: Dan Williams wrote: > Neil, > > The following patches implement hardware accelerated raid5 for the Intel > Xscale(r) series of I/O Processors. The MD changes allow stripe > operations to run outside the spin lock in a work queue. Hardware > acceleration is achieved by using a dma-engine-aware work queue routine > instead of the default software only routine. > > Since the last release of the raid5 changes many bug fixes and other > improvements have been made as a result of stress testing. See the per > patch change logs for more information about what was fixed. This > release is the first release of the full dma implementation. > > The patches touch 3 areas, the md-raid5 driver, the generic dmaengine > interface, and a platform device driver for IOPs. The raid5 changes > follow your comments concerning making the acceleration implementation > similar to how the stripe cache handles I/O requests. The dmaengine > changes are the second release of this code. They expand the interface > to handle more than memcpy operations, and add a generic raid5-dma > client. The iop-adma driver supports dma memcpy, xor, xor zero sum, and > memset across all IOP architectures (32x, 33x, and 13xx). > > Concerning the context switching performance concerns raised at the > previous release, I have observed the following. For the hardware > accelerated case it appears that performance is always better with the > work queue than without since it allows multiple stripes to be operated > on simultaneously. I expect the same for an SMP platform, but so far my > testing has been limited to IOPs. For a single-processor > non-accelerated configuration I have not observed performance > degradation with work queue support enabled, but in the Kconfig option > help text I recommend disabling it (CONFIG_MD_RAID456_WORKQUEUE). > > Please consider the patches for -mm. > > -Dan > > [PATCH 01/19] raid5: raid5_do_soft_block_ops > [PATCH 02/19] raid5: move write operations to a workqueue > [PATCH 03/19] raid5: move check parity operations to a workqueue > [PATCH 04/19] raid5: move compute block operations to a workqueue > [PATCH 05/19] raid5: move read completion copies to a workqueue > [PATCH 06/19] raid5: move the reconstruct write expansion operation to a workqueue > [PATCH 07/19] raid5: remove compute_block and compute_parity5 > [PATCH 08/19] dmaengine: enable multiple clients and operations > [PATCH 09/19] dmaengine: reduce backend address permutations > [PATCH 10/19] dmaengine: expose per channel dma mapping characteristics to clients > [PATCH 11/19] dmaengine: add memset as an asynchronous dma operation > [PATCH 12/19] dmaengine: dma_async_memcpy_err for DMA engines that do not support memcpy > [PATCH 13/19] dmaengine: add support for dma xor zero sum operations > [PATCH 14/19] dmaengine: add dma_sync_wait > [PATCH 15/19] dmaengine: raid5 dma client > [PATCH 16/19] dmaengine: Driver for the Intel IOP 32x, 33x, and 13xx RAID engines > [PATCH 17/19] iop3xx: define IOP3XX_REG_ADDR[32|16|8] and clean up DMA/AAU defs > [PATCH 18/19] iop3xx: Give Linux control over PCI (ATU) initialization > [PATCH 19/19] iop3xx: IOP 32x and 33x support for the iop-adma driver Can devices like drivers/scsi/sata_sx4.c or drivers/scsi/sata_promise.c take advantage of this? Promise silicon supports RAID5 XOR offload. If so, how? If not, why not? :) This is a frequently asked question, Alan Cox had the same one at OLS. The answer is "probably." The only complication I currently see is where/how the stripe cache is maintained. With the IOPs its easy because the DMA engines operate directly on kernel memory. With the Promise card I believe they have memory on the card and it's not clear to me if the XOR engines on the card can deal with host memory. Also, MD would need to be modified to handle a stripe cache located on a device, or somehow synchronize its local cache with card in a manner that is still able to beat software only MD. Jeff Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 08/19] dmaengine: enable multiple clients and operations
On 9/11/06, Jeff Garzik <[EMAIL PROTECTED]> wrote: Dan Williams wrote: > @@ -759,8 +755,10 @@ #endif > device->common.device_memcpy_buf_to_buf = ioat_dma_memcpy_buf_to_buf; > device->common.device_memcpy_buf_to_pg = ioat_dma_memcpy_buf_to_pg; > device->common.device_memcpy_pg_to_pg = ioat_dma_memcpy_pg_to_pg; > - device->common.device_memcpy_complete = ioat_dma_is_complete; > - device->common.device_memcpy_issue_pending = ioat_dma_memcpy_issue_pending; > + device->common.device_operation_complete = ioat_dma_is_complete; > + device->common.device_xor_pgs_to_pg = dma_async_xor_pgs_to_pg_err; > + device->common.device_issue_pending = ioat_dma_memcpy_issue_pending; > + device->common.capabilities = DMA_MEMCPY; Are we really going to add a set of hooks for each DMA engine whizbang feature? What's the alternative? But, also see patch 9 "dmaengine: reduce backend address permutations" it relieves some of this pain. That will get ugly when DMA engines support memcpy, xor, crc32, sha1, aes, and a dozen other transforms. > diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h > index c94d8f1..3599472 100644 > --- a/include/linux/dmaengine.h > +++ b/include/linux/dmaengine.h > @@ -20,7 +20,7 @@ > */ > #ifndef DMAENGINE_H > #define DMAENGINE_H > - > +#include > #ifdef CONFIG_DMA_ENGINE > > #include > @@ -65,6 +65,27 @@ enum dma_status { > }; > > /** > + * enum dma_capabilities - DMA operational capabilities > + * @DMA_MEMCPY: src to dest copy > + * @DMA_XOR: src*n to dest xor > + * @DMA_DUAL_XOR: src*n to dest_diag and dest_horiz xor > + * @DMA_PQ_XOR: src*n to dest_q and dest_p gf/xor > + * @DMA_MEMCPY_CRC32C: src to dest copy and crc-32c sum > + * @DMA_SHARE: multiple clients can use this channel > + */ > +enum dma_capabilities { > + DMA_MEMCPY = 0x1, > + DMA_XOR = 0x2, > + DMA_PQ_XOR = 0x4, > + DMA_DUAL_XOR= 0x8, > + DMA_PQ_UPDATE = 0x10, > + DMA_ZERO_SUM= 0x20, > + DMA_PQ_ZERO_SUM = 0x40, > + DMA_MEMSET = 0x80, > + DMA_MEMCPY_CRC32C = 0x100, Please use the more readable style that explicitly lists bits: DMA_MEMCPY = (1 << 0), DMA_XOR = (1 << 1), ... I prefer this as well, although at one point I was told (not by you) the absolute number was preferred when I was making changes to drivers/scsi/sata_vsc.c. In any event I'll change it... > +/** > * struct dma_chan_percpu - the per-CPU part of struct dma_chan > * @refcount: local_t used for open-coded "bigref" counting > * @memcpy_count: transaction counter > @@ -75,27 +96,32 @@ struct dma_chan_percpu { > local_t refcount; > /* stats */ > unsigned long memcpy_count; > + unsigned long xor_count; > unsigned long bytes_transferred; > + unsigned long bytes_xor; Clearly, each operation needs to be more compartmentalized. This just isn't scalable, when you consider all the possible transforms. Ok, one set of counters per op is probably overkill what about lumping operations into groups and just tracking at the group level? i.e. memcpy, memset -> string_count, string_bytes_transferred crc, sha1, aes -> hash_count, hash_transferred xor, pq_xor -> sum_count, sum_transferred Jeff Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction
On 9/11/06, Jeff Garzik <[EMAIL PROTECTED]> wrote: Dan Williams wrote: > This is a frequently asked question, Alan Cox had the same one at OLS. > The answer is "probably." The only complication I currently see is > where/how the stripe cache is maintained. With the IOPs its easy > because the DMA engines operate directly on kernel memory. With the > Promise card I believe they have memory on the card and it's not clear > to me if the XOR engines on the card can deal with host memory. Also, > MD would need to be modified to handle a stripe cache located on a > device, or somehow synchronize its local cache with card in a manner > that is still able to beat software only MD. sata_sx4 operates through [standard PC] memory on the card, and you use a DMA engine to copy memory to/from the card. [select chipsets supported by] sata_promise operates directly on host memory. So, while sata_sx4 is farther away from your direct-host-memory model, it also has much more potential for RAID acceleration: ideally, RAID1 just copies data to the card once, then copies the data to multiple drives from there. Similarly with RAID5, you can eliminate copies and offload XOR, presuming the drives are all connected to the same card. In the sata_promise case its straight forward, all that is needed is dmaengine drivers for the xor and memcpy engines. This would be similar to the current I/OAT model where dma resources are provided by a PCI function. The sata_sx4 case would need a different flavor of the dma_do_raid5_block_ops routine, one that understands where the cache is located. MD would also need the capability to bypass the block layer since the data will have already been transferred to the card by a stripe cache operation The RAID1 case give me pause because it seems any work along these lines requires that the implementation work for both MD and DM, which then eventually leads to being tasked with merging the two. Jeff Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 08/19] dmaengine: enable multiple clients and operations
On 9/11/06, Roland Dreier <[EMAIL PROTECTED]> wrote: Jeff> Are we really going to add a set of hooks for each DMA Jeff> engine whizbang feature? ...ok, but at some level we are going to need a file that has: EXPORT_SYMBOL_GPL(dma_whizbang_op1) . . . EXPORT_SYMBOL_GPL(dma_whizbang_opX) correct? Dan> What's the alternative? But, also see patch 9 "dmaengine: Dan> reduce backend address permutations" it relieves some of this Dan> pain. I guess you can pass an opcode into a common "start operation" function. But then we still have the problem of being able to request a memory copy operation of a channel that only understands xor, a la Jeff's comment to patch 12: "Further illustration of how this API growth is going wrong. You should create an API such that it is impossible for an XOR transform to ever call non-XOR-transform hooks." With all the memcpy / xor / crypto / etc. hardware out there already, we definitely have to get this interface right. - R. I understand what you are saying Jeff, the implementation can be made better, but something I think is valuable is the ability to write clients once like NET_DMA and RAID5_DMA and have them run without modification on any platform that can provide the engine interface rather than needing a client per architecture IOP_RAID5_DMA...FOO_X_RAID5_DMA. Or is this an example of the where "Do What You Must, And No More" comes in, i.e. don't worry about making a generic RAID5_DMA while there is only one implementation existence? I also want to pose the question of whether the dmaengine interface should handle cryptographic transforms? We already have Acrypto: http://tservice.net.ru/~s0mbre/blog/devel/acrypto/index.html. At the same time since IOPs can do Galois Field multiplication and XOR it would be nice to take advantage of that for crypto acceleration, but this does not fit the model of a device that Acrypto supports. Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction
On 9/13/06, Jakob Oestergaard <[EMAIL PROTECTED]> wrote: On Mon, Sep 11, 2006 at 04:00:32PM -0700, Dan Williams wrote: > Neil, > ... > > Concerning the context switching performance concerns raised at the > previous release, I have observed the following. For the hardware > accelerated case it appears that performance is always better with the > work queue than without since it allows multiple stripes to be operated > on simultaneously. I expect the same for an SMP platform, but so far my > testing has been limited to IOPs. For a single-processor > non-accelerated configuration I have not observed performance > degradation with work queue support enabled, but in the Kconfig option > help text I recommend disabling it (CONFIG_MD_RAID456_WORKQUEUE). Out of curiosity; how does accelerated compare to non-accelerated? One quick example: 4-disk SATA array rebuild on iop321 without acceleration - 'top' reports md0_resync and md0_raid5 dueling for the CPU each at ~50% utilization. With acceleration - 'top' reports md0_resync cpu utilization at ~90% with the rest split between md0_raid5 and md0_raid5_ops. The sync speed reported by /proc/mdstat is ~40% higher in the accelerated case. That being said, array resync is a special case, so your mileage may vary with other applications. I will put together some data from bonnie++, iozone, maybe contest, and post it on SourceForge. / jakob Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] dmaengine: clean up and abstract function types (was Re: [PATCH 08/19] dmaengine: enable multiple clients and operations)
On 9/15/06, Olof Johansson <[EMAIL PROTECTED]> wrote: On Fri, 15 Sep 2006 11:38:17 -0500 Olof Johansson <[EMAIL PROTECTED]> wrote: > On Mon, 11 Sep 2006 19:44:16 -0400 Jeff Garzik <[EMAIL PROTECTED]> wrote: > > Are we really going to add a set of hooks for each DMA engine whizbang > > feature? > > > > That will get ugly when DMA engines support memcpy, xor, crc32, sha1, > > aes, and a dozen other transforms. > > > Yes, it will be unmaintainable. We need some sort of multiplexing with > per-function registrations. > > Here's a first cut at it, just very quick. It could be improved further > but it shows that we could exorcise most of the hardcoded things pretty > easily. Ok, that was obviously a naive and not so nice first attempt, but I figured it was worth it to show how it can be done. This is a little more proper: Specify at client registration time what the function the client will use is, and make the channel use it. This way most of the error checking per call can be removed too. Chris/Dan: Please consider picking this up as a base for the added functionality and cleanups. Thanks for this Olof it has sparked some ideas about how to redo support for multiple operations. Clean up dmaengine a bit. Make the client registration specify which channel functions ("type") the client will use. Also, make devices register which functions they will provide. Also exorcise most of the memcpy-specific references from the generic dma engine code. There's still some left in the iov stuff. I think we should keep the operation type in the function name but drop all the [buf|pg|dma]_to_[buf|pg|dma] permutations. The buffer type can be handled generically across all operation types. Something like the following for a pg_to_buf memcpy. struct dma_async_op_memcpy *op; struct page *pg; void *buf; size_t len; dma_async_op_init_src_pg(op, pg); dma_async_op_init_dest_buf(op, buf); dma_async_memcpy(chan, op, len); -Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction
On 10/8/06, Neil Brown <[EMAIL PROTECTED]> wrote: On Monday September 11, [EMAIL PROTECTED] wrote: > Neil, > > The following patches implement hardware accelerated raid5 for the Intel > Xscale(r) series of I/O Processors. The MD changes allow stripe > operations to run outside the spin lock in a work queue. Hardware > acceleration is achieved by using a dma-engine-aware work queue routine > instead of the default software only routine. Hi Dan, Sorry for the delay in replying. I've looked through these patches at last (mostly the raid-specific bits) and while there is clearly a lot of good stuff here, it does 'feel' right - it just seems too complex. The particular issues that stand out to me are: - 33 new STRIPE_OP_* flags. I'm sure there doesn't need to be that many new flags. - the "raid5 dma client" patch moves far too much internal knowledge about raid5 into drivers/dma. Clearly there are some complex issues being dealt with and some complexity is to be expected, but I feel there must be room for some serious simplification. A valid criticism. There was definitely a push to just get it functional, so I can now see how the complexity crept into the implementation. The primary cause was the choice to explicitly handle channel switching in raid5-dma. However, relieving "client" code from this responsibility is something I am taking care of in the async api changes. Let me try to describe how I envisage it might work. As you know, the theory-of-operation of handle_stripe is that it assesses the state of a stripe deciding what actions to perform and then performs them. Synchronous actions (e.g. current parity calcs) are performed 'in-line'. Async actions (reads, writes) and actions that cannot be performed under a spinlock (->b_end_io) are recorded as being needed and then are initiated at the end of handle_stripe outside of the sh->lock. The proposal is to bring the parity and other bulk-memory operations out of the spinlock and make them optionally asynchronous. The set of tasks that might be needed to be performed on a stripe are: Clear a target cache block pre-xor various cache blocks into a target copy data out of bios into cache blocks. (drain) post-xor various cache blocks into a target copy data into bios out of cache blocks (fill) test if a cache block is all zeros start a read on a cache block start a write on a cache block (There is also a memcpy when expanding raid5. I think I would try to simply avoid that copy and move pointers around instead). Some of these steps require sequencing. e.g. clear, pre-xor, copy, post-xor, write for a rwm cycle. We could require handle_stripe to be called again for each step. i.e. first call just clears the target and flags it as clear. Next call initiates the pre-xor and flags that as done. Etc. However I think that would make the non-offloaded case too slow, or at least too clumsy. So instead we set flags to say what needs to be done and have a workqueue system that does it. (so far this is all quite similar to what you have done.) So handle_stripe would set various flag and other things (like identify which block was the 'target' block) and run the following in a workqueue: raid5_do_stuff(struct stripe_head *sh) { raid5_cont_t *conf = sh->raid_conf; if (test_bit(CLEAR_TARGET, &sh->ops.pending)) { struct page = *p->sh->dev[sh->ops.target].page; rv = async_memset(p, 0, 0, PAGE_SIZE, ops_done, sh); if (rv != BUSY) clear_bit(CLEAR_TARGET, &sh->ops.pending); if (rv != COMPLETE) goto out; } while (test_bit(PRE_XOR, &sh->ops.pending)) { struct page *plist[XOR_MAX]; int offset[XOR_MAX]; int pos = 0; int d; for (d = sh->ops.nextdev; d < conf->raid_disks && pos < XOR_MAX ; d++) { if (sh->ops.nextdev == sh->ops.target) continue; if (!test_bit(R5_WantPreXor, &sh->dev[d].flags)) continue; plist[pos] = sh->dev[d].page; offset[pos++] = 0; } if (pos) { struct page *p = sh->dev[sh->ops.target].page; rv = async_xor(p, 0, plist, offset, pos, PAGE_SIZE, ops_done, sh); if (rv != BUSY) sh->ops.nextdev = d; if (rv != COMPLETE) goto out; } else { clear_bit(PRE_XOR, &sh->ops.pending); sh->ops.nextdev = 0; }
Re: [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction
On 9/14/06, Jakob Oestergaard <[EMAIL PROTECTED]> wrote: On Wed, Sep 13, 2006 at 12:17:55PM -0700, Dan Williams wrote: ... > >Out of curiosity; how does accelerated compare to non-accelerated? > > One quick example: > 4-disk SATA array rebuild on iop321 without acceleration - 'top' > reports md0_resync and md0_raid5 dueling for the CPU each at ~50% > utilization. > > With acceleration - 'top' reports md0_resync cpu utilization at ~90% > with the rest split between md0_raid5 and md0_raid5_ops. > > The sync speed reported by /proc/mdstat is ~40% higher in the accelerated > case. Ok, nice :) > > That being said, array resync is a special case, so your mileage may > vary with other applications. Every-day usage I/O performance data would be nice indeed :) > I will put together some data from bonnie++, iozone, maybe contest, > and post it on SourceForge. Great! I have posted some Iozone data and graphs showing the performance impact of the patches across the three iop processors iop321, iop331, and iop341. The general take away from the data is that using dma engines extends the region that Iozone calls the "buffer cache effect". Write performance benefited the most as expected, but read performance showed some modest gains as well. There are some regions (smaller file size and record length) that show a performance disadvantage but it is typically less than 5%. The graphs map the relative performance multiplier that the raid patches generate ('2.6.18-rc6 performance' x 'performance multiplier' = '2.6.18-rc6-raid performance') . A value of '1' designates equal performance. The large cliff that drops to zero is a "not measured" region, i.e. the record length is larger than the file size. Iozone outputs to Excel, but I have also made pdf's of the graphs available. Note: Openoffice-calc can view the data but it does not support the 3D surface graphs that Iozone uses. Excel: http://prdownloads.sourceforge.net/xscaleiop/iozone_raid_accel.xls?download PDF Graphs: http://prdownloads.sourceforge.net/xscaleiop/iop-iozone-graphs-20061010.tar.bz2?download Regards, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 00/12] md raid acceleration and the async_tx api
Here is the latest version of the raid acceleration patch set. Since the last release I have created the async_tx api to address the concerns raised by Neil and Jeff. With this api in place the raid5 asynchronous and synchronous paths are no longer separated, i.e. there are no hardware specific concerns in the raid code. The async_tx api is proposed as a special dmaengine management client that allows offload engines to be used for bulk memory transfers/transforms, and fallback to synchronous routines when an engine is not present. This implementation has been tested on iop13xx and iop33x platforms in both the synchronous case and the asynchronous case with the iop-adma driver. The changes to the ioatdma driver have only been compile tested, and testing NET_DMA with iop-adma is pending. Please consider for -mm. These patches are against 2.6.19. Dan Williams: dmaengine: add base support for the async_tx api dmaengine: add the async_tx api dmaengine: driver for the iop32x, iop33x, and iop13xx raid engines md: add raid5_run_ops and support routines md: workqueue for raid5 operations md: move write operations to raid5_run_ops md: move raid5 compute block operations to raid5_run_ops md: move raid5 parity checks to raid5_run_ops md: satisfy raid5 read requests via raid5_run_ops md: use async_tx and raid5_run_ops for raid5 expansion operations md: raid5 io requests to raid5_run_ops md: remove raid5 compute_block and compute_parity5 Regards, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 09/12] md: satisfy raid5 read requests via raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> Use raid5_run_ops to carry out the memory copies for a raid5 read request. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 57 +--- 1 files changed, 32 insertions(+), 25 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1764fbb..3c793dc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2020,7 +2020,7 @@ static void handle_stripe5(struct stripe int i; int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; - int compute=0, req_compute=0, non_overwrite=0; + int to_fill=0, compute=0, req_compute=0, non_overwrite=0; int failed_num=0; struct r5dev *dev; @@ -2035,42 +2035,45 @@ static void handle_stripe5(struct stripe syncing = test_bit(STRIPE_SYNCING, &sh->state); expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); - /* Now to look around and see what can be done */ + /* clear completed biofills */ + if (test_and_clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) { + clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack); + clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending); + } + + /* Now to look around and see what can be done */ rcu_read_lock(); for (i=disks; i--; ) { mdk_rdev_t *rdev; dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); - PRINTK("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); + PRINTK("check %d: state 0x%lx toread %p read %p write %p written %p\n", + i, dev->flags, dev->toread, dev->read, dev->towrite, dev->written); + + /* maybe we can acknowledge completion of a biofill operation */ + if (test_bit(R5_Wantfill, &dev->flags) && !dev->toread) + clear_bit(R5_Wantfill, &dev->flags); + /* maybe we can reply to a read */ + if (dev->read && !test_bit(R5_Wantfill, &dev->flags) && + !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) { + return_bi = dev->read; + dev->read = NULL; + } + + /* maybe we can start a biofill operation */ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { - struct bio *rbi, *rbi2; - PRINTK("Return read for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - rbi = dev->toread; - dev->toread = NULL; - if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); - while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { - copy_data(0, rbi, dev->page, dev->sector); - rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); - if (--rbi->bi_phys_segments == 0) { - rbi->bi_next = return_bi; - return_bi = rbi; - } - spin_unlock_irq(&conf->device_lock); - rbi = rbi2; - } + to_read--; + if (!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + set_bit(R5_Wantfill, &dev->flags); } /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) locked++; if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + if (test_bit(R5_Wantfill, &dev->flags)) to_fill++; if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 1); if (dev->toread) to_read++; @@ -2094,9 +2097,13 @@ static void handle_stripe5(struct stripe set_bit(R5_Insync, &dev->flags); } rcu_read_unlock(); + + if (to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + sh->ops.count++; + PRINTK("locked=%d uptodate=%d to_read=%d" - &quo
[PATCH 10/12] md: use async_tx and raid5_run_ops for raid5 expansion operations
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe sets STRIPE_OP_POSTXOR without setting STRIPE_OP_BIODRAIN to carry out the postxor operation required by the expansion process. This distinction is needed since all blocks will need to be written back to disk even though none of the blocks will have their 'written' pointer set. The bulk copy operation to the new stripe is handled by async_tx. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 48 1 files changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3c793dc..8b36611 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2582,18 +2582,32 @@ #endif } } - if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { - /* Need to write out all blocks after computing parity */ - sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - compute_parity5(sh, RECONSTRUCT_WRITE); + /* Finish postxor operations initiated by the expansion +* process +*/ + if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && + !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { + + clear_bit(STRIPE_EXPANDING, &sh->state); + + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + for (i= conf->raid_disks; i--;) { - set_bit(R5_LOCKED, &sh->dev[i].flags); - locked++; set_bit(R5_Wantwrite, &sh->dev[i].flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; } - clear_bit(STRIPE_EXPANDING, &sh->state); - } else if (expanded) { + } + + if (expanded && test_bit(STRIPE_EXPANDING, &sh->state) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + /* Need to write out all blocks after computing parity */ + sh->disks = conf->raid_disks; + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); + locked += handle_write_operations5(sh, 0, 1); + } else if (expanded && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -2604,6 +2618,7 @@ #endif /* We have read all the blocks in this stripe and now we need to * copy some of them into a target stripe for expand. */ + struct dma_async_tx_descriptor *tx = NULL; clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i=0; i< sh->disks; i++) if (i != sh->pd_idx) { @@ -2627,9 +2642,12 @@ #endif release_stripe(sh2); continue; } - memcpy(page_address(sh2->dev[dd_idx].page), - page_address(sh->dev[i].page), - STRIPE_SIZE); + + /* place all the copies on one channel */ + tx = async_memcpy(sh2->dev[dd_idx].page, + sh->dev[i].page, 0, 0, STRIPE_SIZE, + ASYNC_TX_DEP_ACK, tx, NULL, NULL); + set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j=0; jraid_disks; j++) @@ -2641,6 +2659,12 @@ #endif set_bit(STRIPE_HANDLE, &sh2->state); } release_stripe(sh2); + + /* done submitting copies, wait for them to complete */ + if (i + 1 >= sh->disks) { + async_tx_ack(tx); + dma_wait_for_async_tx(tx); + } } } - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/12] dmaengine: add the async_tx api
From: Dan Williams <[EMAIL PROTECTED]> async_tx is an api to describe a series of bulk memory transfers/transforms. When possible these transactions are carried out by asynchrounous dma engines. The api handles inter-transaction dependencies and hides dma channel management from the client. When a dma engine is not present the transaction is carried out via synchronous software routines. Xor operations are handled by async_tx, to this end xor.c is moved into drivers/dma and is changed to take an explicit destination address and a series of sources to match the hardware engine implementation. When CONFIG_DMA_ENGINE is not set the asynchrounous path is compiled away. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/Makefile |3 drivers/dma/Kconfig | 16 + drivers/dma/Makefile |1 drivers/dma/async_tx.c | 921 ++ drivers/dma/xor.c| 153 drivers/md/Kconfig |2 drivers/md/Makefile |6 drivers/md/raid5.c | 18 - drivers/md/xor.c | 154 include/linux/async_tx.h | 181 + include/linux/raid/xor.h |5 11 files changed, 1287 insertions(+), 173 deletions(-) diff --git a/drivers/Makefile b/drivers/Makefile index 4ac14da..8b2460d 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -60,7 +60,6 @@ obj-$(CONFIG_I2C) += i2c/ obj-$(CONFIG_W1) += w1/ obj-$(CONFIG_HWMON)+= hwmon/ obj-$(CONFIG_PHONE)+= telephony/ -obj-$(CONFIG_MD) += md/ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_ISDN) += isdn/ obj-$(CONFIG_EDAC) += edac/ @@ -77,3 +76,5 @@ obj-$(CONFIG_CRYPTO) += crypto/ obj-$(CONFIG_SUPERH) += sh/ obj-$(CONFIG_GENERIC_TIME) += clocksource/ obj-$(CONFIG_DMA_ENGINE) += dma/ +obj-$(CONFIG_ASYNC_TX_DMA) += dma/ +obj-$(CONFIG_MD)+= md/ diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 30d021d..c82ed5f 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -7,8 +7,8 @@ menu "DMA Engine support" config DMA_ENGINE bool "Support for DMA engines" ---help--- - DMA engines offload copy operations from the CPU to dedicated - hardware, allowing the copies to happen asynchronously. + DMA engines offload bulk memory operations from the CPU to dedicated + hardware, allowing the operations to happen asynchronously. comment "DMA Clients" @@ -22,6 +22,17 @@ config NET_DMA Since this is the main user of the DMA engine, it should be enabled; say Y here. +config ASYNC_TX_DMA + tristate "Asynchronous Bulk Memory Transfers/Transforms API" + default y + ---help--- + This enables the async_tx management layer for dma engines. + Subsystems coded to this API will use offload engines for bulk + memory operations where present. Software implementations are + called when a dma engine is not present or fails to allocate + memory to carry out the transaction. + Current subsystems ported to async_tx: MD_RAID4,5 + comment "DMA Devices" config INTEL_IOATDMA @@ -30,5 +41,4 @@ config INTEL_IOATDMA default m ---help--- Enable support for the Intel(R) I/OAT DMA engine. - endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index bdcfdbd..6a99341 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_DMA_ENGINE) += dmaengine.o obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o +obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o diff --git a/drivers/dma/async_tx.c b/drivers/dma/async_tx.c new file mode 100644 index 000..00f72c0 --- /dev/null +++ b/drivers/dma/async_tx.c @@ -0,0 +1,921 @@ +/* + * Copyright(c) 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#include +#include +#include +#include +#include + +#define ASYNC_TX_DEBUG 0 +#define PRINTK(x...) ((void)(ASYNC_TX_D
[PATCH 01/12] dmaengine: add base support for the async_tx api
From: Dan Williams <[EMAIL PROTECTED]> * introduce struct dma_async_tx_descriptor as a common field for all dmaengine software descriptors * convert the device_memcpy_* methods into separate prep, set src/dest, and submit stages * support capabilities beyond memcpy (xor, memset, xor zero sum, completion interrupts) * convert ioatdma to the new semantics Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/dmaengine.c | 44 ++-- drivers/dma/ioatdma.c | 256 ++-- drivers/dma/ioatdma.h |8 + include/linux/dmaengine.h | 263 ++--- 4 files changed, 394 insertions(+), 177 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 1527804..8d203ad 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -210,7 +210,8 @@ static void dma_chans_rebalance(void) mutex_lock(&dma_list_mutex); list_for_each_entry(client, &dma_client_list, global_node) { - while (client->chans_desired > client->chan_count) { + while (client->chans_desired < 0 || + client->chans_desired > client->chan_count) { chan = dma_client_chan_alloc(client); if (!chan) break; @@ -219,7 +220,8 @@ static void dma_chans_rebalance(void) chan, DMA_RESOURCE_ADDED); } - while (client->chans_desired < client->chan_count) { + while (client->chans_desired >= 0 && + client->chans_desired < client->chan_count) { spin_lock_irqsave(&client->lock, flags); chan = list_entry(client->channels.next, struct dma_chan, @@ -294,12 +296,12 @@ void dma_async_client_unregister(struct * @number: count of DMA channels requested * * Clients call dma_async_client_chan_request() to specify how many - * DMA channels they need, 0 to free all currently allocated. + * DMA channels they need, 0 to free all currently allocated. A request + * < 0 indicates the client wants to handle all engines in the system. * The resulting allocations/frees are indicated to the client via the * event callback. */ -void dma_async_client_chan_request(struct dma_client *client, - unsigned int number) +void dma_async_client_chan_request(struct dma_client *client, int number) { client->chans_desired = number; dma_chans_rebalance(); @@ -318,6 +320,31 @@ int dma_async_device_register(struct dma if (!device) return -ENODEV; + /* validate device routines */ + BUG_ON(test_bit(DMA_MEMCPY, &device->capabilities) && + !device->device_prep_dma_memcpy); + BUG_ON(test_bit(DMA_XOR, &device->capabilities) && + !device->device_prep_dma_xor); + BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) && + !device->device_prep_dma_zero_sum); + BUG_ON(test_bit(DMA_MEMSET, &device->capabilities) && + !device->device_prep_dma_memset); + BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) && + !device->device_prep_dma_interrupt); + + BUG_ON(!device->device_alloc_chan_resources); + BUG_ON(!device->device_free_chan_resources); + BUG_ON(!device->device_tx_submit); + BUG_ON(!device->device_set_dest); + BUG_ON(!device->device_set_src); + BUG_ON(!device->device_dependency_added); + BUG_ON(!device->device_is_tx_complete); + BUG_ON(!device->map_page); + BUG_ON(!device->map_single); + BUG_ON(!device->unmap_page); + BUG_ON(!device->unmap_single); + BUG_ON(!device->device_issue_pending); + init_completion(&device->done); kref_init(&device->refcount); device->dev_id = id++; @@ -402,11 +429,8 @@ subsys_initcall(dma_bus_init); EXPORT_SYMBOL(dma_async_client_register); EXPORT_SYMBOL(dma_async_client_unregister); EXPORT_SYMBOL(dma_async_client_chan_request); -EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf); -EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg); -EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg); -EXPORT_SYMBOL(dma_async_memcpy_complete); -EXPORT_SYMBOL(dma_async_memcpy_issue_pending); +EXPORT_SYMBOL(dma_async_is_tx_complete); +EXPORT_SYMBOL(dma_async_issue_pending); EXPORT_SYMBOL(dma_async_device_register); EXPORT_SYMBOL(dma_async_device_unregister); EXPORT_SYMBOL(dma_chan_cleanup); diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 0358419..ff7377d 100644 --- a/drivers/dma/ioat
[PATCH 04/12] md: add raid5_run_ops and support routines
From: Dan Williams <[EMAIL PROTECTED]> Prepare the raid5 implementation to use async_tx and a workqueue for running stripe operations: * biofill (copy data into request buffers to satisfy a read request) * compute block (generate a missing block in the cache from the other blocks) * prexor (subtract existing data as part of the read-modify-write process) * biodrain (copy data out of request buffers to satisfy a write request) * postxor (recalculate parity for new data that has entered the cache) * check (verify that the parity is correct) * io (submit i/o to the member disks) Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 560 include/linux/raid/raid5.h | 67 + 2 files changed, 619 insertions(+), 8 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 0c8ada5..232f525 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -52,6 +52,7 @@ #include #include "raid6.h" #include +#include /* * Stripe cache @@ -222,7 +223,8 @@ static void init_stripe(struct stripe_he BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - + BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); + CHECK_DEVLOCK(); PRINTK("init_stripe called, stripe %llu\n", (unsigned long long)sh->sector); @@ -238,11 +240,11 @@ static void init_stripe(struct stripe_he for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->toread || dev->towrite || dev->written || + if (dev->toread || dev->read || dev->towrite || dev->written || test_bit(R5_LOCKED, &dev->flags)) { - printk("sector=%llx i=%d %p %p %p %d\n", + printk("sector=%llx i=%d %p %p %p %p %d\n", (unsigned long long)sh->sector, i, dev->toread, - dev->towrite, dev->written, + dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); BUG(); } @@ -322,6 +324,556 @@ static struct stripe_head *get_active_st return sh; } +static int +raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error); +static int +raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error); + +static void ops_run_io(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + int i; + + might_sleep(); + + for (i = sh->disks; i-- ;) { + int rw; + struct bio *bi; + mdk_rdev_t *rdev; + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) + rw = 1; + else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) + rw = 0; + else + continue; + + bi = &sh->dev[i].req; + + bi->bi_rw = rw; + if (rw) + bi->bi_end_io = raid5_end_write_request; + else + bi->bi_end_io = raid5_end_read_request; + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = NULL; + if (rdev) + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + + if (rdev) { + if (test_bit(STRIPE_SYNCING, &sh->state) || + test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || + test_bit(STRIPE_EXPAND_READY, &sh->state)) + md_sync_acct(rdev->bdev, STRIPE_SECTORS); + + bi->bi_bdev = rdev->bdev; + PRINTK("%s: stripe %llu schedule op %ld on disc %d\n", + __FUNCTION__, + (unsigned long long)sh->sector, bi->bi_rw, i); + atomic_inc(&sh->count); + bi->bi_sector = sh->sector + rdev->data_offset; + bi->bi_flags = 1 << BIO_UPTODATE; + bi->bi_vcnt = 1; + bi->bi_max_vecs = 1; + bi->bi_idx = 0; + bi->bi_io_vec = &sh->dev[i].vec; + bi->bi_io_vec[0].bv_len = STRIPE_SIZE; + bi->bi_io_vec[0].bv_offset = 0; + bi->bi_size
[PATCH 05/12] md: workqueue for raid5 operations
From: Dan Williams <[EMAIL PROTECTED]> Each raid5 device gets its own queue, and each stripe has its own work_struct. The goal is to have a free running raid5d thread, i.e. reduce the time the stripe lock is held by removing bulk memory operations, and removing the sleeping path in generic_make_request. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 37 + include/linux/raid/raid5.h |6 ++ 2 files changed, 39 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 232f525..c2312d1 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -126,6 +126,7 @@ static void __release_stripe(raid5_conf_ } md_wakeup_thread(conf->mddev->thread); } else { + BUG_ON(sh->ops.pending); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@ -324,6 +325,15 @@ static struct stripe_head *get_active_st return sh; } +static inline void issue_raid_ops(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + + atomic_inc(&sh->count); + conf->workqueue_stripes++; + queue_work(sh->raid_conf->workqueue, &sh->ops.work); +} + static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error); static int @@ -868,6 +878,10 @@ static void raid5_run_ops(void *stripe_h } else if (sh->ops.count < 0) BUG(); + /* we kick off work to the engines in batches */ + if (--(conf->workqueue_stripes) == 0) + async_tx_issue_pending_all(); + spin_unlock(&sh->lock); set_bit(STRIPE_HANDLE, &sh->state); @@ -883,6 +897,7 @@ static int grow_one_stripe(raid5_conf_t memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); sh->raid_conf = conf; spin_lock_init(&sh->lock); + INIT_WORK(&sh->ops.work, raid5_run_ops, sh); if (grow_buffers(sh, conf->raid_disks)) { shrink_buffers(sh, conf->raid_disks); @@ -1923,7 +1938,6 @@ static int stripe_to_pdidx(sector_t stri *schedule a write of some buffers *return confirmation of parity correctness * - * Parity calculations are done inside the stripe lock * buffers are taken off read_list or write_list, and bh_cache buffers * get BH_Lock set before the stripe lock is released. * @@ -1942,9 +1956,9 @@ static void handle_stripe5(struct stripe int failed_num=0; struct r5dev *dev; - PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", - (unsigned long long)sh->sector, atomic_read(&sh->count), - sh->pd_idx); + PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d ops=%lx:%lx:%lx\n", + (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), + sh->pd_idx, sh->ops.pending, sh->ops.ack, sh->ops.complete); spin_lock(&sh->lock); clear_bit(STRIPE_HANDLE, &sh->state); @@ -2409,6 +2423,10 @@ #endif } } + if (sh->ops.count && !test_and_set_bit(STRIPE_OPSQUEUE_ACTIVE, &sh->state)) { + issue_raid_ops(sh); + } + spin_unlock(&sh->lock); while ((bi=return_bi)) { @@ -3717,6 +3735,13 @@ static int run(mddev_t *mddev) if (!conf->spare_page) goto abort; } + + sprintf(conf->workqueue_name, "%s_raid5_ops", + mddev->gendisk->disk_name); + + if ((conf->workqueue = create_workqueue(conf->workqueue_name)) == NULL) + goto abort; + spin_lock_init(&conf->device_lock); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); @@ -3726,6 +3751,7 @@ static int run(mddev_t *mddev) INIT_LIST_HEAD(&conf->inactive_list); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); + conf->workqueue_stripes = 0; PRINTK("raid5: run(%s) called.\n", mdname(mddev)); @@ -3879,6 +3905,8 @@ abort: safe_put_page(conf->spare_page); kfree(conf->disks); kfree(conf->stripe_hashtbl); + if (conf->workqueue) + destroy_workqueue(conf->workqueue); kfree(conf); } mddev->private = NULL; @@ -3899,6 +3927,7 @@ static int stop(mdd
[PATCH 08/12] md: move raid5 parity checks to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe sets STRIPE_OP_CHECK to request a check operation in raid5_run_ops. If raid5_run_ops is able to perform the check with a dma engine the parity will be preserved and not re-read from disk. Check operations re-use the compute block facility to repair the parity. However since repairing the parity implies a write-back to disk the STRIPE_OP_MOD_REPAIR_PD flag is added to distinguish it from other compute block operations. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 81 1 files changed, 62 insertions(+), 19 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8510183..1764fbb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2465,32 +2465,75 @@ #endif locked += handle_write_operations5(sh, rcw, 0); } - /* maybe we need to check and possibly fix the parity for this stripe -* Any reads will already have been scheduled, so we just see if enough data -* is available + /* 1/ Maybe we need to check and possibly fix the parity for this stripe. +*Any reads will already have been scheduled, so we just see if enough data +*is available. +* 2/ Hold off parity checks while parity dependent operations are in flight +*(conflicting writes are protected by the 'locked' variable) */ - if (syncing && locked == 0 && - !test_bit(STRIPE_INSYNC, &sh->state)) { + if ((syncing && locked == 0 && !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_INSYNC, &sh->state)) || + test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || + test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + set_bit(STRIPE_HANDLE, &sh->state); - if (failed == 0) { - BUG_ON(uptodate != disks); - compute_parity5(sh, CHECK_PARITY); - uptodate--; - if (page_is_zero(sh->dev[sh->pd_idx].page)) { - /* parity is correct (on disc, not in buffer any more) */ - set_bit(STRIPE_INSYNC, &sh->state); - } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ + /* Take one of the following actions: +* 1/ start a check parity operation if (uptodate == disks) +* 2/ finish a check parity operation and act on the result +* 3/ skip to the writeback section if we previously +*initiated a recovery operation +*/ + if (failed == 0 && !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + BUG_ON(uptodate != disks); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + sh->ops.count++; + uptodate--; + } else if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { + clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); + clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, not in buffer any more) */ set_bit(STRIPE_INSYNC, &sh->state); else { - compute_block(sh, sh->pd_idx); - uptodate++; + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + BUG_ON(test_and_set_bit( + STRIPE_OP_COMPUTE_BLK, + &sh->ops.pending)); + set_bit(STRIPE_OP_MOD_REPAIR_PD, +
[PATCH 03/12] dmaengine: driver for the iop32x, iop33x, and iop13xx raid engines
From: Dan Williams <[EMAIL PROTECTED]> This is a driver for the iop DMA/AAU/ADMA units which are capable of pq_xor, pq_update, pq_zero_sum, xor, dual_xor, xor_zero_sum, fill, copy+crc, and copy operations. Changelog: * fixed a slot allocation bug in do_iop13xx_adma_xor that caused too few slots to be requested eventually leading to data corruption * enabled the slot allocation routine to attempt to free slots before returning -ENOMEM * switched the cleanup routine to solely use the software chain and the status register to determine if a descriptor is complete. This is necessary to support other IOP engines that do not have status writeback capability * make the driver iop generic * modified the allocation routines to understand allocating a group of slots for a single operation * added a null xor initialization operation for the xor only channel on iop3xx * support xor operations on buffers larger than the hardware maximum * split the do_* routines into separate prep, src/dest set, submit stages * added async_tx support (dependent operations initiation at cleanup time) * simplified group handling * added interrupt support (callbacks via tasklets) Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/Kconfig |8 drivers/dma/Makefile|1 drivers/dma/iop-adma.c | 1522 +++ include/asm-arm/hardware/iop_adma.h | 116 +++ 4 files changed, 1647 insertions(+), 0 deletions(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index c82ed5f..d61e3e5 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -41,4 +41,12 @@ config INTEL_IOATDMA default m ---help--- Enable support for the Intel(R) I/OAT DMA engine. + +config INTEL_IOP_ADMA +tristate "Intel IOP ADMA support" +depends on DMA_ENGINE && (ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX) +default m +---help--- + Enable support for the Intel(R) IOP Series RAID engines. + endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 6a99341..8ebf10d 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_DMA_ENGINE) += dmaengine.o obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o +obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c new file mode 100644 index 000..18fd7e3 --- /dev/null +++ b/drivers/dma/iop-adma.c @@ -0,0 +1,1522 @@ +/* + * Copyright(c) 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This driver supports the asynchrounous DMA copy and RAID engines available + * on the Intel Xscale(R) family of I/O Processors (IOP 32x, 33x, 134x) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define to_iop_adma_chan(chan) container_of(chan, struct iop_adma_chan, common) +#define to_iop_adma_device(dev) container_of(dev, struct iop_adma_device, common) +#define to_iop_adma_slot(lh) container_of(lh, struct iop_adma_desc_slot, slot_node) +#define tx_to_iop_adma_slot(tx) container_of(tx, struct iop_adma_desc_slot, async_tx) + +#define IOP_ADMA_DEBUG 0 +#define PRINTK(x...) ((void)(IOP_ADMA_DEBUG && printk(x))) + +/* software zero sum implemenation bits for iop32x */ +#ifdef CONFIG_ARCH_IOP32X +char iop32x_zero_result_buffer[PAGE_SIZE] __attribute__((aligned(256))); +u32 *iop32x_zero_sum_output; +#endif + +/** + * iop_adma_free_slots - flags descriptor slots for reuse + * @slot: Slot to free + * Caller must hold &iop_chan->lock while calling this function + */ +static inline void iop_adma_free_slots(struct iop_adma_desc_slot *slot) +{ + int stride = slot->stride; + + while (stride--) { + slot->stride = 0; + slot = list_entry(slot->slot_node.next, + struct iop_adma_desc_slot, + slot_node); + } +} + +static inline dma_cookie_t +iop_adma_run_tx_complete_a
[PATCH 12/12] md: remove raid5 compute_block and compute_parity5
From: Dan Williams <[EMAIL PROTECTED]> replaced by raid5_run_ops Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 124 1 files changed, 0 insertions(+), 124 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7d75fbe..478741e 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1522,130 +1522,6 @@ #define check_xor() do { \ } \ } while(0) - -static void compute_block(struct stripe_head *sh, int dd_idx) -{ - int i, count, disks = sh->disks; - void *ptr[MAX_XOR_BLOCKS], *p; - - PRINTK("compute_block, stripe %llu, idx %d\n", - (unsigned long long)sh->sector, dd_idx); - - ptr[0] = page_address(sh->dev[dd_idx].page); - memset(ptr[0], 0, STRIPE_SIZE); - count = 1; - for (i = disks ; i--; ) { - if (i == dd_idx) - continue; - p = page_address(sh->dev[i].page); - if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) - ptr[count++] = p; - else - printk(KERN_ERR "compute_block() %d, stripe %llu, %d" - " not present\n", dd_idx, - (unsigned long long)sh->sector, i); - - check_xor(); - } - if (count != 1) - xor_block(count, STRIPE_SIZE, ptr[0], &ptr[1]); - set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); -} - -static void compute_parity5(struct stripe_head *sh, int method) -{ - raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, disks = sh->disks, count; - void *ptr[MAX_XOR_BLOCKS]; - struct bio *chosen; - - PRINTK("compute_parity5, stripe %llu, method %d\n", - (unsigned long long)sh->sector, method); - - count = 1; - ptr[0] = page_address(sh->dev[pd_idx].page); - switch(method) { - case READ_MODIFY_WRITE: - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); - for (i=disks ; i-- ;) { - if (i==pd_idx) - continue; - if (sh->dev[i].towrite && - test_bit(R5_UPTODATE, &sh->dev[i].flags)) { - ptr[count++] = page_address(sh->dev[i].page); - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - check_xor(); - } - } - break; - case RECONSTRUCT_WRITE: - memset(ptr[0], 0, STRIPE_SIZE); - for (i= disks; i-- ;) - if (i!=pd_idx && sh->dev[i].towrite) { - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - } - break; - case CHECK_PARITY: - break; - } - if (count>1) { - xor_block(count, STRIPE_SIZE, ptr[0], &ptr[1]); - count = 1; - } - - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); - } - - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); - } - - switch(method) { - case RECONSTRUCT_WRITE: - case CHECK_PARITY: - for (i=disks; i--;) - if (i != pd_idx) { - ptr[count++] = page_address(sh->dev[i].page); - che
[PATCH 07/12] md: move raid5 compute block operations to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe sets STRIPE_OP_COMPUTE_BLK to request servicing from raid5_run_ops. It also sets a flag for the block being computed to let other parts of handle_stripe submit dependent operations. raid5_run_ops guarantees that the compute operation completes before any dependent operation starts. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 139 +--- 1 files changed, 100 insertions(+), 39 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 74516ef..8510183 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2020,7 +2020,7 @@ static void handle_stripe5(struct stripe int i; int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; - int non_overwrite = 0; + int compute=0, req_compute=0, non_overwrite=0; int failed_num=0; struct r5dev *dev; @@ -2071,8 +2071,8 @@ static void handle_stripe5(struct stripe /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) locked++; if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 1); - if (dev->toread) to_read++; if (dev->towrite) { to_write++; @@ -2227,40 +2227,91 @@ static void handle_stripe5(struct stripe * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { - for (i=disks; i--;) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || -(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || -syncing || -expanding || -(failed && (sh->dev[failed_num].toread || -(sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags - ) - ) { - /* we would like to get this block, possibly -* by computing it, but we might not be able to + if (to_read || non_overwrite || (syncing && (uptodate + compute < disks)) || expanding || + test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { + + /* Clear completed compute operations. Parity recovery +* (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled +* later on in this routine +*/ + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && + !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + } + + /* look for blocks to read/compute, skip this if a compute +* is already in flight, or if the stripe contents are in the +* midst of changing due to a write +*/ + if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + for (i=disks; i--;) { + dev = &sh->dev[i]; + + /* don't schedule compute operations or reads on +* the parity block while a check is in flight */ - if (uptodate == disks-1) { - PRINTK("Computing block %d\n", i); - compute_block(sh, i); - uptodate++; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); + if ((i == sh->pd_idx) && test_bit(STRIPE_OP_CHECK, &sh->ops.pending)) +
[PATCH 06/12] md: move write operations to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe sets STRIPE_OP_PREXOR, STRIPE_OP_BIODRAIN, STRIPE_OP_POSTXOR to request a write to the stripe cache. raid5_run_ops is triggerred to run and executes the request outside the stripe lock. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 152 +--- 1 files changed, 131 insertions(+), 21 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c2312d1..74516ef 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1830,7 +1830,75 @@ static void compute_block_2(struct strip } } +static int handle_write_operations5(struct stripe_head *sh, int rcw, int expand) +{ + int i, pd_idx = sh->pd_idx, disks = sh->disks; + int locked=0; + + if (rcw == 0) { + /* skip the drain operation on an expand */ + if (!expand) { + BUG_ON(test_and_set_bit(STRIPE_OP_BIODRAIN, + &sh->ops.pending)); + sh->ops.count++; + } + + BUG_ON(test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)); + sh->ops.count++; + + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->towrite) { + set_bit(R5_LOCKED, &dev->flags); + if (!expand) + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } else { + BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); + + BUG_ON(test_and_set_bit(STRIPE_OP_PREXOR, &sh->ops.pending) || + test_and_set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) || + test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)); + + sh->ops.count += 3; + + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + if (i==pd_idx) + continue; + /* For a read-modify write there may be blocks that are +* locked for reading while others are ready to be written +* so we distinguish these blocks by the R5_Wantprexor bit +*/ + if (dev->towrite && + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantprexor, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } + + /* keep the parity disk locked while asynchronous operations +* are in flight +*/ + set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + locked++; + + PRINTK("%s: stripe %llu locked: %d pending: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + locked, sh->ops.pending); + + return locked; +} /* * Each stripe/dev can have one or more bion attached. @@ -2199,8 +2267,67 @@ #endif set_bit(STRIPE_HANDLE, &sh->state); } - /* now to consider writing and what else, if anything should be read */ - if (to_write) { + /* Now we check to see if any write operations have recently +* completed +*/ + + /* leave prexor set until postxor is done, allows us to distinguish +* a rmw from a rcw during biodrain +*/ + if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && + test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + + clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); + clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); + clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); + + for (i=disks; i--;) + clear_bit(R5_Wantprexor, &sh->dev[i].flags); + } + + /* if only POSTXOR is set then this is an 'expand' postxor */ + if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && + test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + + clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete); + clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack); +
[PATCH 11/12] md: raid5 io requests to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> generic_make_request may sleep, moving io to raid5_run_ops allows raid5d to run freely. Since raid5_run_ops is a workqueue other cpus can make forward progress on other stripes. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 68 1 files changed, 10 insertions(+), 58 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8b36611..7d75fbe 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2431,6 +2431,8 @@ #endif PRINTK("Read_old block %d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2451,6 +2453,8 @@ #endif PRINTK("Read_old block %d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2550,6 +2554,8 @@ #endif set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; clear_bit(STRIPE_DEGRADED, &sh->state); locked++; set_bit(STRIPE_INSYNC, &sh->state); @@ -2571,12 +2577,16 @@ #endif dev = &sh->dev[failed_num]; if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; set_bit(R5_LOCKED, &dev->flags); locked++; } @@ -2682,64 +2692,6 @@ #endif bi->bi_size = 0; bi->bi_end_io(bi, bytes, 0); } - for (i=disks; i-- ;) { - int rw; - struct bio *bi; - mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = 1; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = 0; - else - continue; - - bi = &sh->dev[i].req; - - bi->bi_rw = rw; - if (rw) - bi->bi_end_io = raid5_end_write_request; - else - bi->bi_end_io = raid5_end_read_request; - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - if (rdev) { - if (syncing || expanding || expanded) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); - - bi->bi_bdev = rdev->bdev; - PRINTK("for %llu schedule op %ld on disc %d\n", - (unsigned long long)sh->sector, bi->bi_rw, i); - atomic_inc(&sh->count); - bi->bi_sector = sh->sector + rdev->d
Re: [PATCH 02/12] dmaengine: add the async_tx api
+static inline void +do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset, + unsigned int src_cnt, size_t len, enum async_tx_flags flags, + struct dma_async_tx_descriptor *depend_tx, + dma_async_tx_callback callback, void *callback_param) +{ + void *_dest; + int start_idx, i; + + printk("%s: len: %u\n", __FUNCTION__, len); Sorry, this should be PRINTK. Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Odd (slow) RAID performance
On 12/1/06, Bill Davidsen <[EMAIL PROTECTED]> wrote: Thank you so much for verifying this. I do keep enough room on my drives to run tests by creating any kind of whatever I need, but the point is clear: with N drives striped the transfer rate is N x base rate of one drive; with RAID-5 it is about the speed of one drive, suggesting that the md code serializes writes. If true, BOO, HISS! Can you explain and educate us, Neal? This look like terrible performance. Just curious what is your stripe_cache_size setting in sysfs? Neil, please include me in the education if what follows is incorrect: Read performance in kernels up to and including 2.6.19 is hindered by needing to go through the stripe cache. This situation should improve with the stripe-cache-bypass patches currently in -mm. As Raz reported in some cases the performance increase of this approach is 30% which is roughly equivalent to the performance difference I see of a 4-disk raid5 versus a 3-disk raid0. For the write case I can say that MD does not serialize writes. If by serialize you mean that there is 1:1 correlation between writes to the parity disk and writes to a data disk. To illustrate I instrumented MD to count how many times it issued a write to the parity disk and compared that to how many writes it performed to the member disks for the workload "dd if=/dev/zero of=/dev/md0 bs=1024k count=100". I recorded 8544 parity writes and 25600 member disk writes which is about 3 member disk writes per parity write, or pretty close to optimal for a 4-disk array. So, serialization is not the cause, performing sub-stripe width writes is not the cause as >98% of the writes happened without needing to read old data from the disks. However, I see the same performance on my system, about equal to a single disk. Here is where I step into supposition territory. Perhaps the discrepancy is related to the size of the requests going to the block layer. raid5 always makes page sized requests with the expectation that they will coalesce into larger requests in the block layer. Maybe we are missing coalescing opportunities in raid5 compared to what happens in the raid0 case? Are there any io scheduler knobs to turn along these lines? Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: raid5 software vs hardware: parity calculations?
On 1/12/07, James Ralston <[EMAIL PROTECTED]> wrote: On 2007-01-12 at 09:39-08 dean gaudet <[EMAIL PROTECTED]> wrote: > On Thu, 11 Jan 2007, James Ralston wrote: > > > I'm having a discussion with a coworker concerning the cost of > > md's raid5 implementation versus hardware raid5 implementations. > > > > Specifically, he states: > > > > > The performance [of raid5 in hardware] is so much better with > > > the write-back caching on the card and the offload of the > > > parity, it seems to me that the minor increase in work of having > > > to upgrade the firmware if there's a buggy one is a highly > > > acceptable trade-off to the increased performance. The md > > > driver still commits you to longer run queues since IO calls to > > > disk, parity calculator and the subsequent kflushd operations > > > are non-interruptible in the CPU. A RAID card with write-back > > > cache releases the IO operation virtually instantaneously. > > > > It would seem that his comments have merit, as there appears to be > > work underway to move stripe operations outside of the spinlock: > > > > http://lwn.net/Articles/184102/ > > > > What I'm curious about is this: for real-world situations, how > > much does this matter? In other words, how hard do you have to > > push md raid5 before doing dedicated hardware raid5 becomes a real > > win? > > hardware with battery backed write cache is going to beat the > software at small write traffic latency essentially all the time but > it's got nothing to do with the parity computation. I'm not convinced that's true. No, it's true. md implements a write-through cache to ensure that data reaches the disk. What my coworker is arguing is that md raid5 code spinlocks while it is performing this sequence of operations: 1. executing the write not performed under the lock 2. reading the blocks necessary for recalculating the parity not performed under the lock 3. recalculating the parity 4. updating the parity block My [admittedly cursory] read of the code, coupled with the link above, leads me to believe that my coworker is correct, which is why I was for trolling for [informed] opinions about how much of a performance hit the spinlock causes. The spinlock is not a source of performance loss, the reason for moving parity calculations outside the lock is to maximize the benefit of using asynchronous xor+copy engines. The hardware vs software raid trade-offs are well documented here: http://linux.yyz.us/why-software-raid.html Regards, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 00/12] md raid acceleration and the async_tx api
On 1/18/07, Yuri Tikhonov <[EMAIL PROTECTED]> wrote: Hello, Dan. Hello. It seems there is a bug in your 06.11.30 raid acceleration patch-set. I tried to run the Linux s/w RAID-5 driver patched with your 06.11.30 patch-set and found that it fails during write operations when the RAID-5 array consists of 6 or more number of drives (I tested up to 8 drives). For 5 and less number of drives everything works as expected. There are no such problems with your 06.09.12 set of patches. Do you have any assumptions about the reasons of this fault? Yes, sorry, there were bugs in the synchronous path around handling > MAX_XOR_BLOCKS that I have fixed for the next rev of the patches. I'll be releasing them shortly, but attached is a patch to address the issue you are seeing. The kernel I used was 2.6.19, your 06.11.30 patch-set was applied without any warnings/errors. Here is the kernel Oops report: Oops: kernel access of bad area, sig: 11 [#1] NIP: C014F980 LR: C014FD0C CTR: 0080 REGS: eee49d40 TRAP: 0300 Not tainted (2.6.19-g0726acdc-dirty) MSR: 00029000 CR: 44002042 XER: 2000 DAR: 17970004, DSISR: TASK = eed5a7d0[280] 'md0_raid5_ops/0' THREAD: eee48000 GPR00: 007F EEE49DF0 EED5A7D0 0080 EEDFC000 19D7 1787 GPR08: 1000 C02B EEDFC000 C014F950 EEDFC000 3000 C015B8D8 1797 GPR16: C08AC180 C02B EEE0CB48 003A 000C 1000 0001 GPR24: C015B8D8 0004 003A EEDFC000 0004 19D7 1787 NIP [C014F980] xor_32regs_4+0x30/0x158 LR [C014FD0C] xor_block+0xc4/0x12c Call Trace: [EEE49E40] [EEE49E58] 0xeee49e58 [EEE49E50] [C014EFAC] async_xor+0x134/0x200 [EEE49EB0] [C015A960] ops_run_postxor+0xf8/0x198 [EEE49F00] [C0162458] raid5_run_ops+0x8dc/0x994 [EEE49F50] [C0029F7C] run_workqueue+0xa4/0x118 [EEE49F70] [C002A198] worker_thread+0xf8/0x13c [EEE49FC0] [C002E20C] kthread+0xf8/0x100 [EEE49FF0] [C0003DA0] kernel_thread+0x44/0x60 Instruction dump: 5463d97e 7c601b78 3400 9421ffb0 bde1000c 7c6903a6 7c8c2378 7caf2b78 7cde3378 7cff3b78 41800124 80ac <82ef0004> 82cf0008 82af000c 828f0010 Regards, Yuri. Thanks for testing the patches. Regards, Dan diff --git a/drivers/dma/async_tx.c b/drivers/dma/async_tx.c index d918cc3..eee208d 100644 --- a/drivers/dma/async_tx.c +++ b/drivers/dma/async_tx.c @@ -324,9 +324,6 @@ async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, } #endif -#define to_iop_adma_chan(chan) container_of(chan, struct iop_adma_chan, common) -#define tx_to_iop_adma_slot(tx) container_of(tx, struct iop_adma_desc_slot, async_tx) - static inline void async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx, enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx, @@ -423,17 +420,12 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset, dma_async_tx_callback callback, void *callback_param) { void *_dest; - int start_idx, i; + int i; PRINTK("%s: len: %u\n", __FUNCTION__, len); /* reuse the 'src_list' array to convert to buffer pointers */ - if (flags & ASYNC_TX_XOR_DROP_DST) - start_idx = 1; - else - start_idx = 0; - - for (i = start_idx; i < src_cnt; i++) + for (i = 0; i < src_cnt; i++) src_list[i] = (struct page *) (page_address(src_list[i]) + offset); @@ -443,8 +435,8 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset, if (flags & ASYNC_TX_XOR_ZERO_DST) memset(_dest, 0, len); - xor_block(src_cnt - start_idx, len, _dest, - (void **) &src_list[start_idx]); + xor_block(src_cnt, len, _dest, + (void **) src_list); sync_epilog(flags, depend_tx, callback, callback_param); } @@ -514,7 +506,15 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset, goto xor_sync; } else { /* run the xor synchronously */ xor_sync: - /* process up to 'max_xor_blocks' sources */ + /* in the sync case the dest is an implied source + * (assumes the dest is at the src_off index) + */ + if (flags & ASYNC_TX_XOR_DROP_DST) { +src_cnt--; +src_off++; + } + + /* process up to 'MAX_XOR_BLOCKS' sources */ xor_src_cnt = min(src_cnt, (unsigned int) MAX_XOR_BLOCKS); /* if we are submitting additional xors @@ -540,9 +540,9 @@ xor_sync: __FUNCTION__); } - do_sync_xor(dest, &src_list[src_off], offset, src_cnt, -len, local_flags, depend_tx, _callback, -_callback_param); + do_sync_xor(dest, &src_list[src_off], offset, +xor_src_cnt, len, local_flags, depend_tx, +_callback, _callback_param); } /* the previous tx is hidden from the client, @@ -556,13 +556,15 @@ xor_sync: if (src_cnt > xor_src_cnt) { /* drop completed sources */ src_cnt -= xor_src_cnt; + src_off += xor_src_cnt; /* unconditionally preserve the destination */ flags &= ~ASYNC_TX_XOR_ZERO_DST; - /* use the intermediate result a source */ - src_off = xor_src_cnt - 1; - src_list[src_off] = dest; + /* use the i
Re: What is the exacting meaning of Striped_Cache_Size?
On 1/21/07, Liang Yang <[EMAIL PROTECTED]> wrote: Hello, I have tried to increase the Striped_Cache_Size from 256 (default for my MD-RAID5 array) to 8192, it does improve the MD-RAID5 Write performance which varies with the size of I/O packet. However, I'm still not very clean the meaning and the potential performance impact of this Striped_Cache_Size? Is the unit for this parameter Byte or KiloByte? Could anyone here explain with a little bit more details? stripe_cache_size is the number of stripes in the cache. Each stripe (strip) is composed one PAGE_SIZE block per disk. If your page size is 4k and you have 4 disks in your array then a stripe_cache_size of 256 is 4k * 4 * 256 = 4MB. Increasing this number increases the chances that a write to the array will not generate reads to satisfy the parity calculation. Thanks, Liang Regards, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: What is the exacting meaning of Striped_Cache_Size?
On 1/21/07, Liang Yang <[EMAIL PROTECTED]> wrote: Dan, Thanks for your reply. Still get two questions left. Suppose I have a MD-RAID5 array which consists of 8 disks. 1. Do we need to consider the chunk size of the RAID array when we set the value of Striped_Cache_Size? For example, if the chunk size is changed from 64k to 256k, do we need to adjust the Striped_Cache_Size accordingly? stripe_cache_size and the chunk size are completely independent settings. The chunk size only determines how much data will be accessed from one disk before proceeding to the next disk. 2. The performance improvement of large size I/O packets (128k, 256k) is larger than small size I/O packets (512B, 1KB) when I change the Striped_Cache_Size. How do you explain the difference here? With smaller I/Os the chances that you are staying within one stripe are higher so you would see less benefit of having more stripes in the cache. Liang Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Kernel 2.6.19.2 New RAID 5 Bug (oops when writing Samba -> RAID5)
On 1/22/07, Neil Brown <[EMAIL PROTECTED]> wrote: On Monday January 22, [EMAIL PROTECTED] wrote: > Justin Piszcz wrote: > > My .config is attached, please let me know if any other information is > > needed and please CC (lkml) as I am not on the list, thanks! > > > > Running Kernel 2.6.19.2 on a MD RAID5 volume. Copying files over Samba to > > the RAID5 running XFS. > > > > Any idea what happened here? > > > Without digging too deeply, I'd say you've hit the same bug Sami Farin > and others > have reported starting with 2.6.19: pages mapped with kmap_atomic() > become unmapped > during memcpy() or similar operations. Try disabling preempt -- that > seems to be the > common factor. That is exactly the conclusion I had just come to (a kmap_atomic page must be being unmapped during memcpy). I wasn't aware that others had reported it - thanks for that. Turning off CONFIG_PREEMPT certainly seems like a good idea. Coming from an ARM background I am not yet versed in the inner workings of kmap_atomic, but if you have time for a question I am curious as to why spin_lock(&sh->lock) is not sufficient pre-emption protection for copy_data() in this case? NeilBrown Regards, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 01/12] dmaengine: add base support for the async_tx api
From: Dan Williams <[EMAIL PROTECTED]> * introduce struct dma_async_tx_descriptor as a common field for all dmaengine software descriptors * convert the device_memcpy_* methods into separate prep, set src/dest, and submit stages * support capabilities beyond memcpy (xor, memset, xor zero sum, completion interrupts) * convert ioatdma to the new semantics Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/dmaengine.c | 44 ++-- drivers/dma/ioatdma.c | 256 ++-- drivers/dma/ioatdma.h |8 + include/linux/dmaengine.h | 263 ++--- 4 files changed, 394 insertions(+), 177 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 1527804..8d203ad 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -210,7 +210,8 @@ static void dma_chans_rebalance(void) mutex_lock(&dma_list_mutex); list_for_each_entry(client, &dma_client_list, global_node) { - while (client->chans_desired > client->chan_count) { + while (client->chans_desired < 0 || + client->chans_desired > client->chan_count) { chan = dma_client_chan_alloc(client); if (!chan) break; @@ -219,7 +220,8 @@ static void dma_chans_rebalance(void) chan, DMA_RESOURCE_ADDED); } - while (client->chans_desired < client->chan_count) { + while (client->chans_desired >= 0 && + client->chans_desired < client->chan_count) { spin_lock_irqsave(&client->lock, flags); chan = list_entry(client->channels.next, struct dma_chan, @@ -294,12 +296,12 @@ void dma_async_client_unregister(struct dma_client *client) * @number: count of DMA channels requested * * Clients call dma_async_client_chan_request() to specify how many - * DMA channels they need, 0 to free all currently allocated. + * DMA channels they need, 0 to free all currently allocated. A request + * < 0 indicates the client wants to handle all engines in the system. * The resulting allocations/frees are indicated to the client via the * event callback. */ -void dma_async_client_chan_request(struct dma_client *client, - unsigned int number) +void dma_async_client_chan_request(struct dma_client *client, int number) { client->chans_desired = number; dma_chans_rebalance(); @@ -318,6 +320,31 @@ int dma_async_device_register(struct dma_device *device) if (!device) return -ENODEV; + /* validate device routines */ + BUG_ON(test_bit(DMA_MEMCPY, &device->capabilities) && + !device->device_prep_dma_memcpy); + BUG_ON(test_bit(DMA_XOR, &device->capabilities) && + !device->device_prep_dma_xor); + BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) && + !device->device_prep_dma_zero_sum); + BUG_ON(test_bit(DMA_MEMSET, &device->capabilities) && + !device->device_prep_dma_memset); + BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) && + !device->device_prep_dma_interrupt); + + BUG_ON(!device->device_alloc_chan_resources); + BUG_ON(!device->device_free_chan_resources); + BUG_ON(!device->device_tx_submit); + BUG_ON(!device->device_set_dest); + BUG_ON(!device->device_set_src); + BUG_ON(!device->device_dependency_added); + BUG_ON(!device->device_is_tx_complete); + BUG_ON(!device->map_page); + BUG_ON(!device->map_single); + BUG_ON(!device->unmap_page); + BUG_ON(!device->unmap_single); + BUG_ON(!device->device_issue_pending); + init_completion(&device->done); kref_init(&device->refcount); device->dev_id = id++; @@ -402,11 +429,8 @@ subsys_initcall(dma_bus_init); EXPORT_SYMBOL(dma_async_client_register); EXPORT_SYMBOL(dma_async_client_unregister); EXPORT_SYMBOL(dma_async_client_chan_request); -EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf); -EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg); -EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg); -EXPORT_SYMBOL(dma_async_memcpy_complete); -EXPORT_SYMBOL(dma_async_memcpy_issue_pending); +EXPORT_SYMBOL(dma_async_is_tx_complete); +EXPORT_SYMBOL(dma_async_issue_pending); EXPORT_SYMBOL(dma_async_device_register); EXPORT_SYMBOL(dma_async_device_unregister); EXPORT_SYMBOL(dma_chan_cleanup); diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 8e87261..70b
[PATCH 08/12] md: satisfy raid5 read requests via raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> Use raid5_run_ops to carry out the memory copies for a raid5 read request. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 40 +++- 1 files changed, 15 insertions(+), 25 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2422253..db8925f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1980,7 +1980,7 @@ static void handle_stripe5(struct stripe_head *sh) int i; int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; - int compute=0, req_compute=0, non_overwrite=0; + int to_fill=0, compute=0, req_compute=0, non_overwrite=0; int failed_num=0; struct r5dev *dev; unsigned long pending=0; @@ -2004,34 +2004,20 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); - PRINTK("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read */ + PRINTK("check %d: state 0x%lx toread %p read %p write %p written %p\n", + i, dev->flags, dev->toread, dev->read, dev->towrite, dev->written); + + /* maybe we can start a biofill operation */ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { - struct bio *rbi, *rbi2; - PRINTK("Return read for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - rbi = dev->toread; - dev->toread = NULL; - if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); - while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { - copy_data(0, rbi, dev->page, dev->sector); - rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); - if (--rbi->bi_phys_segments == 0) { - rbi->bi_next = return_bi; - return_bi = rbi; - } - spin_unlock_irq(&conf->device_lock); - rbi = rbi2; - } + to_read--; + if (!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + set_bit(R5_Wantfill, &dev->flags); } /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) locked++; if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + if (test_bit(R5_Wantfill, &dev->flags)) to_fill++; if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 1); if (dev->toread) to_read++; @@ -2055,9 +2041,13 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(R5_Insync, &dev->flags); } rcu_read_unlock(); + + if (to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + sh->ops.count++; + PRINTK("locked=%d uptodate=%d to_read=%d" - " to_write=%d failed=%d failed_num=%d\n", - locked, uptodate, to_read, to_write, failed, failed_num); + " to_write=%d to_fill=%d failed=%d failed_num=%d\n", + locked, uptodate, to_read, to_write, to_fill, failed, failed_num); /* check if the array has lost two devices and, if so, some requests might * need to be failed */ - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 03/12] md: add raid5_run_ops and support routines
From: Dan Williams <[EMAIL PROTECTED]> Prepare the raid5 implementation to use async_tx for running stripe operations: * biofill (copy data into request buffers to satisfy a read request) * compute block (generate a missing block in the cache from the other blocks) * prexor (subtract existing data as part of the read-modify-write process) * biodrain (copy data out of request buffers to satisfy a write request) * postxor (recalculate parity for new data that has entered the cache) * check (verify that the parity is correct) * io (submit i/o to the member disks) Changelog: * removed ops_complete_biodrain in favor of ops_complete_postxor and ops_complete_write. * removed the workqueue * call bi_end_io for reads in ops_complete_biofill Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 520 include/linux/raid/raid5.h | 63 + 2 files changed, 580 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 68b6fea..e70ee17 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -52,6 +52,7 @@ #include "raid6.h" #include +#include /* * Stripe cache @@ -324,6 +325,525 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } +static int +raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error); +static int +raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error); + +static void ops_run_io(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + int i, disks = sh->disks; + + might_sleep(); + + for (i=disks; i-- ;) { + int rw; + struct bio *bi; + mdk_rdev_t *rdev; + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) + rw = WRITE; + else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) + rw = READ; + else + continue; + + bi = &sh->dev[i].req; + + bi->bi_rw = rw; + if (rw == WRITE) + bi->bi_end_io = raid5_end_write_request; + else + bi->bi_end_io = raid5_end_read_request; + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = NULL; + if (rdev) + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + + if (rdev) { + if (test_bit(STRIPE_SYNCING, &sh->state) || + test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || + test_bit(STRIPE_EXPAND_READY, &sh->state)) + md_sync_acct(rdev->bdev, STRIPE_SECTORS); + + bi->bi_bdev = rdev->bdev; + PRINTK("%s: for %llu schedule op %ld on disc %d\n", + __FUNCTION__, (unsigned long long)sh->sector, + bi->bi_rw, i); + atomic_inc(&sh->count); + bi->bi_sector = sh->sector + rdev->data_offset; + bi->bi_flags = 1 << BIO_UPTODATE; + bi->bi_vcnt = 1; + bi->bi_max_vecs = 1; + bi->bi_idx = 0; + bi->bi_io_vec = &sh->dev[i].vec; + bi->bi_io_vec[0].bv_len = STRIPE_SIZE; + bi->bi_io_vec[0].bv_offset = 0; + bi->bi_size = STRIPE_SIZE; + bi->bi_next = NULL; + if (rw == WRITE && + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); + generic_make_request(bi); + } else { + if (rw == WRITE) + set_bit(STRIPE_DEGRADED, &sh->state); + PRINTK("skip op %ld on disc %d for sector %llu\n", + bi->bi_rw, i, (unsigned long long)sh->sector); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + } + } +} + +static struct dma_async_tx_descriptor * +async_copy_data(int frombio, struct bio *bio, struct page *page, sector_t sector, + struct dma_async_tx_descriptor *tx) +{ + struct bio_vec *bvl; + struct page *bio_page; + int i; + int page_
[PATCH 04/12] md: use raid5_run_ops for stripe cache operations
From: Dan Williams <[EMAIL PROTECTED]> Each stripe has three flag variables to reflect the state of operations (pending, ack, and complete). -pending: set to request servicing in raid5_run_ops -ack: set to reflect that raid5_runs_ops has seen this request -complete: set when the operation is complete and it is ok for handle_stripe5 to clear 'pending' and 'ack'. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 65 +--- 1 files changed, 56 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e70ee17..2c74f9b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -126,6 +126,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } md_wakeup_thread(conf->mddev->thread); } else { + BUG_ON(sh->ops.pending); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@ -225,7 +226,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - + BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); + CHECK_DEVLOCK(); PRINTK("init_stripe called, stripe %llu\n", (unsigned long long)sh->sector); @@ -241,11 +243,11 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->toread || dev->towrite || dev->written || + if (dev->toread || dev->read || dev->towrite || dev->written || test_bit(R5_LOCKED, &dev->flags)) { - printk("sector=%llx i=%d %p %p %p %d\n", + printk("sector=%llx i=%d %p %p %p %p %d\n", (unsigned long long)sh->sector, i, dev->toread, - dev->towrite, dev->written, + dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); BUG(); } @@ -325,6 +327,43 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } +/* check_op() ensures that we only dequeue an operation once */ +#define check_op(op) do {\ + if (test_bit(op, &sh->ops.pending) &&\ + !test_bit(op, &sh->ops.complete)) {\ + if (test_and_set_bit(op, &sh->ops.ack))\ + clear_bit(op, &pending);\ + else\ + ack++;\ + } else\ + clear_bit(op, &pending);\ +} while(0) + +/* find new work to run, do not resubmit work that is already + * in flight + */ +static unsigned long get_stripe_work(struct stripe_head *sh) +{ + unsigned long pending; + int ack = 0; + + pending = sh->ops.pending; + + check_op(STRIPE_OP_BIOFILL); + check_op(STRIPE_OP_COMPUTE_BLK); + check_op(STRIPE_OP_PREXOR); + check_op(STRIPE_OP_BIODRAIN); + check_op(STRIPE_OP_POSTXOR); + check_op(STRIPE_OP_CHECK); + if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) + ack++; + + sh->ops.count -= ack; + BUG_ON(sh->ops.count < 0); + + return pending; +} + static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error); static int @@ -1859,7 +1898,6 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) *schedule a write of some buffers *return confirmation of parity correctness * - * Parity calculations are done inside the stripe lock * buffers are taken off read_list or write_list, and bh_cache buffers * get BH_Lock set before the stripe lock is released. * @@ -1877,10 +1915,11 @@ static void handle_stripe5(struct stripe_head *sh) int non_overwrite = 0; int failed_num=0; struct r5dev *dev; + unsigned long pending=0; - PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", - (unsigned long long)sh->sector, atomic_read(&sh->count), - sh->pd_idx); + PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d ops=%lx:%lx:%lx\n", + (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), + sh->pd_id
[PATCH 12/12] dmaengine: driver for the iop32x, iop33x, and iop13xx raid engines
From: Dan Williams <[EMAIL PROTECTED]> This is a driver for the iop DMA/AAU/ADMA units which are capable of pq_xor, pq_update, pq_zero_sum, xor, dual_xor, xor_zero_sum, fill, copy+crc, and copy operations. Changelog: * fixed a slot allocation bug in do_iop13xx_adma_xor that caused too few slots to be requested eventually leading to data corruption * enabled the slot allocation routine to attempt to free slots before returning -ENOMEM * switched the cleanup routine to solely use the software chain and the status register to determine if a descriptor is complete. This is necessary to support other IOP engines that do not have status writeback capability * make the driver iop generic * modified the allocation routines to understand allocating a group of slots for a single operation * added a null xor initialization operation for the xor only channel on iop3xx * support xor operations on buffers larger than the hardware maximum * split the do_* routines into separate prep, src/dest set, submit stages * added async_tx support (dependent operations initiation at cleanup time) * simplified group handling * added interrupt support (callbacks via tasklets) * brought the pending depth inline with ioat (i.e. 4 descriptors) Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/Kconfig |8 drivers/dma/Makefile|1 drivers/dma/iop-adma.c | 1511 +++ include/asm-arm/hardware/iop_adma.h | 116 +++ 4 files changed, 1636 insertions(+), 0 deletions(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index c82ed5f..d61e3e5 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -41,4 +41,12 @@ config INTEL_IOATDMA default m ---help--- Enable support for the Intel(R) I/OAT DMA engine. + +config INTEL_IOP_ADMA +tristate "Intel IOP ADMA support" +depends on DMA_ENGINE && (ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX) +default m +---help--- + Enable support for the Intel(R) IOP Series RAID engines. + endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 6a99341..8ebf10d 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_DMA_ENGINE) += dmaengine.o obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o +obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c new file mode 100644 index 000..77f859e --- /dev/null +++ b/drivers/dma/iop-adma.c @@ -0,0 +1,1511 @@ +/* + * Copyright(c) 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This driver supports the asynchrounous DMA copy and RAID engines available + * on the Intel Xscale(R) family of I/O Processors (IOP 32x, 33x, 134x) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define to_iop_adma_chan(chan) container_of(chan, struct iop_adma_chan, common) +#define to_iop_adma_device(dev) container_of(dev, struct iop_adma_device, common) +#define to_iop_adma_slot(lh) container_of(lh, struct iop_adma_desc_slot, slot_node) +#define tx_to_iop_adma_slot(tx) container_of(tx, struct iop_adma_desc_slot, async_tx) + +#define IOP_ADMA_DEBUG 0 +#define PRINTK(x...) ((void)(IOP_ADMA_DEBUG && printk(x))) + +/** + * iop_adma_free_slots - flags descriptor slots for reuse + * @slot: Slot to free + * Caller must hold &iop_chan->lock while calling this function + */ +static inline void iop_adma_free_slots(struct iop_adma_desc_slot *slot) +{ + int stride = slot->stride; + + while (stride--) { + slot->stride = 0; + slot = list_entry(slot->slot_node.next, + struct iop_adma_desc_slot, + slot_node); + } +} + +static inline dma_cookie_t +iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *iop_chan, dma_cookie_t cookie) +{ + BUG_ON(
[PATCH 07/12] md: move raid5 parity checks to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe sets STRIPE_OP_CHECK to request a check operation in raid5_run_ops. If raid5_run_ops is able to perform the check with a dma engine the parity will be preserved in memory removing the need to re-read it from disk, as is necessary in the synchronous case. 'Repair' operations re-use the same logic as compute block, with the caveat that the results of the compute block are immediately written back to the parity disk. To differentiate these operations the STRIPE_OP_MOD_REPAIR_PD flag is added. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 81 1 files changed, 62 insertions(+), 19 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 279a30c..2422253 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2411,32 +2411,75 @@ static void handle_stripe5(struct stripe_head *sh) locked += handle_write_operations5(sh, rcw, 0); } - /* maybe we need to check and possibly fix the parity for this stripe -* Any reads will already have been scheduled, so we just see if enough data -* is available + /* 1/ Maybe we need to check and possibly fix the parity for this stripe. +*Any reads will already have been scheduled, so we just see if enough data +*is available. +* 2/ Hold off parity checks while parity dependent operations are in flight +*(conflicting writes are protected by the 'locked' variable) */ - if (syncing && locked == 0 && - !test_bit(STRIPE_INSYNC, &sh->state)) { + if ((syncing && locked == 0 && !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_INSYNC, &sh->state)) || + test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || + test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + set_bit(STRIPE_HANDLE, &sh->state); - if (failed == 0) { - BUG_ON(uptodate != disks); - compute_parity5(sh, CHECK_PARITY); - uptodate--; - if (page_is_zero(sh->dev[sh->pd_idx].page)) { - /* parity is correct (on disc, not in buffer any more) */ - set_bit(STRIPE_INSYNC, &sh->state); - } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ + /* Take one of the following actions: +* 1/ start a check parity operation if (uptodate == disks) +* 2/ finish a check parity operation and act on the result +* 3/ skip to the writeback section if we previously +*initiated a recovery operation +*/ + if (failed == 0 && !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + BUG_ON(uptodate != disks); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + sh->ops.count++; + uptodate--; + } else if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { + clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); + clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, not in buffer any more) */ set_bit(STRIPE_INSYNC, &sh->state); else { - compute_block(sh, sh->pd_idx); - uptodate++; + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + BUG_ON(test_and_set_bit( + STRIPE_OP_COMPUTE_BLK, + &sh->ops.pe
[PATCH 11/12] md: remove raid5 compute_block and compute_parity5
From: Dan Williams <[EMAIL PROTECTED]> replaced by raid5_run_ops Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 124 1 files changed, 0 insertions(+), 124 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8af084f..a981c35 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1480,130 +1480,6 @@ static void copy_data(int frombio, struct bio *bio, } \ } while(0) - -static void compute_block(struct stripe_head *sh, int dd_idx) -{ - int i, count, disks = sh->disks; - void *ptr[MAX_XOR_BLOCKS], *dest, *p; - - PRINTK("compute_block, stripe %llu, idx %d\n", - (unsigned long long)sh->sector, dd_idx); - - dest = page_address(sh->dev[dd_idx].page); - memset(dest, 0, STRIPE_SIZE); - count = 0; - for (i = disks ; i--; ) { - if (i == dd_idx) - continue; - p = page_address(sh->dev[i].page); - if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) - ptr[count++] = p; - else - printk(KERN_ERR "compute_block() %d, stripe %llu, %d" - " not present\n", dd_idx, - (unsigned long long)sh->sector, i); - - check_xor(); - } - if (count) - xor_block(count, STRIPE_SIZE, dest, ptr); - set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); -} - -static void compute_parity5(struct stripe_head *sh, int method) -{ - raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, disks = sh->disks, count; - void *ptr[MAX_XOR_BLOCKS], *dest; - struct bio *chosen; - - PRINTK("compute_parity5, stripe %llu, method %d\n", - (unsigned long long)sh->sector, method); - - count = 0; - dest = page_address(sh->dev[pd_idx].page); - switch(method) { - case READ_MODIFY_WRITE: - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); - for (i=disks ; i-- ;) { - if (i==pd_idx) - continue; - if (sh->dev[i].towrite && - test_bit(R5_UPTODATE, &sh->dev[i].flags)) { - ptr[count++] = page_address(sh->dev[i].page); - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - check_xor(); - } - } - break; - case RECONSTRUCT_WRITE: - memset(dest, 0, STRIPE_SIZE); - for (i= disks; i-- ;) - if (i!=pd_idx && sh->dev[i].towrite) { - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - } - break; - case CHECK_PARITY: - break; - } - if (count) { - xor_block(count, STRIPE_SIZE, dest, ptr); - count = 0; - } - - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); - } - - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); - } - - switch(method) { - case RECONSTRUCT_WRITE: - case CHECK_PARITY: - for (i=disks; i--;) - if (i != pd_idx) { - ptr[count++] = page_address(sh->dev[i].page); - check_xor(); -
[PATCH 10/12] md: move raid5 io requests to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe now only updates the state of stripes. All execution of operations is moved to raid5_run_ops. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 68 1 files changed, 10 insertions(+), 58 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1956b3c..8af084f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2360,6 +2360,8 @@ static void handle_stripe5(struct stripe_head *sh) PRINTK("Read_old block %d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2380,6 +2382,8 @@ static void handle_stripe5(struct stripe_head *sh) PRINTK("Read_old block %d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2479,6 +2483,8 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; clear_bit(STRIPE_DEGRADED, &sh->state); locked++; set_bit(STRIPE_INSYNC, &sh->state); @@ -2500,12 +2506,16 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[failed_num]; if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; set_bit(R5_LOCKED, &dev->flags); locked++; } @@ -2615,64 +2625,6 @@ static void handle_stripe5(struct stripe_head *sh) test_bit(BIO_UPTODATE, &bi->bi_flags) ? 0 : -EIO); } - for (i=disks; i-- ;) { - int rw; - struct bio *bi; - mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = READ; - else - continue; - - bi = &sh->dev[i].req; - - bi->bi_rw = rw; - if (rw == WRITE) - bi->bi_end_io = raid5_end_write_request; - else - bi->bi_end_io = raid5_end_read_request; - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - if (rdev) { - if (syncing || expanding || expanded) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); - - bi->bi_bdev = rdev->bdev; - PRINTK("for %llu schedule op %ld on disc %d\n", -
[PATCH 09/12] md: use async_tx and raid5_run_ops for raid5 expansion operations
From: Dan Williams <[EMAIL PROTECTED]> The parity calculation for an expansion operation is the same as the calculation performed at the end of a write with the caveat that all blocks in the stripe are scheduled to be written. An expansion operation is identified as a stripe with the POSTXOR flag set and the BIODRAIN flag not set. The bulk copy operation to the new stripe is handled inline by async_tx. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 48 1 files changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index db8925f..1956b3c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2511,18 +2511,32 @@ static void handle_stripe5(struct stripe_head *sh) } } - if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { - /* Need to write out all blocks after computing parity */ - sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - compute_parity5(sh, RECONSTRUCT_WRITE); + /* Finish postxor operations initiated by the expansion +* process +*/ + if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && + !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { + + clear_bit(STRIPE_EXPANDING, &sh->state); + + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + for (i= conf->raid_disks; i--;) { - set_bit(R5_LOCKED, &sh->dev[i].flags); - locked++; set_bit(R5_Wantwrite, &sh->dev[i].flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; } - clear_bit(STRIPE_EXPANDING, &sh->state); - } else if (expanded) { + } + + if (expanded && test_bit(STRIPE_EXPANDING, &sh->state) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + /* Need to write out all blocks after computing parity */ + sh->disks = conf->raid_disks; + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); + locked += handle_write_operations5(sh, 0, 1); + } else if (expanded && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -2533,6 +2547,7 @@ static void handle_stripe5(struct stripe_head *sh) /* We have read all the blocks in this stripe and now we need to * copy some of them into a target stripe for expand. */ + struct dma_async_tx_descriptor *tx = NULL; clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i=0; i< sh->disks; i++) if (i != sh->pd_idx) { @@ -2556,9 +2571,12 @@ static void handle_stripe5(struct stripe_head *sh) release_stripe(sh2); continue; } - memcpy(page_address(sh2->dev[dd_idx].page), - page_address(sh->dev[i].page), - STRIPE_SIZE); + + /* place all the copies on one channel */ + tx = async_memcpy(sh2->dev[dd_idx].page, + sh->dev[i].page, 0, 0, STRIPE_SIZE, + ASYNC_TX_DEP_ACK, tx, NULL, NULL); + set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j=0; jraid_disks; j++) @@ -2570,6 +2588,12 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(STRIPE_HANDLE, &sh2->state); } release_stripe(sh2); + + /* done submitting copies, wait for them to complete */ + if (i + 1 >= sh->disks) { + async_tx_ack(tx); + dma_wait_for_async_tx(tx); + } } } - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 02/12] dmaengine: add the async_tx api
From: Dan Williams <[EMAIL PROTECTED]> async_tx is an api to describe a series of bulk memory transfers/transforms. When possible these transactions are carried out by asynchrounous dma engines. The api handles inter-transaction dependencies and hides dma channel management from the client. When a dma engine is not present the transaction is carried out via synchronous software routines. Xor operations are handled by async_tx, to this end xor.c is moved into drivers/dma and is changed to take an explicit destination address and a series of sources to match the hardware engine implementation. When CONFIG_DMA_ENGINE is not set the asynchrounous path is compiled away. Changelog: * fixed a leftover debug print * don't allow callbacks in async_interrupt_cond * fixed xor_block changes * fixed usage of ASYNC_TX_XOR_DROP_DEST Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/Makefile |1 drivers/dma/Kconfig | 16 + drivers/dma/Makefile |1 drivers/dma/async_tx.c | 910 ++ drivers/dma/xor.c| 153 drivers/md/Kconfig |2 drivers/md/Makefile |6 drivers/md/raid5.c | 52 +-- drivers/md/xor.c | 154 include/linux/async_tx.h | 180 + include/linux/raid/xor.h |5 11 files changed, 1291 insertions(+), 189 deletions(-) diff --git a/drivers/Makefile b/drivers/Makefile index 0dd96d1..7d55837 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -61,6 +61,7 @@ obj-$(CONFIG_I2C) += i2c/ obj-$(CONFIG_W1) += w1/ obj-$(CONFIG_HWMON)+= hwmon/ obj-$(CONFIG_PHONE)+= telephony/ +obj-$(CONFIG_ASYNC_TX_DMA) += dma/ obj-$(CONFIG_MD) += md/ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_ISDN) += isdn/ diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 30d021d..c82ed5f 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -7,8 +7,8 @@ menu "DMA Engine support" config DMA_ENGINE bool "Support for DMA engines" ---help--- - DMA engines offload copy operations from the CPU to dedicated - hardware, allowing the copies to happen asynchronously. + DMA engines offload bulk memory operations from the CPU to dedicated + hardware, allowing the operations to happen asynchronously. comment "DMA Clients" @@ -22,6 +22,17 @@ config NET_DMA Since this is the main user of the DMA engine, it should be enabled; say Y here. +config ASYNC_TX_DMA + tristate "Asynchronous Bulk Memory Transfers/Transforms API" + default y + ---help--- + This enables the async_tx management layer for dma engines. + Subsystems coded to this API will use offload engines for bulk + memory operations where present. Software implementations are + called when a dma engine is not present or fails to allocate + memory to carry out the transaction. + Current subsystems ported to async_tx: MD_RAID4,5 + comment "DMA Devices" config INTEL_IOATDMA @@ -30,5 +41,4 @@ config INTEL_IOATDMA default m ---help--- Enable support for the Intel(R) I/OAT DMA engine. - endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index bdcfdbd..6a99341 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_DMA_ENGINE) += dmaengine.o obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o +obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o diff --git a/drivers/dma/async_tx.c b/drivers/dma/async_tx.c new file mode 100644 index 000..eee208d --- /dev/null +++ b/drivers/dma/async_tx.c @@ -0,0 +1,910 @@ +/* + * Copyright(c) 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#include +#include +#include +#include +#include + +#define ASYNC_TX_DEBUG 0 +#define PRINTK(x...) ((void)(ASYNC_TX_DEBUG && printk(x))) + +#ifdef CONFIG_DMA_ENGINE +static struct dma_client *a
[PATCH 06/12] md: move raid5 compute block operations to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe sets STRIPE_OP_COMPUTE_BLK to request servicing from raid5_run_ops. It also sets a flag for the block being computed to let other parts of handle_stripe submit dependent operations. raid5_run_ops guarantees that the compute operation completes before any dependent operation starts. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 125 +++- 1 files changed, 93 insertions(+), 32 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2390657..279a30c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1980,7 +1980,7 @@ static void handle_stripe5(struct stripe_head *sh) int i; int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; - int non_overwrite = 0; + int compute=0, req_compute=0, non_overwrite=0; int failed_num=0; struct r5dev *dev; unsigned long pending=0; @@ -2032,8 +2032,8 @@ static void handle_stripe5(struct stripe_head *sh) /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) locked++; if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 1); - if (dev->toread) to_read++; if (dev->towrite) { to_write++; @@ -2188,31 +2188,82 @@ static void handle_stripe5(struct stripe_head *sh) * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { - for (i=disks; i--;) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || -(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || -syncing || -expanding || -(failed && (sh->dev[failed_num].toread || -(sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags - ) - ) { - /* we would like to get this block, possibly -* by computing it, but we might not be able to + if (to_read || non_overwrite || (syncing && (uptodate + compute < disks)) || expanding || + test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { + + /* Clear completed compute operations. Parity recovery +* (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled +* later on in this routine +*/ + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && + !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + } + + /* look for blocks to read/compute, skip this if a compute +* is already in flight, or if the stripe contents are in the +* midst of changing due to a write +*/ + if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + for (i=disks; i--;) { + dev = &sh->dev[i]; + + /* don't schedule compute operations or reads on +* the parity block while a check is in flight */ - if (uptodate == disks-1) { - PRINTK("Computing block %d\n", i); - compute_block(sh, i); - uptodate++; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - locked++; -
[PATCH 05/12] md: move write operations to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe sets STRIPE_OP_PREXOR, STRIPE_OP_BIODRAIN, STRIPE_OP_POSTXOR to request a write to the stripe cache. raid5_run_ops is triggerred to run and executes the request outside the stripe lock. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 152 +--- 1 files changed, 131 insertions(+), 21 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2c74f9b..2390657 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1788,7 +1788,75 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) } } +static int handle_write_operations5(struct stripe_head *sh, int rcw, int expand) +{ + int i, pd_idx = sh->pd_idx, disks = sh->disks; + int locked=0; + + if (rcw == 0) { + /* skip the drain operation on an expand */ + if (!expand) { + BUG_ON(test_and_set_bit(STRIPE_OP_BIODRAIN, + &sh->ops.pending)); + sh->ops.count++; + } + + BUG_ON(test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)); + sh->ops.count++; + + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->towrite) { + set_bit(R5_LOCKED, &dev->flags); + if (!expand) + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } else { + BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); + + BUG_ON(test_and_set_bit(STRIPE_OP_PREXOR, &sh->ops.pending) || + test_and_set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) || + test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)); + + sh->ops.count += 3; + + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + if (i==pd_idx) + continue; + /* For a read-modify write there may be blocks that are +* locked for reading while others are ready to be written +* so we distinguish these blocks by the R5_Wantprexor bit +*/ + if (dev->towrite && + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantprexor, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } + + /* keep the parity disk locked while asynchronous operations +* are in flight +*/ + set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + locked++; + + PRINTK("%s: stripe %llu locked: %d pending: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + locked, sh->ops.pending); + + return locked; +} /* * Each stripe/dev can have one or more bion attached. @@ -2151,8 +2219,67 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(STRIPE_HANDLE, &sh->state); } - /* now to consider writing and what else, if anything should be read */ - if (to_write) { + /* Now we check to see if any write operations have recently +* completed +*/ + + /* leave prexor set until postxor is done, allows us to distinguish +* a rmw from a rcw during biodrain +*/ + if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && + test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + + clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); + clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); + clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); + + for (i=disks; i--;) + clear_bit(R5_Wantprexor, &sh->dev[i].flags); + } + + /* if only POSTXOR is set then this is an 'expand' postxor */ + if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && + test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + + clear_bit(STRIPE_OP_BIODRAIN
[PATCH 2.6.20-rc5 01/12] dmaengine: add base support for the async_tx api
From: Dan Williams <[EMAIL PROTECTED]> * introduce struct dma_async_tx_descriptor as a common field for all dmaengine software descriptors * convert the device_memcpy_* methods into separate prep, set src/dest, and submit stages * support capabilities beyond memcpy (xor, memset, xor zero sum, completion interrupts) * convert ioatdma to the new semantics Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/dmaengine.c | 44 ++-- drivers/dma/ioatdma.c | 256 ++-- drivers/dma/ioatdma.h |8 + include/linux/dmaengine.h | 263 ++--- 4 files changed, 394 insertions(+), 177 deletions(-) diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 1527804..8d203ad 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -210,7 +210,8 @@ static void dma_chans_rebalance(void) mutex_lock(&dma_list_mutex); list_for_each_entry(client, &dma_client_list, global_node) { - while (client->chans_desired > client->chan_count) { + while (client->chans_desired < 0 || + client->chans_desired > client->chan_count) { chan = dma_client_chan_alloc(client); if (!chan) break; @@ -219,7 +220,8 @@ static void dma_chans_rebalance(void) chan, DMA_RESOURCE_ADDED); } - while (client->chans_desired < client->chan_count) { + while (client->chans_desired >= 0 && + client->chans_desired < client->chan_count) { spin_lock_irqsave(&client->lock, flags); chan = list_entry(client->channels.next, struct dma_chan, @@ -294,12 +296,12 @@ void dma_async_client_unregister(struct dma_client *client) * @number: count of DMA channels requested * * Clients call dma_async_client_chan_request() to specify how many - * DMA channels they need, 0 to free all currently allocated. + * DMA channels they need, 0 to free all currently allocated. A request + * < 0 indicates the client wants to handle all engines in the system. * The resulting allocations/frees are indicated to the client via the * event callback. */ -void dma_async_client_chan_request(struct dma_client *client, - unsigned int number) +void dma_async_client_chan_request(struct dma_client *client, int number) { client->chans_desired = number; dma_chans_rebalance(); @@ -318,6 +320,31 @@ int dma_async_device_register(struct dma_device *device) if (!device) return -ENODEV; + /* validate device routines */ + BUG_ON(test_bit(DMA_MEMCPY, &device->capabilities) && + !device->device_prep_dma_memcpy); + BUG_ON(test_bit(DMA_XOR, &device->capabilities) && + !device->device_prep_dma_xor); + BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) && + !device->device_prep_dma_zero_sum); + BUG_ON(test_bit(DMA_MEMSET, &device->capabilities) && + !device->device_prep_dma_memset); + BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) && + !device->device_prep_dma_interrupt); + + BUG_ON(!device->device_alloc_chan_resources); + BUG_ON(!device->device_free_chan_resources); + BUG_ON(!device->device_tx_submit); + BUG_ON(!device->device_set_dest); + BUG_ON(!device->device_set_src); + BUG_ON(!device->device_dependency_added); + BUG_ON(!device->device_is_tx_complete); + BUG_ON(!device->map_page); + BUG_ON(!device->map_single); + BUG_ON(!device->unmap_page); + BUG_ON(!device->unmap_single); + BUG_ON(!device->device_issue_pending); + init_completion(&device->done); kref_init(&device->refcount); device->dev_id = id++; @@ -402,11 +429,8 @@ subsys_initcall(dma_bus_init); EXPORT_SYMBOL(dma_async_client_register); EXPORT_SYMBOL(dma_async_client_unregister); EXPORT_SYMBOL(dma_async_client_chan_request); -EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf); -EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg); -EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg); -EXPORT_SYMBOL(dma_async_memcpy_complete); -EXPORT_SYMBOL(dma_async_memcpy_issue_pending); +EXPORT_SYMBOL(dma_async_is_tx_complete); +EXPORT_SYMBOL(dma_async_issue_pending); EXPORT_SYMBOL(dma_async_device_register); EXPORT_SYMBOL(dma_async_device_unregister); EXPORT_SYMBOL(dma_chan_cleanup); diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c index 8e87261..70b
[PATCH 2.6.20-rc5 02/12] dmaengine: add the async_tx api
From: Dan Williams <[EMAIL PROTECTED]> async_tx is an api to describe a series of bulk memory transfers/transforms. When possible these transactions are carried out by asynchrounous dma engines. The api handles inter-transaction dependencies and hides dma channel management from the client. When a dma engine is not present the transaction is carried out via synchronous software routines. Xor operations are handled by async_tx, to this end xor.c is moved into drivers/dma and is changed to take an explicit destination address and a series of sources to match the hardware engine implementation. When CONFIG_DMA_ENGINE is not set the asynchrounous path is compiled away. Changelog: * fixed a leftover debug print * don't allow callbacks in async_interrupt_cond * fixed xor_block changes * fixed usage of ASYNC_TX_XOR_DROP_DEST Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/Makefile |1 drivers/dma/Kconfig | 16 + drivers/dma/Makefile |1 drivers/dma/async_tx.c | 910 ++ drivers/dma/xor.c| 153 drivers/md/Kconfig |2 drivers/md/Makefile |6 drivers/md/raid5.c | 52 +-- drivers/md/xor.c | 154 include/linux/async_tx.h | 180 + include/linux/raid/xor.h |5 11 files changed, 1291 insertions(+), 189 deletions(-) diff --git a/drivers/Makefile b/drivers/Makefile index 0dd96d1..7d55837 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -61,6 +61,7 @@ obj-$(CONFIG_I2C) += i2c/ obj-$(CONFIG_W1) += w1/ obj-$(CONFIG_HWMON)+= hwmon/ obj-$(CONFIG_PHONE)+= telephony/ +obj-$(CONFIG_ASYNC_TX_DMA) += dma/ obj-$(CONFIG_MD) += md/ obj-$(CONFIG_BT) += bluetooth/ obj-$(CONFIG_ISDN) += isdn/ diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 30d021d..c82ed5f 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -7,8 +7,8 @@ menu "DMA Engine support" config DMA_ENGINE bool "Support for DMA engines" ---help--- - DMA engines offload copy operations from the CPU to dedicated - hardware, allowing the copies to happen asynchronously. + DMA engines offload bulk memory operations from the CPU to dedicated + hardware, allowing the operations to happen asynchronously. comment "DMA Clients" @@ -22,6 +22,17 @@ config NET_DMA Since this is the main user of the DMA engine, it should be enabled; say Y here. +config ASYNC_TX_DMA + tristate "Asynchronous Bulk Memory Transfers/Transforms API" + default y + ---help--- + This enables the async_tx management layer for dma engines. + Subsystems coded to this API will use offload engines for bulk + memory operations where present. Software implementations are + called when a dma engine is not present or fails to allocate + memory to carry out the transaction. + Current subsystems ported to async_tx: MD_RAID4,5 + comment "DMA Devices" config INTEL_IOATDMA @@ -30,5 +41,4 @@ config INTEL_IOATDMA default m ---help--- Enable support for the Intel(R) I/OAT DMA engine. - endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index bdcfdbd..6a99341 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_DMA_ENGINE) += dmaengine.o obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o +obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o diff --git a/drivers/dma/async_tx.c b/drivers/dma/async_tx.c new file mode 100644 index 000..eee208d --- /dev/null +++ b/drivers/dma/async_tx.c @@ -0,0 +1,910 @@ +/* + * Copyright(c) 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ +#include +#include +#include +#include +#include + +#define ASYNC_TX_DEBUG 0 +#define PRINTK(x...) ((void)(ASYNC_TX_DEBUG && printk(x))) + +#ifdef CONFIG_DMA_ENGINE +static struct dma_client *a
[PATCH 2.6.20-rc5 06/12] md: move raid5 compute block operations to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe sets STRIPE_OP_COMPUTE_BLK to request servicing from raid5_run_ops. It also sets a flag for the block being computed to let other parts of handle_stripe submit dependent operations. raid5_run_ops guarantees that the compute operation completes before any dependent operation starts. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 125 +++- 1 files changed, 93 insertions(+), 32 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2390657..279a30c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1980,7 +1980,7 @@ static void handle_stripe5(struct stripe_head *sh) int i; int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; - int non_overwrite = 0; + int compute=0, req_compute=0, non_overwrite=0; int failed_num=0; struct r5dev *dev; unsigned long pending=0; @@ -2032,8 +2032,8 @@ static void handle_stripe5(struct stripe_head *sh) /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) locked++; if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 1); - if (dev->toread) to_read++; if (dev->towrite) { to_write++; @@ -2188,31 +2188,82 @@ static void handle_stripe5(struct stripe_head *sh) * parity, or to satisfy requests * or to load a block that is being partially written. */ - if (to_read || non_overwrite || (syncing && (uptodate < disks)) || expanding) { - for (i=disks; i--;) { - dev = &sh->dev[i]; - if (!test_bit(R5_LOCKED, &dev->flags) && !test_bit(R5_UPTODATE, &dev->flags) && - (dev->toread || -(dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || -syncing || -expanding || -(failed && (sh->dev[failed_num].toread || -(sh->dev[failed_num].towrite && !test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags - ) - ) { - /* we would like to get this block, possibly -* by computing it, but we might not be able to + if (to_read || non_overwrite || (syncing && (uptodate + compute < disks)) || expanding || + test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) { + + /* Clear completed compute operations. Parity recovery +* (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is handled +* later on in this routine +*/ + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && + !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); + } + + /* look for blocks to read/compute, skip this if a compute +* is already in flight, or if the stripe contents are in the +* midst of changing due to a write +*/ + if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + for (i=disks; i--;) { + dev = &sh->dev[i]; + + /* don't schedule compute operations or reads on +* the parity block while a check is in flight */ - if (uptodate == disks-1) { - PRINTK("Computing block %d\n", i); - compute_block(sh, i); - uptodate++; - } else if (test_bit(R5_Insync, &dev->flags)) { - set_bit(R5_LOCKED, &dev->flags); - set_bit(R5_Wantread, &dev->flags); - locked++; -
[PATCH 2.6.20-rc5 03/12] md: add raid5_run_ops and support routines
From: Dan Williams <[EMAIL PROTECTED]> Prepare the raid5 implementation to use async_tx for running stripe operations: * biofill (copy data into request buffers to satisfy a read request) * compute block (generate a missing block in the cache from the other blocks) * prexor (subtract existing data as part of the read-modify-write process) * biodrain (copy data out of request buffers to satisfy a write request) * postxor (recalculate parity for new data that has entered the cache) * check (verify that the parity is correct) * io (submit i/o to the member disks) Changelog: * removed ops_complete_biodrain in favor of ops_complete_postxor and ops_complete_write. * removed the workqueue * call bi_end_io for reads in ops_complete_biofill Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 520 include/linux/raid/raid5.h | 63 + 2 files changed, 580 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 68b6fea..e70ee17 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -52,6 +52,7 @@ #include "raid6.h" #include +#include /* * Stripe cache @@ -324,6 +325,525 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } +static int +raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error); +static int +raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error); + +static void ops_run_io(struct stripe_head *sh) +{ + raid5_conf_t *conf = sh->raid_conf; + int i, disks = sh->disks; + + might_sleep(); + + for (i=disks; i-- ;) { + int rw; + struct bio *bi; + mdk_rdev_t *rdev; + if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) + rw = WRITE; + else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) + rw = READ; + else + continue; + + bi = &sh->dev[i].req; + + bi->bi_rw = rw; + if (rw == WRITE) + bi->bi_end_io = raid5_end_write_request; + else + bi->bi_end_io = raid5_end_read_request; + + rcu_read_lock(); + rdev = rcu_dereference(conf->disks[i].rdev); + if (rdev && test_bit(Faulty, &rdev->flags)) + rdev = NULL; + if (rdev) + atomic_inc(&rdev->nr_pending); + rcu_read_unlock(); + + if (rdev) { + if (test_bit(STRIPE_SYNCING, &sh->state) || + test_bit(STRIPE_EXPAND_SOURCE, &sh->state) || + test_bit(STRIPE_EXPAND_READY, &sh->state)) + md_sync_acct(rdev->bdev, STRIPE_SECTORS); + + bi->bi_bdev = rdev->bdev; + PRINTK("%s: for %llu schedule op %ld on disc %d\n", + __FUNCTION__, (unsigned long long)sh->sector, + bi->bi_rw, i); + atomic_inc(&sh->count); + bi->bi_sector = sh->sector + rdev->data_offset; + bi->bi_flags = 1 << BIO_UPTODATE; + bi->bi_vcnt = 1; + bi->bi_max_vecs = 1; + bi->bi_idx = 0; + bi->bi_io_vec = &sh->dev[i].vec; + bi->bi_io_vec[0].bv_len = STRIPE_SIZE; + bi->bi_io_vec[0].bv_offset = 0; + bi->bi_size = STRIPE_SIZE; + bi->bi_next = NULL; + if (rw == WRITE && + test_bit(R5_ReWrite, &sh->dev[i].flags)) + atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); + generic_make_request(bi); + } else { + if (rw == WRITE) + set_bit(STRIPE_DEGRADED, &sh->state); + PRINTK("skip op %ld on disc %d for sector %llu\n", + bi->bi_rw, i, (unsigned long long)sh->sector); + clear_bit(R5_LOCKED, &sh->dev[i].flags); + set_bit(STRIPE_HANDLE, &sh->state); + } + } +} + +static struct dma_async_tx_descriptor * +async_copy_data(int frombio, struct bio *bio, struct page *page, sector_t sector, + struct dma_async_tx_descriptor *tx) +{ + struct bio_vec *bvl; + struct page *bio_page; + int i; + int page_
[PATCH 2.6.20-rc5 08/12] md: satisfy raid5 read requests via raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> Use raid5_run_ops to carry out the memory copies for a raid5 read request. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 40 +++- 1 files changed, 15 insertions(+), 25 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2422253..db8925f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1980,7 +1980,7 @@ static void handle_stripe5(struct stripe_head *sh) int i; int syncing, expanding, expanded; int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0; - int compute=0, req_compute=0, non_overwrite=0; + int to_fill=0, compute=0, req_compute=0, non_overwrite=0; int failed_num=0; struct r5dev *dev; unsigned long pending=0; @@ -2004,34 +2004,20 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[i]; clear_bit(R5_Insync, &dev->flags); - PRINTK("check %d: state 0x%lx read %p write %p written %p\n", - i, dev->flags, dev->toread, dev->towrite, dev->written); - /* maybe we can reply to a read */ + PRINTK("check %d: state 0x%lx toread %p read %p write %p written %p\n", + i, dev->flags, dev->toread, dev->read, dev->towrite, dev->written); + + /* maybe we can start a biofill operation */ if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) { - struct bio *rbi, *rbi2; - PRINTK("Return read for disc %d\n", i); - spin_lock_irq(&conf->device_lock); - rbi = dev->toread; - dev->toread = NULL; - if (test_and_clear_bit(R5_Overlap, &dev->flags)) - wake_up(&conf->wait_for_overlap); - spin_unlock_irq(&conf->device_lock); - while (rbi && rbi->bi_sector < dev->sector + STRIPE_SECTORS) { - copy_data(0, rbi, dev->page, dev->sector); - rbi2 = r5_next_bio(rbi, dev->sector); - spin_lock_irq(&conf->device_lock); - if (--rbi->bi_phys_segments == 0) { - rbi->bi_next = return_bi; - return_bi = rbi; - } - spin_unlock_irq(&conf->device_lock); - rbi = rbi2; - } + to_read--; + if (!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + set_bit(R5_Wantfill, &dev->flags); } /* now count some things */ if (test_bit(R5_LOCKED, &dev->flags)) locked++; if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++; + if (test_bit(R5_Wantfill, &dev->flags)) to_fill++; if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 1); if (dev->toread) to_read++; @@ -2055,9 +2041,13 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(R5_Insync, &dev->flags); } rcu_read_unlock(); + + if (to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) + sh->ops.count++; + PRINTK("locked=%d uptodate=%d to_read=%d" - " to_write=%d failed=%d failed_num=%d\n", - locked, uptodate, to_read, to_write, failed, failed_num); + " to_write=%d to_fill=%d failed=%d failed_num=%d\n", + locked, uptodate, to_read, to_write, to_fill, failed, failed_num); /* check if the array has lost two devices and, if so, some requests might * need to be failed */ - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2.6.20-rc5 10/12] md: move raid5 io requests to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe now only updates the state of stripes. All execution of operations is moved to raid5_run_ops. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 68 1 files changed, 10 insertions(+), 58 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 1956b3c..8af084f 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2360,6 +2360,8 @@ static void handle_stripe5(struct stripe_head *sh) PRINTK("Read_old block %d for r-m-w\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2380,6 +2382,8 @@ static void handle_stripe5(struct stripe_head *sh) PRINTK("Read_old block %d for Reconstruct\n", i); set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; locked++; } else { set_bit(STRIPE_DELAYED, &sh->state); @@ -2479,6 +2483,8 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantwrite, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; clear_bit(STRIPE_DEGRADED, &sh->state); locked++; set_bit(STRIPE_INSYNC, &sh->state); @@ -2500,12 +2506,16 @@ static void handle_stripe5(struct stripe_head *sh) dev = &sh->dev[failed_num]; if (!test_bit(R5_ReWrite, &dev->flags)) { set_bit(R5_Wantwrite, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; set_bit(R5_ReWrite, &dev->flags); set_bit(R5_LOCKED, &dev->flags); locked++; } else { /* let's read it back */ set_bit(R5_Wantread, &dev->flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; set_bit(R5_LOCKED, &dev->flags); locked++; } @@ -2615,64 +2625,6 @@ static void handle_stripe5(struct stripe_head *sh) test_bit(BIO_UPTODATE, &bi->bi_flags) ? 0 : -EIO); } - for (i=disks; i-- ;) { - int rw; - struct bio *bi; - mdk_rdev_t *rdev; - if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) - rw = WRITE; - else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) - rw = READ; - else - continue; - - bi = &sh->dev[i].req; - - bi->bi_rw = rw; - if (rw == WRITE) - bi->bi_end_io = raid5_end_write_request; - else - bi->bi_end_io = raid5_end_read_request; - - rcu_read_lock(); - rdev = rcu_dereference(conf->disks[i].rdev); - if (rdev && test_bit(Faulty, &rdev->flags)) - rdev = NULL; - if (rdev) - atomic_inc(&rdev->nr_pending); - rcu_read_unlock(); - - if (rdev) { - if (syncing || expanding || expanded) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); - - bi->bi_bdev = rdev->bdev; - PRINTK("for %llu schedule op %ld on disc %d\n", -
[PATCH 2.6.20-rc5 09/12] md: use async_tx and raid5_run_ops for raid5 expansion operations
From: Dan Williams <[EMAIL PROTECTED]> The parity calculation for an expansion operation is the same as the calculation performed at the end of a write with the caveat that all blocks in the stripe are scheduled to be written. An expansion operation is identified as a stripe with the POSTXOR flag set and the BIODRAIN flag not set. The bulk copy operation to the new stripe is handled inline by async_tx. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 48 1 files changed, 36 insertions(+), 12 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index db8925f..1956b3c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2511,18 +2511,32 @@ static void handle_stripe5(struct stripe_head *sh) } } - if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) { - /* Need to write out all blocks after computing parity */ - sh->disks = conf->raid_disks; - sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); - compute_parity5(sh, RECONSTRUCT_WRITE); + /* Finish postxor operations initiated by the expansion +* process +*/ + if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) && + !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) { + + clear_bit(STRIPE_EXPANDING, &sh->state); + + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack); + clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete); + for (i= conf->raid_disks; i--;) { - set_bit(R5_LOCKED, &sh->dev[i].flags); - locked++; set_bit(R5_Wantwrite, &sh->dev[i].flags); + if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending)) + sh->ops.count++; } - clear_bit(STRIPE_EXPANDING, &sh->state); - } else if (expanded) { + } + + if (expanded && test_bit(STRIPE_EXPANDING, &sh->state) && + !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { + /* Need to write out all blocks after computing parity */ + sh->disks = conf->raid_disks; + sh->pd_idx = stripe_to_pdidx(sh->sector, conf, conf->raid_disks); + locked += handle_write_operations5(sh, 0, 1); + } else if (expanded && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) { clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); @@ -2533,6 +2547,7 @@ static void handle_stripe5(struct stripe_head *sh) /* We have read all the blocks in this stripe and now we need to * copy some of them into a target stripe for expand. */ + struct dma_async_tx_descriptor *tx = NULL; clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); for (i=0; i< sh->disks; i++) if (i != sh->pd_idx) { @@ -2556,9 +2571,12 @@ static void handle_stripe5(struct stripe_head *sh) release_stripe(sh2); continue; } - memcpy(page_address(sh2->dev[dd_idx].page), - page_address(sh->dev[i].page), - STRIPE_SIZE); + + /* place all the copies on one channel */ + tx = async_memcpy(sh2->dev[dd_idx].page, + sh->dev[i].page, 0, 0, STRIPE_SIZE, + ASYNC_TX_DEP_ACK, tx, NULL, NULL); + set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); for (j=0; jraid_disks; j++) @@ -2570,6 +2588,12 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(STRIPE_HANDLE, &sh2->state); } release_stripe(sh2); + + /* done submitting copies, wait for them to complete */ + if (i + 1 >= sh->disks) { + async_tx_ack(tx); + dma_wait_for_async_tx(tx); + } } } - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2.6.20-rc5 12/12] dmaengine: driver for the iop32x, iop33x, and iop13xx raid engines
From: Dan Williams <[EMAIL PROTECTED]> This is a driver for the iop DMA/AAU/ADMA units which are capable of pq_xor, pq_update, pq_zero_sum, xor, dual_xor, xor_zero_sum, fill, copy+crc, and copy operations. Changelog: * fixed a slot allocation bug in do_iop13xx_adma_xor that caused too few slots to be requested eventually leading to data corruption * enabled the slot allocation routine to attempt to free slots before returning -ENOMEM * switched the cleanup routine to solely use the software chain and the status register to determine if a descriptor is complete. This is necessary to support other IOP engines that do not have status writeback capability * make the driver iop generic * modified the allocation routines to understand allocating a group of slots for a single operation * added a null xor initialization operation for the xor only channel on iop3xx * support xor operations on buffers larger than the hardware maximum * split the do_* routines into separate prep, src/dest set, submit stages * added async_tx support (dependent operations initiation at cleanup time) * simplified group handling * added interrupt support (callbacks via tasklets) * brought the pending depth inline with ioat (i.e. 4 descriptors) Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/dma/Kconfig |8 drivers/dma/Makefile|1 drivers/dma/iop-adma.c | 1511 +++ include/asm-arm/hardware/iop_adma.h | 116 +++ 4 files changed, 1636 insertions(+), 0 deletions(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index c82ed5f..d61e3e5 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -41,4 +41,12 @@ config INTEL_IOATDMA default m ---help--- Enable support for the Intel(R) I/OAT DMA engine. + +config INTEL_IOP_ADMA +tristate "Intel IOP ADMA support" +depends on DMA_ENGINE && (ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX) +default m +---help--- + Enable support for the Intel(R) IOP Series RAID engines. + endmenu diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 6a99341..8ebf10d 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_DMA_ENGINE) += dmaengine.o obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o +obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c new file mode 100644 index 000..77f859e --- /dev/null +++ b/drivers/dma/iop-adma.c @@ -0,0 +1,1511 @@ +/* + * Copyright(c) 2006 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This driver supports the asynchrounous DMA copy and RAID engines available + * on the Intel Xscale(R) family of I/O Processors (IOP 32x, 33x, 134x) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define to_iop_adma_chan(chan) container_of(chan, struct iop_adma_chan, common) +#define to_iop_adma_device(dev) container_of(dev, struct iop_adma_device, common) +#define to_iop_adma_slot(lh) container_of(lh, struct iop_adma_desc_slot, slot_node) +#define tx_to_iop_adma_slot(tx) container_of(tx, struct iop_adma_desc_slot, async_tx) + +#define IOP_ADMA_DEBUG 0 +#define PRINTK(x...) ((void)(IOP_ADMA_DEBUG && printk(x))) + +/** + * iop_adma_free_slots - flags descriptor slots for reuse + * @slot: Slot to free + * Caller must hold &iop_chan->lock while calling this function + */ +static inline void iop_adma_free_slots(struct iop_adma_desc_slot *slot) +{ + int stride = slot->stride; + + while (stride--) { + slot->stride = 0; + slot = list_entry(slot->slot_node.next, + struct iop_adma_desc_slot, + slot_node); + } +} + +static inline dma_cookie_t +iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc, + struct iop_adma_chan *iop_chan, dma_cookie_t cookie) +{ + BUG_ON(
[PATCH 2.6.20-rc5 11/12] md: remove raid5 compute_block and compute_parity5
From: Dan Williams <[EMAIL PROTECTED]> replaced by raid5_run_ops Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 124 1 files changed, 0 insertions(+), 124 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8af084f..a981c35 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1480,130 +1480,6 @@ static void copy_data(int frombio, struct bio *bio, } \ } while(0) - -static void compute_block(struct stripe_head *sh, int dd_idx) -{ - int i, count, disks = sh->disks; - void *ptr[MAX_XOR_BLOCKS], *dest, *p; - - PRINTK("compute_block, stripe %llu, idx %d\n", - (unsigned long long)sh->sector, dd_idx); - - dest = page_address(sh->dev[dd_idx].page); - memset(dest, 0, STRIPE_SIZE); - count = 0; - for (i = disks ; i--; ) { - if (i == dd_idx) - continue; - p = page_address(sh->dev[i].page); - if (test_bit(R5_UPTODATE, &sh->dev[i].flags)) - ptr[count++] = p; - else - printk(KERN_ERR "compute_block() %d, stripe %llu, %d" - " not present\n", dd_idx, - (unsigned long long)sh->sector, i); - - check_xor(); - } - if (count) - xor_block(count, STRIPE_SIZE, dest, ptr); - set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags); -} - -static void compute_parity5(struct stripe_head *sh, int method) -{ - raid5_conf_t *conf = sh->raid_conf; - int i, pd_idx = sh->pd_idx, disks = sh->disks, count; - void *ptr[MAX_XOR_BLOCKS], *dest; - struct bio *chosen; - - PRINTK("compute_parity5, stripe %llu, method %d\n", - (unsigned long long)sh->sector, method); - - count = 0; - dest = page_address(sh->dev[pd_idx].page); - switch(method) { - case READ_MODIFY_WRITE: - BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags)); - for (i=disks ; i-- ;) { - if (i==pd_idx) - continue; - if (sh->dev[i].towrite && - test_bit(R5_UPTODATE, &sh->dev[i].flags)) { - ptr[count++] = page_address(sh->dev[i].page); - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - check_xor(); - } - } - break; - case RECONSTRUCT_WRITE: - memset(dest, 0, STRIPE_SIZE); - for (i= disks; i-- ;) - if (i!=pd_idx && sh->dev[i].towrite) { - chosen = sh->dev[i].towrite; - sh->dev[i].towrite = NULL; - - if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) - wake_up(&conf->wait_for_overlap); - - BUG_ON(sh->dev[i].written); - sh->dev[i].written = chosen; - } - break; - case CHECK_PARITY: - break; - } - if (count) { - xor_block(count, STRIPE_SIZE, dest, ptr); - count = 0; - } - - for (i = disks; i--;) - if (sh->dev[i].written) { - sector_t sector = sh->dev[i].sector; - struct bio *wbi = sh->dev[i].written; - while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) { - copy_data(1, wbi, sh->dev[i].page, sector); - wbi = r5_next_bio(wbi, sector); - } - - set_bit(R5_LOCKED, &sh->dev[i].flags); - set_bit(R5_UPTODATE, &sh->dev[i].flags); - } - - switch(method) { - case RECONSTRUCT_WRITE: - case CHECK_PARITY: - for (i=disks; i--;) - if (i != pd_idx) { - ptr[count++] = page_address(sh->dev[i].page); - check_xor(); -
[PATCH 2.6.20-rc5 04/12] md: use raid5_run_ops for stripe cache operations
From: Dan Williams <[EMAIL PROTECTED]> Each stripe has three flag variables to reflect the state of operations (pending, ack, and complete). -pending: set to request servicing in raid5_run_ops -ack: set to reflect that raid5_runs_ops has seen this request -complete: set when the operation is complete and it is ok for handle_stripe5 to clear 'pending' and 'ack'. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 65 +--- 1 files changed, 56 insertions(+), 9 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e70ee17..2c74f9b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -126,6 +126,7 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) } md_wakeup_thread(conf->mddev->thread); } else { + BUG_ON(sh->ops.pending); if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { atomic_dec(&conf->preread_active_stripes); if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) @@ -225,7 +226,8 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int BUG_ON(atomic_read(&sh->count) != 0); BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); - + BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete); + CHECK_DEVLOCK(); PRINTK("init_stripe called, stripe %llu\n", (unsigned long long)sh->sector); @@ -241,11 +243,11 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int pd_idx, int for (i = sh->disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->toread || dev->towrite || dev->written || + if (dev->toread || dev->read || dev->towrite || dev->written || test_bit(R5_LOCKED, &dev->flags)) { - printk("sector=%llx i=%d %p %p %p %d\n", + printk("sector=%llx i=%d %p %p %p %p %d\n", (unsigned long long)sh->sector, i, dev->toread, - dev->towrite, dev->written, + dev->read, dev->towrite, dev->written, test_bit(R5_LOCKED, &dev->flags)); BUG(); } @@ -325,6 +327,43 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } +/* check_op() ensures that we only dequeue an operation once */ +#define check_op(op) do {\ + if (test_bit(op, &sh->ops.pending) &&\ + !test_bit(op, &sh->ops.complete)) {\ + if (test_and_set_bit(op, &sh->ops.ack))\ + clear_bit(op, &pending);\ + else\ + ack++;\ + } else\ + clear_bit(op, &pending);\ +} while(0) + +/* find new work to run, do not resubmit work that is already + * in flight + */ +static unsigned long get_stripe_work(struct stripe_head *sh) +{ + unsigned long pending; + int ack = 0; + + pending = sh->ops.pending; + + check_op(STRIPE_OP_BIOFILL); + check_op(STRIPE_OP_COMPUTE_BLK); + check_op(STRIPE_OP_PREXOR); + check_op(STRIPE_OP_BIODRAIN); + check_op(STRIPE_OP_POSTXOR); + check_op(STRIPE_OP_CHECK); + if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending)) + ack++; + + sh->ops.count -= ack; + BUG_ON(sh->ops.count < 0); + + return pending; +} + static int raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error); static int @@ -1859,7 +1898,6 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t *conf, int disks) *schedule a write of some buffers *return confirmation of parity correctness * - * Parity calculations are done inside the stripe lock * buffers are taken off read_list or write_list, and bh_cache buffers * get BH_Lock set before the stripe lock is released. * @@ -1877,10 +1915,11 @@ static void handle_stripe5(struct stripe_head *sh) int non_overwrite = 0; int failed_num=0; struct r5dev *dev; + unsigned long pending=0; - PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n", - (unsigned long long)sh->sector, atomic_read(&sh->count), - sh->pd_idx); + PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d ops=%lx:%lx:%lx\n", + (unsigned long long)sh->sector, sh->state, atomic_read(&sh->count), + sh->pd_id
[PATCH 2.6.20-rc5 05/12] md: move write operations to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe sets STRIPE_OP_PREXOR, STRIPE_OP_BIODRAIN, STRIPE_OP_POSTXOR to request a write to the stripe cache. raid5_run_ops is triggerred to run and executes the request outside the stripe lock. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 152 +--- 1 files changed, 131 insertions(+), 21 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2c74f9b..2390657 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -1788,7 +1788,75 @@ static void compute_block_2(struct stripe_head *sh, int dd_idx1, int dd_idx2) } } +static int handle_write_operations5(struct stripe_head *sh, int rcw, int expand) +{ + int i, pd_idx = sh->pd_idx, disks = sh->disks; + int locked=0; + + if (rcw == 0) { + /* skip the drain operation on an expand */ + if (!expand) { + BUG_ON(test_and_set_bit(STRIPE_OP_BIODRAIN, + &sh->ops.pending)); + sh->ops.count++; + } + + BUG_ON(test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)); + sh->ops.count++; + + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + + if (dev->towrite) { + set_bit(R5_LOCKED, &dev->flags); + if (!expand) + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } else { + BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || + test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); + + BUG_ON(test_and_set_bit(STRIPE_OP_PREXOR, &sh->ops.pending) || + test_and_set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) || + test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)); + + sh->ops.count += 3; + + for (i=disks ; i-- ;) { + struct r5dev *dev = &sh->dev[i]; + if (i==pd_idx) + continue; + /* For a read-modify write there may be blocks that are +* locked for reading while others are ready to be written +* so we distinguish these blocks by the R5_Wantprexor bit +*/ + if (dev->towrite && + (test_bit(R5_UPTODATE, &dev->flags) || + test_bit(R5_Wantcompute, &dev->flags))) { + set_bit(R5_Wantprexor, &dev->flags); + set_bit(R5_LOCKED, &dev->flags); + clear_bit(R5_UPTODATE, &dev->flags); + locked++; + } + } + } + + /* keep the parity disk locked while asynchronous operations +* are in flight +*/ + set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); + clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); + locked++; + + PRINTK("%s: stripe %llu locked: %d pending: %lx\n", + __FUNCTION__, (unsigned long long)sh->sector, + locked, sh->ops.pending); + + return locked; +} /* * Each stripe/dev can have one or more bion attached. @@ -2151,8 +2219,67 @@ static void handle_stripe5(struct stripe_head *sh) set_bit(STRIPE_HANDLE, &sh->state); } - /* now to consider writing and what else, if anything should be read */ - if (to_write) { + /* Now we check to see if any write operations have recently +* completed +*/ + + /* leave prexor set until postxor is done, allows us to distinguish +* a rmw from a rcw during biodrain +*/ + if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) && + test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + + clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete); + clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack); + clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending); + + for (i=disks; i--;) + clear_bit(R5_Wantprexor, &sh->dev[i].flags); + } + + /* if only POSTXOR is set then this is an 'expand' postxor */ + if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) && + test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) { + + clear_bit(STRIPE_OP_BIODRAIN
[PATCH 2.6.20-rc5 07/12] md: move raid5 parity checks to raid5_run_ops
From: Dan Williams <[EMAIL PROTECTED]> handle_stripe sets STRIPE_OP_CHECK to request a check operation in raid5_run_ops. If raid5_run_ops is able to perform the check with a dma engine the parity will be preserved in memory removing the need to re-read it from disk, as is necessary in the synchronous case. 'Repair' operations re-use the same logic as compute block, with the caveat that the results of the compute block are immediately written back to the parity disk. To differentiate these operations the STRIPE_OP_MOD_REPAIR_PD flag is added. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 81 1 files changed, 62 insertions(+), 19 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 279a30c..2422253 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2411,32 +2411,75 @@ static void handle_stripe5(struct stripe_head *sh) locked += handle_write_operations5(sh, rcw, 0); } - /* maybe we need to check and possibly fix the parity for this stripe -* Any reads will already have been scheduled, so we just see if enough data -* is available + /* 1/ Maybe we need to check and possibly fix the parity for this stripe. +*Any reads will already have been scheduled, so we just see if enough data +*is available. +* 2/ Hold off parity checks while parity dependent operations are in flight +*(conflicting writes are protected by the 'locked' variable) */ - if (syncing && locked == 0 && - !test_bit(STRIPE_INSYNC, &sh->state)) { + if ((syncing && locked == 0 && !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) && + !test_bit(STRIPE_INSYNC, &sh->state)) || + test_bit(STRIPE_OP_CHECK, &sh->ops.pending) || + test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + set_bit(STRIPE_HANDLE, &sh->state); - if (failed == 0) { - BUG_ON(uptodate != disks); - compute_parity5(sh, CHECK_PARITY); - uptodate--; - if (page_is_zero(sh->dev[sh->pd_idx].page)) { - /* parity is correct (on disc, not in buffer any more) */ - set_bit(STRIPE_INSYNC, &sh->state); - } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; - if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) - /* don't try to repair!! */ + /* Take one of the following actions: +* 1/ start a check parity operation if (uptodate == disks) +* 2/ finish a check parity operation and act on the result +* 3/ skip to the writeback section if we previously +*initiated a recovery operation +*/ + if (failed == 0 && !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { + if (!test_and_set_bit(STRIPE_OP_CHECK, &sh->ops.pending)) { + BUG_ON(uptodate != disks); + clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); + sh->ops.count++; + uptodate--; + } else if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { + clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); + clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); + + if (sh->ops.zero_sum_result == 0) + /* parity is correct (on disc, not in buffer any more) */ set_bit(STRIPE_INSYNC, &sh->state); else { - compute_block(sh, sh->pd_idx); - uptodate++; + conf->mddev->resync_mismatches += STRIPE_SECTORS; + if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) + /* don't try to repair!! */ + set_bit(STRIPE_INSYNC, &sh->state); + else { + BUG_ON(test_and_set_bit( + STRIPE_OP_COMPUTE_BLK, + &sh->ops.pe
Re: [RFC][PATCH 00/12] md raid acceleration and performance analysis
On 2/6/07, Leech, Christopher <[EMAIL PROTECTED]> wrote: Hi Dan, I've been looking over how your patches change the ioatdma driver. I like the idea of removing the multiple entry points for virtual address vs. page struct arguments, and just using dma_addr_t for the driver interfaces. But, I don't think having both ioatdma and iop-adma implement map_page, map_single, unmap_page, and unmap_single entry points is much better. Do you see a reason why it wouldn't work to expose the generic device for a DMA channel, and replace instances of dma_device->map_single(dma_chan, src, len, DMA_TO_DEVICE) with dma_map_single(dma_device->dev, src, len, DMA_TO_DEVICE) I was initially concerned about a case where dma_map_single was not equivalent to pci_map_single. Looking now, it appears that case would be a bug, so I will integrate this change. I am a little concerned about having the DMA mapping happen outside of the driver, but the unmapping is still in the driver cleanup routine. I'm not sure if it's really a problem, or how I'd change it though. - Chris Thanks, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.20: stripe_cache_size goes boom with 32mb
On 2/23/07, Justin Piszcz <[EMAIL PROTECTED]> wrote: I have 2GB On this machine. For me, 8192 seems to be the sweet spot, I will probably keep it at 8mb. Just a note stripe_cache_size = 8192 = 192MB with six disks. The calculation is: stripe_cache_size * num_disks * PAGE_SIZE = stripe_cache_size_bytes -- Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: DMRAID feature direction?
On 2/27/07, Gaston, Jason D <[EMAIL PROTECTED]> wrote: Hello, Can someone point me to where I can search a linux-raid mailing list archive? I use: http://marc.theaimsgroup.com/?l=linux-raid&r=1&w=2 I am looking for information about where things are going with DMRAID features and any discussion on where things stand in regards to the possibility of merging MD and DMRAID. My guess, from what I have found in google, is that this was a heated discussion. Mainly I am looking at what direction people think is correct for getting more functionality to support "fakeraid" volumes. Also interesting is the discussion around surrounding the proposed EMD solution. I appreciate any information! Thanks, Jason - Regards, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC, PATCH] raid456: replace the handle_list with a multi-threaded workqueue
Currently raid456 queues up work to a single raid5d thread per array. Since there are no dependencies between operations on different stripes I believed a speed up could be obtained by spreading the handle_stripe load across all available CPU's. However I am not seeing a speed up, as measured by tiobench. I think the reason is that multi-processor effects will only show up when data is already in the cache. In this case the work is already spread out per client thread. Also work submitted to workqueues is sticky to the CPU where queue_work() was called, not load balanced amongst the available CPUs. I'm posting it anyway to see if I am overlooking a case where it would be helpful, and from a cosmetic standpoint it separates raid5d housekeeping work from handle_stripe work. Signed-off-by: Dan Williams <[EMAIL PROTECTED]> --- drivers/md/raid5.c | 108 ++-- include/linux/raid/raid5.h |6 ++ 2 files changed, 68 insertions(+), 46 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 11c3d7b..e54310c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -121,7 +121,10 @@ static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) blk_plug_device(conf->mddev->queue); } else { clear_bit(STRIPE_BIT_DELAY, &sh->state); - list_add_tail(&sh->lru, &conf->handle_list); + conf->workqueue_stripes++; + atomic_inc(&sh->count); + BUG_ON(queue_work(conf->workqueue, + &sh->work) == 0); } md_wakeup_thread(conf->mddev->thread); } else { @@ -310,6 +313,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); if (list_empty(&sh->lru) && + !work_pending(&sh->work) && !test_bit(STRIPE_EXPANDING, &sh->state)) BUG(); list_del_init(&sh->lru); @@ -324,6 +328,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t *conf, sector_t sector return sh; } +static void raid456_workqueue(struct work_struct *work); static int grow_one_stripe(raid5_conf_t *conf) { struct stripe_head *sh; @@ -343,6 +348,7 @@ static int grow_one_stripe(raid5_conf_t *conf) /* we just created an active stripe so... */ atomic_set(&sh->count, 1); atomic_inc(&conf->active_stripes); + INIT_WORK(&sh->work, raid456_workqueue); INIT_LIST_HEAD(&sh->lru); release_stripe(sh); return 1; @@ -2448,7 +2454,9 @@ static void raid5_activate_delayed(raid5_conf_t *conf) clear_bit(STRIPE_DELAYED, &sh->state); if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); - list_add_tail(&sh->lru, &conf->handle_list); + conf->workqueue_stripes++; + atomic_inc(&sh->count); + BUG_ON(queue_work(conf->workqueue, &sh->work) == 0); } } } @@ -3181,7 +3189,6 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) } - /* * This is our raid5 kernel thread. * @@ -3191,9 +3198,9 @@ static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) */ static void raid5d (mddev_t *mddev) { - struct stripe_head *sh; raid5_conf_t *conf = mddev_to_conf(mddev); int handled; + struct bio *bio; PRINTK("+++ raid5d active\n"); @@ -3201,51 +3208,30 @@ static void raid5d (mddev_t *mddev) handled = 0; spin_lock_irq(&conf->device_lock); - while (1) { - struct list_head *first; - struct bio *bio; - - if (conf->seq_flush != conf->seq_write) { - int seq = conf->seq_flush; - spin_unlock_irq(&conf->device_lock); - bitmap_unplug(mddev->bitmap); - spin_lock_irq(&conf->device_lock); - conf->seq_write = seq; - activate_bit_delay(conf); - } - - if (list_empty(&conf->handle_list) && -
Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.
On 3/15/07, Paul Mackerras <[EMAIL PROTECTED]> wrote: Wolfgang Denk writes: > This patch is based on and requires a set of patches posted to the > linux-raid mailing list by Dan Williams on 2007-01-23: Those patches don't seem to be upstream in Linus' tree. Are they in -mm, or is anyone pushing for them to be? They are in -mm (git-md-accel.patch). I'll review this driver and and integrate it into my next push to Andrew, along with some further cleanups. Paul. Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.
On 3/16/07, Wolfgang Denk <[EMAIL PROTECTED]> wrote: In message <[EMAIL PROTECTED]> you wrote: > > They are in -mm (git-md-accel.patch). I'll review this driver and and > integrate it into my next push to Andrew, along with some further > cleanups. Thanks. We're doing some cleanup now based on the feedback we receive. What is easier for you to handle - a complete new patch, or an incrementan one on top of what we submitted now? (I'd prefer incremental, but will do whatever works better for you). I can handle incremental, but I will probably fold everything together in the patch that goes to -mm. Check out Stacked GIT (http://www.procode.org/stgit/) when you get a chance, it handles this situation well. Best regards, Wolfgang Denk Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.
On 3/16/07, Benjamin Herrenschmidt <[EMAIL PROTECTED]> wrote: > + PRINTK("\tfree slot %x: %d stride: %d\n", desc->phys, desc->idx, desc->stride); Why don't you use the kernel existing debugging facilitie, like pr_debug, or dev_dbg if you have a proper struct device (which you should have with an arch/powerpc port hopefully using of_platform_device). This came from the the iop-adma driver. I blindly copied it from drivers/md/raid5.c, but yes it should change to dev_dbg. > + spin_lock_bh(&spe_chan->lock); > + /* Allocate descriptor slots */ > + i = spe_chan->slots_allocated; > + if (spe_chan->device->id != PPC440SPE_XOR_ID) > + db_sz = sizeof (dma_cdb_t); > + else > + db_sz = sizeof (xor_cb_t); > + > + for (; i < (plat_data->pool_size/db_sz); i++) { > + slot = kzalloc(sizeof(struct spe_adma_desc_slot), GFP_KERNEL); GFP_KERNEL within spin_lock_bh is no good... This is an iop-adma wart... will fix. > diff --git a/include/asm-ppc/adma.h b/include/asm-ppc/adma.h > new file mode 100644 > index 000..0be88f1 > --- /dev/null > +++ b/include/asm-ppc/adma.h There's way too many code in this .h file, too big inline functions. It should mostly be moved to a .c file The iop-adma driver uses separate .h files because the driver is shared between iop3xx and iop13xx implementations and I did not want the overhead of another indirect-branch layer. In this case the hardware specific routines can be written inline since the driver is only supporting one architecture... other suggestions? Cheers, Ben. Regards, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.
Here are some additional comments/nits: +/* + * Init DMA0/1 and XOR engines; allocate memory for DMAx FIFOs; set platform_device + * memory resources addresses + */ +static void ppc440spe_configure_raid_devices(void) Any reason not to move most of this function into spe_adma_probe? The "set resource address" section is the only piece that spe_adma_probe should not handle. +++ b/drivers/dma/spe-adma.c @@ -0,0 +1,1071 @@ +/* + * Copyright(c) 2006 DENX Engineering. All rights reserved. + * + * Author: Yuri Tikhonov <[EMAIL PROTECTED]> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This driver supports the asynchrounous DMA copy and RAID engines available + * on the AMCC PPC440SPe Processors. + * Based on the Intel Xscale(R) family of I/O Processors (SPE 32x, 33x, 134x) SPE should be IOP on this line. ../.. +static inline void +spe_adma_slot_cleanup(struct spe_adma_chan *spe_chan) +{ + spin_lock_bh(&spe_chan->lock); + __spe_adma_slot_cleanup(spe_chan); + spin_unlock_bh(&spe_chan->lock); +} + +static struct spe_adma_chan *spe_adma_chan_array[3]; +static void spe_adma0_task(unsigned long data) +{ + __spe_adma_slot_cleanup(spe_adma_chan_array[0]); +} + +static void spe_adma1_task(unsigned long data) +{ + __spe_adma_slot_cleanup(spe_adma_chan_array[1]); +} + +static void spe_adma2_task(unsigned long data) +{ + __spe_adma_slot_cleanup(spe_adma_chan_array[2]); +} + +DECLARE_TASKLET(spe_adma0_tasklet, spe_adma0_task, 0); +DECLARE_TASKLET(spe_adma1_tasklet, spe_adma1_task, 0); +DECLARE_TASKLET(spe_adma2_tasklet, spe_adma2_task, 0); +struct tasklet_struct *spe_adma_tasklet[] = { + &spe_adma0_tasklet, + &spe_adma1_tasklet, + &spe_adma2_tasklet, +}; + This is something I am cleaning up in iop-adma by adding a struct tasklet * to each channel.I'll post an incremental diff of my iop-adma changes so you can see what I have cleaned up since the 2.6.20-rc5 posting. +static dma_addr_t spe_adma_map_page(struct dma_chan *chan, struct page *page, + unsigned long offset, size_t size, + int direction) +{ + struct spe_adma_chan *spe_chan = to_spe_adma_chan(chan); + return dma_map_page(&spe_chan->device->pdev->dev, page, offset, size, + direction); +} + +static dma_addr_t spe_adma_map_single(struct dma_chan *chan, void *cpu_addr, + size_t size, int direction) +{ + struct spe_adma_chan *spe_chan = to_spe_adma_chan(chan); + return dma_map_single(&spe_chan->device->pdev->dev, cpu_addr, size, + direction); +} + +static void spe_adma_unmap_page(struct dma_chan *chan, dma_addr_t handle, + size_t size, int direction) +{ + struct spe_adma_chan *spe_chan = to_spe_adma_chan(chan); + dma_unmap_page(&spe_chan->device->pdev->dev, handle, size, direction); +} + +static void spe_adma_unmap_single(struct dma_chan *chan, dma_addr_t handle, + size_t size, int direction) +{ + struct spe_adma_chan *spe_chan = to_spe_adma_chan(chan); + dma_unmap_single(&spe_chan->device->pdev->dev, handle, size, direction); +} + ...these are gone as well in the latest code. +static int __devinit spe_adma_probe(struct platform_device *pdev) ../.. + printk(KERN_INFO "Intel(R) SPE ADMA Engine found [%d]: " Intel(R)? :-) Regards, Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.
On 3/17/07, Stefan Roese <[EMAIL PROTECTED]> wrote: Dan, I just noticed that your patch "dmaengine: add the async_tx api": @@ -22,6 +22,17 @@ config NET_DMA Since this is the main user of the DMA engine, it should be enabled; say Y here. +config ASYNC_TX_DMA + tristate "Asynchronous Bulk Memory Transfers/Transforms API" + default y + ---help--- + This enables the async_tx management layer for dma engines. + Subsystems coded to this API will use offload engines for bulk + memory operations where present. Software implementations are + called when a dma engine is not present or fails to allocate + memory to carry out the transaction. + Current subsystems ported to async_tx: MD_RAID4,5 + adds ASYNC_TX_DMA unconditionally to _all_ platforms. You might what to bundle this with something like DMA_ENGINE. Yes, defaulting to 'y' is not necessary, but ASYNC_TX_DMA=y && DMA_ENGINE=n is an explicit feature of the interface. When DMA_ENGINE is not selected all the asynchronous paths in the API are compiled out. This allows subsytems, like md-raid5, to be written in an asynchronous fashion without regard for the architecture[1] or availability of offload engines. Best regards, Stefan Regards, Dan [1] The API implicitly handles channel switching depending on the offload engine architecture. Where an iop13xx engine can handle a copy+xor sequence on one channel, a 440sp or iop3xx platform will need to switch between copy and xor capable engines. Resolving operation dependencies and channel switching is handled behind the scenes. - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.
The current implementation builds on my embedded PPC4xx system without any disks the objects async_tx.o and xor.o into the kernel which I definitely don't need and want. And I get something like: async_tx: api initialized (sync-only) xor: measuring software checksumming speed 8regs : 145.000 MB/sec 8regs_prefetch: 115.000 MB/sec 32regs: 176.000 MB/sec 32regs_prefetch: 135.000 MB/sec xor: using function: 32regs (176.000 MB/sec) upon bootup. Understood I'll change it so that xor.o and async_tx.o are off by default. Best regards, Stefan Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: 2.6.20.3 AMD64 oops in CFQ code
On 3/22/07, Neil Brown <[EMAIL PROTECTED]> wrote: On Thursday March 22, [EMAIL PROTECTED] wrote: > On Thu, Mar 22 2007, [EMAIL PROTECTED] wrote: > > > 3 (I think) seperate instances of this, each involving raid5. Is your > > > array degraded or fully operational? > > > > Ding! A drive fell out the other day, which is why the problems only > > appeared recently. > > > > md5 : active raid5 sdf4[5] sdd4[3] sdc4[2] sdb4[1] sda4[0] > > 1719155200 blocks level 5, 64k chunk, algorithm 2 [6/5] [_U] > > bitmap: 149/164 pages [596KB], 1024KB chunk > > > > H'm... this means that my alarm scripts aren't working. Well, that's > > good to know. The drive is being re-integrated now. > > Heh, at least something good came out of this bug then :-) > But that's reaffirming. Neil, are you following this? It smells somewhat > fishy wrt raid5. Yes, I've been trying to pay attention The evidence does seem to point to raid5 and degraded arrays being implicated. However I'm having trouble finding how the fact that an array is degraded would be visible down in the elevator except for having a slightly different distribution of reads and writes. One possible way is that if an array is degraded, then some read requests will go through the stripe cache rather than direct to the device. However I would more expect the direct-to-device path to have problems as it is much newer code. Going through the cache for reads is very well tested code - and reads come from the cache for most writes anyway, so the elevator will still see lots of single-page. reads. It only ever sees single-page write. There might be more pressure on the stripe cache when running degraded, so we might call the ->unplug_fn a little more often, but I doubt that would be noticeable. As you seem to suggest by the patch, it does look like some sort of unlocked access to the cfq_queue structure. However apart from the comment before cfq_exit_single_io_context being in the wrong place (should be before __cfq_exit_single_io_context) I cannot see anything obviously wrong with the locking around that structure. So I'm afraid I'm stumped too. NeilBrown Not a cfq failure, but I have been able to reproduce a different oops at array stop time while i/o's were pending. I have not dug into it enough to suggest a patch, but I wonder if it is somehow related to the cfq failure since it involves congestion and drives going away: md: md0: recovery done. Unable to handle kernel NULL pointer dereference at virtual address 00bc pgd = 40004000 [00bc] *pgd= Internal error: Oops: 17 [#1] Modules linked in: CPU: 0 PC is at raid5_congested+0x14/0x5c LR is at sync_sb_inodes+0x278/0x2ec pc : [<402801cc>]lr : [<400a39e8>]Not tainted sp : 8a3e3ec4 ip : 8a3e3ed4 fp : 8a3e3ed0 r10: 40474878 r9 : 40474870 r8 : 40439710 r7 : 8a3e3f30 r6 : bfa76b78 r5 : 4161dc08 r4 : 40474800 r3 : 402801b8 r2 : 0004 r1 : 0001 r0 : Flags: nzCv IRQs on FIQs on Mode SVC_32 Segment kernel Control: 400397F Table: 7B7D4018 DAC: 0035 Process pdflush (pid: 1371, stack limit = 0x8a3e2250) Stack: (0x8a3e3ec4 to 0x8a3e4000) 3ec0: 8a3e3f04 8a3e3ed4 400a39e8 402801c4 8a3e3f24 000129f9 40474800 3ee0: 4047483c 40439a44 8a3e3f30 40439710 40438a48 4045ae68 8a3e3f24 8a3e3f08 3f00: 400a3ca0 400a377c 8a3e3f30 1162 00012bed 40438a48 8a3e3f78 8a3e3f28 3f20: 40069b58 400a3bfc 00011e41 8a3e3f38 8a3e3f28 0400 3f40: 0025 8a3e3f80 8a3e3f8c 3f60: 40439750 8a3e2000 40438a48 8a3e3fc0 8a3e3f7c 4006ab68 40069a8c 0001 3f80: bfae2ac0 40069a80 8a3e3f8c 8a3e3f8c 00012805 8a3e2000 3fa0: 9e7e1f1c 4006aa40 0001 fffc 8a3e3ff4 8a3e3fc4 4005461c 3fc0: 4006aa4c 0001 3fe0: 8a3e3ff8 40042320 40054520 Backtrace: [<402801b8>] (raid5_congested+0x0/0x5c) from [<400a39e8>] (sync_sb_inodes+0x278/0x2ec) [<400a3770>] (sync_sb_inodes+0x0/0x2ec) from [<400a3ca0>] (writeback_inodes+0xb0/0xb8) [<400a3bf0>] (writeback_inodes+0x0/0xb8) from [<40069b58>] (wb_kupdate+0xd8/0x160) r7 = 40438A48 r6 = 00012BED r5 = 1162 r4 = 8A3E3F30 [<40069a80>] (wb_kupdate+0x0/0x160) from [<4006ab68>] (pdflush+0x128/0x204) r8 = 40438A48 r7 = 8A3E2000 r6 = 40439750 r5 = 8A3E3F8C r4 = 8A3E3F80 [<4006aa40>] (pdflush+0x0/0x204) from [<4005461c>] (kthread+0x108/0x134) [<40054514>] (kthread+0x0/0x134) from [<40042320>] (do_exit+0x0/0x844) Code: e92dd800 e24cb004 e590 e3a01001 (e59030bc) md: md0 stopped. md: unbind md: export_rdev(sda) md: unbind md: export_rdev(sdd) md: unbind md: export_rdev(sdc) md: unbind md: export_rdev(sdb) 2.6.20-rc3-iop1 on an iop348 platform. SATA controller is sata_vsc. -- Dan - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/ma