[PATCH 005 of 006] raid5: Move expansion operations to a work queue

2006-06-28 Thread Dan Williams
This patch modifies handle_write_operations5() to handle the parity
calculation request made by the reshape code.  However this patch does
not move the copy operation associated with an expand to the work queue.
First, it was difficult to find a clean way to pass the parameters of
this operation to the queue.  Second, this section of code is a good
candidate for performing the copies with inline calls to the dma
routines.

This patch also cleans up the *_End flags which as of this version
of the patch set are not needed.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>

 drivers/md/raid5.c |   51 -
 include/linux/raid/raid5.h |   36 +++
 2 files changed, 54 insertions(+), 33 deletions(-)

===
Index: linux-2.6-raid/drivers/md/raid5.c
===
--- linux-2.6-raid.orig/drivers/md/raid5.c  2006-06-28 10:35:40.0 
-0700
+++ linux-2.6-raid/drivers/md/raid5.c   2006-06-28 10:35:50.0 -0700
@@ -1250,16 +1250,25 @@
 */
if (locked == 0) {
if (rcw == 0) {
-   /* enter stage 1 of reconstruct write operation */
-   set_bit(STRIPE_OP_RCW, &sh->state);
-   set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state);
-   for (i=disks ; i-- ;) {
-   struct r5dev *dev = &sh->dev[i];
-
-   if (i!=pd_idx && dev->towrite) {
-   set_bit(R5_LOCKED, &dev->flags);
+   /* skip the drain operation on an expand */
+   if (test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state)) {
+   set_bit(STRIPE_OP_RCW, &sh->state);
+   set_bit(STRIPE_OP_RCW_Parity, &sh->ops.state);
+   for (i=disks ; i-- ;) {
+   set_bit(R5_LOCKED, &sh->dev[i].flags);
locked++;
}
+   } else { /* enter stage 1 of reconstruct write 
operation */
+   set_bit(STRIPE_OP_RCW, &sh->state);
+   set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state);
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+
+   if (i!=pd_idx && dev->towrite) {
+   set_bit(R5_LOCKED, &dev->flags);
+   locked++;
+   }
+   }
}
} else {
/* enter stage 1 of read modify write operation */
@@ -2213,16 +,24 @@
}
 
if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+   int work_queued, start_n=1;
/* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks;
sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 
conf->raid_disks);
-   compute_parity5(sh, RECONSTRUCT_WRITE);
-   for (i= conf->raid_disks; i--;) {
-   set_bit(R5_LOCKED, &sh->dev[i].flags);
-   locked++;
-   set_bit(R5_Wantwrite, &sh->dev[i].flags);
+   if (!(test_bit(STRIPE_OP_RCW, &sh->state) ||
+   test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state))) {
+   start_n = 0;
+   set_bit(STRIPE_OP_RCW_Expand, &sh->ops.state);
+   }
+   work_queued = handle_write_operations5(sh, 0, start_n);
+   if (work_queued == 0) {
+   for (i= conf->raid_disks; i--;)
+   set_bit(R5_Wantwrite, &sh->dev[i].flags);
+   clear_bit(STRIPE_EXPANDING, &sh->state);
+   clear_bit(STRIPE_OP_RCW_Expand, &sh->ops.state);
+   } else if (work_queued > 0) {
+   locked += work_queued;
}
-   clear_bit(STRIPE_EXPANDING, &sh->state);
} else if (expanded) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
@@ -2257,9 +2274,15 @@
release_stripe(sh2);
continue;
}
+   /

[PATCH 002 of 006] raid5: Move check parity operations to a work queue

2006-06-28 Thread Dan Williams
This patch adds 'check parity' capabilities to the work queue and fixes
'queue_raid_work'.

Also, raid5_do_soft_block_ops now accesses the stripe state under the
lock to ensure that it is never out of sync with handle_stripe5.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>

 drivers/md/raid5.c |  123 ++---
 include/linux/raid/raid5.h |   25 ++---
 2 files changed, 113 insertions(+), 35 deletions(-)

===
Index: linux-2.6-raid/drivers/md/raid5.c
===
--- linux-2.6-raid.orig/drivers/md/raid5.c  2006-06-28 09:52:07.0 
-0700
+++ linux-2.6-raid/drivers/md/raid5.c   2006-06-28 10:35:23.0 -0700
@@ -1289,7 +1289,7 @@
if (locked > 0) {
set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
-   sh->ops.queue_count++;
+   sh->ops.pending++;
} else if (locked == 0)
set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
 
@@ -1300,6 +1300,37 @@
return locked;
 }
 
+static int handle_check_operations5(struct stripe_head *sh, int start_n)
+{
+   int complete=0, work_queued = -EBUSY;
+
+   if (test_bit(STRIPE_OP_CHECK, &sh->state) &&
+   test_bit(STRIPE_OP_CHECK_Done, &sh->ops.state)) {
+   clear_bit(STRIPE_OP_CHECK, &sh->state);
+   clear_bit(STRIPE_OP_CHECK_Done, &sh->ops.state);
+   complete = 1;
+   }
+
+   if (start_n == 0) {
+   /* enter stage 1 of parity check operation */
+   set_bit(STRIPE_OP_CHECK, &sh->state);
+   set_bit(STRIPE_OP_CHECK_Gen, &sh->ops.state);
+   work_queued = 1;
+   } else if (complete)
+   work_queued = 0;
+
+   if (work_queued > 0) {
+   clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
+   sh->ops.pending++;
+   }
+
+   PRINTK("%s: stripe %llu start: %d complete: %d op_state: %lx\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+   start_n == 0, complete, sh->ops.state);
+
+   return work_queued;
+}
+
 
 /*
  * Each stripe/dev can have one or more bion attached.
@@ -1406,11 +1437,11 @@
 /* must be called under the stripe lock */
 static void queue_raid_work(struct stripe_head *sh)
 {
-   if (--sh->ops.queue_count == 0) {
+   if (!test_bit(STRIPE_OP_QUEUED, &sh->state) && sh->ops.pending) {
+   set_bit(STRIPE_OP_QUEUED, &sh->state);
atomic_inc(&sh->count);
queue_work(sh->raid_conf->block_ops_queue, &sh->ops.work);
-   } else if (sh->ops.queue_count < 0)
-   sh->ops.queue_count = 0;
+   }
 }
 
 /*
@@ -1423,16 +1454,17 @@
int i, pd_idx = sh->pd_idx, disks = sh->disks, count = 1;
void *ptr[MAX_XOR_BLOCKS];
struct bio *chosen;
-   int overlap=0, new_work=0, written=0;
-   unsigned long state, ops_state;
+   int overlap=0, work=0, written=0;
+   unsigned long state, ops_state, ops_state_orig;
 
/* take a snapshot of what needs to be done at this point in time */
spin_lock(&sh->lock);
state = sh->state;
-   ops_state = sh->ops.state;
+   ops_state_orig = ops_state = sh->ops.state;
spin_unlock(&sh->lock);
 
if (test_bit(STRIPE_OP_RMW, &state)) {
+   BUG_ON(test_bit(STRIPE_OP_RCW, &state));
PRINTK("%s: stripe %llu STRIPE_OP_RMW op_state: %lx\n",
__FUNCTION__, (unsigned long long)sh->sector,
ops_state);
@@ -1483,14 +1515,14 @@
if (count != 1)
xor_block(count, STRIPE_SIZE, ptr);
 
-   /* signal completion and acknowledge the last state seen
-* by sh->ops.state
-*/
+   work++;
set_bit(STRIPE_OP_RMW_Done, &ops_state);
-   set_bit(STRIPE_OP_RMW_ParityPre, &ops_state);
}
 
-   } else if (test_bit(STRIPE_OP_RCW, &state)) {
+   }
+
+   if (test_bit(STRIPE_OP_RCW, &state)) {
+   BUG_ON(test_bit(STRIPE_OP_RMW, &state));
PRINTK("%s: stripe %llu STRIPE_OP_RCW op_state: %lx\n",
__FUNCTION__, (unsigned long long)sh->sector,
ops_state);
@@ -1527,20 +1559,47 @@
if (count != 1)
xor_

[PATCH 006 of 006] raid5: Remove compute_block and compute_parity

2006-06-28 Thread Dan Williams
compute_block and compute_parity5 are replaced by the work queue and the
handle_*_operations5 routines.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>

 raid5.c |  123 
 1 files changed, 123 deletions(-)

===
--- linux-2.6-raid.orig/drivers/md/raid5.c  2006-06-27 16:16:31.0 
-0700
+++ linux-2.6-raid/drivers/md/raid5.c   2006-06-27 16:19:13.0 -0700
@@ -918,129 +918,6 @@
} while(0)
 
 
-static void compute_block(struct stripe_head *sh, int dd_idx)
-{
-   int i, count, disks = sh->disks;
-   void *ptr[MAX_XOR_BLOCKS], *p;
-
-   PRINTK("compute_block, stripe %llu, idx %d\n", 
-   (unsigned long long)sh->sector, dd_idx);
-
-   ptr[0] = page_address(sh->dev[dd_idx].page);
-   memset(ptr[0], 0, STRIPE_SIZE);
-   count = 1;
-   for (i = disks ; i--; ) {
-   if (i == dd_idx)
-   continue;
-   p = page_address(sh->dev[i].page);
-   if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
-   ptr[count++] = p;
-   else
-   printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
-   " not present\n", dd_idx,
-   (unsigned long long)sh->sector, i);
-
-   check_xor();
-   }
-   if (count != 1)
-   xor_block(count, STRIPE_SIZE, ptr);
-   set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-}
-
-static void compute_parity5(struct stripe_head *sh, int method)
-{
-   raid5_conf_t *conf = sh->raid_conf;
-   int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
-   void *ptr[MAX_XOR_BLOCKS];
-   struct bio *chosen;
-
-   PRINTK("compute_parity5, stripe %llu, method %d\n",
-   (unsigned long long)sh->sector, method);
-
-   count = 1;
-   ptr[0] = page_address(sh->dev[pd_idx].page);
-   switch(method) {
-   case READ_MODIFY_WRITE:
-   BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
-   for (i=disks ; i-- ;) {
-   if (i==pd_idx)
-   continue;
-   if (sh->dev[i].towrite &&
-   test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
-   ptr[count++] = page_address(sh->dev[i].page);
-   chosen = sh->dev[i].towrite;
-   sh->dev[i].towrite = NULL;
-
-   if (test_and_clear_bit(R5_Overlap, 
&sh->dev[i].flags))
-   wake_up(&conf->wait_for_overlap);
-
-   BUG_ON(sh->dev[i].written);
-   sh->dev[i].written = chosen;
-   check_xor();
-   }
-   }
-   break;
-   case RECONSTRUCT_WRITE:
-   memset(ptr[0], 0, STRIPE_SIZE);
-   for (i= disks; i-- ;)
-   if (i!=pd_idx && sh->dev[i].towrite) {
-   chosen = sh->dev[i].towrite;
-   sh->dev[i].towrite = NULL;
-
-   if (test_and_clear_bit(R5_Overlap, 
&sh->dev[i].flags))
-   wake_up(&conf->wait_for_overlap);
-
-   BUG_ON(sh->dev[i].written);
-   sh->dev[i].written = chosen;
-   }
-   break;
-   case CHECK_PARITY:
-   break;
-   }
-   if (count>1) {
-   xor_block(count, STRIPE_SIZE, ptr);
-   count = 1;
-   }
-   
-   for (i = disks; i--;)
-   if (sh->dev[i].written) {
-   sector_t sector = sh->dev[i].sector;
-   struct bio *wbi = sh->dev[i].written;
-   while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) 
{
-   copy_data(1, wbi, sh->dev[i].page, sector);
-   wbi = r5_next_bio(wbi, sector);
-   }
-
-   set_bit(R5_LOCKED, &sh->dev[i].flags);
-   set_bit(R5_UPTODATE, &sh->dev[i].flags);
-   }
-
-   switch(method) {
-   case RECONSTRUCT_WRITE:
-   case CHECK_PARITY:
-   for (i=disks; i--;)
-   if (i != pd_idx) {
-   ptr[count++] = page_address(sh->dev[i].page);
-   check_xor();
-   }
-   break;
-   case

[PATCH 003 of 006] raid5: Move compute block operations to a work queue

2006-06-28 Thread Dan Williams
This patch adds 'compute block' capabilities to the work queue.

Here are a few notes about the new flags R5_ComputeReq and
STRIPE_OP_COMPUTE_Recover:

Previously, when handle_stripe5 found a block that needed to be computed
it updated it in the same step.  Now that these operations are separated
(across multiple calls to handle_stripe5), a R5_ComputeReq flag is
needed to tell other parts of handle_stripe5 to treat the block under
computation as if it were up to date.  The order of events in the work
queue ensures that the block is indeed up to date before performing
further operations.

STRIPE_OP_COMPUTE_Recover was added to track when the parity block is
being computed due to a failed parity check.  This allows the code in
handle_stripe5 that produces requests for check_parity and compute_block
operations to be separate from the code that consumes the result.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>

 drivers/md/raid5.c |  147 +
 include/linux/raid/raid5.h |7 +-
 2 files changed, 129 insertions(+), 25 deletions(-)

===
Index: linux-2.6-raid/drivers/md/raid5.c
===
--- linux-2.6-raid.orig/drivers/md/raid5.c  2006-06-28 10:47:43.0 
-0700
+++ linux-2.6-raid/drivers/md/raid5.c   2006-06-28 11:06:06.0 -0700
@@ -1263,7 +1263,9 @@
}
} else {
/* enter stage 1 of read modify write operation */
-   BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
+   BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) 
||
+   test_bit(R5_ComputeReq, 
&sh->dev[pd_idx].flags)));
+
set_bit(STRIPE_OP_RMW, &sh->state);
set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state);
for (i=disks ; i-- ;) {
@@ -1272,7 +1274,8 @@
continue;
 
if (dev->towrite &&
-   test_bit(R5_UPTODATE, &dev->flags)) {
+   (test_bit(R5_UPTODATE, &dev->flags) ||
+   test_bit(R5_ComputeReq, &dev->flags))) {
set_bit(R5_LOCKED, &dev->flags);
locked++;
}
@@ -1331,6 +1334,30 @@
return work_queued;
 }
 
+static int handle_compute_operations5(struct stripe_head *sh, int dd_idx)
+{
+   int work_queued = -EBUSY;
+
+   if (test_bit(STRIPE_OP_COMPUTE, &sh->state) &&
+   test_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state)) {
+   clear_bit(STRIPE_OP_COMPUTE, &sh->state);
+   clear_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state);
+   clear_bit(R5_ComputeReq, &sh->dev[dd_idx].flags);
+   work_queued = 0;
+   } else if (!test_bit(STRIPE_OP_COMPUTE, &sh->state)) {
+   set_bit(STRIPE_OP_COMPUTE, &sh->state);
+   set_bit(STRIPE_OP_COMPUTE_Prep, &sh->ops.state);
+   set_bit(R5_ComputeReq, &sh->dev[dd_idx].flags);
+   work_queued = 1;
+   sh->ops.pending++;
+   }
+
+   PRINTK("%s: stripe %llu work_queued: %d op_state: %lx dev[%d].flags: 
%lx\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+   work_queued, sh->ops.state, dd_idx, sh->dev[dd_idx].flags);
+
+   return work_queued;
+}
 
 /*
  * Each stripe/dev can have one or more bion attached.
@@ -1454,7 +1481,7 @@
int i, pd_idx = sh->pd_idx, disks = sh->disks, count = 1;
void *ptr[MAX_XOR_BLOCKS];
struct bio *chosen;
-   int overlap=0, work=0, written=0;
+   int overlap=0, work=0, written=0, compute=0, dd_idx=0;
unsigned long state, ops_state, ops_state_orig;
 
/* take a snapshot of what needs to be done at this point in time */
@@ -1463,6 +1490,51 @@
ops_state_orig = ops_state = sh->ops.state;
spin_unlock(&sh->lock);
 
+   if (test_bit(STRIPE_OP_COMPUTE, &state)) {
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+   if (test_bit(R5_ComputeReq, &dev->flags)) {
+   dd_idx = i;
+   i = -1;
+   break;
+   }
+   }
+   BUG_ON(i >= 0);
+   PRINTK("%s: stripe %llu STRIPE_OP_COMPUTE op_state: %lx block: 
%d\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+

[PATCH 004 of 006] raid5: Move read completion copies to a work queue

2006-06-28 Thread Dan Williams
This patch moves the data copying portion of satisfying read requests
into the work queue. It adds a 'read' (past tense) pointer to the r5dev
structure to to track reads that have been offloaded to the work queue.
When the copy operation is complete the 'read' pointer is reused as the
return_bi for the bi_end_io() call.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>

 drivers/md/raid5.c |   94 -
 include/linux/raid/raid5.h |6 +-
 2 files changed, 71 insertions(+), 29 deletions(-)

===
Index: linux-2.6-raid/drivers/md/raid5.c
===
--- linux-2.6-raid.orig/drivers/md/raid5.c  2006-06-28 10:35:31.0 
-0700
+++ linux-2.6-raid/drivers/md/raid5.c   2006-06-28 10:35:40.0 -0700
@@ -213,11 +213,11 @@
for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
 
-   if (dev->toread || dev->towrite || dev->written ||
+   if (dev->toread || dev->read || dev->towrite || dev->written ||
test_bit(R5_LOCKED, &dev->flags)) {
-   printk("sector=%llx i=%d %p %p %p %d\n",
+   printk("sector=%llx i=%d %p %p %p %p %d\n",
   (unsigned long long)sh->sector, i, dev->toread,
-  dev->towrite, dev->written,
+  dev->read, dev->towrite, dev->written,
   test_bit(R5_LOCKED, &dev->flags));
BUG();
}
@@ -1490,6 +1490,35 @@
ops_state_orig = ops_state = sh->ops.state;
spin_unlock(&sh->lock);
 
+   if (test_bit(STRIPE_OP_BIOFILL, &state)) {
+   raid5_conf_t *conf = sh->raid_conf;
+   struct bio *return_bi=NULL;
+   PRINTK("%s: stripe %llu STRIPE_OP_BIOFILL op_state: %lx\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+   ops_state);
+
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+   struct bio *rbi, *rbi2;
+   rbi = dev->read;
+   while (rbi && rbi->bi_sector < dev->sector + 
STRIPE_SECTORS) {
+   copy_data(0, rbi, dev->page, dev->sector);
+   rbi2 = r5_next_bio(rbi, dev->sector);
+   spin_lock_irq(&conf->device_lock);
+   if (--rbi->bi_phys_segments == 0) {
+   rbi->bi_next = return_bi;
+   return_bi = rbi;
+   }
+   spin_unlock_irq(&conf->device_lock);
+   rbi = rbi2;
+   dev->read = return_bi;
+   }
+   }
+
+   work++;
+   set_bit(STRIPE_OP_BIOFILL_Done, &ops_state);
+   }
+
if (test_bit(STRIPE_OP_COMPUTE, &state)) {
for (i=disks ; i-- ;) {
struct r5dev *dev = &sh->dev[i];
@@ -1725,6 +1754,7 @@
int i;
int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
+   int fill_complete=0, to_fill=0;
int non_overwrite = 0;
int failed_num=0;
struct r5dev *dev;
@@ -1740,45 +1770,49 @@
syncing = test_bit(STRIPE_SYNCING, &sh->state);
expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
-   /* Now to look around and see what can be done */
+   if (test_bit(STRIPE_OP_BIOFILL, &sh->state) &&
+   test_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state)) {
+   clear_bit(STRIPE_OP_BIOFILL, &sh->state);
+   clear_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state);
+   fill_complete++;
+   }
 
+   /* Now to look around and see what can be done */
rcu_read_lock();
for (i=disks; i--; ) {
mdk_rdev_t *rdev;
dev = &sh->dev[i];
clear_bit(R5_Insync, &dev->flags);
 
-   PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
-   i, dev->flags, dev->toread, dev->towrite, dev->written);
+   PRINTK("check %d: state 0x%lx toread %p read %p write %p 
written %p\n",
+   i, dev->flags, dev->toread, dev->

[PATCH 000 of 006] raid5: Offload RAID operations to a workqueue

2006-06-28 Thread Dan Williams
This patch set is a step towards enabling hardware offload in the
md-raid5 driver.  These patches are considered experimental and are not
yet suitable for production environments.

As mentioned, this patch set is the first step in that it moves work
from handle_stripe5 to a work queue.  The next step is to enable the
work queue to offload the operations to hardware copy/xor engines using
the dmaengine API (include/linux/dmaengine.h).  Initial testing shows
that about 60% of the array maintenance work previously performed by
raid5d has moved to the work queue.

These patches apply to the version of md as of commit 
266bee88699ddbde42ab303bbc426a105cc49809 in Linus' tree.

Regards,

Dan Williams

[PATCH 001 of 006] raid5: Move write operations to a work queue
[PATCH 002 of 006] raid5: Move check parity operations to a work queue
[PATCH 003 of 006] raid5: Move compute block operations to a work queue
[PATCH 004 of 006] raid5: Move read completion copies to a work queue
[PATCH 005 of 006] raid5: Move expansion operations to a work queue
[PATCH 006 of 006] raid5: Remove compute_block and compute_parity
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 001 of 006] raid5: Move write operations to a work queue

2006-06-28 Thread Dan Williams
This patch moves write (reconstruct and read-modify) operations to a
work queue.  Note the next patch in this series fixes some incorrect
assumptions around having multiple operations in flight (i.e. ignore
this version of 'queue_raid_work').

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>

 drivers/md/raid5.c |  314 +
 include/linux/raid/raid5.h |   67 +
 2 files changed, 357 insertions(+), 24 deletions(-)

===
Index: linux-2.6-raid/drivers/md/raid5.c
===
--- linux-2.6-raid.orig/drivers/md/raid5.c  2006-06-28 08:44:11.0 
-0700
+++ linux-2.6-raid/drivers/md/raid5.c   2006-06-28 09:52:07.0 -0700
@@ -305,6 +305,7 @@
memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
sh->raid_conf = conf;
spin_lock_init(&sh->lock);
+   INIT_WORK(&sh->ops.work, conf->do_block_ops, sh);
 
if (grow_buffers(sh, conf->raid_disks)) {
shrink_buffers(sh, conf->raid_disks);
@@ -1224,6 +1225,80 @@
}
 }
 
+static int handle_write_operations5(struct stripe_head *sh, int rcw, int 
locked)
+{
+   int i, pd_idx = sh->pd_idx, disks = sh->disks;
+   int complete=0;
+
+   if (test_bit(STRIPE_OP_RCW, &sh->state) &&
+   test_bit(STRIPE_OP_RCW_Done, &sh->ops.state)) {
+   clear_bit(STRIPE_OP_RCW, &sh->state);
+   clear_bit(STRIPE_OP_RCW_Done, &sh->ops.state);
+   complete++;
+   }
+
+   if (test_bit(STRIPE_OP_RMW, &sh->state) &&
+   test_bit(STRIPE_OP_RMW_Done, &sh->ops.state)) {
+   clear_bit(STRIPE_OP_RMW, &sh->state);
+   clear_bit(STRIPE_OP_RMW_Done, &sh->ops.state);
+   BUG_ON(++complete == 2);
+   }
+
+
+   /* If no operation is currently in process then use the rcw flag to
+* select an operation
+*/
+   if (locked == 0) {
+   if (rcw == 0) {
+   /* enter stage 1 of reconstruct write operation */
+   set_bit(STRIPE_OP_RCW, &sh->state);
+   set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state);
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+
+   if (i!=pd_idx && dev->towrite) {
+   set_bit(R5_LOCKED, &dev->flags);
+   locked++;
+   }
+   }
+   } else {
+   /* enter stage 1 of read modify write operation */
+   BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
+   set_bit(STRIPE_OP_RMW, &sh->state);
+   set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state);
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+   if (i==pd_idx)
+   continue;
+
+   if (dev->towrite &&
+   test_bit(R5_UPTODATE, &dev->flags)) {
+   set_bit(R5_LOCKED, &dev->flags);
+   locked++;
+   }
+   }
+   }
+   } else if (locked && complete == 0) /* the queue has an operation in 
flight */
+   locked = -EBUSY;
+   else if (complete)
+   locked = 0;
+
+   /* keep the parity disk locked while asynchronous operations
+* are in flight
+*/
+   if (locked > 0) {
+   set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
+   clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+   sh->ops.queue_count++;
+   } else if (locked == 0)
+   set_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+
+   PRINTK("%s: stripe %llu locked: %d complete: %d op_state: %lx\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+   locked, complete, sh->ops.state);
+
+   return locked;
+}
 
 
 /*
@@ -1320,6 +1395,174 @@
return pd_idx;
 }
 
+static inline void drain_bio(struct bio *wbi, sector_t sector, struct page 
*page)
+{
+   while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) {
+   copy_data(1, wbi, page, sector);
+   wbi = r5_next_bio(wbi, sector);
+   }

Re: [PATCH 000 of 006] raid5: Offload RAID operations to a workqueue

2006-06-29 Thread Dan Williams
> Hi,
> 
> since using work queues involve more context switches than doing things
> inline... have you measured the performance impact of your changes? If
> so... was there any impact that you could measure, and how big was that?
> 
> Greetings,
> Arjan van de Ven

Good point.  Especially on ARM extra context switching can be very
expensive.  In general more testing (and testers for that matter) is
needed.  To facilitate the determination of whether a multi-threaded
work queue is better/worse than an in context implementation here is a
patch that makes this configurable.

Thanks,

Dan


[PATCH] raid5: Configuration options to allow raid ops to run in raid5d context

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>

 drivers/md/Kconfig |   21 +
 drivers/md/raid5.c |   25 +
 include/linux/raid/raid5.h |6 ++
 3 files changed, 52 insertions(+)

Index: linux-2.6-raid/drivers/md/Kconfig
===
--- linux-2.6-raid.orig/drivers/md/Kconfig  2006-06-29 11:40:02.0 
-0700
+++ linux-2.6-raid/drivers/md/Kconfig   2006-06-29 13:43:03.0 -0700
@@ -162,6 +162,27 @@
  There should be enough spares already present to make the new
  array workable.
 
+config MD_RAID456_WORKQUEUE
+   depends on MD_RAID456
+   bool "Offload raid work to a workqueue from raid5d"
+   ---help---
+ This option enables raid work (block copy and xor operations)
+ to run in a workqueue.  However this may come at the expense of
+ extra context switching.  Single processor systems may benefit
+ from keeping the work within the raid5d context.
+
+ If unsure say, Y.
+
+config MD_RAID456_WORKQUEUE_MULTITHREAD
+   depends on MD_RAID456_WORKQUEUE && SMP
+   bool "Enable multi-threaded raid processing"
+   default y
+   ---help---
+ This option controls whether the raid workqueue will be multi-
+ threaded or single threaded.
+
+ If unsure say, Y.
+
 config MD_MULTIPATH
tristate "Multipath I/O support"
depends on BLK_DEV_MD
Index: linux-2.6-raid/drivers/md/raid5.c
===
--- linux-2.6-raid.orig/drivers/md/raid5.c  2006-06-29 13:42:57.0 
-0700
+++ linux-2.6-raid/drivers/md/raid5.c   2006-06-29 13:43:03.0 -0700
@@ -305,7 +305,9 @@
memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
sh->raid_conf = conf;
spin_lock_init(&sh->lock);
+   #ifdef CONFIG_MD_RAID456_WORKQUEUE
INIT_WORK(&sh->ops.work, conf->do_block_ops, sh);
+   #endif
 
if (grow_buffers(sh, conf->raid_disks)) {
shrink_buffers(sh, conf->raid_disks);
@@ -1352,8 +1354,10 @@
 {
if (!test_bit(STRIPE_OP_QUEUED, &sh->state) && sh->ops.pending) {
set_bit(STRIPE_OP_QUEUED, &sh->state);
+   #ifdef CONFIG_MD_RAID456_WORKQUEUE
atomic_inc(&sh->count);
queue_work(sh->raid_conf->block_ops_queue, &sh->ops.work);
+   #endif
}
 }
 
@@ -1614,7 +1618,9 @@
queue_raid_work(sh);
spin_unlock(&sh->lock);
 
+   #ifdef CONFIG_MD_RAID456_WORKQUEUE
release_stripe(sh);
+   #endif
 }
 
 /*
@@ -2182,6 +2188,13 @@
 
spin_unlock(&sh->lock);
 
+   #ifndef CONFIG_MD_RAID456_WORKQUEUE
+   while (test_bit(STRIPE_OP_QUEUED, &sh->state)) {
+   PRINTK("run do_block_ops\n", __FUNCTION__);
+   conf->do_block_ops(sh);
+   }
+   #endif
+
while ((bi=return_bi)) {
int bytes = bi->bi_size;
 
@@ -3480,12 +3493,20 @@
goto abort;
}
 
+   #ifdef CONFIG_MD_RAID456_WORKQUEUE
sprintf(conf->workqueue_name, "%s_raid5_ops",
mddev->gendisk->disk_name);
 
+   #ifdef CONFIG_MD_RAID456_MULTITHREAD
if ((conf->block_ops_queue = create_workqueue(conf->workqueue_name))
 == NULL)
goto abort;
+   #else
+   if ((conf->block_ops_queue = __create_workqueue(conf->workqueue_name, 
1))
+== NULL)
+   goto abort;
+   #endif
+   #endif
 
/* To Do:
 * 1/ Offload to asynchronous copy / xor engines
@@ -3656,8 +3677,10 @@
safe_put_page(conf->spare_page);
kfree(conf->disks);
kfree(conf->stripe_hashtbl);
+   #ifdef CONFIG_MD_RAID456_WORKQUEUE
if (conf->do_block_ops)
destroy_workqueue(conf->block_ops_queue);
+   #endif
 

Re: [PATCH 004 of 006] raid5: Move read completion copies to a work queue

2006-06-29 Thread Dan Williams
Minor refresh to make 'biofill' go through a test_and_clear_bit check
before performing the copy.  Which is important for the hardware offload
implementation where operations might need to be retried until DMA
resources are available.

-

This patch moves the data copying portion of satisfying read requests into
the workqueue. It adds a 'read' (past tense) pointer to the r5dev structure
to to track reads that have been offloaded to the workqueue.  When the copy
operation is complete the 'read' pointer is reused as the return_bi for the
bi_end_io() call.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>

 drivers/md/raid5.c |   98 -
 include/linux/raid/raid5.h |7 ++-
 2 files changed, 76 insertions(+), 29 deletions(-)

===
Index: linux-2.6-raid/drivers/md/raid5.c
===
--- linux-2.6-raid.orig/drivers/md/raid5.c  2006-06-28 11:06:06.0 
-0700
+++ linux-2.6-raid/drivers/md/raid5.c   2006-06-29 11:43:35.0 -0700
@@ -213,11 +213,11 @@
for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
 
-   if (dev->toread || dev->towrite || dev->written ||
+   if (dev->toread || dev->read || dev->towrite || dev->written ||
test_bit(R5_LOCKED, &dev->flags)) {
-   printk("sector=%llx i=%d %p %p %p %d\n",
+   printk("sector=%llx i=%d %p %p %p %p %d\n",
   (unsigned long long)sh->sector, i, dev->toread,
-  dev->towrite, dev->written,
+  dev->read, dev->towrite, dev->written,
   test_bit(R5_LOCKED, &dev->flags));
BUG();
}
@@ -1490,6 +1490,38 @@
ops_state_orig = ops_state = sh->ops.state;
spin_unlock(&sh->lock);
 
+   if (test_bit(STRIPE_OP_BIOFILL, &state)) {
+   PRINTK("%s: stripe %llu STRIPE_OP_BIOFILL op_state: %lx\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+   ops_state);
+
+   if (test_and_clear_bit(STRIPE_OP_BIOFILL_Copy, &ops_state)) {
+   raid5_conf_t *conf = sh->raid_conf;
+   struct bio *return_bi=NULL;
+
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+   struct bio *rbi, *rbi2;
+   rbi = dev->read;
+   while (rbi && rbi->bi_sector < dev->sector + 
STRIPE_SECTORS) {
+   copy_data(0, rbi, dev->page, 
dev->sector);
+   rbi2 = r5_next_bio(rbi, dev->sector);
+   spin_lock_irq(&conf->device_lock);
+   if (--rbi->bi_phys_segments == 0) {
+   rbi->bi_next = return_bi;
+   return_bi = rbi;
+   }
+   spin_unlock_irq(&conf->device_lock);
+   rbi = rbi2;
+   dev->read = return_bi;
+   }
+   }
+
+   work++;
+   set_bit(STRIPE_OP_BIOFILL_Done, &ops_state);
+   }
+   }
+
if (test_bit(STRIPE_OP_COMPUTE, &state)) {
for (i=disks ; i-- ;) {
struct r5dev *dev = &sh->dev[i];
@@ -1725,6 +1757,7 @@
int i;
int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
+   int fill_complete=0, to_fill=0;
int non_overwrite = 0;
int failed_num=0;
struct r5dev *dev;
@@ -1740,45 +1773,49 @@
syncing = test_bit(STRIPE_SYNCING, &sh->state);
expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
-   /* Now to look around and see what can be done */
+   if (test_bit(STRIPE_OP_BIOFILL, &sh->state) &&
+   test_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state)) {
+   clear_bit(STRIPE_OP_BIOFILL, &sh->state);
+   clear_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state);
+   fill_complete++;
+   }
 
+   /* Now to look around and see what can be done */
rcu

Re: [PATCH 005 of 006] raid5: Move expansion operations to a work queue

2006-06-29 Thread Dan Williams
Refresh to apply on top the new version of [PATCH 004 of 006].

---

This patch modifies handle_write_operations5() to handle the parity
calculation request made by the reshape code.  However this patch does
not move the copy operation associated with an expand to the work queue.
First, it was difficult to find a clean way to pass the parameters of
this operation to the queue.  Second, this section of code is a good
candidate for performing the copies with inline calls to the dma
routines.

This patch also cleans up the *_End flags which as of this version of
the patch set are not needed.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>

 drivers/md/raid5.c |   51 -
 include/linux/raid/raid5.h |   36 +++
 2 files changed, 54 insertions(+), 33 deletions(-)

===
Index: linux-2.6-raid/drivers/md/raid5.c
===
--- linux-2.6-raid.orig/drivers/md/raid5.c  2006-06-29 11:43:35.0 
-0700
+++ linux-2.6-raid/drivers/md/raid5.c   2006-06-29 11:44:30.0 -0700
@@ -1250,16 +1250,25 @@
 */
if (locked == 0) {
if (rcw == 0) {
-   /* enter stage 1 of reconstruct write operation */
-   set_bit(STRIPE_OP_RCW, &sh->state);
-   set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state);
-   for (i=disks ; i-- ;) {
-   struct r5dev *dev = &sh->dev[i];
-
-   if (i!=pd_idx && dev->towrite) {
-   set_bit(R5_LOCKED, &dev->flags);
+   /* skip the drain operation on an expand */
+   if (test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state)) {
+   set_bit(STRIPE_OP_RCW, &sh->state);
+   set_bit(STRIPE_OP_RCW_Parity, &sh->ops.state);
+   for (i=disks ; i-- ;) {
+   set_bit(R5_LOCKED, &sh->dev[i].flags);
locked++;
}
+   } else { /* enter stage 1 of reconstruct write 
operation */
+   set_bit(STRIPE_OP_RCW, &sh->state);
+   set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state);
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+
+   if (i!=pd_idx && dev->towrite) {
+   set_bit(R5_LOCKED, &dev->flags);
+   locked++;
+   }
+   }
}
} else {
/* enter stage 1 of read modify write operation */
@@ -2217,16 +2226,24 @@
}
 
if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+   int work_queued, start_n=1;
/* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks;
sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 
conf->raid_disks);
-   compute_parity5(sh, RECONSTRUCT_WRITE);
-   for (i= conf->raid_disks; i--;) {
-   set_bit(R5_LOCKED, &sh->dev[i].flags);
-   locked++;
-   set_bit(R5_Wantwrite, &sh->dev[i].flags);
+   if (!(test_bit(STRIPE_OP_RCW, &sh->state) ||
+   test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state))) {
+   start_n = 0;
+   set_bit(STRIPE_OP_RCW_Expand, &sh->ops.state);
+   }
+   work_queued = handle_write_operations5(sh, 0, start_n);
+   if (work_queued == 0) {
+   for (i= conf->raid_disks; i--;)
+   set_bit(R5_Wantwrite, &sh->dev[i].flags);
+   clear_bit(STRIPE_EXPANDING, &sh->state);
+   clear_bit(STRIPE_OP_RCW_Expand, &sh->ops.state);
+   } else if (work_queued > 0) {
+   locked += work_queued;
}
-   clear_bit(STRIPE_EXPANDING, &sh->state);
} else if (expanded) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
@@ -2261,9 +2278,15 @@
release_stripe(sh2);
 

Re: Hardware assisted parity computation - is it now worth it?

2006-07-13 Thread Dan Williams

On 7/13/06, Burn Alting <[EMAIL PROTECTED]> wrote:

Last year, there were discussions on this list about the possible
use of a 'co-processor' (Intel's IOP333) to compute raid 5/6's
parity data.

The MD patches have been posted for review, and the hardware offload
pieces are nearing completion.


We are about to see low cost, multi core cpu chips with very
high speed memory bandwidth. In light of this, is there any
effective benefit to such devices as the IOP333?

It is true that upcoming server platforms have an abundance of CPU
cycles, but what about the case where an IOP is the host processor?
This is the primary target of the current work.  Also, what about the
more expensive RAID6 conditions (2-failed disks) where there might be
benefits to having MD split its work over many CPUs?

Regards,

Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: In Trouble--Please Help! (was Re: Can't add disk to failed raid array)

2006-07-24 Thread Dan Williams

On 7/23/06, Paul Waldo <[EMAIL PROTECTED]> wrote:

Here is the dmesg output.  No log files are created with the FC5 rescue disk.
Thanks!

I ran into this as well, I believe at this point you want to set:

md-mod.start_dirty_degraded=1

as part of your boot options.  Understand you may see some filesystem
corruption as noted in the documentation.

See:
http://www.linux-m32r.org/lxr/http/source/Documentation/md.txt?v=2.6.17#L54

Regards,

Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: In Trouble--Please Help! (was Re: Can't add disk to failed raid array)

2006-07-24 Thread Dan Williams

I'll certainly give that a try later on, as I need physical access to
the box.

The corruption part is worrisome...  When you did this, did you
experience corruption?  I'm running RAID6 with 7 disks; presumably even
with two disks out of whack, I should be in good shape...???


I was running a 5 disk RAID-5 and did not detect any corruption.  Neil
correct me if I am wrong, but I believe that since your failure
occured without power loss that the chances for data corruption in
this case are small.

Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 001 of 006] raid5: Move write operations to a work queue

2006-07-27 Thread Dan Williams

On 7/27/06, Yuri Tikhonov <[EMAIL PROTECTED]> wrote:


 Hello, Dan.

 I've looked through your patches, and have some suggestions about write 
operations processing.


Thanks for reviewing the code.



 In the current implementation of the Raid5 driver the RMW operation won't 
begin until old blocks in the stripe cache,
which are to be rewritten, become UPTODATE.
 But if you have dedicated h/w DMA engines, then, while an IOC(input/output 
contoller) performs transmition of the
old strip data from the disk to the stripe cache, it may make sense to start a 
DMA engine, which will transmit new
strip data from the bio requested to write. So, when an IOC operation complete, 
we'll already have all necessary data
to compute new parity value.

1) For the current implementation:

 Trmw = Tioc1 + Txor1 + Tdma + Txor2 + Tioc2,
 where Tioc1 is the time it takes to update stripe cache with old data, Txor1 
is the time it takes to substract
old data from old parity value, Tdma is the time it takes to update strip with 
new data, Txor2 is the time it takes
to compute new parity, and Tioc2 is the time it takes to transfer updated data 
to disks.
 So, Trmw = 2*Tioc + 2*Txor + Tdma

2) If copying old and new data to stripe cache is performed simultaneously, 
then time to complete the whole RMW
operation will take:

 T'rmw = max(Tioc1, Tdma) + 2*Txor + Tioc2,
 where Tioc1 is the time it takes to update stripe cache with old data, Tdma is 
the time it takes to update strip
with new data, 2*Txor is the time it takes to compute new parity, and Tioc2 is 
the time it takes to transfer updated
data to disks.
 So, T'rmw = 2*Tioc + 2*Txor.
(in any case, i think that Tioc > Tdma, because Tioc corresponds to the time 
spent reading from disk, and
Tdma corresponds to operations with SDRAM, which are faster).

 Also, 2*Txor for (2) is less then 2*Txor for (1), because in (2) approach we 
have to prepare XOR engine descriptors only
once, but in the (1) approach - twice.

 Does it make sense to revise your Raid5 driver implementaion to allow IOC and 
DMA to have separate destination buffers? That is, some kind of a stripe 
shadow. IOC will copy to the regular buffer in the stripe cache, DMA - to the 
shadow one.


The issue I see with this is that Tioc1 is orders of magnitude greater
than Tdma.  So while I agree there may be room to get some pre-work
done while the reads are in flight I do not expect that the
performance increase would be significant, and definitely not worth
the design complexity of adding a "shadow buffer".


 Regards, Yuri.


Regards,

Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: raid 5 read performance

2006-08-05 Thread Dan Williams

Neil hello

Sorry for the delay. too many things to do.

I have implemented all said in :
http://www.spinics.net/lists/raid/msg11838.html

As always I have some questions:

1.  mergeable_bvec
 I did not understand first i must admit. now i do not see how it
differs from the
 one of raid0.  so i  actually copied it and renamed it.

2. statistics.
i have added md statistics since the code does not reach the
statics in make_request.
it returns from make_request before that.

3. i have added the new retry list called toread_aligned to raid5_conf_t .
hope this is correct.

4.  your instructions are to add a failed bio to sh, but it does not
say to handle it directly.
i have tried it and something is missing here. raid5d handle
stripes only if  conf->handle_list is not empty. i added handle_stripe
and and release_stripe of my own.
   this way i managed to get from the completion routine:
   "R5: read error corrected!! " message . ( i have tested by failing
a ram disk ).


5. I am going to test the non common path heavily before submitting
you the patch ( on real disks  and use  several file systems and
several chunk sizes).
 It is quite a big patch so I need to know which kernel do you want me
to use ? i am using poor 2.6.15.

I thank you
--
Raz

Hi Raz,

What is the status of this patch?  Anything I can help out with, like
testing or forward porting to the latest kernel?

Thanks,

Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: raid 5 read performance

2006-08-05 Thread Dan Williams

On 8/5/06, Raz Ben-Jehuda(caro) <[EMAIL PROTECTED]> wrote:

patch is applied by Neil.
I do not know when he going to apply it.
i have applied it on my systems ( on 2.6.15 )  but they are currenly in the
lab and not in production.
Raz.
PS
I must say that it saves lots of cpu cycles.


Did you send the 2.6.15 patch in a private message I can't find it in
the archives?

Thanks,

Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Linux: Why software RAID?

2006-08-26 Thread Dan Williams

On 8/23/06, H. Peter Anvin <[EMAIL PROTECTED]> wrote:

Chris Friesen wrote:
> Jeff Garzik wrote:
>
>> But anyway, to help answer the question of hardware vs. software RAID,
>> I wrote up a page:
>>
>> http://linux.yyz.us/why-software-raid.html
>
> Just curious...with these guys
> (http://www.bigfootnetworks.com/KillerOverview.aspx) putting linux on a
> PCI NIC to allow them to bypass Windows' network stack, has anyone ever
> considered doing "hardware" raid by using an embedded cpu running linux
> software RAID, with battery-backed memory?
>
> It would theoretically allow you to remain feature-compatible by
> downloading new kernels to your RAID card.
>

Yes.  In fact, I have been told by several RAID chip vendors that their
customers are *strongly* demanding that their chips be able to run Linux
  md (and still use whatever hardware offload features.)

So it's happening.

Speaking of md with hardware offload features:

http://prdownloads.sourceforge.net/xscaleiop/ols_paper_2006.pdf?download


-hpa


Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 00/19] Hardware Accelerated MD RAID5: Introduction

2006-09-11 Thread Dan Williams
Neil,

The following patches implement hardware accelerated raid5 for the Intel
XscaleĀ® series of I/O Processors.  The MD changes allow stripe
operations to run outside the spin lock in a work queue.  Hardware
acceleration is achieved by using a dma-engine-aware work queue routine
instead of the default software only routine.

Since the last release of the raid5 changes many bug fixes and other
improvements have been made as a result of stress testing.  See the per
patch change logs for more information about what was fixed.  This
release is the first release of the full dma implementation.

The patches touch 3 areas, the md-raid5 driver, the generic dmaengine
interface, and a platform device driver for IOPs.  The raid5 changes
follow your comments concerning making the acceleration implementation
similar to how the stripe cache handles I/O requests.  The dmaengine
changes are the second release of this code.  They expand the interface
to handle more than memcpy operations, and add a generic raid5-dma
client.  The iop-adma driver supports dma memcpy, xor, xor zero sum, and
memset across all IOP architectures (32x, 33x, and 13xx).

Concerning the context switching performance concerns raised at the
previous release, I have observed the following.  For the hardware
accelerated case it appears that performance is always better with the
work queue than without since it allows multiple stripes to be operated
on simultaneously.  I expect the same for an SMP platform, but so far my
testing has been limited to IOPs.  For a single-processor
non-accelerated configuration I have not observed performance
degradation with work queue support enabled, but in the Kconfig option
help text I recommend disabling it (CONFIG_MD_RAID456_WORKQUEUE).

Please consider the patches for -mm.

-Dan

[PATCH 01/19] raid5: raid5_do_soft_block_ops
[PATCH 02/19] raid5: move write operations to a workqueue
[PATCH 03/19] raid5: move check parity operations to a workqueue
[PATCH 04/19] raid5: move compute block operations to a workqueue
[PATCH 05/19] raid5: move read completion copies to a workqueue
[PATCH 06/19] raid5: move the reconstruct write expansion operation to a 
workqueue
[PATCH 07/19] raid5: remove compute_block and compute_parity5
[PATCH 08/19] dmaengine: enable multiple clients and operations
[PATCH 09/19] dmaengine: reduce backend address permutations
[PATCH 10/19] dmaengine: expose per channel dma mapping characteristics to 
clients
[PATCH 11/19] dmaengine: add memset as an asynchronous dma operation
[PATCH 12/19] dmaengine: dma_async_memcpy_err for DMA engines that do not 
support memcpy
[PATCH 13/19] dmaengine: add support for dma xor zero sum operations
[PATCH 14/19] dmaengine: add dma_sync_wait
[PATCH 15/19] dmaengine: raid5 dma client
[PATCH 16/19] dmaengine: Driver for the Intel IOP 32x, 33x, and 13xx RAID 
engines
[PATCH 17/19] iop3xx: define IOP3XX_REG_ADDR[32|16|8] and clean up DMA/AAU defs
[PATCH 18/19] iop3xx: Give Linux control over PCI (ATU) initialization
[PATCH 19/19] iop3xx: IOP 32x and 33x support for the iop-adma driver

Note, the iop3xx patches apply against the iop3xx platform code
re-factoring done by Lennert Buytenhek.  His patches are reproduced,
with permission, on the Xscale IOP SourceForge site.

Also available on SourceForge:

Linux Symposium Paper: MD RAID Acceleration Support for Asynchronous
DMA/XOR Engines
http://prdownloads.sourceforge.net/xscaleiop/ols_paper_2006.pdf?download

Tar archive of the patch set
http://prdownloads.sourceforge.net/xscaleiop/md_raid_accel-2.6.18-rc6.tar.gz?download

[PATCH 01/19] 
http://prdownloads.sourceforge.net/xscaleiop/md-add-raid5-do-soft-block-ops.patch?download
[PATCH 02/19] 
http://prdownloads.sourceforge.net/xscaleiop/md-move-write-operations-to-a-workqueue.patch?download
[PATCH 03/19] 
http://prdownloads.sourceforge.net/xscaleiop/md-move-check-parity-operations-to-a-workqueue.patch?download
[PATCH 04/19] 
http://prdownloads.sourceforge.net/xscaleiop/md-move-compute-block-operations-to-a-workqueue.patch?download
[PATCH 05/19] 
http://prdownloads.sourceforge.net/xscaleiop/md-move-read-completion-copies-to-a-workqueue.patch?download
[PATCH 06/19] 
http://prdownloads.sourceforge.net/xscaleiop/md-move-expansion-operations-to-a-workqueue.patch?download
[PATCH 07/19] 
http://prdownloads.sourceforge.net/xscaleiop/md-remove-compute_block-and-compute_parity5.patch?download
[PATCH 08/19] 
http://prdownloads.sourceforge.net/xscaleiop/dmaengine-multiple-clients-and-multiple-operations.patch?download
[PATCH 09/19] 
http://prdownloads.sourceforge.net/xscaleiop/dmaengine-unite-backend-address-types.patch?download
[PATCH 10/19] 
http://prdownloads.sourceforge.net/xscaleiop/dmaengine-dma-async-map-page.patch?download
[PATCH 11/19] 
http://prdownloads.sourceforge.net/xscaleiop/dmaengine-dma-async-memset.patch?download
[PATCH 12/19] 
http://prdownloads.sourceforge.net/xscaleiop/dmaengine-dma-async-memcpy-err.patch?download
[PATCH 13/19] 
http://prdownloads.sourceforge.net/xscale

[PATCH 01/19] raid5: raid5_do_soft_block_ops

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

raid5_do_soft_block_ops consolidates all the stripe cache maintenance
operations into a single routine.  The stripe operations are:
* copying data between the stripe cache and user application buffers
* computing blocks to save a disk access, or to recover a missing block
* updating the parity on a write operation (reconstruct write and
read-modify-write)
* checking parity correctness

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  289 
 include/linux/raid/raid5.h |  129 +++-
 2 files changed, 415 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4500660..8fde62b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1362,6 +1362,295 @@ static int stripe_to_pdidx(sector_t stri
return pd_idx;
 }
 
+/*
+ * raid5_do_soft_block_ops - perform block memory operations on stripe data
+ * outside the spin lock.
+ */
+static void raid5_do_soft_block_ops(void *stripe_head_ref)
+{
+   struct stripe_head *sh = stripe_head_ref;
+   int i, pd_idx = sh->pd_idx, disks = sh->disks;
+   void *ptr[MAX_XOR_BLOCKS];
+   int overlap=0, work=0, written=0, compute=0, dd_idx=0;
+   int pd_uptodate=0;
+   unsigned long state, ops_state, ops_state_orig;
+   raid5_conf_t *conf = sh->raid_conf;
+
+   /* take a snapshot of what needs to be done at this point in time */
+   spin_lock(&sh->lock);
+   state = sh->state;
+   ops_state_orig = ops_state = sh->ops.state;
+   spin_unlock(&sh->lock);
+
+   if (test_bit(STRIPE_OP_BIOFILL, &state)) {
+   struct bio *return_bi=NULL;
+
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+   if (test_bit(R5_ReadReq, &dev->flags)) {
+   struct bio *rbi, *rbi2;
+   PRINTK("%s: stripe %llu STRIPE_OP_BIOFILL 
op_state: %lx disk: %d\n",
+   __FUNCTION__, (unsigned long 
long)sh->sector,
+   ops_state, i);
+   spin_lock_irq(&conf->device_lock);
+   rbi = dev->toread;
+   dev->toread = NULL;
+   spin_unlock_irq(&conf->device_lock);
+   overlap++;
+   while (rbi && rbi->bi_sector < dev->sector + 
STRIPE_SECTORS) {
+   copy_data(0, rbi, dev->page, 
dev->sector);
+   rbi2 = r5_next_bio(rbi, dev->sector);
+   spin_lock_irq(&conf->device_lock);
+   if (--rbi->bi_phys_segments == 0) {
+   rbi->bi_next = return_bi;
+   return_bi = rbi;
+   }
+   spin_unlock_irq(&conf->device_lock);
+   rbi = rbi2;
+   }
+   dev->read = return_bi;
+   }
+   }
+   if (overlap) {
+   set_bit(STRIPE_OP_BIOFILL_Done, &ops_state);
+   work++;
+   }
+   }
+
+   if (test_bit(STRIPE_OP_COMPUTE, &state)) {
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+   if (test_bit(R5_ComputeReq, &dev->flags)) {
+   dd_idx = i;
+   i = -1;
+   break;
+   }
+   }
+   BUG_ON(i >= 0);
+   PRINTK("%s: stripe %llu STRIPE_OP_COMPUTE op_state: %lx block: 
%d\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+   ops_state, dd_idx);
+   ptr[0] = page_address(sh->dev[dd_idx].page);
+
+   if (test_and_clear_bit(STRIPE_OP_COMPUTE_Prep, &ops_state)) {
+   memset(ptr[0], 0, STRIPE_SIZE);
+   set_bit(STRIPE_OP_COMPUTE_Parity, &ops_state);
+   }
+
+   if (test_and_clear_bit(STRIPE_OP_COMPUTE_Parity, &ops_state)) {
+   int count = 1;
+   for (i = disks ; i--; ) {
+   struct r5dev *dev = &sh->dev[i];
+   void *p;
+   if (i == dd_idx)
+   continue;
+

[PATCH 17/19] iop3xx: define IOP3XX_REG_ADDR[32|16|8] and clean up DMA/AAU defs

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Also brings the iop3xx registers in line with the format of the iop13xx
register definitions.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 include/asm-arm/arch-iop32x/entry-macro.S |2 
 include/asm-arm/arch-iop32x/iop32x.h  |   14 +
 include/asm-arm/arch-iop33x/entry-macro.S |2 
 include/asm-arm/arch-iop33x/iop33x.h  |   38 ++-
 include/asm-arm/hardware/iop3xx.h |  347 +
 5 files changed, 188 insertions(+), 215 deletions(-)

diff --git a/include/asm-arm/arch-iop32x/entry-macro.S 
b/include/asm-arm/arch-iop32x/entry-macro.S
index 1500cbb..f357be4 100644
--- a/include/asm-arm/arch-iop32x/entry-macro.S
+++ b/include/asm-arm/arch-iop32x/entry-macro.S
@@ -13,7 +13,7 @@ #include 
.endm
 
.macro  get_irqnr_and_base, irqnr, irqstat, base, tmp
-   ldr \base, =IOP3XX_REG_ADDR(0x07D8)
+   ldr \base, =0xfeffe7d8
ldr \irqstat, [\base]   @ Read IINTSRC
cmp \irqstat, #0
clzne   \irqnr, \irqstat
diff --git a/include/asm-arm/arch-iop32x/iop32x.h 
b/include/asm-arm/arch-iop32x/iop32x.h
index 15b4d6a..904a14d 100644
--- a/include/asm-arm/arch-iop32x/iop32x.h
+++ b/include/asm-arm/arch-iop32x/iop32x.h
@@ -19,16 +19,18 @@ #define __IOP32X_H
  * Peripherals that are shared between the iop32x and iop33x but
  * located at different addresses.
  */
-#define IOP3XX_GPIO_REG(reg)   (IOP3XX_PERIPHERAL_VIRT_BASE + 0x07c0 + (reg))
-#define IOP3XX_TIMER_REG(reg)  (IOP3XX_PERIPHERAL_VIRT_BASE + 0x07e0 + (reg))
+#define IOP3XX_GPIO_REG32(reg)  (volatile u32 *)(IOP3XX_PERIPHERAL_VIRT_BASE +\
+ 0x07c0 + (reg))
+#define IOP3XX_TIMER_REG32(reg) (volatile u32 *)(IOP3XX_PERIPHERAL_VIRT_BASE +\
+ 0x07e0 + (reg))
 
 #include 
 
 /* Interrupt Controller  */
-#define IOP32X_INTCTL  (volatile u32 *)IOP3XX_REG_ADDR(0x07d0)
-#define IOP32X_INTSTR  (volatile u32 *)IOP3XX_REG_ADDR(0x07d4)
-#define IOP32X_IINTSRC (volatile u32 *)IOP3XX_REG_ADDR(0x07d8)
-#define IOP32X_FINTSRC (volatile u32 *)IOP3XX_REG_ADDR(0x07dc)
+#define IOP32X_INTCTL  IOP3XX_REG_ADDR32(0x07d0)
+#define IOP32X_INTSTR  IOP3XX_REG_ADDR32(0x07d4)
+#define IOP32X_IINTSRC IOP3XX_REG_ADDR32(0x07d8)
+#define IOP32X_FINTSRC IOP3XX_REG_ADDR32(0x07dc)
 
 
 #endif
diff --git a/include/asm-arm/arch-iop33x/entry-macro.S 
b/include/asm-arm/arch-iop33x/entry-macro.S
index 92b7917..eb207d2 100644
--- a/include/asm-arm/arch-iop33x/entry-macro.S
+++ b/include/asm-arm/arch-iop33x/entry-macro.S
@@ -13,7 +13,7 @@ #include 
.endm
 
.macro  get_irqnr_and_base, irqnr, irqstat, base, tmp
-   ldr \base, =IOP3XX_REG_ADDR(0x07C8)
+   ldr \base, =0xfeffe7c8
ldr \irqstat, [\base]   @ Read IINTVEC
cmp \irqstat, #0
ldreq   \irqstat, [\base]   @ erratum 63 workaround
diff --git a/include/asm-arm/arch-iop33x/iop33x.h 
b/include/asm-arm/arch-iop33x/iop33x.h
index 9b38fde..c171383 100644
--- a/include/asm-arm/arch-iop33x/iop33x.h
+++ b/include/asm-arm/arch-iop33x/iop33x.h
@@ -18,28 +18,30 @@ #define __IOP33X_H
  * Peripherals that are shared between the iop32x and iop33x but
  * located at different addresses.
  */
-#define IOP3XX_GPIO_REG(reg)   (IOP3XX_PERIPHERAL_VIRT_BASE + 0x1780 + (reg))
-#define IOP3XX_TIMER_REG(reg)  (IOP3XX_PERIPHERAL_VIRT_BASE + 0x07d0 + (reg))
+#define IOP3XX_GPIO_REG32(reg)  (volatile u32 *)(IOP3XX_PERIPHERAL_VIRT_BASE +\
+ 0x1780 + (reg))
+#define IOP3XX_TIMER_REG32(reg) (volatile u32 *)(IOP3XX_PERIPHERAL_VIRT_BASE +\
+ 0x07d0 + (reg))
 
 #include 
 
 /* Interrupt Controller  */
-#define IOP33X_INTCTL0 (volatile u32 *)IOP3XX_REG_ADDR(0x0790)
-#define IOP33X_INTCTL1 (volatile u32 *)IOP3XX_REG_ADDR(0x0794)
-#define IOP33X_INTSTR0 (volatile u32 *)IOP3XX_REG_ADDR(0x0798)
-#define IOP33X_INTSTR1 (volatile u32 *)IOP3XX_REG_ADDR(0x079c)
-#define IOP33X_IINTSRC0(volatile u32 *)IOP3XX_REG_ADDR(0x07a0)
-#define IOP33X_IINTSRC1(volatile u32 *)IOP3XX_REG_ADDR(0x07a4)
-#define IOP33X_FINTSRC0(volatile u32 *)IOP3XX_REG_ADDR(0x07a8)
-#define IOP33X_FINTSRC1(volatile u32 *)IOP3XX_REG_ADDR(0x07ac)
-#define IOP33X_IPR0(volatile u32 *)IOP3XX_REG_ADDR(0x07b0)
-#define IOP33X_IPR1(volatile u32 *)IOP3XX_REG_ADDR(0x07b4)
-#define IOP33X_IPR2(volatile u32 *)IOP3XX_REG_ADDR(0x07b8)
-#define IOP33X_IPR3(volatile u32 *)IOP3XX_REG_ADDR(0x07bc)
-#define IOP33X_INTBASE (volatile u32 *)IOP3XX_REG_ADDR(0x07c0)
-#define IO

[PATCH 13/19] dmaengine: add support for dma xor zero sum operations

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/dmaengine.c   |   15 
 drivers/dma/ioatdma.c |6 +
 include/linux/dmaengine.h |   56 +
 3 files changed, 77 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 33ad690..190c612 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -617,6 +617,18 @@ dma_cookie_t dma_async_do_xor_err(struct
 }
 
 /**
+ * dma_async_do_zero_sum_err - default function for dma devices that
+ * do not support xor zero sum
+ */
+dma_cookie_t dma_async_do_zero_sum_err(struct dma_chan *chan,
+   union dmaengine_addr src, unsigned int src_cnt,
+   unsigned int src_off, size_t len, u32 *result,
+   unsigned long flags)
+{
+   return -ENXIO;
+}
+
+/**
  * dma_async_do_memset_err - default function for dma devices that
  *  do not support memset
  */
@@ -649,6 +661,8 @@ EXPORT_SYMBOL_GPL(dma_async_memset_page)
 EXPORT_SYMBOL_GPL(dma_async_memset_dma);
 EXPORT_SYMBOL_GPL(dma_async_xor_pgs_to_pg);
 EXPORT_SYMBOL_GPL(dma_async_xor_dma_list_to_dma);
+EXPORT_SYMBOL_GPL(dma_async_zero_sum_pgs);
+EXPORT_SYMBOL_GPL(dma_async_zero_sum_dma_list);
 EXPORT_SYMBOL_GPL(dma_async_operation_complete);
 EXPORT_SYMBOL_GPL(dma_async_issue_pending);
 EXPORT_SYMBOL_GPL(dma_async_device_register);
@@ -656,6 +670,7 @@ EXPORT_SYMBOL_GPL(dma_async_device_unreg
 EXPORT_SYMBOL_GPL(dma_chan_cleanup);
 EXPORT_SYMBOL_GPL(dma_async_do_memcpy_err);
 EXPORT_SYMBOL_GPL(dma_async_do_xor_err);
+EXPORT_SYMBOL_GPL(dma_async_do_zero_sum_err);
 EXPORT_SYMBOL_GPL(dma_async_do_memset_err);
 EXPORT_SYMBOL_GPL(dma_async_chan_init);
 EXPORT_SYMBOL_GPL(dma_async_map_page);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 231247c..4e90b02 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -637,6 +637,11 @@ extern dma_cookie_t dma_async_do_xor_err
union dmaengine_addr src, unsigned int src_cnt,
unsigned int src_off, size_t len, unsigned long flags);
 
+extern dma_cookie_t dma_async_do_zero_sum_err(struct dma_chan *chan,
+union dmaengine_addr src, unsigned int src_cnt,
+unsigned int src_off, size_t len, u32 *result, 
+   unsigned long flags);
+
 extern dma_cookie_t dma_async_do_memset_err(struct dma_chan *chan,
union dmaengine_addr dest, unsigned int dest_off,
int val, size_t size, unsigned long flags);
@@ -752,6 +757,7 @@ #endif
device->common.capabilities = DMA_MEMCPY;
device->common.device_do_dma_memcpy = do_ioat_dma_memcpy;
device->common.device_do_dma_xor = dma_async_do_xor_err;
+   device->common.device_do_dma_zero_sum = dma_async_do_zero_sum_err;
device->common.device_do_dma_memset = dma_async_do_memset_err;
device->common.map_page = ioat_map_page;
device->common.map_single = ioat_map_single;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 8d53b08..9fd6cbd 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -260,6 +260,7 @@ struct dma_chan_client_ref {
  * @device_issue_pending: push appended descriptors to hardware
  * @device_do_dma_memcpy: perform memcpy with a dma engine
  * @device_do_dma_xor: perform block xor with a dma engine
+ * @device_do_dma_zero_sum: perform block xor zero sum with a dma engine
  * @device_do_dma_memset: perform block fill with a dma engine
  */
 struct dma_device {
@@ -285,6 +286,10 @@ struct dma_device {
union dmaengine_addr src, unsigned int src_cnt,
unsigned int src_off, size_t len,
unsigned long flags);
+   dma_cookie_t (*device_do_dma_zero_sum)(struct dma_chan *chan,
+   union dmaengine_addr src, unsigned int src_cnt,
+   unsigned int src_off, size_t len, u32 *result,
+   unsigned long flags);
dma_cookie_t (*device_do_dma_memset)(struct dma_chan *chan,
union dmaengine_addr dest, unsigned int dest_off,
int value, size_t len, unsigned long flags);
@@ -601,6 +606,57 @@ static inline dma_cookie_t dma_async_xor
 }
 
 /**
+ * dma_async_zero_sum_pgs - offloaded xor zero sum from a list of pages
+ * @chan: DMA channel to offload zero sum to
+ * @src_pgs: array of source pages
+ * @src_cnt: number of source pages
+ * @src_off: offset in pages to xor from
+ * @len: length
+ * @result: set to 1 if sum is zero else 0
+ *
+ * Both @dest_page/@dest_off and @src_page/@src_off must be mappable to a bus
+ * address according to the DMA mapping API rules for streaming mappings.
+ * Both @dest_page/@dest_off and @src_page/@src_off must stay memory resident
+ * (kernel memory or locked user space pages)
+ */
+static inline dma_cookie_t dma_async_zero_sum_pgs(struct dma_chan *ch

[PATCH 15/19] dmaengine: raid5 dma client

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Adds a dmaengine client that is the hardware accelerated version of
raid5_do_soft_block_ops.  It utilizes the raid5 workqueue implementation to
operate on multiple stripes simultaneously.  See the iop-adma.c driver for
an example of a driver that enables hardware accelerated raid5.

Changelog:
* mark operations as _Dma rather than _Done until all outstanding
operations have completed.  Once all operations have completed update the
state and return it to the handle list
* add a helper routine to retrieve the last used cookie
* use dma_async_zero_sum_dma_list for checking parity which optionally
allows parity check operations to not dirty the parity block in the cache
(if 'disks' is less than 'MAX_ADMA_XOR_SOURCES')
* remove dependencies on iop13xx
* take into account the fact that dma engines have a staging buffer so we
can perform 1 less block operation compared to software xor
* added __arch_raid5_dma_chan_request __arch_raid5_dma_next_channel and
__arch_raid5_dma_check_channel to make the driver architecture independent
* added channel switching capability for architectures that implement
different operations (i.e. copy & xor) on individual channels
* added initial support for "non-blocking" channel switching

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/Kconfig|9 +
 drivers/dma/Makefile   |1 
 drivers/dma/raid5-dma.c|  730 
 drivers/md/Kconfig |   11 +
 drivers/md/raid5.c |   66 
 include/linux/dmaengine.h  |5 
 include/linux/raid/raid5.h |   24 +
 7 files changed, 839 insertions(+), 7 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 30d021d..fced8c3 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -22,6 +22,15 @@ config NET_DMA
  Since this is the main user of the DMA engine, it should be enabled;
  say Y here.
 
+config RAID5_DMA
+tristate "MD raid5: block operations offload"
+   depends on INTEL_IOP_ADMA && MD_RAID456
+   default y
+   ---help---
+ This enables the use of DMA engines in the MD-RAID5 driver to
+ offload stripe cache operations, freeing CPU cycles.
+ say Y here
+
 comment "DMA Devices"
 
 config INTEL_IOATDMA
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index bdcfdbd..4e36d6e 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
+obj-$(CONFIG_RAID5_DMA) += raid5-dma.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
diff --git a/drivers/dma/raid5-dma.c b/drivers/dma/raid5-dma.c
new file mode 100644
index 000..04a1790
--- /dev/null
+++ b/drivers/dma/raid5-dma.c
@@ -0,0 +1,730 @@
+/*
+ * Offload raid5 operations to hardware RAID engines
+ * Copyright(c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+#include 
+#include 
+
+static struct dma_client *raid5_dma_client;
+static atomic_t raid5_count;
+extern void release_stripe(struct stripe_head *sh);
+extern void __arch_raid5_dma_chan_request(struct dma_client *client);
+extern struct dma_chan *__arch_raid5_dma_next_channel(struct dma_client 
*client);
+
+#define MAX_HW_XOR_SRCS 16
+
+#ifndef STRIPE_SIZE
+#define STRIPE_SIZE PAGE_SIZE
+#endif
+
+#ifndef STRIPE_SECTORS
+#define STRIPE_SECTORS (STRIPE_SIZE>>9)
+#endif
+
+#ifndef r5_next_bio
+#define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < 
sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
+#endif
+
+#define DMA_RAID5_DEBUG 0
+#define PRINTK(x...) ((void)(DMA_RAID5_DEBUG && printk(x)))
+
+/*
+ * Copy data between a page in the stripe cache, and one or more bion
+ * The page could align with the middle of the bio, or there could be
+ * several bion, each with several bio_vecs, which cover part of the page
+ * Multiple bion are linked together on bi_next.  There may be extras
+ * at the end of this list.  We ignore them.
+ */
+static dma_cookie_t dma_raid_copy_data

[PATCH 04/19] raid5: move compute block operations to a workqueue

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Enable handle_stripe5 to pass off compute block operations to
raid5_do_soft_block_ops, formerly handled by compute_block.

Here are a few notes about the new flags R5_ComputeReq and
STRIPE_OP_COMPUTE_Recover:

Previously, when handle_stripe5 found a block that needed to be computed it
updated it in the same step.  Now that these operations are separated
(across multiple calls to handle_stripe5), a R5_ComputeReq flag is needed
to tell other parts of handle_stripe5 to treat the block under computation
as if it were up to date.  The order of events in the work queue ensures that 
the
block is indeed up to date before performing further operations.

STRIPE_OP_COMPUTE_Recover_pd was added to track when the parity block is being
computed due to a failed parity check.  This allows the code in
handle_stripe5 that produces requests for check_parity and compute_block
operations to be separate from the code that consumes the result.

Changelog:
* count blocks under computation as uptodate
* removed handle_compute_operations5.  All logic moved into handle_stripe5
so that we do not need to go through the initiation logic to end the
operation.
* since the write operations mark blocks !uptodate we hold off the code to
compute/read blocks until it completes.
* new compute block operations and reads are held off while a compute is in
flight
* do not compute a block while a check parity operation is pending, and do
not start a new check parity operation while a compute operation is pending
* STRIPE_OP_Recover_pd holds off the clearing of the STRIPE_OP_COMPUTE state.
This allows the transition to be handled by the check parity logic that
writes recomputed parity to disk.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  153 
 1 files changed, 107 insertions(+), 46 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 24ed4d8..0c39203 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1300,7 +1300,8 @@ static int handle_write_operations5(stru
}
} else {
/* enter stage 1 of read modify write operation */
-   BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
+   BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
+   test_bit(R5_ComputeReq, &sh->dev[pd_idx].flags)));
 
set_bit(STRIPE_OP_RMW, &sh->state);
set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state);
@@ -1314,7 +1315,8 @@ static int handle_write_operations5(stru
 * so we distinguish these blocks by the RMWReq bit
 */
if (dev->towrite &&
-   test_bit(R5_UPTODATE, &dev->flags)) {
+   (test_bit(R5_UPTODATE, &dev->flags) ||
+   test_bit(R5_ComputeReq, &dev->flags))) {
set_bit(R5_RMWReq, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
clear_bit(R5_UPTODATE, &dev->flags);
@@ -1748,7 +1750,7 @@ static void handle_stripe5(struct stripe
int i;
int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-   int non_overwrite=0, write_complete=0;
+   int compute=0, non_overwrite=0, write_complete=0;
int failed_num=0;
struct r5dev *dev;
 
@@ -1799,7 +1801,7 @@ static void handle_stripe5(struct stripe
/* now count some things */
if (test_bit(R5_LOCKED, &dev->flags)) locked++;
if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
-
+   if (test_bit(R5_ComputeReq, &dev->flags)) BUG_ON(++compute > 1);

if (dev->toread) to_read++;
if (dev->towrite) {
@@ -1955,40 +1957,83 @@ static void handle_stripe5(struct stripe
 * parity, or to satisfy requests
 * or to load a block that is being partially written.
 */
-   if (to_read || non_overwrite || (syncing && (uptodate < disks)) || 
expanding) {
-   for (i=disks; i--;) {
-   dev = &sh->dev[i];
-   if (!test_bit(R5_LOCKED, &dev->flags) && 
!test_bit(R5_UPTODATE, &dev->flags) &&
-   (dev->toread ||
-(dev->towrite && !test_bit(R5_OVERWRITE, 
&dev->flags)) ||
-syncing ||
-expanding ||
-(failed && (sh->dev[failed_num].toread ||
-(sh->dev[failed_num].towrite && 
!test_bit(R5_OVERWRITE

[PATCH 19/19] iop3xx: IOP 32x and 33x support for the iop-adma driver

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Adds the platform device definitions and the architecture specific support
routines (i.e. register initialization and descriptor formats) for the
iop-adma driver.

Changelog:
* add support for > 1k zero sum buffer sizes
* added dma/aau platform devices to iq80321 and iq80332 setup
* fixed the calculation in iop_desc_is_aligned
* support xor buffer sizes larger than 16MB
* fix places where software descriptors are assumed to be contiguous, only
hardware descriptors are contiguous
* iop32x does not support hardware zero sum, add software emulation support
for up to a PAGE_SIZE buffer size
* added raid5 dma driver support functions

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 arch/arm/mach-iop32x/iq80321.c |  141 +
 arch/arm/mach-iop33x/iq80331.c |9 
 arch/arm/mach-iop33x/iq80332.c |8 
 arch/arm/mach-iop33x/setup.c   |  132 +
 include/asm-arm/arch-iop32x/adma.h |5 
 include/asm-arm/arch-iop33x/adma.h |5 
 include/asm-arm/hardware/iop3xx-adma.h |  901 
 7 files changed, 1201 insertions(+), 0 deletions(-)

diff --git a/arch/arm/mach-iop32x/iq80321.c b/arch/arm/mach-iop32x/iq80321.c
index cdd2265..79d6514 100644
--- a/arch/arm/mach-iop32x/iq80321.c
+++ b/arch/arm/mach-iop32x/iq80321.c
@@ -33,6 +33,9 @@ #include 
 #include 
 #include 
 #include 
+#ifdef CONFIG_DMA_ENGINE
+#include 
+#endif
 
 /*
  * IQ80321 timer tick configuration.
@@ -170,12 +173,150 @@ static struct platform_device iq80321_se
.resource   = &iq80321_uart_resource,
 };
 
+#ifdef CONFIG_DMA_ENGINE
+/* AAU and DMA Channels */
+static struct resource iop3xx_dma_0_resources[] = {
+   [0] = {
+   .start = (unsigned long) IOP3XX_DMA_CCR(0),
+   .end = ((unsigned long) IOP3XX_DMA_DCR(0)) + 4,
+   .flags = IORESOURCE_MEM,
+   },
+   [1] = {
+   .start = IRQ_IOP32X_DMA0_EOT,
+   .end = IRQ_IOP32X_DMA0_EOT,
+   .flags = IORESOURCE_IRQ
+   },
+   [2] = {
+   .start = IRQ_IOP32X_DMA0_EOC,
+   .end = IRQ_IOP32X_DMA0_EOC,
+   .flags = IORESOURCE_IRQ
+   },
+   [3] = {
+   .start = IRQ_IOP32X_DMA0_ERR,
+   .end = IRQ_IOP32X_DMA0_ERR,
+   .flags = IORESOURCE_IRQ
+   }
+};
+
+static struct resource iop3xx_dma_1_resources[] = {
+   [0] = {
+   .start = (unsigned long) IOP3XX_DMA_CCR(1),
+   .end = ((unsigned long) IOP3XX_DMA_DCR(1)) + 4,
+   .flags = IORESOURCE_MEM,
+   },
+   [1] = {
+   .start = IRQ_IOP32X_DMA1_EOT,
+   .end = IRQ_IOP32X_DMA1_EOT,
+   .flags = IORESOURCE_IRQ
+   },
+   [2] = {
+   .start = IRQ_IOP32X_DMA1_EOC,
+   .end = IRQ_IOP32X_DMA1_EOC,
+   .flags = IORESOURCE_IRQ
+   },
+   [3] = {
+   .start = IRQ_IOP32X_DMA1_ERR,
+   .end = IRQ_IOP32X_DMA1_ERR,
+   .flags = IORESOURCE_IRQ
+   }
+};
+
+
+static struct resource iop3xx_aau_resources[] = {
+   [0] = {
+   .start = (unsigned long) IOP3XX_AAU_ACR,
+   .end = (unsigned long) IOP3XX_AAU_SAR_EDCR(32),
+   .flags = IORESOURCE_MEM,
+   },
+   [1] = {
+   .start = IRQ_IOP32X_AA_EOT,
+   .end = IRQ_IOP32X_AA_EOT,
+   .flags = IORESOURCE_IRQ
+   },
+   [2] = {
+   .start = IRQ_IOP32X_AA_EOC,
+   .end = IRQ_IOP32X_AA_EOC,
+   .flags = IORESOURCE_IRQ
+   },
+   [3] = {
+   .start = IRQ_IOP32X_AA_ERR,
+   .end = IRQ_IOP32X_AA_ERR,
+   .flags = IORESOURCE_IRQ
+   }
+};
+
+static u64 iop3xx_adma_dmamask = DMA_32BIT_MASK;
+
+static struct iop_adma_platform_data iop3xx_dma_0_data = {
+   .hw_id = IOP3XX_DMA0_ID,
+   .capabilities = DMA_MEMCPY | DMA_MEMCPY_CRC32C,
+   .pool_size = PAGE_SIZE,
+};
+
+static struct iop_adma_platform_data iop3xx_dma_1_data = {
+   .hw_id = IOP3XX_DMA1_ID,
+   .capabilities = DMA_MEMCPY | DMA_MEMCPY_CRC32C,
+   .pool_size = PAGE_SIZE,
+};
+
+static struct iop_adma_platform_data iop3xx_aau_data = {
+   .hw_id = IOP3XX_AAU_ID,
+   .capabilities = DMA_XOR | DMA_ZERO_SUM | DMA_MEMSET,
+   .pool_size = 3 * PAGE_SIZE,
+};
+
+struct platform_device iop3xx_dma_0_channel = {
+   .name = "IOP-ADMA",
+   .id = 0,
+   .num_resources = 4,
+   .resource = iop3xx_dma_0_resources,
+   .dev = {
+   .dma_mask = &iop3xx_adma_dmamask,
+   .coherent_dma_mask = DMA_64BIT_MASK,
+   .platform_data = (void *) &iop3xx_dma_0_data,
+   },
+};
+
+struct platform_device iop3xx_dma_1_channel = {
+   .name = "IOP-ADMA",
+   .id = 1,
+   .num_resources = 4,
+   .

[PATCH 06/19] raid5: move the reconstruct write expansion operation to a workqueue

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Enable handle_stripe5 to use the reconstruct write operations capability
for expansion operations.  

However this does not move the copy operation associated with an expand to
the workqueue.  First, it was difficult to find a clean way to pass the
parameters of this operation to the queue.  Second, this section of code is
a good candidate for performing the copies with inline calls to the dma
routines.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   36 +++-
 1 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1a8dfd2..a07b52b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2053,6 +2053,7 @@ #endif
 * completed
 */
if (test_bit(STRIPE_OP_RCW, &sh->state) &&
+   !test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state) &&
test_bit(STRIPE_OP_RCW_Done, &sh->ops.state)) {
clear_bit(STRIPE_OP_RCW, &sh->state);
clear_bit(STRIPE_OP_RCW_Done, &sh->ops.state);
@@ -2226,6 +2227,7 @@ #endif
}
}
}
+
if (test_bit(STRIPE_OP_COMPUTE_Done, &sh->ops.state) &&
test_bit(STRIPE_OP_COMPUTE_Recover_pd, &sh->ops.state)) 
{
clear_bit(STRIPE_OP_COMPUTE, &sh->state);
@@ -2282,18 +2284,28 @@ #endif
}
}
 
-   if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
+   /* Finish 'rcw' operations initiated by the expansion
+* process
+*/
+   if (test_bit(STRIPE_OP_RCW, &sh->state) &&
+   test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state) &&
+   test_bit(STRIPE_OP_RCW_Done, &sh->ops.state)) {
+   clear_bit(STRIPE_OP_RCW, &sh->state);
+   clear_bit(STRIPE_OP_RCW_Done, &sh->ops.state);
+   clear_bit(STRIPE_OP_RCW_Expand, &sh->ops.state);
+   clear_bit(STRIPE_EXPANDING, &sh->state);
+   for (i= conf->raid_disks; i--;)
+   set_bit(R5_Wantwrite, &sh->dev[i].flags);
+   }
+
+   if (expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+   !test_bit(STRIPE_OP_RCW, &sh->state)) {
/* Need to write out all blocks after computing parity */
sh->disks = conf->raid_disks;
sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 
conf->raid_disks);
-   compute_parity5(sh, RECONSTRUCT_WRITE);
-   for (i= conf->raid_disks; i--;) {
-   set_bit(R5_LOCKED, &sh->dev[i].flags);
-   locked++;
-   set_bit(R5_Wantwrite, &sh->dev[i].flags);
-   }
-   clear_bit(STRIPE_EXPANDING, &sh->state);
-   } else if (expanded) {
+   set_bit(STRIPE_OP_RCW_Expand, &sh->ops.state);
+   locked += handle_write_operations5(sh, 0);
+   } else if (expanded && !test_bit(STRIPE_OP_RCW, &sh->state)) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
@@ -2327,9 +2339,15 @@ #endif
release_stripe(sh2);
continue;
}
+   /* to do: perform these operations with a dma 
engine
+* inline (rather than pushing to the workqueue)
+*/
+   /*#ifdef CONFIG_RAID5_DMA*/
+   /*#else*/
memcpy(page_address(sh2->dev[dd_idx].page),
   page_address(sh->dev[i].page),
   STRIPE_SIZE);
+   /*#endif*/
set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
for (j=0; jraid_disks; j++)
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 12/19] dmaengine: dma_async_memcpy_err for DMA engines that do not support memcpy

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Default virtual function that returns an error if the user attempts a
memcpy operation.  An XOR engine is an example of a DMA engine that does
not support memcpy.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/dmaengine.c |   13 +
 1 files changed, 13 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index fe62237..33ad690 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -593,6 +593,18 @@ void dma_async_device_unregister(struct 
 }
 
 /**
+ * dma_async_do_memcpy_err - default function for dma devices that
+ * do not support memcpy
+ */
+dma_cookie_t dma_async_do_memcpy_err(struct dma_chan *chan,
+   union dmaengine_addr dest, unsigned int dest_off,
+   union dmaengine_addr src, unsigned int src_off,
+size_t len, unsigned long flags)
+{
+   return -ENXIO;
+}
+
+/**
  * dma_async_do_xor_err - default function for dma devices that
  * do not support xor
  */
@@ -642,6 +654,7 @@ EXPORT_SYMBOL_GPL(dma_async_issue_pendin
 EXPORT_SYMBOL_GPL(dma_async_device_register);
 EXPORT_SYMBOL_GPL(dma_async_device_unregister);
 EXPORT_SYMBOL_GPL(dma_chan_cleanup);
+EXPORT_SYMBOL_GPL(dma_async_do_memcpy_err);
 EXPORT_SYMBOL_GPL(dma_async_do_xor_err);
 EXPORT_SYMBOL_GPL(dma_async_do_memset_err);
 EXPORT_SYMBOL_GPL(dma_async_chan_init);
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/19] dmaengine: reduce backend address permutations

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Change the backend dma driver API to accept a 'union dmaengine_addr'.  The
intent is to be able to support a wide range of frontend address type
permutations without needing an equal number of function type permutations
on the backend.

Changelog:
* make the dmaengine api EXPORT_SYMBOL_GPL
* zero sum support should be standalone, not integrated into xor

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/dmaengine.c   |   15 ++-
 drivers/dma/ioatdma.c |  186 +--
 include/linux/dmaengine.h |  193 +++--
 3 files changed, 249 insertions(+), 145 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index e10f19d..9b02afa 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -593,12 +593,13 @@ void dma_async_device_unregister(struct 
 }
 
 /**
- * dma_async_xor_pgs_to_pg_err - default function for dma devices that
+ * dma_async_do_xor_err - default function for dma devices that
  * do not support xor
  */
-dma_cookie_t dma_async_xor_pgs_to_pg_err(struct dma_chan *chan,
-   struct page *dest_pg, unsigned int dest_off, struct page *src_pgs,
-   unsigned int src_cnt, unsigned int src_off, size_t len)
+dma_cookie_t dma_async_do_xor_err(struct dma_chan *chan,
+   union dmaengine_addr dest, unsigned int dest_off,
+   union dmaengine_addr src, unsigned int src_cnt,
+   unsigned int src_off, size_t len, unsigned long flags)
 {
return -ENXIO;
 }
@@ -617,11 +618,15 @@ EXPORT_SYMBOL_GPL(dma_async_client_chan_
 EXPORT_SYMBOL_GPL(dma_async_memcpy_buf_to_buf);
 EXPORT_SYMBOL_GPL(dma_async_memcpy_buf_to_pg);
 EXPORT_SYMBOL_GPL(dma_async_memcpy_pg_to_pg);
+EXPORT_SYMBOL_GPL(dma_async_memcpy_dma_to_dma);
+EXPORT_SYMBOL_GPL(dma_async_memcpy_pg_to_dma);
+EXPORT_SYMBOL_GPL(dma_async_memcpy_dma_to_pg);
 EXPORT_SYMBOL_GPL(dma_async_xor_pgs_to_pg);
+EXPORT_SYMBOL_GPL(dma_async_xor_dma_list_to_dma);
 EXPORT_SYMBOL_GPL(dma_async_operation_complete);
 EXPORT_SYMBOL_GPL(dma_async_issue_pending);
 EXPORT_SYMBOL_GPL(dma_async_device_register);
 EXPORT_SYMBOL_GPL(dma_async_device_unregister);
 EXPORT_SYMBOL_GPL(dma_chan_cleanup);
-EXPORT_SYMBOL_GPL(dma_async_xor_pgs_to_pg_err);
+EXPORT_SYMBOL_GPL(dma_async_do_xor_err);
 EXPORT_SYMBOL_GPL(dma_async_chan_init);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 415de03..dd5b9f0 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -213,20 +213,25 @@ static void ioat_dma_free_chan_resources
 
 /**
  * do_ioat_dma_memcpy - actual function that initiates a IOAT DMA transaction
- * @ioat_chan: IOAT DMA channel handle
- * @dest: DMA destination address
- * @src: DMA source address
+ * @chan: IOAT DMA channel handle
+ * @dest: DMAENGINE destination address
+ * @dest_off: Page offset
+ * @src: DMAENGINE source address
+ * @src_off: Page offset
  * @len: transaction length in bytes
  */
 
-static dma_cookie_t do_ioat_dma_memcpy(struct ioat_dma_chan *ioat_chan,
-   dma_addr_t dest,
-   dma_addr_t src,
-   size_t len)
+static dma_cookie_t do_ioat_dma_memcpy(struct dma_chan *dma_chan,
+   union dmaengine_addr dest,
+   unsigned int dest_off,
+   union dmaengine_addr src,
+   unsigned int src_off,
+   size_t len,
+   unsigned long flags)
 {
struct ioat_desc_sw *first;
struct ioat_desc_sw *prev;
-   struct ioat_desc_sw *new;
+   struct ioat_desc_sw *new = 0;
dma_cookie_t cookie;
LIST_HEAD(new_chain);
u32 copy;
@@ -234,16 +239,47 @@ static dma_cookie_t do_ioat_dma_memcpy(s
dma_addr_t orig_src, orig_dst;
unsigned int desc_count = 0;
unsigned int append = 0;
+   struct ioat_dma_chan *ioat_chan = to_ioat_chan(dma_chan);
 
-   if (!ioat_chan || !dest || !src)
+   if (!dma_chan || !dest.dma || !src.dma)
return -EFAULT;
 
if (!len)
return ioat_chan->common.cookie;
 
+   switch (flags & (DMA_SRC_BUF | DMA_SRC_PAGE | DMA_SRC_DMA)) {
+   case DMA_SRC_BUF:
+   src.dma = pci_map_single(ioat_chan->device->pdev,
+   src.buf, len, PCI_DMA_TODEVICE);
+   break;
+   case DMA_SRC_PAGE:
+   src.dma = pci_map_page(ioat_chan->device->pdev,
+   src.pg, src_off, len, PCI_DMA_TODEVICE);
+   break;
+   case DMA_SRC_DMA:
+   break;
+   default:
+   return -EFAULT;
+   }
+
+   switch (flags & (DMA_DEST_BUF | DMA_DEST_PAGE | DMA_DEST_DMA)) {
+ 

[PATCH 14/19] dmaengine: add dma_sync_wait

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

dma_sync_wait is a common routine to live wait for a dma operation to
complete.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 include/linux/dmaengine.h |   12 
 1 files changed, 12 insertions(+), 0 deletions(-)

diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index 9fd6cbd..0a70c9e 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -750,6 +750,18 @@ static inline void dma_async_unmap_singl
chan->device->unmap_single(chan, handle, size, direction);
 }
 
+static inline enum dma_status dma_sync_wait(struct dma_chan *chan,
+   dma_cookie_t cookie)
+{
+   enum dma_status status;
+   dma_async_issue_pending(chan);
+   do {
+   status = dma_async_operation_complete(chan, cookie, NULL, NULL);
+   } while (status == DMA_IN_PROGRESS);
+
+   return status;
+}
+
 /* --- DMA device --- */
 
 int dma_async_device_register(struct dma_device *device);
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/19] dmaengine: expose per channel dma mapping characteristics to clients

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Allow a client to ensure that the dma channel it has selected can
dma to the specified buffer or page address.  Also allow the client to
pre-map address ranges to be passed to the operations API.

Changelog:
* make the dmaengine api EXPORT_SYMBOL_GPL
* zero sum support should be standalone, not integrated into xor

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/dmaengine.c   |4 
 drivers/dma/ioatdma.c |   35 +++
 include/linux/dmaengine.h |   34 ++
 3 files changed, 73 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 9b02afa..e78ce89 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -630,3 +630,7 @@ EXPORT_SYMBOL_GPL(dma_async_device_unreg
 EXPORT_SYMBOL_GPL(dma_chan_cleanup);
 EXPORT_SYMBOL_GPL(dma_async_do_xor_err);
 EXPORT_SYMBOL_GPL(dma_async_chan_init);
+EXPORT_SYMBOL_GPL(dma_async_map_page);
+EXPORT_SYMBOL_GPL(dma_async_map_single);
+EXPORT_SYMBOL_GPL(dma_async_unmap_page);
+EXPORT_SYMBOL_GPL(dma_async_unmap_single);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index dd5b9f0..0159d14 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -637,6 +637,37 @@ extern dma_cookie_t dma_async_do_xor_err
union dmaengine_addr src, unsigned int src_cnt,
unsigned int src_off, size_t len, unsigned long flags);
 
+static dma_addr_t ioat_map_page(struct dma_chan *chan, struct page *page,
+   unsigned long offset, size_t size,
+   int direction)
+{
+   struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+   return pci_map_page(ioat_chan->device->pdev, page, offset, size,
+   direction);
+}
+
+static dma_addr_t ioat_map_single(struct dma_chan *chan, void *cpu_addr,
+   size_t size, int direction)
+{
+   struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+   return pci_map_single(ioat_chan->device->pdev, cpu_addr, size,
+   direction);
+}
+
+static void ioat_unmap_page(struct dma_chan *chan, dma_addr_t handle,
+   size_t size, int direction)
+{
+   struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+   pci_unmap_page(ioat_chan->device->pdev, handle, size, direction);
+}
+
+static void ioat_unmap_single(struct dma_chan *chan, dma_addr_t handle,
+   size_t size, int direction)
+{
+   struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan);
+   pci_unmap_single(ioat_chan->device->pdev, handle, size, direction);
+}
+
 static int __devinit ioat_probe(struct pci_dev *pdev,
 const struct pci_device_id *ent)
 {
@@ -717,6 +748,10 @@ #endif
device->common.capabilities = DMA_MEMCPY;
device->common.device_do_dma_memcpy = do_ioat_dma_memcpy;
device->common.device_do_dma_xor = dma_async_do_xor_err;
+   device->common.map_page = ioat_map_page;
+   device->common.map_single = ioat_map_single;
+   device->common.unmap_page = ioat_unmap_page;
+   device->common.unmap_single = ioat_unmap_single;
printk(KERN_INFO "Intel(R) I/OAT DMA Engine found, %d channels\n",
device->common.chancnt);
 
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index df055cc..cb4cfcf 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -287,6 +287,15 @@ struct dma_device {
enum dma_status (*device_operation_complete)(struct dma_chan *chan,
dma_cookie_t cookie, dma_cookie_t *last,
dma_cookie_t *used);
+   dma_addr_t (*map_page)(struct dma_chan *chan, struct page *page,
+   unsigned long offset, size_t size,
+   int direction);
+   dma_addr_t (*map_single)(struct dma_chan *chan, void *cpu_addr,
+   size_t size, int direction);
+   void (*unmap_page)(struct dma_chan *chan, dma_addr_t handle,
+   size_t size, int direction);
+   void (*unmap_single)(struct dma_chan *chan, dma_addr_t handle,
+   size_t size, int direction);
void (*device_issue_pending)(struct dma_chan *chan);
 };
 
@@ -592,6 +601,31 @@ static inline enum dma_status dma_async_
return DMA_IN_PROGRESS;
 }
 
+static inline dma_addr_t dma_async_map_page(struct dma_chan *chan,
+   struct page *page, unsigned long offset, size_t size,
+   int direction)
+{
+   return chan->device->map_page(chan, page, offset, size, direction);
+}
+
+static inline dma_addr_t dma_async_map_single(struct dma_chan *chan,
+   void *cpu_addr, size_t si

[PATCH 11/19] dmaengine: add memset as an asynchronous dma operation

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Changelog:
* make the dmaengine api EXPORT_SYMBOL_GPL
* zero sum support should be standalone, not integrated into xor

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/dmaengine.c   |   15 ++
 drivers/dma/ioatdma.c |5 +++
 include/linux/dmaengine.h |   68 +
 3 files changed, 88 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index e78ce89..fe62237 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -604,6 +604,17 @@ dma_cookie_t dma_async_do_xor_err(struct
return -ENXIO;
 }
 
+/**
+ * dma_async_do_memset_err - default function for dma devices that
+ *  do not support memset
+ */
+dma_cookie_t dma_async_do_memset_err(struct dma_chan *chan,
+union dmaengine_addr dest, unsigned int dest_off,
+int val, size_t len, unsigned long flags)
+{
+return -ENXIO;
+}
+
 static int __init dma_bus_init(void)
 {
mutex_init(&dma_list_mutex);
@@ -621,6 +632,9 @@ EXPORT_SYMBOL_GPL(dma_async_memcpy_pg_to
 EXPORT_SYMBOL_GPL(dma_async_memcpy_dma_to_dma);
 EXPORT_SYMBOL_GPL(dma_async_memcpy_pg_to_dma);
 EXPORT_SYMBOL_GPL(dma_async_memcpy_dma_to_pg);
+EXPORT_SYMBOL_GPL(dma_async_memset_buf);
+EXPORT_SYMBOL_GPL(dma_async_memset_page);
+EXPORT_SYMBOL_GPL(dma_async_memset_dma);
 EXPORT_SYMBOL_GPL(dma_async_xor_pgs_to_pg);
 EXPORT_SYMBOL_GPL(dma_async_xor_dma_list_to_dma);
 EXPORT_SYMBOL_GPL(dma_async_operation_complete);
@@ -629,6 +643,7 @@ EXPORT_SYMBOL_GPL(dma_async_device_regis
 EXPORT_SYMBOL_GPL(dma_async_device_unregister);
 EXPORT_SYMBOL_GPL(dma_chan_cleanup);
 EXPORT_SYMBOL_GPL(dma_async_do_xor_err);
+EXPORT_SYMBOL_GPL(dma_async_do_memset_err);
 EXPORT_SYMBOL_GPL(dma_async_chan_init);
 EXPORT_SYMBOL_GPL(dma_async_map_page);
 EXPORT_SYMBOL_GPL(dma_async_map_single);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 0159d14..231247c 100644
--- a/drivers/dma/ioatdma.c
+++ b/drivers/dma/ioatdma.c
@@ -637,6 +637,10 @@ extern dma_cookie_t dma_async_do_xor_err
union dmaengine_addr src, unsigned int src_cnt,
unsigned int src_off, size_t len, unsigned long flags);
 
+extern dma_cookie_t dma_async_do_memset_err(struct dma_chan *chan,
+   union dmaengine_addr dest, unsigned int dest_off,
+   int val, size_t size, unsigned long flags);
+
 static dma_addr_t ioat_map_page(struct dma_chan *chan, struct page *page,
unsigned long offset, size_t size,
int direction)
@@ -748,6 +752,7 @@ #endif
device->common.capabilities = DMA_MEMCPY;
device->common.device_do_dma_memcpy = do_ioat_dma_memcpy;
device->common.device_do_dma_xor = dma_async_do_xor_err;
+   device->common.device_do_dma_memset = dma_async_do_memset_err;
device->common.map_page = ioat_map_page;
device->common.map_single = ioat_map_single;
device->common.unmap_page = ioat_unmap_page;
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index cb4cfcf..8d53b08 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -260,6 +260,7 @@ struct dma_chan_client_ref {
  * @device_issue_pending: push appended descriptors to hardware
  * @device_do_dma_memcpy: perform memcpy with a dma engine
  * @device_do_dma_xor: perform block xor with a dma engine
+ * @device_do_dma_memset: perform block fill with a dma engine
  */
 struct dma_device {
 
@@ -284,6 +285,9 @@ struct dma_device {
union dmaengine_addr src, unsigned int src_cnt,
unsigned int src_off, size_t len,
unsigned long flags);
+   dma_cookie_t (*device_do_dma_memset)(struct dma_chan *chan,
+   union dmaengine_addr dest, unsigned int dest_off,
+   int value, size_t len, unsigned long flags);
enum dma_status (*device_operation_complete)(struct dma_chan *chan,
dma_cookie_t cookie, dma_cookie_t *last,
dma_cookie_t *used);
@@ -478,6 +482,70 @@ static inline dma_cookie_t dma_async_mem
 }
 
 /**
+ * dma_async_memset_buf - offloaded memset
+ * @chan: DMA channel to offload memset to
+ * @buf: destination buffer
+ * @val: value to initialize the buffer
+ * @len: length
+ */
+static inline dma_cookie_t dma_async_memset_buf(struct dma_chan *chan,
+   void *buf, int val, size_t len)
+{
+   unsigned long flags = DMA_DEST_BUF;
+   union dmaengine_addr dest_addr = { .buf = buf };
+   int cpu = get_cpu();
+   per_cpu_ptr(chan->local, cpu)->bytes_transferred += len;
+   per_cpu_ptr(chan->local, cpu)->memcpy_count++;
+   put_cpu();
+
+   return chan->device->device_do_dma_memset(chan, dest_addr, 0, val,
+   len, flags);

[PATCH 16/19] dmaengine: Driver for the Intel IOP 32x, 33x, and 13xx RAID engines

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

This is a driver for the iop DMA/AAU/ADMA units which are capable of pq_xor,
pq_update, pq_zero_sum, xor, dual_xor, xor_zero_sum, fill, copy+crc, and copy
operations.

Changelog:
* fixed a slot allocation bug in do_iop13xx_adma_xor that caused too few
slots to be requested eventually leading to data corruption
* enabled the slot allocation routine to attempt to free slots before
returning -ENOMEM
* switched the cleanup routine to solely use the software chain and the
status register to determine if a descriptor is complete.  This is
necessary to support other IOP engines that do not have status writeback
capability
* make the driver iop generic
* modified the allocation routines to understand allocating a group of
slots for a single operation
* added a null xor initialization operation for the xor only channel on
iop3xx
* add software emulation of zero sum on iop32x
* support xor operations on buffers larger than the hardware maximum
* add architecture specific raid5-dma support functions

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/Kconfig |   27 +
 drivers/dma/Makefile|1 
 drivers/dma/iop-adma.c  | 1501 +++
 include/asm-arm/hardware/iop_adma.h |   98 ++
 4 files changed, 1624 insertions(+), 3 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index fced8c3..3556143 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -7,8 +7,8 @@ menu "DMA Engine support"
 config DMA_ENGINE
bool "Support for DMA engines"
---help---
- DMA engines offload copy operations from the CPU to dedicated
- hardware, allowing the copies to happen asynchronously.
+  DMA engines offload block memory operations from the CPU to dedicated
+  hardware, allowing the operations to happen asynchronously.
 
 comment "DMA Clients"
 
@@ -28,9 +28,19 @@ config RAID5_DMA
default y
---help---
  This enables the use of DMA engines in the MD-RAID5 driver to
- offload stripe cache operations, freeing CPU cycles.
+ offload stripe cache operations (i.e. xor, memcpy), freeing CPU 
cycles.
  say Y here
 
+config RAID5_DMA_WAIT_VIA_REQUEUE
+   bool "raid5-dma: Non-blocking channel switching"
+   depends on RAID5_DMA_ARCH_NEEDS_CHAN_SWITCH && RAID5_DMA && BROKEN
+   default n
+   ---help---
+ This enables the raid5-dma driver to continue to operate on incoming
+ stripes when it determines that the current stripe must wait for a
+ a hardware channel to finish operations.  This code is a work in
+ progress, only say Y to debug the implementation, otherwise say N.
+
 comment "DMA Devices"
 
 config INTEL_IOATDMA
@@ -40,4 +50,15 @@ config INTEL_IOATDMA
---help---
  Enable support for the Intel(R) I/OAT DMA engine.
 
+config INTEL_IOP_ADMA
+tristate "Intel IOP ADMA support"
+depends on DMA_ENGINE && (ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX)
+   select RAID5_DMA_ARCH_NEEDS_CHAN_SWITCH if (ARCH_IOP32X || ARCH_IOP33X)
+default m
+---help---
+  Enable support for the Intel(R) IOP Series RAID engines.
+
+config RAID5_DMA_ARCH_NEEDS_CHAN_SWITCH
+   bool
+
 endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 4e36d6e..233eae7 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -2,3 +2,4 @@ obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_RAID5_DMA) += raid5-dma.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
+obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
new file mode 100644
index 000..51f1c54
--- /dev/null
+++ b/drivers/dma/iop-adma.c
@@ -0,0 +1,1501 @@
+/*
+ * Copyright(c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This driver supports the asynchrounous DMA copy and RAID engines available
+ * on the Intel Xscale(R) family of 

[PATCH 18/19] iop3xx: Give Linux control over PCI (ATU) initialization

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Currently the iop3xx platform support code assumes that RedBoot is the
bootloader and has already initialized the ATU.  Linux should handle this
initialization for three reasons:

1/ The memory map that RedBoot sets up is not optimal (page_to_dma and
virt_to_phys return different addresses).  The effect of this is that using
the dma mapping API for the internal bus dma units generates pci bus
addresses that are incorrect for the internal bus.

2/ Not all iop platforms use RedBoot

3/ If the ATU is already initialized it indicates that the iop is an add-in
card in another host, it does not own the PCI bus, and should not be
re-initialized.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 arch/arm/mach-iop32x/Kconfig |8 ++
 arch/arm/mach-iop32x/ep80219.c   |4 +
 arch/arm/mach-iop32x/iq31244.c   |5 +
 arch/arm/mach-iop32x/iq80321.c   |5 +
 arch/arm/mach-iop33x/Kconfig |8 ++
 arch/arm/mach-iop33x/iq80331.c   |5 +
 arch/arm/mach-iop33x/iq80332.c   |4 +
 arch/arm/plat-iop/pci.c  |  140 ++
 include/asm-arm/arch-iop32x/iop32x.h |9 ++
 include/asm-arm/arch-iop32x/memory.h |4 -
 include/asm-arm/arch-iop33x/iop33x.h |   10 ++
 include/asm-arm/arch-iop33x/memory.h |4 -
 include/asm-arm/hardware/iop3xx.h|   20 -
 13 files changed, 214 insertions(+), 12 deletions(-)

diff --git a/arch/arm/mach-iop32x/Kconfig b/arch/arm/mach-iop32x/Kconfig
index 05549a5..b2788e3 100644
--- a/arch/arm/mach-iop32x/Kconfig
+++ b/arch/arm/mach-iop32x/Kconfig
@@ -22,6 +22,14 @@ config ARCH_IQ80321
  Say Y here if you want to run your kernel on the Intel IQ80321
  evaluation kit for the IOP321 processor.
 
+config IOP3XX_ATU
+bool "Enable the PCI Controller"
+default y
+help
+  Say Y here if you want the IOP to initialize its PCI Controller.
+  Say N if the IOP is an add in card, the host system owns the PCI
+  bus in this case.
+
 endmenu
 
 endif
diff --git a/arch/arm/mach-iop32x/ep80219.c b/arch/arm/mach-iop32x/ep80219.c
index f616d3e..1a5c586 100644
--- a/arch/arm/mach-iop32x/ep80219.c
+++ b/arch/arm/mach-iop32x/ep80219.c
@@ -100,7 +100,7 @@ ep80219_pci_map_irq(struct pci_dev *dev,
 
 static struct hw_pci ep80219_pci __initdata = {
.swizzle= pci_std_swizzle,
-   .nr_controllers = 1,
+   .nr_controllers = 0,
.setup  = iop3xx_pci_setup,
.preinit= iop3xx_pci_preinit,
.scan   = iop3xx_pci_scan_bus,
@@ -109,6 +109,8 @@ static struct hw_pci ep80219_pci __initd
 
 static int __init ep80219_pci_init(void)
 {
+   if (iop3xx_get_init_atu() == IOP3XX_INIT_ATU_ENABLE)
+   ep80219_pci.nr_controllers = 1;
 #if 0
if (machine_is_ep80219())
pci_common_init(&ep80219_pci);
diff --git a/arch/arm/mach-iop32x/iq31244.c b/arch/arm/mach-iop32x/iq31244.c
index 967a696..25d5d62 100644
--- a/arch/arm/mach-iop32x/iq31244.c
+++ b/arch/arm/mach-iop32x/iq31244.c
@@ -97,7 +97,7 @@ iq31244_pci_map_irq(struct pci_dev *dev,
 
 static struct hw_pci iq31244_pci __initdata = {
.swizzle= pci_std_swizzle,
-   .nr_controllers = 1,
+   .nr_controllers = 0,
.setup  = iop3xx_pci_setup,
.preinit= iop3xx_pci_preinit,
.scan   = iop3xx_pci_scan_bus,
@@ -106,6 +106,9 @@ static struct hw_pci iq31244_pci __initd
 
 static int __init iq31244_pci_init(void)
 {
+   if (iop3xx_get_init_atu() == IOP3XX_INIT_ATU_ENABLE)
+   iq31244_pci.nr_controllers = 1;
+
if (machine_is_iq31244())
pci_common_init(&iq31244_pci);
 
diff --git a/arch/arm/mach-iop32x/iq80321.c b/arch/arm/mach-iop32x/iq80321.c
index ef4388c..cdd2265 100644
--- a/arch/arm/mach-iop32x/iq80321.c
+++ b/arch/arm/mach-iop32x/iq80321.c
@@ -97,7 +97,7 @@ iq80321_pci_map_irq(struct pci_dev *dev,
 
 static struct hw_pci iq80321_pci __initdata = {
.swizzle= pci_std_swizzle,
-   .nr_controllers = 1,
+   .nr_controllers = 0,
.setup  = iop3xx_pci_setup,
.preinit= iop3xx_pci_preinit,
.scan   = iop3xx_pci_scan_bus,
@@ -106,6 +106,9 @@ static struct hw_pci iq80321_pci __initd
 
 static int __init iq80321_pci_init(void)
 {
+   if (iop3xx_get_init_atu() == IOP3XX_INIT_ATU_ENABLE)
+   iq80321_pci.nr_controllers = 1;
+
if (machine_is_iq80321())
pci_common_init(&iq80321_pci);
 
diff --git a/arch/arm/mach-iop33x/Kconfig b/arch/arm/mach-iop33x/Kconfig
index 9aa016b..45598e0 100644
--- a/arch/arm/mach-iop33x/Kconfig
+++ b/arch/arm/mach-iop33x/Kconfig
@@ -16,6 +16,14 @@ config MACH_IQ80332
  Say Y here if you want to run your kernel on the Intel IQ80332
  evaluation kit for the IOP332 chipset.
 
+config IOP3XX_ATU
+   bool "Enabl

[PATCH 08/19] dmaengine: enable multiple clients and operations

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Enable the dmaengine interface to allow multiple clients to share a
channel, and enable clients to request channels based on an operations
capability mask.  This prepares the interface for use with the RAID5 client
and the future RAID6 client.

Multi-client support is achieved by modifying channels to maintain a list
of peer clients.

Multi-operation support is achieved by modifying clients to maintain lists
of channel references.  Channel references in a given request list satisfy
a client specified capability mask.

Changelog:
* make the dmaengine api EXPORT_SYMBOL_GPL
* zero sum support should be standalone, not integrated into xor

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/dmaengine.c   |  357 -
 drivers/dma/ioatdma.c |   12 +-
 include/linux/dmaengine.h |  164 ++---
 net/core/dev.c|   21 +--
 net/ipv4/tcp.c|4 -
 5 files changed, 443 insertions(+), 115 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 1527804..e10f19d 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -37,8 +37,13 @@
  * Each device has a channels list, which runs unlocked but is never modified
  * once the device is registered, it's just setup by the driver.
  *
- * Each client has a channels list, it's only modified under the client->lock
- * and in an RCU callback, so it's safe to read under rcu_read_lock().
+ * Each client has 'n' lists of channel references where
+ * n == DMA_MAX_CHAN_TYPE_REQ.  These lists are only modified under the
+ * client->lock and in an RCU callback, so they are safe to read under
+ * rcu_read_lock().
+ *
+ * Each channel has a list of peer clients, it's only modified under the
+ * chan->lock.  This allows a channel to be shared amongst several clients
  *
  * Each device has a kref, which is initialized to 1 when the device is
  * registered. A kref_put is done for each class_device registered.  When the
@@ -85,6 +90,18 @@ static ssize_t show_memcpy_count(struct 
return sprintf(buf, "%lu\n", count);
 }
 
+static ssize_t show_xor_count(struct class_device *cd, char *buf)
+{
+   struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+   unsigned long count = 0;
+   int i;
+
+   for_each_possible_cpu(i)
+   count += per_cpu_ptr(chan->local, i)->xor_count;
+
+   return sprintf(buf, "%lu\n", count);
+}
+
 static ssize_t show_bytes_transferred(struct class_device *cd, char *buf)
 {
struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
@@ -97,16 +114,37 @@ static ssize_t show_bytes_transferred(st
return sprintf(buf, "%lu\n", count);
 }
 
+static ssize_t show_bytes_xor(struct class_device *cd, char *buf)
+{
+   struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
+   unsigned long count = 0;
+   int i;
+
+   for_each_possible_cpu(i)
+   count += per_cpu_ptr(chan->local, i)->bytes_xor;
+
+   return sprintf(buf, "%lu\n", count);
+}
+
 static ssize_t show_in_use(struct class_device *cd, char *buf)
 {
+   unsigned int clients = 0;
+   struct list_head *peer;
struct dma_chan *chan = container_of(cd, struct dma_chan, class_dev);
 
-   return sprintf(buf, "%d\n", (chan->client ? 1 : 0));
+   rcu_read_lock();
+   list_for_each_rcu(peer, &chan->peers)
+   clients++;
+   rcu_read_unlock();
+
+   return sprintf(buf, "%d\n", clients);
 }
 
 static struct class_device_attribute dma_class_attrs[] = {
__ATTR(memcpy_count, S_IRUGO, show_memcpy_count, NULL),
+   __ATTR(xor_count, S_IRUGO, show_xor_count, NULL),
__ATTR(bytes_transferred, S_IRUGO, show_bytes_transferred, NULL),
+   __ATTR(bytes_xor, S_IRUGO, show_bytes_xor, NULL),
__ATTR(in_use, S_IRUGO, show_in_use, NULL),
__ATTR_NULL
 };
@@ -130,34 +168,79 @@ static struct class dma_devclass = {
 /**
  * dma_client_chan_alloc - try to allocate a channel to a client
  * @client: &dma_client
+ * @req: request descriptor
  *
  * Called with dma_list_mutex held.
  */
-static struct dma_chan *dma_client_chan_alloc(struct dma_client *client)
+static struct dma_chan *dma_client_chan_alloc(struct dma_client *client,
+   struct dma_req *req)
 {
struct dma_device *device;
struct dma_chan *chan;
+   struct dma_client_chan_peer *peer;
+   struct dma_chan_client_ref *chan_ref;
unsigned long flags;
int desc;   /* allocated descriptor count */
+   int allocated;  /* flag re-allocations */
 
-   /* Find a channel, any DMA engine will do */
+   /* Find a channel */
list_for_each_entry(device, &dma_device_list, global_node) {
+   if ((req->cap_m

[PATCH 03/19] raid5: move check parity operations to a workqueue

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Enable handle_stripe5 to pass off check parity operations to
raid5_do_soft_block_ops formerly handled by compute_parity5.

Changelog:
* removed handle_check_operations5.  All logic moved into handle_stripe5 so
that we do not need to go through the initiation logic to end the
operation.
* clear the uptodate bit on the parity block
* hold off check operations if a parity dependent operation is in flight
like a write

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   60 
 1 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e39d248..24ed4d8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2121,35 +2121,59 @@ #endif
locked += handle_write_operations5(sh, rcw);
}
 
-   /* maybe we need to check and possibly fix the parity for this stripe
-* Any reads will already have been scheduled, so we just see if enough 
data
-* is available
+   /* 1/ Maybe we need to check and possibly fix the parity for this 
stripe.
+*Any reads will already have been scheduled, so we just see if 
enough data
+*is available.
+* 2/ Hold off parity checks while parity dependent operations are in 
flight
+*(RCW and RMW are protected by 'locked')
 */
-   if (syncing && locked == 0 &&
-   !test_bit(STRIPE_INSYNC, &sh->state)) {
+   if ((syncing && locked == 0 &&
+   !test_bit(STRIPE_INSYNC, &sh->state)) ||
+   test_bit(STRIPE_OP_CHECK, &sh->state)) {
+
set_bit(STRIPE_HANDLE, &sh->state);
+   /* Take one of the following actions:
+* 1/ start a check parity operation if (uptodate == disks)
+* 2/ finish a check parity operation and act on the result
+*/
if (failed == 0) {
-   BUG_ON(uptodate != disks);
-   compute_parity5(sh, CHECK_PARITY);
-   uptodate--;
-   if (page_is_zero(sh->dev[sh->pd_idx].page)) {
-   /* parity is correct (on disc, not in buffer 
any more) */
-   set_bit(STRIPE_INSYNC, &sh->state);
-   } else {
-   conf->mddev->resync_mismatches += 
STRIPE_SECTORS;
-   if (test_bit(MD_RECOVERY_CHECK, 
&conf->mddev->recovery))
-   /* don't try to repair!! */
+   if (!test_bit(STRIPE_OP_CHECK, &sh->state)) {
+   BUG_ON(uptodate != disks);
+   set_bit(STRIPE_OP_CHECK, &sh->state);
+   set_bit(STRIPE_OP_CHECK_Gen, &sh->ops.state);
+   clear_bit(R5_UPTODATE, 
&sh->dev[sh->pd_idx].flags);
+   sh->ops.pending++;
+   uptodate--;
+   } else if (test_and_clear_bit(STRIPE_OP_CHECK_Done, 
&sh->ops.state)) {
+   clear_bit(STRIPE_OP_CHECK, &sh->state);
+
+   if (test_and_clear_bit(STRIPE_OP_CHECK_IsZero,
+   &sh->ops.state))
+   /* parity is correct (on disc, not in 
buffer any more) */
set_bit(STRIPE_INSYNC, &sh->state);
else {
-   compute_block(sh, sh->pd_idx);
-   uptodate++;
+   conf->mddev->resync_mismatches += 
STRIPE_SECTORS;
+   if (test_bit(MD_RECOVERY_CHECK, 
&conf->mddev->recovery))
+   /* don't try to repair!! */
+   set_bit(STRIPE_INSYNC, 
&sh->state);
+   else {
+   compute_block(sh, sh->pd_idx);
+   uptodate++;
+   }
}
}
}
-   if (!test_bit(STRIPE_INSYNC, &sh->state)) {
+
+   /* Wait for check parity operations to complete
+* before write-back
+*/
+   if (!test_bit(STRIPE_INSYNC, &sh->state) &&
+   !test_bit(STRIPE_OP_CHECK, &sh->state)) {
+
/* either

[PATCH 05/19] raid5: move read completion copies to a workqueue

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Enable handle_stripe5 to hand off the memory copy operations that satisfy
read requests to raid5_do_soft_blocks_ops, formerly this was handled in
line within handle_stripe5.

It adds a 'read' (past tense) pointer to the r5dev structure
to to track reads that have been offloaded to the workqueue.  When the copy
operation is complete the 'read' pointer is reused as the return_bi for the
bi_end_io() call.

Changelog:
* dev->read only holds reads that have been satisfied, previously it
doubled as a request queue to the operations routine
* added R5_ReadReq to mark the blocks that belong to a given bio fill
operation
* requested reads no longer count towards the 'to_read' count, 'to_fill'
tracks the number of requested reads

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   67 +---
 1 files changed, 38 insertions(+), 29 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0c39203..1a8dfd2 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -240,11 +240,11 @@ static void init_stripe(struct stripe_he
for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
 
-   if (dev->toread || dev->towrite || dev->written ||
+   if (dev->toread || dev->read || dev->towrite || dev->written ||
test_bit(R5_LOCKED, &dev->flags)) {
-   printk("sector=%llx i=%d %p %p %p %d\n",
+   printk("sector=%llx i=%d %p %p %p %p %d\n",
   (unsigned long long)sh->sector, i, dev->toread,
-  dev->towrite, dev->written,
+  dev->read, dev->towrite, dev->written,
   test_bit(R5_LOCKED, &dev->flags));
BUG();
}
@@ -1749,7 +1749,7 @@ static void handle_stripe5(struct stripe
struct bio *bi;
int i;
int syncing, expanding, expanded;
-   int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
+   int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0, 
to_fill=0;
int compute=0, non_overwrite=0, write_complete=0;
int failed_num=0;
struct r5dev *dev;
@@ -1765,44 +1765,47 @@ static void handle_stripe5(struct stripe
syncing = test_bit(STRIPE_SYNCING, &sh->state);
expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
-   /* Now to look around and see what can be done */
 
+   if (test_bit(STRIPE_OP_BIOFILL, &sh->state) &&
+   test_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state)) {
+   clear_bit(STRIPE_OP_BIOFILL, &sh->state);
+   clear_bit(STRIPE_OP_BIOFILL_Done, &sh->ops.state);
+   }
+
+   /* Now to look around and see what can be done */
rcu_read_lock();
for (i=disks; i--; ) {
mdk_rdev_t *rdev;
dev = &sh->dev[i];
clear_bit(R5_Insync, &dev->flags);
 
-   PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
-   i, dev->flags, dev->toread, dev->towrite, dev->written);
+   PRINTK("check %d: state 0x%lx toread %p read %p write %p 
written %p\n",
+   i, dev->flags, dev->toread, dev->read, dev->towrite, 
dev->written);
+
+   /* maybe we can acknowledge completion of a biofill operation */
+   if (test_bit(R5_ReadReq, &dev->flags) && !dev->toread)
+   clear_bit(R5_ReadReq, &dev->flags);
+
/* maybe we can reply to a read */
+   if (dev->read && !test_bit(R5_ReadReq, &dev->flags) &&
+   !test_bit(STRIPE_OP_BIOFILL, &sh->state)) {
+   return_bi = dev->read;
+   dev->read = NULL;
+   }
+
+   /* maybe we can start a biofill operation */
if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
-   struct bio *rbi, *rbi2;
-   PRINTK("Return read for disc %d\n", i);
-   spin_lock_irq(&conf->device_lock);
-   rbi = dev->toread;
-   dev->toread = NULL;
-   if (test_and_clear_bit(R5_Overlap, &dev->flags))
-   wake_up(&conf->wait_for_overlap);
-   spin_unlock_irq(&conf->device_lock);
-   while (rbi && r

[PATCH 02/19] raid5: move write operations to a workqueue

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Enable handle_stripe5 to pass off write operations to
raid5_do_soft_blocks_ops (which can be run as a workqueue).  The operations
moved are reconstruct-writes and read-modify-writes formerly handled by
compute_parity5.

Changelog:
* moved raid5_do_soft_block_ops changes into a separate patch
* changed handle_write_operations5 to only initiate write operations, which
prevents new writes from being requested while the current one is in flight
* all blocks undergoing a write are now marked locked and !uptodate at the
beginning of the write operation
* blocks undergoing a read-modify-write need a request flag to distinguish
them from blocks that are locked for reading. Reconstruct-writes still use
the R5_LOCKED bit to select blocks for the operation
* integrated the work queue Kconfig option

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/Kconfig |   21 +
 drivers/md/raid5.c |  192 ++--
 include/linux/raid/raid5.h |3 +
 3 files changed, 190 insertions(+), 26 deletions(-)

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index bf869ed..2a16b3b 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -162,6 +162,27 @@ config MD_RAID5_RESHAPE
  There should be enough spares already present to make the new
  array workable.
 
+config MD_RAID456_WORKQUEUE
+   depends on MD_RAID456
+   bool "Offload raid work to a workqueue from raid5d"
+   ---help---
+ This option enables raid work (block copy and xor operations)
+ to run in a workqueue.  If your platform has a high context
+ switch penalty say N.  If you are using hardware offload or
+ are running on an SMP platform say Y.
+
+ If unsure say, Y.
+
+config MD_RAID456_WORKQUEUE_MULTITHREAD
+   depends on MD_RAID456_WORKQUEUE && SMP
+   bool "Enable multi-threaded raid processing"
+   default y
+   ---help---
+ This option controls whether the raid workqueue will be multi-
+ threaded or single threaded.
+
+ If unsure say, Y.
+
 config MD_MULTIPATH
tristate "Multipath I/O support"
depends on BLK_DEV_MD
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8fde62b..e39d248 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -222,6 +222,8 @@ static void init_stripe(struct stripe_he
 
BUG_ON(atomic_read(&sh->count) != 0);
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
+   BUG_ON(sh->ops.state);
+   BUG_ON(sh->ops.pending);

CHECK_DEVLOCK();
PRINTK("init_stripe called, stripe %llu\n", 
@@ -331,6 +333,9 @@ static int grow_one_stripe(raid5_conf_t 
memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
sh->raid_conf = conf;
spin_lock_init(&sh->lock);
+   #ifdef CONFIG_MD_RAID456_WORKQUEUE
+   INIT_WORK(&sh->ops.work, conf->do_block_ops, sh);
+   #endif
 
if (grow_buffers(sh, conf->raid_disks)) {
shrink_buffers(sh, conf->raid_disks);
@@ -1266,7 +1271,72 @@ static void compute_block_2(struct strip
}
 }
 
+static int handle_write_operations5(struct stripe_head *sh, int rcw)
+{
+   int i, pd_idx = sh->pd_idx, disks = sh->disks;
+   int locked=0;
+
+   if (rcw == 0) {
+   /* skip the drain operation on an expand */
+   if (test_bit(STRIPE_OP_RCW_Expand, &sh->ops.state)) {
+   set_bit(STRIPE_OP_RCW, &sh->state);
+   set_bit(STRIPE_OP_RCW_Parity, &sh->ops.state);
+   for (i=disks ; i-- ;) {
+   set_bit(R5_LOCKED, &sh->dev[i].flags);
+   locked++;
+   }
+   } else { /* enter stage 1 of reconstruct write operation */
+   set_bit(STRIPE_OP_RCW, &sh->state);
+   set_bit(STRIPE_OP_RCW_Drain, &sh->ops.state);
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+
+   if (dev->towrite) {
+   set_bit(R5_LOCKED, &dev->flags);
+   clear_bit(R5_UPTODATE, &dev->flags);
+   locked++;
+   }
+   }
+   }
+   } else {
+   /* enter stage 1 of read modify write operation */
+   BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
+
+   set_bit(STRIPE_OP_RMW, &sh->state);
+   set_bit(STRIPE_OP_RMW_ParityPre, &sh->ops.state);
+   for (i=disks ; i-- ;) {
+

[PATCH 07/19] raid5: remove compute_block and compute_parity5

2006-09-11 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

replaced by the workqueue implementation

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  123 
 1 files changed, 0 insertions(+), 123 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a07b52b..ad6883b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -964,129 +964,6 @@ #define check_xor()   do {
\
} while(0)
 
 
-static void compute_block(struct stripe_head *sh, int dd_idx)
-{
-   int i, count, disks = sh->disks;
-   void *ptr[MAX_XOR_BLOCKS], *p;
-
-   PRINTK("compute_block, stripe %llu, idx %d\n", 
-   (unsigned long long)sh->sector, dd_idx);
-
-   ptr[0] = page_address(sh->dev[dd_idx].page);
-   memset(ptr[0], 0, STRIPE_SIZE);
-   count = 1;
-   for (i = disks ; i--; ) {
-   if (i == dd_idx)
-   continue;
-   p = page_address(sh->dev[i].page);
-   if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
-   ptr[count++] = p;
-   else
-   printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
-   " not present\n", dd_idx,
-   (unsigned long long)sh->sector, i);
-
-   check_xor();
-   }
-   if (count != 1)
-   xor_block(count, STRIPE_SIZE, ptr);
-   set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-}
-
-static void compute_parity5(struct stripe_head *sh, int method)
-{
-   raid5_conf_t *conf = sh->raid_conf;
-   int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
-   void *ptr[MAX_XOR_BLOCKS];
-   struct bio *chosen;
-
-   PRINTK("compute_parity5, stripe %llu, method %d\n",
-   (unsigned long long)sh->sector, method);
-
-   count = 1;
-   ptr[0] = page_address(sh->dev[pd_idx].page);
-   switch(method) {
-   case READ_MODIFY_WRITE:
-   BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
-   for (i=disks ; i-- ;) {
-   if (i==pd_idx)
-   continue;
-   if (sh->dev[i].towrite &&
-   test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
-   ptr[count++] = page_address(sh->dev[i].page);
-   chosen = sh->dev[i].towrite;
-   sh->dev[i].towrite = NULL;
-
-   if (test_and_clear_bit(R5_Overlap, 
&sh->dev[i].flags))
-   wake_up(&conf->wait_for_overlap);
-
-   BUG_ON(sh->dev[i].written);
-   sh->dev[i].written = chosen;
-   check_xor();
-   }
-   }
-   break;
-   case RECONSTRUCT_WRITE:
-   memset(ptr[0], 0, STRIPE_SIZE);
-   for (i= disks; i-- ;)
-   if (i!=pd_idx && sh->dev[i].towrite) {
-   chosen = sh->dev[i].towrite;
-   sh->dev[i].towrite = NULL;
-
-   if (test_and_clear_bit(R5_Overlap, 
&sh->dev[i].flags))
-   wake_up(&conf->wait_for_overlap);
-
-   BUG_ON(sh->dev[i].written);
-   sh->dev[i].written = chosen;
-   }
-   break;
-   case CHECK_PARITY:
-   break;
-   }
-   if (count>1) {
-   xor_block(count, STRIPE_SIZE, ptr);
-   count = 1;
-   }
-   
-   for (i = disks; i--;)
-   if (sh->dev[i].written) {
-   sector_t sector = sh->dev[i].sector;
-   struct bio *wbi = sh->dev[i].written;
-   while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) 
{
-   copy_data(1, wbi, sh->dev[i].page, sector);
-   wbi = r5_next_bio(wbi, sector);
-   }
-
-   set_bit(R5_LOCKED, &sh->dev[i].flags);
-   set_bit(R5_UPTODATE, &sh->dev[i].flags);
-   }
-
-   switch(method) {
-   case RECONSTRUCT_WRITE:
-   case CHECK_PARITY:
-   for (i=disks; i--;)
-   if (i != pd_idx) {
-   ptr[count++] = page_address(sh->dev[i].page);
-   check_xor();
-   }
-   break;
- 

Re: [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction

2006-09-11 Thread Dan Williams

On 9/11/06, Jeff Garzik <[EMAIL PROTECTED]> wrote:

Dan Williams wrote:
> Neil,
>
> The following patches implement hardware accelerated raid5 for the Intel
> Xscale(r) series of I/O Processors.  The MD changes allow stripe
> operations to run outside the spin lock in a work queue.  Hardware
> acceleration is achieved by using a dma-engine-aware work queue routine
> instead of the default software only routine.
>
> Since the last release of the raid5 changes many bug fixes and other
> improvements have been made as a result of stress testing.  See the per
> patch change logs for more information about what was fixed.  This
> release is the first release of the full dma implementation.
>
> The patches touch 3 areas, the md-raid5 driver, the generic dmaengine
> interface, and a platform device driver for IOPs.  The raid5 changes
> follow your comments concerning making the acceleration implementation
> similar to how the stripe cache handles I/O requests.  The dmaengine
> changes are the second release of this code.  They expand the interface
> to handle more than memcpy operations, and add a generic raid5-dma
> client.  The iop-adma driver supports dma memcpy, xor, xor zero sum, and
> memset across all IOP architectures (32x, 33x, and 13xx).
>
> Concerning the context switching performance concerns raised at the
> previous release, I have observed the following.  For the hardware
> accelerated case it appears that performance is always better with the
> work queue than without since it allows multiple stripes to be operated
> on simultaneously.  I expect the same for an SMP platform, but so far my
> testing has been limited to IOPs.  For a single-processor
> non-accelerated configuration I have not observed performance
> degradation with work queue support enabled, but in the Kconfig option
> help text I recommend disabling it (CONFIG_MD_RAID456_WORKQUEUE).
>
> Please consider the patches for -mm.
>
> -Dan
>
> [PATCH 01/19] raid5: raid5_do_soft_block_ops
> [PATCH 02/19] raid5: move write operations to a workqueue
> [PATCH 03/19] raid5: move check parity operations to a workqueue
> [PATCH 04/19] raid5: move compute block operations to a workqueue
> [PATCH 05/19] raid5: move read completion copies to a workqueue
> [PATCH 06/19] raid5: move the reconstruct write expansion operation to a 
workqueue
> [PATCH 07/19] raid5: remove compute_block and compute_parity5
> [PATCH 08/19] dmaengine: enable multiple clients and operations
> [PATCH 09/19] dmaengine: reduce backend address permutations
> [PATCH 10/19] dmaengine: expose per channel dma mapping characteristics to 
clients
> [PATCH 11/19] dmaengine: add memset as an asynchronous dma operation
> [PATCH 12/19] dmaengine: dma_async_memcpy_err for DMA engines that do not 
support memcpy
> [PATCH 13/19] dmaengine: add support for dma xor zero sum operations
> [PATCH 14/19] dmaengine: add dma_sync_wait
> [PATCH 15/19] dmaengine: raid5 dma client
> [PATCH 16/19] dmaengine: Driver for the Intel IOP 32x, 33x, and 13xx RAID 
engines
> [PATCH 17/19] iop3xx: define IOP3XX_REG_ADDR[32|16|8] and clean up DMA/AAU 
defs
> [PATCH 18/19] iop3xx: Give Linux control over PCI (ATU) initialization
> [PATCH 19/19] iop3xx: IOP 32x and 33x support for the iop-adma driver

Can devices like drivers/scsi/sata_sx4.c or drivers/scsi/sata_promise.c
take advantage of this?  Promise silicon supports RAID5 XOR offload.

If so, how?  If not, why not?  :)

This is a frequently asked question, Alan Cox had the same one at OLS.
The answer is "probably."  The only complication I currently see is
where/how the stripe cache is maintained.  With the IOPs its easy
because the DMA engines operate directly on kernel memory.  With the
Promise card I believe they have memory on the card and it's not clear
to me if the XOR engines on the card can deal with host memory.  Also,
MD would need to be modified to handle a stripe cache located on a
device, or somehow synchronize its local cache with card in a manner
that is still able to beat software only MD.


Jeff


Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/19] dmaengine: enable multiple clients and operations

2006-09-11 Thread Dan Williams

On 9/11/06, Jeff Garzik <[EMAIL PROTECTED]> wrote:

Dan Williams wrote:
> @@ -759,8 +755,10 @@ #endif
>   device->common.device_memcpy_buf_to_buf = ioat_dma_memcpy_buf_to_buf;
>   device->common.device_memcpy_buf_to_pg = ioat_dma_memcpy_buf_to_pg;
>   device->common.device_memcpy_pg_to_pg = ioat_dma_memcpy_pg_to_pg;
> - device->common.device_memcpy_complete = ioat_dma_is_complete;
> - device->common.device_memcpy_issue_pending = 
ioat_dma_memcpy_issue_pending;
> + device->common.device_operation_complete = ioat_dma_is_complete;
> + device->common.device_xor_pgs_to_pg = dma_async_xor_pgs_to_pg_err;
> + device->common.device_issue_pending = ioat_dma_memcpy_issue_pending;
> + device->common.capabilities = DMA_MEMCPY;


Are we really going to add a set of hooks for each DMA engine whizbang
feature?


What's the alternative?  But, also see patch 9 "dmaengine: reduce
backend address permutations" it relieves some of this pain.



That will get ugly when DMA engines support memcpy, xor, crc32, sha1,
aes, and a dozen other transforms.


> diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
> index c94d8f1..3599472 100644
> --- a/include/linux/dmaengine.h
> +++ b/include/linux/dmaengine.h
> @@ -20,7 +20,7 @@
>   */
>  #ifndef DMAENGINE_H
>  #define DMAENGINE_H
> -
> +#include 
>  #ifdef CONFIG_DMA_ENGINE
>
>  #include 
> @@ -65,6 +65,27 @@ enum dma_status {
>  };
>
>  /**
> + * enum dma_capabilities - DMA operational capabilities
> + * @DMA_MEMCPY: src to dest copy
> + * @DMA_XOR: src*n to dest xor
> + * @DMA_DUAL_XOR: src*n to dest_diag and dest_horiz xor
> + * @DMA_PQ_XOR: src*n to dest_q and dest_p gf/xor
> + * @DMA_MEMCPY_CRC32C: src to dest copy and crc-32c sum
> + * @DMA_SHARE: multiple clients can use this channel
> + */
> +enum dma_capabilities {
> + DMA_MEMCPY  = 0x1,
> + DMA_XOR = 0x2,
> + DMA_PQ_XOR  = 0x4,
> + DMA_DUAL_XOR= 0x8,
> + DMA_PQ_UPDATE   = 0x10,
> + DMA_ZERO_SUM= 0x20,
> + DMA_PQ_ZERO_SUM = 0x40,
> + DMA_MEMSET  = 0x80,
> + DMA_MEMCPY_CRC32C   = 0x100,

Please use the more readable style that explicitly lists bits:

DMA_MEMCPY  = (1 << 0),
DMA_XOR = (1 << 1),
...

I prefer this as well, although at one point I was told (not by you)
the absolute number was preferred when I was making changes to
drivers/scsi/sata_vsc.c.  In any event I'll change it...



> +/**
>   * struct dma_chan_percpu - the per-CPU part of struct dma_chan
>   * @refcount: local_t used for open-coded "bigref" counting
>   * @memcpy_count: transaction counter
> @@ -75,27 +96,32 @@ struct dma_chan_percpu {
>   local_t refcount;
>   /* stats */
>   unsigned long memcpy_count;
> + unsigned long xor_count;
>   unsigned long bytes_transferred;
> + unsigned long bytes_xor;

Clearly, each operation needs to be more compartmentalized.

This just isn't scalable, when you consider all the possible transforms.

Ok, one set of counters per op is probably overkill what about lumping
operations into groups and just tracking at the group level? i.e.

memcpy, memset -> string_count, string_bytes_transferred
crc, sha1, aes -> hash_count, hash_transferred
xor, pq_xor -> sum_count, sum_transferred



Jeff


Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction

2006-09-11 Thread Dan Williams

On 9/11/06, Jeff Garzik <[EMAIL PROTECTED]> wrote:

Dan Williams wrote:
> This is a frequently asked question, Alan Cox had the same one at OLS.
> The answer is "probably."  The only complication I currently see is
> where/how the stripe cache is maintained.  With the IOPs its easy
> because the DMA engines operate directly on kernel memory.  With the
> Promise card I believe they have memory on the card and it's not clear
> to me if the XOR engines on the card can deal with host memory.  Also,
> MD would need to be modified to handle a stripe cache located on a
> device, or somehow synchronize its local cache with card in a manner
> that is still able to beat software only MD.

sata_sx4 operates through [standard PC] memory on the card, and you use
a DMA engine to copy memory to/from the card.

[select chipsets supported by] sata_promise operates directly on host
memory.

So, while sata_sx4 is farther away from your direct-host-memory model,
it also has much more potential for RAID acceleration:  ideally, RAID1
just copies data to the card once, then copies the data to multiple
drives from there.  Similarly with RAID5, you can eliminate copies and
offload XOR, presuming the drives are all connected to the same card.

In the sata_promise case its straight forward, all that is needed is
dmaengine drivers for the xor and memcpy engines.  This would be
similar to the current I/OAT model where dma resources are provided by
a PCI function.  The sata_sx4 case would need a different flavor of
the dma_do_raid5_block_ops routine, one that understands where the
cache is located.  MD would also need the capability to bypass the
block layer since the data will have already been transferred to the
card by a stripe cache operation

The RAID1 case give me pause because it seems any work along these
lines requires that the implementation work for both MD and DM, which
then eventually leads to being tasked with merging the two.


Jeff


Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 08/19] dmaengine: enable multiple clients and operations

2006-09-11 Thread Dan Williams

On 9/11/06, Roland Dreier <[EMAIL PROTECTED]> wrote:

Jeff> Are we really going to add a set of hooks for each DMA
Jeff> engine whizbang feature?

...ok, but at some level we are going to need a file that has:
EXPORT_SYMBOL_GPL(dma_whizbang_op1)
. . .
EXPORT_SYMBOL_GPL(dma_whizbang_opX)
correct?



Dan> What's the alternative?  But, also see patch 9 "dmaengine:
Dan> reduce backend address permutations" it relieves some of this
Dan> pain.

I guess you can pass an opcode into a common "start operation" function.

But then we still have the problem of being able to request a memory
copy operation of a channel that only understands xor, a la Jeff's
comment to patch 12:

"Further illustration of how this API growth is going wrong.  You should
create an API such that it is impossible for an XOR transform to ever
call non-XOR-transform hooks."


With all the memcpy / xor / crypto / etc. hardware out there already,
we definitely have to get this interface right.

 - R.


I understand what you are saying Jeff, the implementation can be made
better, but something I think is valuable is the ability to write
clients once like NET_DMA and RAID5_DMA and have them run without
modification on any platform that can provide the engine interface
rather than needing a client per architecture
IOP_RAID5_DMA...FOO_X_RAID5_DMA.

Or is this an example of the where "Do What You Must, And No More"
comes in, i.e. don't worry about making a generic RAID5_DMA while
there is only one implementation existence?

I also want to pose the question of whether the dmaengine interface
should handle cryptographic transforms?  We already have Acrypto:
http://tservice.net.ru/~s0mbre/blog/devel/acrypto/index.html.  At the
same time since IOPs can do Galois Field multiplication and XOR it
would be nice to take advantage of that for crypto acceleration, but
this does not fit the model of a device that Acrypto supports.

Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction

2006-09-13 Thread Dan Williams

On 9/13/06, Jakob Oestergaard <[EMAIL PROTECTED]> wrote:

On Mon, Sep 11, 2006 at 04:00:32PM -0700, Dan Williams wrote:
> Neil,
>
...
>
> Concerning the context switching performance concerns raised at the
> previous release, I have observed the following.  For the hardware
> accelerated case it appears that performance is always better with the
> work queue than without since it allows multiple stripes to be operated
> on simultaneously.  I expect the same for an SMP platform, but so far my
> testing has been limited to IOPs.  For a single-processor
> non-accelerated configuration I have not observed performance
> degradation with work queue support enabled, but in the Kconfig option
> help text I recommend disabling it (CONFIG_MD_RAID456_WORKQUEUE).

Out of curiosity; how does accelerated compare to non-accelerated?


One quick example:
4-disk SATA array rebuild on iop321 without acceleration - 'top'
reports md0_resync and md0_raid5 dueling for the CPU each at ~50%
utilization.

With acceleration - 'top' reports md0_resync cpu utilization at ~90%
with the rest split between md0_raid5 and md0_raid5_ops.

The sync speed reported by /proc/mdstat is ~40% higher in the accelerated case.

That being said, array resync is a special case, so your mileage may
vary with other applications.

I will put together some data from bonnie++, iozone, maybe contest,
and post it on SourceForge.


 / jakob


Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] dmaengine: clean up and abstract function types (was Re: [PATCH 08/19] dmaengine: enable multiple clients and operations)

2006-09-18 Thread Dan Williams

On 9/15/06, Olof Johansson <[EMAIL PROTECTED]> wrote:

On Fri, 15 Sep 2006 11:38:17 -0500 Olof Johansson <[EMAIL PROTECTED]> wrote:

> On Mon, 11 Sep 2006 19:44:16 -0400 Jeff Garzik <[EMAIL PROTECTED]> wrote:

> > Are we really going to add a set of hooks for each DMA engine whizbang
> > feature?
> >
> > That will get ugly when DMA engines support memcpy, xor, crc32, sha1,
> > aes, and a dozen other transforms.
>
>
> Yes, it will be unmaintainable. We need some sort of multiplexing with
> per-function registrations.
>
> Here's a first cut at it, just very quick. It could be improved further
> but it shows that we could exorcise most of the hardcoded things pretty
> easily.

Ok, that was obviously a naive and not so nice first attempt, but I
figured it was worth it to show how it can be done.

This is a little more proper: Specify at client registration time what
the function the client will use is, and make the channel use it. This
way most of the error checking per call can be removed too.

Chris/Dan: Please consider picking this up as a base for the added
functionality and cleanups.


Thanks for this Olof it has sparked some ideas about how to redo
support for multiple operations.






Clean up dmaengine a bit. Make the client registration specify which
channel functions ("type") the client will use. Also, make devices
register which functions they will provide.

Also exorcise most of the memcpy-specific references from the generic
dma engine code. There's still some left in the iov stuff.

I think we should keep the operation type in the function name but
drop all the [buf|pg|dma]_to_[buf|pg|dma] permutations.  The buffer
type can be handled generically across all operation types.  Something
like the following for a pg_to_buf memcpy.

struct dma_async_op_memcpy *op;
struct page *pg;
void *buf;
size_t len;

dma_async_op_init_src_pg(op, pg);
dma_async_op_init_dest_buf(op, buf);
dma_async_memcpy(chan, op, len);

-Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction

2006-10-10 Thread Dan Williams

On 10/8/06, Neil Brown <[EMAIL PROTECTED]> wrote:



On Monday September 11, [EMAIL PROTECTED] wrote:
> Neil,
>
> The following patches implement hardware accelerated raid5 for the Intel
> Xscale(r) series of I/O Processors.  The MD changes allow stripe
> operations to run outside the spin lock in a work queue.  Hardware
> acceleration is achieved by using a dma-engine-aware work queue routine
> instead of the default software only routine.

Hi Dan,
 Sorry for the delay in replying.
 I've looked through these patches at last (mostly the raid-specific
 bits) and while there is clearly a lot of good stuff here, it does
 'feel' right - it just seems too complex.

 The particular issues that stand out to me are:
   - 33 new STRIPE_OP_* flags.  I'm sure there doesn't need to be that
  many new flags.
   - the "raid5 dma client" patch moves far too much internal
 knowledge about raid5 into drivers/dma.

 Clearly there are some complex issues being dealt with and some
 complexity is to be expected, but I feel there must be room for some
 serious simplification.

A valid criticism.  There was definitely a push to just get it
functional, so I can now see how the complexity crept into the
implementation.  The primary cause was the choice to explicitly handle
channel switching in raid5-dma.  However, relieving "client" code from
this responsibility is something I am taking care of in the async api
changes.



 Let me try to describe how I envisage it might work.

 As you know, the theory-of-operation of handle_stripe is that it
 assesses the state of a stripe deciding what actions to perform and
 then performs them.  Synchronous actions (e.g. current parity calcs)
 are performed 'in-line'.  Async actions (reads, writes) and actions
 that cannot be performed under a spinlock (->b_end_io) are recorded
 as being needed and then are initiated at the end of handle_stripe
 outside of the sh->lock.

 The proposal is to bring the parity and other bulk-memory operations
 out of the spinlock and make them optionally asynchronous.

 The set of tasks that might be needed to be performed on a stripe
 are:
Clear a target cache block
pre-xor various cache blocks into a target
copy data out of bios into cache blocks. (drain)
post-xor various cache blocks into a target
copy data into bios out of cache blocks (fill)
test if a cache block is all zeros
start a read on a cache block
start a write on a cache block

 (There is also a memcpy when expanding raid5.  I think I would try to
  simply avoid that copy and move pointers around instead).

 Some of these steps require sequencing. e.g.
   clear, pre-xor, copy, post-xor, write
 for a rwm cycle.
 We could require handle_stripe to be called again for each step.
 i.e. first call just clears the target and flags it as clear.  Next
 call initiates the pre-xor and flags that as done.  Etc.  However I
 think that would make the non-offloaded case too slow, or at least
 too clumsy.

 So instead we set flags to say what needs to be done and have a
 workqueue system that does it.

 (so far this is all quite similar to what you have done.)

 So handle_stripe would set various flag and other things (like
 identify which block was the 'target' block) and run the following
 in a workqueue:

raid5_do_stuff(struct stripe_head *sh)
{
raid5_cont_t *conf = sh->raid_conf;

if (test_bit(CLEAR_TARGET, &sh->ops.pending)) {
struct page = *p->sh->dev[sh->ops.target].page;
rv = async_memset(p, 0, 0, PAGE_SIZE, ops_done, sh);
if (rv != BUSY)
clear_bit(CLEAR_TARGET, &sh->ops.pending);
if (rv != COMPLETE)
goto out;
}

while (test_bit(PRE_XOR, &sh->ops.pending)) {
struct page *plist[XOR_MAX];
int offset[XOR_MAX];
int pos = 0;
int d;

for (d = sh->ops.nextdev;
 d < conf->raid_disks && pos < XOR_MAX ;
 d++) {
if (sh->ops.nextdev == sh->ops.target)
continue;
if (!test_bit(R5_WantPreXor, &sh->dev[d].flags))
continue;
plist[pos] = sh->dev[d].page;
offset[pos++] = 0;
}
if (pos) {
struct page *p = sh->dev[sh->ops.target].page;
rv = async_xor(p, 0, plist, offset, pos, PAGE_SIZE,
   ops_done, sh);
if (rv != BUSY)
sh->ops.nextdev = d;
if (rv != COMPLETE)
goto out;
} else {
clear_bit(PRE_XOR, &sh->ops.pending);
sh->ops.nextdev = 0;
}
  

Re: [PATCH 00/19] Hardware Accelerated MD RAID5: Introduction

2006-10-10 Thread Dan Williams

On 9/14/06, Jakob Oestergaard <[EMAIL PROTECTED]> wrote:

On Wed, Sep 13, 2006 at 12:17:55PM -0700, Dan Williams wrote:
...
> >Out of curiosity; how does accelerated compare to non-accelerated?
>
> One quick example:
> 4-disk SATA array rebuild on iop321 without acceleration - 'top'
> reports md0_resync and md0_raid5 dueling for the CPU each at ~50%
> utilization.
>
> With acceleration - 'top' reports md0_resync cpu utilization at ~90%
> with the rest split between md0_raid5 and md0_raid5_ops.
>
> The sync speed reported by /proc/mdstat is ~40% higher in the accelerated
> case.

Ok, nice :)

>
> That being said, array resync is a special case, so your mileage may
> vary with other applications.

Every-day usage I/O performance data would be nice indeed :)

> I will put together some data from bonnie++, iozone, maybe contest,
> and post it on SourceForge.

Great!


I have posted some Iozone data and graphs showing the performance
impact of the patches across the three iop processors iop321, iop331,
and iop341.  The general take away from the data is that using dma
engines extends the region that Iozone calls the "buffer cache
effect".  Write performance benefited the most as expected, but read
performance showed some modest gains as well.  There are some regions
(smaller file size and record length) that show a performance
disadvantage but it is typically less than 5%.

The graphs map the relative performance multiplier that the raid
patches generate ('2.6.18-rc6 performance' x 'performance multiplier'
= '2.6.18-rc6-raid performance') .  A value of '1' designates equal
performance.  The large cliff that drops to zero is a "not measured"
region, i.e. the record length is larger than the file size.  Iozone
outputs to Excel, but I have also made pdf's of the graphs available.
Note: Openoffice-calc can view the data but it does not support the 3D
surface graphs that Iozone uses.

Excel:
http://prdownloads.sourceforge.net/xscaleiop/iozone_raid_accel.xls?download

PDF Graphs:
http://prdownloads.sourceforge.net/xscaleiop/iop-iozone-graphs-20061010.tar.bz2?download

Regards,
Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 00/12] md raid acceleration and the async_tx api

2006-11-30 Thread Dan Williams

Here is the latest version of the raid acceleration patch set.  Since
the last release I have created the async_tx api to address the
concerns raised by Neil and Jeff.  With this api in place the raid5
asynchronous and synchronous paths are no longer separated, i.e. there
are no hardware specific concerns in the raid code.

The async_tx api is proposed as a special dmaengine management client
that allows offload engines to be used for bulk memory
transfers/transforms, and fallback to synchronous routines when an
engine is not present.

This implementation has been tested on iop13xx and iop33x platforms in
both the synchronous case and the asynchronous case with the iop-adma
driver.  The changes to the ioatdma driver have only been compile
tested, and testing NET_DMA with iop-adma is pending.

Please consider for -mm.  These patches are against 2.6.19.

Dan Williams:
 dmaengine: add base support for the async_tx api
 dmaengine: add the async_tx api
 dmaengine: driver for the iop32x, iop33x, and iop13xx raid engines
 md: add raid5_run_ops and support routines
 md: workqueue for raid5 operations
 md: move write operations to raid5_run_ops
 md: move raid5 compute block operations to raid5_run_ops
 md: move raid5 parity checks to raid5_run_ops
 md: satisfy raid5 read requests via raid5_run_ops
 md: use async_tx and raid5_run_ops for raid5 expansion operations
 md: raid5 io requests to raid5_run_ops
 md: remove raid5 compute_block and compute_parity5

Regards,
Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/12] md: satisfy raid5 read requests via raid5_run_ops

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Use raid5_run_ops to carry out the memory copies for a raid5 read request.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   57 +---
 1 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1764fbb..3c793dc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2020,7 +2020,7 @@ static void handle_stripe5(struct stripe
int i;
int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-   int compute=0, req_compute=0, non_overwrite=0;
+   int to_fill=0, compute=0, req_compute=0, non_overwrite=0;
int failed_num=0;
struct r5dev *dev;
 
@@ -2035,42 +2035,45 @@ static void handle_stripe5(struct stripe
syncing = test_bit(STRIPE_SYNCING, &sh->state);
expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
-   /* Now to look around and see what can be done */
 
+   /* clear completed biofills */
+   if (test_and_clear_bit(STRIPE_OP_BIOFILL, &sh->ops.complete)) {
+   clear_bit(STRIPE_OP_BIOFILL, &sh->ops.ack);
+   clear_bit(STRIPE_OP_BIOFILL, &sh->ops.pending);
+   }
+
+   /* Now to look around and see what can be done */
rcu_read_lock();
for (i=disks; i--; ) {
mdk_rdev_t *rdev;
dev = &sh->dev[i];
clear_bit(R5_Insync, &dev->flags);
 
-   PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
-   i, dev->flags, dev->toread, dev->towrite, dev->written);
+   PRINTK("check %d: state 0x%lx toread %p read %p write %p 
written %p\n",
+   i, dev->flags, dev->toread, dev->read, dev->towrite, 
dev->written);
+
+   /* maybe we can acknowledge completion of a biofill operation */
+   if (test_bit(R5_Wantfill, &dev->flags) && !dev->toread)
+   clear_bit(R5_Wantfill, &dev->flags);
+
/* maybe we can reply to a read */
+   if (dev->read && !test_bit(R5_Wantfill, &dev->flags) &&
+   !test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending)) {
+   return_bi = dev->read;
+   dev->read = NULL;
+   }
+
+   /* maybe we can start a biofill operation */
if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
-   struct bio *rbi, *rbi2;
-   PRINTK("Return read for disc %d\n", i);
-   spin_lock_irq(&conf->device_lock);
-   rbi = dev->toread;
-   dev->toread = NULL;
-   if (test_and_clear_bit(R5_Overlap, &dev->flags))
-   wake_up(&conf->wait_for_overlap);
-   spin_unlock_irq(&conf->device_lock);
-   while (rbi && rbi->bi_sector < dev->sector + 
STRIPE_SECTORS) {
-   copy_data(0, rbi, dev->page, dev->sector);
-   rbi2 = r5_next_bio(rbi, dev->sector);
-   spin_lock_irq(&conf->device_lock);
-   if (--rbi->bi_phys_segments == 0) {
-   rbi->bi_next = return_bi;
-   return_bi = rbi;
-   }
-   spin_unlock_irq(&conf->device_lock);
-   rbi = rbi2;
-   }
+   to_read--;
+   if (!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+   set_bit(R5_Wantfill, &dev->flags);
}
 
/* now count some things */
if (test_bit(R5_LOCKED, &dev->flags)) locked++;
if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+   if (test_bit(R5_Wantfill, &dev->flags)) to_fill++;
if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 
1);
 
if (dev->toread) to_read++;
@@ -2094,9 +2097,13 @@ static void handle_stripe5(struct stripe
set_bit(R5_Insync, &dev->flags);
}
rcu_read_unlock();
+
+   if (to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+   sh->ops.count++;
+
PRINTK("locked=%d uptodate=%d to_read=%d"
-   &quo

[PATCH 10/12] md: use async_tx and raid5_run_ops for raid5 expansion operations

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe sets STRIPE_OP_POSTXOR without setting STRIPE_OP_BIODRAIN to
carry out the postxor operation required by the expansion process.  This
distinction is needed since all blocks will need to be written back to disk
even though none of the blocks will have their 'written' pointer set.

The bulk copy operation to the new stripe is handled by async_tx.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   48 
 1 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 3c793dc..8b36611 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2582,18 +2582,32 @@ #endif
}
}
 
-   if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
-   /* Need to write out all blocks after computing parity */
-   sh->disks = conf->raid_disks;
-   sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 
conf->raid_disks);
-   compute_parity5(sh, RECONSTRUCT_WRITE);
+   /* Finish postxor operations initiated by the expansion
+* process
+*/
+   if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
+   !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
+
+   clear_bit(STRIPE_EXPANDING, &sh->state);
+
+   clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
+   clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
+   clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+
for (i= conf->raid_disks; i--;) {
-   set_bit(R5_LOCKED, &sh->dev[i].flags);
-   locked++;
set_bit(R5_Wantwrite, &sh->dev[i].flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
}
-   clear_bit(STRIPE_EXPANDING, &sh->state);
-   } else if (expanded) {
+   }
+
+   if (expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+   !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+   /* Need to write out all blocks after computing parity */
+   sh->disks = conf->raid_disks;
+   sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 
conf->raid_disks);
+   locked += handle_write_operations5(sh, 0, 1);
+   } else if (expanded && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
@@ -2604,6 +2618,7 @@ #endif
/* We have read all the blocks in this stripe and now we need to
 * copy some of them into a target stripe for expand.
 */
+   struct dma_async_tx_descriptor *tx = NULL;
clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
for (i=0; i< sh->disks; i++)
if (i != sh->pd_idx) {
@@ -2627,9 +2642,12 @@ #endif
release_stripe(sh2);
continue;
}
-   memcpy(page_address(sh2->dev[dd_idx].page),
-  page_address(sh->dev[i].page),
-  STRIPE_SIZE);
+
+   /* place all the copies on one channel */
+   tx = async_memcpy(sh2->dev[dd_idx].page,
+   sh->dev[i].page, 0, 0, STRIPE_SIZE,
+   ASYNC_TX_DEP_ACK, tx, NULL, NULL);
+
set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
for (j=0; jraid_disks; j++)
@@ -2641,6 +2659,12 @@ #endif
set_bit(STRIPE_HANDLE, &sh2->state);
}
release_stripe(sh2);
+
+   /* done submitting copies, wait for them to 
complete */
+   if (i + 1 >= sh->disks) {
+   async_tx_ack(tx);
+   dma_wait_for_async_tx(tx);
+   }
}
}
 
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/12] dmaengine: add the async_tx api

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

async_tx is an api to describe a series of bulk memory
transfers/transforms.  When possible these transactions are carried out by
asynchrounous dma engines.  The api handles inter-transaction dependencies
and hides dma channel management from the client.  When a dma engine is not
present the transaction is carried out via synchronous software routines.

Xor operations are handled by async_tx, to this end xor.c is moved into
drivers/dma and is changed to take an explicit destination address and
a series of sources to match the hardware engine implementation.

When CONFIG_DMA_ENGINE is not set the asynchrounous path is compiled away.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/Makefile |3 
 drivers/dma/Kconfig  |   16 +
 drivers/dma/Makefile |1 
 drivers/dma/async_tx.c   |  921 ++
 drivers/dma/xor.c|  153 
 drivers/md/Kconfig   |2 
 drivers/md/Makefile  |6 
 drivers/md/raid5.c   |   18 -
 drivers/md/xor.c |  154 
 include/linux/async_tx.h |  181 +
 include/linux/raid/xor.h |5 
 11 files changed, 1287 insertions(+), 173 deletions(-)

diff --git a/drivers/Makefile b/drivers/Makefile
index 4ac14da..8b2460d 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -60,7 +60,6 @@ obj-$(CONFIG_I2C) += i2c/
 obj-$(CONFIG_W1)   += w1/
 obj-$(CONFIG_HWMON)+= hwmon/
 obj-$(CONFIG_PHONE)+= telephony/
-obj-$(CONFIG_MD)   += md/
 obj-$(CONFIG_BT)   += bluetooth/
 obj-$(CONFIG_ISDN) += isdn/
 obj-$(CONFIG_EDAC) += edac/
@@ -77,3 +76,5 @@ obj-$(CONFIG_CRYPTO)  += crypto/
 obj-$(CONFIG_SUPERH)   += sh/
 obj-$(CONFIG_GENERIC_TIME) += clocksource/
 obj-$(CONFIG_DMA_ENGINE)   += dma/
+obj-$(CONFIG_ASYNC_TX_DMA) += dma/
+obj-$(CONFIG_MD)+= md/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 30d021d..c82ed5f 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -7,8 +7,8 @@ menu "DMA Engine support"
 config DMA_ENGINE
bool "Support for DMA engines"
---help---
- DMA engines offload copy operations from the CPU to dedicated
- hardware, allowing the copies to happen asynchronously.
+  DMA engines offload bulk memory operations from the CPU to dedicated
+  hardware, allowing the operations to happen asynchronously.
 
 comment "DMA Clients"
 
@@ -22,6 +22,17 @@ config NET_DMA
  Since this is the main user of the DMA engine, it should be enabled;
  say Y here.
 
+config ASYNC_TX_DMA
+   tristate "Asynchronous Bulk Memory Transfers/Transforms API"
+   default y
+   ---help---
+ This enables the async_tx management layer for dma engines.
+ Subsystems coded to this API will use offload engines for bulk
+ memory operations where present.  Software implementations are
+ called when a dma engine is not present or fails to allocate
+ memory to carry out the transaction.
+ Current subsystems ported to async_tx: MD_RAID4,5
+
 comment "DMA Devices"
 
 config INTEL_IOATDMA
@@ -30,5 +41,4 @@ config INTEL_IOATDMA
default m
---help---
  Enable support for the Intel(R) I/OAT DMA engine.
-
 endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index bdcfdbd..6a99341 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
+obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o
diff --git a/drivers/dma/async_tx.c b/drivers/dma/async_tx.c
new file mode 100644
index 000..00f72c0
--- /dev/null
+++ b/drivers/dma/async_tx.c
@@ -0,0 +1,921 @@
+/*
+ * Copyright(c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define ASYNC_TX_DEBUG 0
+#define PRINTK(x...) ((void)(ASYNC_TX_D

[PATCH 01/12] dmaengine: add base support for the async_tx api

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

* introduce struct dma_async_tx_descriptor as a common field for all dmaengine
software descriptors
* convert the device_memcpy_* methods into separate prep, set src/dest, and
submit stages
* support capabilities beyond memcpy (xor, memset, xor zero sum, completion
interrupts)
* convert ioatdma to the new semantics

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/dmaengine.c   |   44 ++--
 drivers/dma/ioatdma.c |  256 ++--
 drivers/dma/ioatdma.h |8 +
 include/linux/dmaengine.h |  263 ++---
 4 files changed, 394 insertions(+), 177 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 1527804..8d203ad 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -210,7 +210,8 @@ static void dma_chans_rebalance(void)
mutex_lock(&dma_list_mutex);
 
list_for_each_entry(client, &dma_client_list, global_node) {
-   while (client->chans_desired > client->chan_count) {
+   while (client->chans_desired < 0 ||
+   client->chans_desired > client->chan_count) {
chan = dma_client_chan_alloc(client);
if (!chan)
break;
@@ -219,7 +220,8 @@ static void dma_chans_rebalance(void)
   chan,
   DMA_RESOURCE_ADDED);
}
-   while (client->chans_desired < client->chan_count) {
+   while (client->chans_desired >= 0 &&
+   client->chans_desired < client->chan_count) {
spin_lock_irqsave(&client->lock, flags);
chan = list_entry(client->channels.next,
  struct dma_chan,
@@ -294,12 +296,12 @@ void dma_async_client_unregister(struct 
  * @number: count of DMA channels requested
  *
  * Clients call dma_async_client_chan_request() to specify how many
- * DMA channels they need, 0 to free all currently allocated.
+ * DMA channels they need, 0 to free all currently allocated. A request
+ * < 0 indicates the client wants to handle all engines in the system.
  * The resulting allocations/frees are indicated to the client via the
  * event callback.
  */
-void dma_async_client_chan_request(struct dma_client *client,
-   unsigned int number)
+void dma_async_client_chan_request(struct dma_client *client, int number)
 {
client->chans_desired = number;
dma_chans_rebalance();
@@ -318,6 +320,31 @@ int dma_async_device_register(struct dma
if (!device)
return -ENODEV;
 
+   /* validate device routines */
+   BUG_ON(test_bit(DMA_MEMCPY, &device->capabilities) &&
+   !device->device_prep_dma_memcpy);
+   BUG_ON(test_bit(DMA_XOR, &device->capabilities) &&
+   !device->device_prep_dma_xor);
+   BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) &&
+   !device->device_prep_dma_zero_sum);
+   BUG_ON(test_bit(DMA_MEMSET, &device->capabilities) &&
+   !device->device_prep_dma_memset);
+   BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) &&
+   !device->device_prep_dma_interrupt);
+
+   BUG_ON(!device->device_alloc_chan_resources);
+   BUG_ON(!device->device_free_chan_resources);
+   BUG_ON(!device->device_tx_submit);
+   BUG_ON(!device->device_set_dest);
+   BUG_ON(!device->device_set_src);
+   BUG_ON(!device->device_dependency_added);
+   BUG_ON(!device->device_is_tx_complete);
+   BUG_ON(!device->map_page);
+   BUG_ON(!device->map_single);
+   BUG_ON(!device->unmap_page);
+   BUG_ON(!device->unmap_single);
+   BUG_ON(!device->device_issue_pending);
+
init_completion(&device->done);
kref_init(&device->refcount);
device->dev_id = id++;
@@ -402,11 +429,8 @@ subsys_initcall(dma_bus_init);
 EXPORT_SYMBOL(dma_async_client_register);
 EXPORT_SYMBOL(dma_async_client_unregister);
 EXPORT_SYMBOL(dma_async_client_chan_request);
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
-EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
-EXPORT_SYMBOL(dma_async_memcpy_complete);
-EXPORT_SYMBOL(dma_async_memcpy_issue_pending);
+EXPORT_SYMBOL(dma_async_is_tx_complete);
+EXPORT_SYMBOL(dma_async_issue_pending);
 EXPORT_SYMBOL(dma_async_device_register);
 EXPORT_SYMBOL(dma_async_device_unregister);
 EXPORT_SYMBOL(dma_chan_cleanup);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 0358419..ff7377d 100644
--- a/drivers/dma/ioat

[PATCH 04/12] md: add raid5_run_ops and support routines

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Prepare the raid5 implementation to use async_tx and a workqueue for
running stripe operations:
* biofill (copy data into request buffers to satisfy a read request)
* compute block (generate a missing block in the cache from the other
blocks)
* prexor (subtract existing data as part of the read-modify-write process)
* biodrain (copy data out of request buffers to satisfy a write request)
* postxor (recalculate parity for new data that has entered the cache)
* check (verify that the parity is correct)
* io (submit i/o to the member disks)

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  560 
 include/linux/raid/raid5.h |   67 +
 2 files changed, 619 insertions(+), 8 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 0c8ada5..232f525 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -52,6 +52,7 @@ #include 
 #include "raid6.h"
 
 #include 
+#include 
 
 /*
  * Stripe cache
@@ -222,7 +223,8 @@ static void init_stripe(struct stripe_he
 
BUG_ON(atomic_read(&sh->count) != 0);
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
-   
+   BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
+
CHECK_DEVLOCK();
PRINTK("init_stripe called, stripe %llu\n", 
(unsigned long long)sh->sector);
@@ -238,11 +240,11 @@ static void init_stripe(struct stripe_he
for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
 
-   if (dev->toread || dev->towrite || dev->written ||
+   if (dev->toread || dev->read || dev->towrite || dev->written ||
test_bit(R5_LOCKED, &dev->flags)) {
-   printk("sector=%llx i=%d %p %p %p %d\n",
+   printk("sector=%llx i=%d %p %p %p %p %d\n",
   (unsigned long long)sh->sector, i, dev->toread,
-  dev->towrite, dev->written,
+  dev->read, dev->towrite, dev->written,
   test_bit(R5_LOCKED, &dev->flags));
BUG();
}
@@ -322,6 +324,556 @@ static struct stripe_head *get_active_st
return sh;
 }
 
+static int
+raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error);
+static int
+raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error);
+
+static void ops_run_io(struct stripe_head *sh)
+{
+   raid5_conf_t *conf = sh->raid_conf;
+   int i;
+
+   might_sleep();
+
+   for (i = sh->disks; i-- ;) {
+   int rw;
+   struct bio *bi;
+   mdk_rdev_t *rdev;
+   if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
+   rw = 1;
+   else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+   rw = 0;
+   else
+   continue;
+
+   bi = &sh->dev[i].req;
+
+   bi->bi_rw = rw;
+   if (rw)
+   bi->bi_end_io = raid5_end_write_request;
+   else
+   bi->bi_end_io = raid5_end_read_request;
+
+   rcu_read_lock();
+   rdev = rcu_dereference(conf->disks[i].rdev);
+   if (rdev && test_bit(Faulty, &rdev->flags))
+   rdev = NULL;
+   if (rdev)
+   atomic_inc(&rdev->nr_pending);
+   rcu_read_unlock();
+
+   if (rdev) {
+   if (test_bit(STRIPE_SYNCING, &sh->state) ||
+   test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
+   test_bit(STRIPE_EXPAND_READY, &sh->state))
+   md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+
+   bi->bi_bdev = rdev->bdev;
+   PRINTK("%s: stripe %llu schedule op %ld on disc %d\n",
+   __FUNCTION__,
+   (unsigned long long)sh->sector, bi->bi_rw, i);
+   atomic_inc(&sh->count);
+   bi->bi_sector = sh->sector + rdev->data_offset;
+   bi->bi_flags = 1 << BIO_UPTODATE;
+   bi->bi_vcnt = 1;
+   bi->bi_max_vecs = 1;
+   bi->bi_idx = 0;
+   bi->bi_io_vec = &sh->dev[i].vec;
+   bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+   bi->bi_io_vec[0].bv_offset = 0;
+   bi->bi_size 

[PATCH 05/12] md: workqueue for raid5 operations

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Each raid5 device gets its own queue, and each stripe has its own
work_struct.  The goal is to have a free running raid5d thread, i.e. reduce
the time the stripe lock is held by removing bulk memory operations, and
removing the sleeping path in generic_make_request.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   37 +
 include/linux/raid/raid5.h |6 ++
 2 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 232f525..c2312d1 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -126,6 +126,7 @@ static void __release_stripe(raid5_conf_
}
md_wakeup_thread(conf->mddev->thread);
} else {
+   BUG_ON(sh->ops.pending);
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state)) {
atomic_dec(&conf->preread_active_stripes);
if (atomic_read(&conf->preread_active_stripes) 
< IO_THRESHOLD)
@@ -324,6 +325,15 @@ static struct stripe_head *get_active_st
return sh;
 }
 
+static inline void issue_raid_ops(struct stripe_head *sh)
+{
+   raid5_conf_t *conf = sh->raid_conf;
+
+   atomic_inc(&sh->count);
+   conf->workqueue_stripes++;
+   queue_work(sh->raid_conf->workqueue, &sh->ops.work);
+}
+
 static int
 raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error);
 static int
@@ -868,6 +878,10 @@ static void raid5_run_ops(void *stripe_h
} else if (sh->ops.count < 0)
BUG();
 
+   /* we kick off work to the engines in batches */
+   if (--(conf->workqueue_stripes) == 0)
+   async_tx_issue_pending_all();
+
spin_unlock(&sh->lock);
 
set_bit(STRIPE_HANDLE, &sh->state);
@@ -883,6 +897,7 @@ static int grow_one_stripe(raid5_conf_t 
memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev));
sh->raid_conf = conf;
spin_lock_init(&sh->lock);
+   INIT_WORK(&sh->ops.work, raid5_run_ops, sh);
 
if (grow_buffers(sh, conf->raid_disks)) {
shrink_buffers(sh, conf->raid_disks);
@@ -1923,7 +1938,6 @@ static int stripe_to_pdidx(sector_t stri
  *schedule a write of some buffers
  *return confirmation of parity correctness
  *
- * Parity calculations are done inside the stripe lock
  * buffers are taken off read_list or write_list, and bh_cache buffers
  * get BH_Lock set before the stripe lock is released.
  *
@@ -1942,9 +1956,9 @@ static void handle_stripe5(struct stripe
int failed_num=0;
struct r5dev *dev;
 
-   PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
-   (unsigned long long)sh->sector, atomic_read(&sh->count),
-   sh->pd_idx);
+   PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d 
ops=%lx:%lx:%lx\n",
+  (unsigned long long)sh->sector, sh->state, 
atomic_read(&sh->count),
+  sh->pd_idx, sh->ops.pending, sh->ops.ack, sh->ops.complete);
 
spin_lock(&sh->lock);
clear_bit(STRIPE_HANDLE, &sh->state);
@@ -2409,6 +2423,10 @@ #endif
}
}
 
+   if (sh->ops.count && !test_and_set_bit(STRIPE_OPSQUEUE_ACTIVE, 
&sh->state)) {
+   issue_raid_ops(sh);
+   }
+
spin_unlock(&sh->lock);
 
while ((bi=return_bi)) {
@@ -3717,6 +3735,13 @@ static int run(mddev_t *mddev)
if (!conf->spare_page)
goto abort;
}
+
+   sprintf(conf->workqueue_name, "%s_raid5_ops",
+   mddev->gendisk->disk_name);
+
+   if ((conf->workqueue = create_workqueue(conf->workqueue_name)) == NULL)
+   goto abort;
+
spin_lock_init(&conf->device_lock);
init_waitqueue_head(&conf->wait_for_stripe);
init_waitqueue_head(&conf->wait_for_overlap);
@@ -3726,6 +3751,7 @@ static int run(mddev_t *mddev)
INIT_LIST_HEAD(&conf->inactive_list);
atomic_set(&conf->active_stripes, 0);
atomic_set(&conf->preread_active_stripes, 0);
+   conf->workqueue_stripes = 0;
 
PRINTK("raid5: run(%s) called.\n", mdname(mddev));
 
@@ -3879,6 +3905,8 @@ abort:
safe_put_page(conf->spare_page);
kfree(conf->disks);
kfree(conf->stripe_hashtbl);
+   if (conf->workqueue)
+   destroy_workqueue(conf->workqueue);
kfree(conf);
}
mddev->private = NULL;
@@ -3899,6 +3927,7 @@ static int stop(mdd

[PATCH 08/12] md: move raid5 parity checks to raid5_run_ops

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe sets STRIPE_OP_CHECK to request a check operation in
raid5_run_ops.  If raid5_run_ops is able to perform the check with a
dma engine the parity will be preserved and not re-read from disk.

Check operations re-use the compute block facility to repair the parity.
However since repairing the parity implies a write-back to disk the
STRIPE_OP_MOD_REPAIR_PD flag is added to distinguish it from other compute
block operations.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   81 
 1 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8510183..1764fbb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2465,32 +2465,75 @@ #endif
locked += handle_write_operations5(sh, rcw, 0);
}
 
-   /* maybe we need to check and possibly fix the parity for this stripe
-* Any reads will already have been scheduled, so we just see if enough 
data
-* is available
+   /* 1/ Maybe we need to check and possibly fix the parity for this 
stripe.
+*Any reads will already have been scheduled, so we just see if 
enough data
+*is available.
+* 2/ Hold off parity checks while parity dependent operations are in 
flight
+*(conflicting writes are protected by the 'locked' variable)
 */
-   if (syncing && locked == 0 &&
-   !test_bit(STRIPE_INSYNC, &sh->state)) {
+   if ((syncing && locked == 0 && !test_bit(STRIPE_OP_COMPUTE_BLK, 
&sh->ops.pending) &&
+   !test_bit(STRIPE_INSYNC, &sh->state)) ||
+   test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
+   test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+
set_bit(STRIPE_HANDLE, &sh->state);
-   if (failed == 0) {
-   BUG_ON(uptodate != disks);
-   compute_parity5(sh, CHECK_PARITY);
-   uptodate--;
-   if (page_is_zero(sh->dev[sh->pd_idx].page)) {
-   /* parity is correct (on disc, not in buffer 
any more) */
-   set_bit(STRIPE_INSYNC, &sh->state);
-   } else {
-   conf->mddev->resync_mismatches += 
STRIPE_SECTORS;
-   if (test_bit(MD_RECOVERY_CHECK, 
&conf->mddev->recovery))
-   /* don't try to repair!! */
+   /* Take one of the following actions:
+* 1/ start a check parity operation if (uptodate == disks)
+* 2/ finish a check parity operation and act on the result
+* 3/ skip to the writeback section if we previously
+*initiated a recovery operation
+*/
+   if (failed == 0 && !test_bit(STRIPE_OP_MOD_REPAIR_PD, 
&sh->ops.pending)) {
+   if (!test_and_set_bit(STRIPE_OP_CHECK, 
&sh->ops.pending)) {
+   BUG_ON(uptodate != disks);
+   clear_bit(R5_UPTODATE, 
&sh->dev[sh->pd_idx].flags);
+   sh->ops.count++;
+   uptodate--;
+   } else if (test_and_clear_bit(STRIPE_OP_CHECK, 
&sh->ops.complete)) {
+   clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
+   clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+
+   if (sh->ops.zero_sum_result == 0)
+   /* parity is correct (on disc, not in 
buffer any more) */
set_bit(STRIPE_INSYNC, &sh->state);
else {
-   compute_block(sh, sh->pd_idx);
-   uptodate++;
+   conf->mddev->resync_mismatches += 
STRIPE_SECTORS;
+   if (test_bit(MD_RECOVERY_CHECK, 
&conf->mddev->recovery))
+   /* don't try to repair!! */
+   set_bit(STRIPE_INSYNC, 
&sh->state);
+   else {
+   BUG_ON(test_and_set_bit(
+   STRIPE_OP_COMPUTE_BLK,
+   &sh->ops.pending));
+   set_bit(STRIPE_OP_MOD_REPAIR_PD,
+

[PATCH 03/12] dmaengine: driver for the iop32x, iop33x, and iop13xx raid engines

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

This is a driver for the iop DMA/AAU/ADMA units which are capable of pq_xor,
pq_update, pq_zero_sum, xor, dual_xor, xor_zero_sum, fill, copy+crc, and copy
operations.

Changelog:
* fixed a slot allocation bug in do_iop13xx_adma_xor that caused too few
slots to be requested eventually leading to data corruption
* enabled the slot allocation routine to attempt to free slots before
returning -ENOMEM
* switched the cleanup routine to solely use the software chain and the
status register to determine if a descriptor is complete.  This is
necessary to support other IOP engines that do not have status writeback
capability
* make the driver iop generic
* modified the allocation routines to understand allocating a group of
slots for a single operation
* added a null xor initialization operation for the xor only channel on
iop3xx
* support xor operations on buffers larger than the hardware maximum
* split the do_* routines into separate prep, src/dest set, submit stages
* added async_tx support (dependent operations initiation at cleanup time)
* simplified group handling
* added interrupt support (callbacks via tasklets)

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/Kconfig |8 
 drivers/dma/Makefile|1 
 drivers/dma/iop-adma.c  | 1522 +++
 include/asm-arm/hardware/iop_adma.h |  116 +++
 4 files changed, 1647 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index c82ed5f..d61e3e5 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -41,4 +41,12 @@ config INTEL_IOATDMA
default m
---help---
  Enable support for the Intel(R) I/OAT DMA engine.
+
+config INTEL_IOP_ADMA
+tristate "Intel IOP ADMA support"
+depends on DMA_ENGINE && (ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX)
+default m
+---help---
+  Enable support for the Intel(R) IOP Series RAID engines.
+
 endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 6a99341..8ebf10d 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
+obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
 obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
new file mode 100644
index 000..18fd7e3
--- /dev/null
+++ b/drivers/dma/iop-adma.c
@@ -0,0 +1,1522 @@
+/*
+ * Copyright(c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This driver supports the asynchrounous DMA copy and RAID engines available
+ * on the Intel Xscale(R) family of I/O Processors (IOP 32x, 33x, 134x)
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define to_iop_adma_chan(chan) container_of(chan, struct iop_adma_chan, common)
+#define to_iop_adma_device(dev) container_of(dev, struct iop_adma_device, 
common)
+#define to_iop_adma_slot(lh) container_of(lh, struct iop_adma_desc_slot, 
slot_node)
+#define tx_to_iop_adma_slot(tx) container_of(tx, struct iop_adma_desc_slot, 
async_tx)
+
+#define IOP_ADMA_DEBUG 0
+#define PRINTK(x...) ((void)(IOP_ADMA_DEBUG && printk(x)))
+
+/* software zero sum implemenation bits for iop32x */
+#ifdef CONFIG_ARCH_IOP32X
+char iop32x_zero_result_buffer[PAGE_SIZE] __attribute__((aligned(256)));
+u32 *iop32x_zero_sum_output;
+#endif
+
+/**
+ * iop_adma_free_slots - flags descriptor slots for reuse
+ * @slot: Slot to free
+ * Caller must hold &iop_chan->lock while calling this function
+ */
+static inline void iop_adma_free_slots(struct iop_adma_desc_slot *slot)
+{
+   int stride = slot->stride;
+
+   while (stride--) {
+   slot->stride = 0;
+   slot = list_entry(slot->slot_node.next,
+   struct iop_adma_desc_slot,
+   slot_node);
+   }
+}
+
+static inline dma_cookie_t
+iop_adma_run_tx_complete_a

[PATCH 12/12] md: remove raid5 compute_block and compute_parity5

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

replaced by raid5_run_ops

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  124 
 1 files changed, 0 insertions(+), 124 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7d75fbe..478741e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1522,130 +1522,6 @@ #define check_xor() do {
   \
   }   \
} while(0)
 
-
-static void compute_block(struct stripe_head *sh, int dd_idx)
-{
-   int i, count, disks = sh->disks;
-   void *ptr[MAX_XOR_BLOCKS], *p;
-
-   PRINTK("compute_block, stripe %llu, idx %d\n", 
-   (unsigned long long)sh->sector, dd_idx);
-
-   ptr[0] = page_address(sh->dev[dd_idx].page);
-   memset(ptr[0], 0, STRIPE_SIZE);
-   count = 1;
-   for (i = disks ; i--; ) {
-   if (i == dd_idx)
-   continue;
-   p = page_address(sh->dev[i].page);
-   if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
-   ptr[count++] = p;
-   else
-   printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
-   " not present\n", dd_idx,
-   (unsigned long long)sh->sector, i);
-
-   check_xor();
-   }
-   if (count != 1)
-   xor_block(count, STRIPE_SIZE, ptr[0], &ptr[1]);
-   set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-}
-
-static void compute_parity5(struct stripe_head *sh, int method)
-{
-   raid5_conf_t *conf = sh->raid_conf;
-   int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
-   void *ptr[MAX_XOR_BLOCKS];
-   struct bio *chosen;
-
-   PRINTK("compute_parity5, stripe %llu, method %d\n",
-   (unsigned long long)sh->sector, method);
-
-   count = 1;
-   ptr[0] = page_address(sh->dev[pd_idx].page);
-   switch(method) {
-   case READ_MODIFY_WRITE:
-   BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
-   for (i=disks ; i-- ;) {
-   if (i==pd_idx)
-   continue;
-   if (sh->dev[i].towrite &&
-   test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
-   ptr[count++] = page_address(sh->dev[i].page);
-   chosen = sh->dev[i].towrite;
-   sh->dev[i].towrite = NULL;
-
-   if (test_and_clear_bit(R5_Overlap, 
&sh->dev[i].flags))
-   wake_up(&conf->wait_for_overlap);
-
-   BUG_ON(sh->dev[i].written);
-   sh->dev[i].written = chosen;
-   check_xor();
-   }
-   }
-   break;
-   case RECONSTRUCT_WRITE:
-   memset(ptr[0], 0, STRIPE_SIZE);
-   for (i= disks; i-- ;)
-   if (i!=pd_idx && sh->dev[i].towrite) {
-   chosen = sh->dev[i].towrite;
-   sh->dev[i].towrite = NULL;
-
-   if (test_and_clear_bit(R5_Overlap, 
&sh->dev[i].flags))
-   wake_up(&conf->wait_for_overlap);
-
-   BUG_ON(sh->dev[i].written);
-   sh->dev[i].written = chosen;
-   }
-   break;
-   case CHECK_PARITY:
-   break;
-   }
-   if (count>1) {
-   xor_block(count, STRIPE_SIZE, ptr[0], &ptr[1]);
-   count = 1;
-   }
-   
-   for (i = disks; i--;)
-   if (sh->dev[i].written) {
-   sector_t sector = sh->dev[i].sector;
-   struct bio *wbi = sh->dev[i].written;
-   while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) 
{
-   copy_data(1, wbi, sh->dev[i].page, sector);
-   wbi = r5_next_bio(wbi, sector);
-   }
-
-   set_bit(R5_LOCKED, &sh->dev[i].flags);
-   set_bit(R5_UPTODATE, &sh->dev[i].flags);
-   }
-
-   switch(method) {
-   case RECONSTRUCT_WRITE:
-   case CHECK_PARITY:
-   for (i=disks; i--;)
-   if (i != pd_idx) {
-   ptr[count++] = page_address(sh->dev[i].page);
-   che

[PATCH 07/12] md: move raid5 compute block operations to raid5_run_ops

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe sets STRIPE_OP_COMPUTE_BLK to request servicing from
raid5_run_ops.  It also sets a flag for the block being computed to let
other parts of handle_stripe submit dependent operations.  raid5_run_ops
guarantees that the compute operation completes before any dependent
operation starts.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  139 +---
 1 files changed, 100 insertions(+), 39 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 74516ef..8510183 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2020,7 +2020,7 @@ static void handle_stripe5(struct stripe
int i;
int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-   int non_overwrite = 0;
+   int compute=0, req_compute=0, non_overwrite=0;
int failed_num=0;
struct r5dev *dev;
 
@@ -2071,8 +2071,8 @@ static void handle_stripe5(struct stripe
/* now count some things */
if (test_bit(R5_LOCKED, &dev->flags)) locked++;
if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+   if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 
1);
 
-   
if (dev->toread) to_read++;
if (dev->towrite) {
to_write++;
@@ -2227,40 +2227,91 @@ static void handle_stripe5(struct stripe
 * parity, or to satisfy requests
 * or to load a block that is being partially written.
 */
-   if (to_read || non_overwrite || (syncing && (uptodate < disks)) || 
expanding) {
-   for (i=disks; i--;) {
-   dev = &sh->dev[i];
-   if (!test_bit(R5_LOCKED, &dev->flags) && 
!test_bit(R5_UPTODATE, &dev->flags) &&
-   (dev->toread ||
-(dev->towrite && !test_bit(R5_OVERWRITE, 
&dev->flags)) ||
-syncing ||
-expanding ||
-(failed && (sh->dev[failed_num].toread ||
-(sh->dev[failed_num].towrite && 
!test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags
-   )
-   ) {
-   /* we would like to get this block, possibly
-* by computing it, but we might not be able to
+   if (to_read || non_overwrite || (syncing && (uptodate + compute < 
disks)) || expanding ||
+   test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
+
+   /* Clear completed compute operations.  Parity recovery
+* (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is 
handled
+* later on in this routine
+*/
+   if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
+   !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+   clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+   clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
+   clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+   }
+
+   /* look for blocks to read/compute, skip this if a compute
+* is already in flight, or if the stripe contents are in the
+* midst of changing due to a write
+*/
+   if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
+   !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
+   !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+   for (i=disks; i--;) {
+   dev = &sh->dev[i];
+
+   /* don't schedule compute operations or reads on
+* the parity block while a check is in flight
 */
-   if (uptodate == disks-1) {
-   PRINTK("Computing block %d\n", i);
-   compute_block(sh, i);
-   uptodate++;
-   } else if (test_bit(R5_Insync, &dev->flags)) {
-   set_bit(R5_LOCKED, &dev->flags);
-   set_bit(R5_Wantread, &dev->flags);
+   if ((i == sh->pd_idx) && 
test_bit(STRIPE_OP_CHECK, &sh->ops.pending))
+

[PATCH 06/12] md: move write operations to raid5_run_ops

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe sets STRIPE_OP_PREXOR, STRIPE_OP_BIODRAIN, STRIPE_OP_POSTXOR
to request a write to the stripe cache.  raid5_run_ops is triggerred to run
and executes the request outside the stripe lock.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  152 +---
 1 files changed, 131 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index c2312d1..74516ef 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1830,7 +1830,75 @@ static void compute_block_2(struct strip
}
 }
 
+static int handle_write_operations5(struct stripe_head *sh, int rcw, int 
expand)
+{
+   int i, pd_idx = sh->pd_idx, disks = sh->disks;
+   int locked=0;
+
+   if (rcw == 0) {
+   /* skip the drain operation on an expand */
+   if (!expand) {
+   BUG_ON(test_and_set_bit(STRIPE_OP_BIODRAIN,
+   &sh->ops.pending));
+   sh->ops.count++;
+   }
+
+   BUG_ON(test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending));
+   sh->ops.count++;
+
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+
+   if (dev->towrite) {
+   set_bit(R5_LOCKED, &dev->flags);
+   if (!expand)
+   clear_bit(R5_UPTODATE, &dev->flags);
+   locked++;
+   }
+   }
+   } else {
+   BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
+   test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
+
+   BUG_ON(test_and_set_bit(STRIPE_OP_PREXOR, &sh->ops.pending) ||
+   test_and_set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) 
||
+   test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending));
+
+   sh->ops.count += 3;
+
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+   if (i==pd_idx)
+   continue;
 
+   /* For a read-modify write there may be blocks that are
+* locked for reading while others are ready to be 
written
+* so we distinguish these blocks by the R5_Wantprexor 
bit
+*/
+   if (dev->towrite &&
+   (test_bit(R5_UPTODATE, &dev->flags) ||
+   test_bit(R5_Wantcompute, &dev->flags))) {
+   set_bit(R5_Wantprexor, &dev->flags);
+   set_bit(R5_LOCKED, &dev->flags);
+   clear_bit(R5_UPTODATE, &dev->flags);
+   locked++;
+   }
+   }
+   }
+
+   /* keep the parity disk locked while asynchronous operations
+* are in flight
+*/
+   set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
+   clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+   locked++;
+
+   PRINTK("%s: stripe %llu locked: %d pending: %lx\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+   locked, sh->ops.pending);
+
+   return locked;
+}
 
 /*
  * Each stripe/dev can have one or more bion attached.
@@ -2199,8 +2267,67 @@ #endif
set_bit(STRIPE_HANDLE, &sh->state);
}
 
-   /* now to consider writing and what else, if anything should be read */
-   if (to_write) {
+   /* Now we check to see if any write operations have recently
+* completed
+*/
+
+   /* leave prexor set until postxor is done, allows us to distinguish
+* a rmw from a rcw during biodrain
+*/
+   if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
+   test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
+
+   clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
+   clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
+   clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+
+   for (i=disks; i--;)
+   clear_bit(R5_Wantprexor, &sh->dev[i].flags);
+   }
+
+   /* if only POSTXOR is set then this is an 'expand' postxor */
+   if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
+   test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
+
+   clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete);
+   clear_bit(STRIPE_OP_BIODRAIN, &sh->ops.ack);
+  

[PATCH 11/12] md: raid5 io requests to raid5_run_ops

2006-11-30 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

generic_make_request may sleep, moving io to raid5_run_ops allows raid5d to
run freely.  Since raid5_run_ops is a workqueue other cpus can make forward
progress on other stripes.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   68 
 1 files changed, 10 insertions(+), 58 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8b36611..7d75fbe 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2431,6 +2431,8 @@ #endif
PRINTK("Read_old block %d for 
r-m-w\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, 
&dev->flags);
+   if 
(!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
locked++;
} else {
set_bit(STRIPE_DELAYED, 
&sh->state);
@@ -2451,6 +2453,8 @@ #endif
PRINTK("Read_old block %d for 
Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, 
&dev->flags);
+   if 
(!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
locked++;
} else {
set_bit(STRIPE_DELAYED, 
&sh->state);
@@ -2550,6 +2554,8 @@ #endif
 
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
clear_bit(STRIPE_DEGRADED, &sh->state);
locked++;
set_bit(STRIPE_INSYNC, &sh->state);
@@ -2571,12 +2577,16 @@ #endif
dev = &sh->dev[failed_num];
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
set_bit(R5_ReWrite, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
locked++;
} else {
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
set_bit(R5_LOCKED, &dev->flags);
locked++;
}
@@ -2682,64 +2692,6 @@ #endif
bi->bi_size = 0;
bi->bi_end_io(bi, bytes, 0);
}
-   for (i=disks; i-- ;) {
-   int rw;
-   struct bio *bi;
-   mdk_rdev_t *rdev;
-   if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-   rw = 1;
-   else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-   rw = 0;
-   else
-   continue;
- 
-   bi = &sh->dev[i].req;
- 
-   bi->bi_rw = rw;
-   if (rw)
-   bi->bi_end_io = raid5_end_write_request;
-   else
-   bi->bi_end_io = raid5_end_read_request;
- 
-   rcu_read_lock();
-   rdev = rcu_dereference(conf->disks[i].rdev);
-   if (rdev && test_bit(Faulty, &rdev->flags))
-   rdev = NULL;
-   if (rdev)
-   atomic_inc(&rdev->nr_pending);
-   rcu_read_unlock();
- 
-   if (rdev) {
-   if (syncing || expanding || expanded)
-   md_sync_acct(rdev->bdev, STRIPE_SECTORS);
-
-   bi->bi_bdev = rdev->bdev;
-   PRINTK("for %llu schedule op %ld on disc %d\n",
-   (unsigned long long)sh->sector, bi->bi_rw, i);
-   atomic_inc(&sh->count);
-   bi->bi_sector = sh->sector + rdev->d

Re: [PATCH 02/12] dmaengine: add the async_tx api

2006-11-30 Thread Dan Williams

+static inline void
+do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
+   unsigned int src_cnt, size_t len, enum async_tx_flags flags,
+   struct dma_async_tx_descriptor *depend_tx,
+   dma_async_tx_callback callback, void *callback_param)
+{
+   void *_dest;
+   int start_idx, i;
+
+   printk("%s: len: %u\n", __FUNCTION__, len);

Sorry, this should be PRINTK.

Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Odd (slow) RAID performance

2006-12-04 Thread Dan Williams

On 12/1/06, Bill Davidsen <[EMAIL PROTECTED]> wrote:

Thank you so much for verifying this. I do keep enough room on my drives
to run tests by creating any kind of whatever I need, but the point is
clear: with N drives striped the transfer rate is N x base rate of one
drive; with RAID-5 it is about the speed of one drive, suggesting that
the md code serializes writes.

If true, BOO, HISS!

Can you explain and educate us, Neal? This look like terrible performance.


Just curious what is your stripe_cache_size setting in sysfs?

Neil, please include me in the education if what follows is incorrect:

Read performance in kernels up to and including 2.6.19 is hindered by
needing to go through the stripe cache.  This situation should improve
with the stripe-cache-bypass patches currently in -mm.  As Raz
reported in some cases the performance increase of this approach is
30% which is roughly equivalent to the performance difference I see of
a 4-disk raid5 versus a 3-disk raid0.

For the write case I can say that MD does not serialize writes.  If by
serialize you mean that there is 1:1 correlation between writes to the
parity disk and writes to a data disk.  To illustrate I instrumented
MD to count how many times it issued a write to the parity disk and
compared that to how many writes it performed to the member disks for
the workload "dd if=/dev/zero of=/dev/md0 bs=1024k count=100".  I
recorded 8544 parity writes and 25600 member disk writes which is
about 3 member disk writes per parity write, or pretty close to
optimal for a 4-disk array.  So, serialization is not the cause,
performing sub-stripe width writes is not the cause as >98% of the
writes happened without needing to read old data from the disks.
However, I see the same performance on my system, about equal to a
single disk.

Here is where I step into supposition territory.  Perhaps the
discrepancy is related to the size of the requests going to the block
layer.  raid5 always makes page sized requests with the expectation
that they will coalesce into larger requests in the block layer.
Maybe we are missing coalescing opportunities in raid5 compared to
what happens in the raid0 case?  Are there any io scheduler knobs to
turn along these lines?

Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: raid5 software vs hardware: parity calculations?

2007-01-13 Thread Dan Williams

On 1/12/07, James Ralston <[EMAIL PROTECTED]> wrote:

On 2007-01-12 at 09:39-08 dean gaudet <[EMAIL PROTECTED]> wrote:

> On Thu, 11 Jan 2007, James Ralston wrote:
>
> > I'm having a discussion with a coworker concerning the cost of
> > md's raid5 implementation versus hardware raid5 implementations.
> >
> > Specifically, he states:
> >
> > > The performance [of raid5 in hardware] is so much better with
> > > the write-back caching on the card and the offload of the
> > > parity, it seems to me that the minor increase in work of having
> > > to upgrade the firmware if there's a buggy one is a highly
> > > acceptable trade-off to the increased performance.  The md
> > > driver still commits you to longer run queues since IO calls to
> > > disk, parity calculator and the subsequent kflushd operations
> > > are non-interruptible in the CPU.  A RAID card with write-back
> > > cache releases the IO operation virtually instantaneously.
> >
> > It would seem that his comments have merit, as there appears to be
> > work underway to move stripe operations outside of the spinlock:
> >
> > http://lwn.net/Articles/184102/
> >
> > What I'm curious about is this: for real-world situations, how
> > much does this matter?  In other words, how hard do you have to
> > push md raid5 before doing dedicated hardware raid5 becomes a real
> > win?
>
> hardware with battery backed write cache is going to beat the
> software at small write traffic latency essentially all the time but
> it's got nothing to do with the parity computation.

I'm not convinced that's true.

No, it's true.  md implements a write-through cache to ensure that
data reaches the disk.


What my coworker is arguing is that md
raid5 code spinlocks while it is performing this sequence of
operations:

1.  executing the write

not performed under the lock

2.  reading the blocks necessary for recalculating the parity

not performed under the lock

3.  recalculating the parity
4.  updating the parity block

My [admittedly cursory] read of the code, coupled with the link above,
leads me to believe that my coworker is correct, which is why I was
for trolling for [informed] opinions about how much of a performance
hit the spinlock causes.


The spinlock is not a source of performance loss, the reason for
moving parity calculations outside the lock is to maximize the benefit
of using asynchronous xor+copy engines.

The hardware vs software raid trade-offs are well documented here:
http://linux.yyz.us/why-software-raid.html

Regards,
Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 00/12] md raid acceleration and the async_tx api

2007-01-18 Thread Dan Williams

On 1/18/07, Yuri Tikhonov <[EMAIL PROTECTED]> wrote:


 Hello, Dan.

Hello.


 It seems there is a bug in your 06.11.30 raid acceleration patch-set. I tried 
to run the
Linux s/w RAID-5 driver patched with your 06.11.30 patch-set and

found that it fails during

write operations when the RAID-5 array consists of 6 or more number

of drives (I tested up

to 8 drives). For 5 and less number of drives everything works as

expected. There are no

such problems with your 06.09.12 set of patches. Do you have any

assumptions about the

reasons of this fault?


Yes, sorry, there were bugs in the synchronous path around handling >
MAX_XOR_BLOCKS that I have fixed for the next rev of the patches.
I'll be releasing them shortly, but attached is a patch to address the
issue you are seeing.


The kernel I used was 2.6.19, your 06.11.30 patch-set was applied without any
warnings/errors. Here is the kernel Oops report:

Oops: kernel access of bad area, sig: 11 [#1]
NIP: C014F980 LR: C014FD0C CTR: 0080
REGS: eee49d40 TRAP: 0300   Not tainted  (2.6.19-g0726acdc-dirty)
MSR: 00029000   CR: 44002042  XER: 2000
DAR: 17970004, DSISR: 
TASK = eed5a7d0[280] 'md0_raid5_ops/0' THREAD: eee48000
GPR00: 007F EEE49DF0 EED5A7D0 0080 EEDFC000  19D7 1787
GPR08: 1000 C02B EEDFC000 C014F950 EEDFC000 3000 C015B8D8 1797
GPR16: C08AC180 C02B EEE0CB48 003A 000C 1000  0001
GPR24:  C015B8D8 0004 003A EEDFC000 0004 19D7 1787
NIP [C014F980] xor_32regs_4+0x30/0x158
LR [C014FD0C] xor_block+0xc4/0x12c
Call Trace:
[EEE49E40] [EEE49E58] 0xeee49e58
[EEE49E50] [C014EFAC] async_xor+0x134/0x200
[EEE49EB0] [C015A960] ops_run_postxor+0xf8/0x198
[EEE49F00] [C0162458] raid5_run_ops+0x8dc/0x994
[EEE49F50] [C0029F7C] run_workqueue+0xa4/0x118
[EEE49F70] [C002A198] worker_thread+0xf8/0x13c
[EEE49FC0] [C002E20C] kthread+0xf8/0x100
[EEE49FF0] [C0003DA0] kernel_thread+0x44/0x60
Instruction dump:
5463d97e 7c601b78 3400 9421ffb0 bde1000c 7c6903a6 7c8c2378 7caf2b78
7cde3378 7cff3b78 41800124 80ac <82ef0004> 82cf0008 82af000c 828f0010

 Regards, Yuri.


Thanks for testing the patches.

Regards,
Dan
diff --git a/drivers/dma/async_tx.c b/drivers/dma/async_tx.c
index d918cc3..eee208d 100644
--- a/drivers/dma/async_tx.c
+++ b/drivers/dma/async_tx.c
@@ -324,9 +324,6 @@ async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx,
 }
 #endif
 
-#define to_iop_adma_chan(chan) container_of(chan, struct iop_adma_chan, common)
-#define tx_to_iop_adma_slot(tx) container_of(tx, struct iop_adma_desc_slot, async_tx)
-
 static inline void
 async_tx_submit(struct dma_chan *chan, struct dma_async_tx_descriptor *tx,
 	enum async_tx_flags flags, struct dma_async_tx_descriptor *depend_tx,
@@ -423,17 +420,12 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
 	dma_async_tx_callback callback, void *callback_param)
 {
 	void *_dest;
-	int start_idx, i;
+	int i;
 
 	PRINTK("%s: len: %u\n", __FUNCTION__, len);
 
 	/* reuse the 'src_list' array to convert to buffer pointers */
-	if (flags & ASYNC_TX_XOR_DROP_DST)
-		start_idx = 1;
-	else
-		start_idx = 0;
-
-	for (i = start_idx; i < src_cnt; i++)
+	for (i = 0; i < src_cnt; i++)
 		src_list[i] = (struct page *)
 			(page_address(src_list[i]) + offset);
 
@@ -443,8 +435,8 @@ do_sync_xor(struct page *dest, struct page **src_list, unsigned int offset,
 	if (flags & ASYNC_TX_XOR_ZERO_DST)
 		memset(_dest, 0, len);
 
-	xor_block(src_cnt - start_idx, len, _dest,
-		(void **) &src_list[start_idx]);
+	xor_block(src_cnt, len, _dest,
+		(void **) src_list);
 
 	sync_epilog(flags, depend_tx, callback, callback_param);
 }
@@ -514,7 +506,15 @@ async_xor(struct page *dest, struct page **src_list, unsigned int offset,
 goto xor_sync;
 		} else { /* run the xor synchronously */
 xor_sync:
-			/* process up to 'max_xor_blocks' sources */
+			/* in the sync case the dest is an implied source
+			 * (assumes the dest is at the src_off index)
+			 */
+			if (flags & ASYNC_TX_XOR_DROP_DST) {
+src_cnt--;
+src_off++;
+			}
+
+			/* process up to 'MAX_XOR_BLOCKS' sources */
 			xor_src_cnt = min(src_cnt, (unsigned int) MAX_XOR_BLOCKS);
 
 			/* if we are submitting additional xors
@@ -540,9 +540,9 @@ xor_sync:
 		__FUNCTION__);
 			}
 
-			do_sync_xor(dest, &src_list[src_off], offset, src_cnt,
-len, local_flags, depend_tx, _callback,
-_callback_param);
+			do_sync_xor(dest, &src_list[src_off], offset,
+xor_src_cnt, len, local_flags, depend_tx,
+_callback, _callback_param);
 		}
 
 		/* the previous tx is hidden from the client,
@@ -556,13 +556,15 @@ xor_sync:
 		if (src_cnt > xor_src_cnt) {
 			/* drop completed sources */
 			src_cnt -= xor_src_cnt;
+			src_off += xor_src_cnt;
 
 			/* unconditionally preserve the destination */
 			flags &= ~ASYNC_TX_XOR_ZERO_DST;
 
-			/* use the intermediate result a source */
-			src_off = xor_src_cnt - 1;
-			src_list[src_off] = dest;
+			/* use the i

Re: What is the exacting meaning of Striped_Cache_Size?

2007-01-21 Thread Dan Williams

On 1/21/07, Liang Yang <[EMAIL PROTECTED]> wrote:

Hello,

I have tried to increase the Striped_Cache_Size from 256 (default for my
MD-RAID5 array) to 8192, it does improve the MD-RAID5 Write performance
which varies with the size of I/O packet.

However, I'm still not very clean the meaning and the potential performance
impact of this Striped_Cache_Size? Is the unit for this parameter Byte or
KiloByte?

Could anyone here explain with a little bit more details?


stripe_cache_size is the number of stripes in the cache.  Each stripe
(strip) is composed one PAGE_SIZE block per disk.  If your page size
is 4k and you have 4 disks in your array then a stripe_cache_size of
256 is 4k * 4 * 256 = 4MB.

Increasing this number increases the chances that a write to the array
will not generate reads to satisfy the parity calculation.


Thanks,

Liang


Regards,

Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: What is the exacting meaning of Striped_Cache_Size?

2007-01-22 Thread Dan Williams

On 1/21/07, Liang Yang <[EMAIL PROTECTED]> wrote:

Dan,

Thanks for your reply. Still get two questions left.

Suppose I have a MD-RAID5 array which consists of 8 disks.
1. Do we need to consider the chunk size of the RAID array when we set the
value of Striped_Cache_Size? For example, if the chunk size is changed from
64k to 256k, do we need to adjust the Striped_Cache_Size accordingly?


stripe_cache_size and the chunk size are completely independent
settings.  The chunk size only determines how much data will be
accessed from one disk before proceeding to the next disk.


2. The performance improvement of large size I/O packets (128k, 256k) is
larger than small size I/O packets (512B, 1KB) when I change the
Striped_Cache_Size. How do you explain the difference here?


With smaller I/Os the chances that you are staying within one stripe
are higher so you would see less benefit of having more stripes in the
cache.

Liang



Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Kernel 2.6.19.2 New RAID 5 Bug (oops when writing Samba -> RAID5)

2007-01-22 Thread Dan Williams

On 1/22/07, Neil Brown <[EMAIL PROTECTED]> wrote:

On Monday January 22, [EMAIL PROTECTED] wrote:
> Justin Piszcz wrote:
> > My .config is attached, please let me know if any other information is
> > needed and please CC (lkml) as I am not on the list, thanks!
> >
> > Running Kernel 2.6.19.2 on a MD RAID5 volume.  Copying files over Samba to
> > the RAID5 running XFS.
> >
> > Any idea what happened here?

> >
> Without digging too deeply, I'd say you've hit the same bug Sami Farin
> and others
> have reported starting with 2.6.19: pages mapped with kmap_atomic()
> become unmapped
> during memcpy() or similar operations.  Try disabling preempt -- that
> seems to be the
> common factor.

That is exactly the conclusion I had just come to (a kmap_atomic page
must be being unmapped during memcpy).  I wasn't aware that others had
reported it - thanks for that.

Turning off CONFIG_PREEMPT certainly seems like a good idea.


Coming from an ARM background I am not yet versed in the inner
workings of kmap_atomic, but if you have time for a question I am
curious as to why spin_lock(&sh->lock)  is not sufficient pre-emption
protection for copy_data() in this case?


NeilBrown


Regards,
Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/12] dmaengine: add base support for the async_tx api

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

* introduce struct dma_async_tx_descriptor as a common field for all dmaengine
software descriptors
* convert the device_memcpy_* methods into separate prep, set src/dest, and
submit stages
* support capabilities beyond memcpy (xor, memset, xor zero sum, completion
interrupts)
* convert ioatdma to the new semantics

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/dmaengine.c   |   44 ++--
 drivers/dma/ioatdma.c |  256 ++--
 drivers/dma/ioatdma.h |8 +
 include/linux/dmaengine.h |  263 ++---
 4 files changed, 394 insertions(+), 177 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 1527804..8d203ad 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -210,7 +210,8 @@ static void dma_chans_rebalance(void)
mutex_lock(&dma_list_mutex);
 
list_for_each_entry(client, &dma_client_list, global_node) {
-   while (client->chans_desired > client->chan_count) {
+   while (client->chans_desired < 0 ||
+   client->chans_desired > client->chan_count) {
chan = dma_client_chan_alloc(client);
if (!chan)
break;
@@ -219,7 +220,8 @@ static void dma_chans_rebalance(void)
   chan,
   DMA_RESOURCE_ADDED);
}
-   while (client->chans_desired < client->chan_count) {
+   while (client->chans_desired >= 0 &&
+   client->chans_desired < client->chan_count) {
spin_lock_irqsave(&client->lock, flags);
chan = list_entry(client->channels.next,
  struct dma_chan,
@@ -294,12 +296,12 @@ void dma_async_client_unregister(struct dma_client 
*client)
  * @number: count of DMA channels requested
  *
  * Clients call dma_async_client_chan_request() to specify how many
- * DMA channels they need, 0 to free all currently allocated.
+ * DMA channels they need, 0 to free all currently allocated. A request
+ * < 0 indicates the client wants to handle all engines in the system.
  * The resulting allocations/frees are indicated to the client via the
  * event callback.
  */
-void dma_async_client_chan_request(struct dma_client *client,
-   unsigned int number)
+void dma_async_client_chan_request(struct dma_client *client, int number)
 {
client->chans_desired = number;
dma_chans_rebalance();
@@ -318,6 +320,31 @@ int dma_async_device_register(struct dma_device *device)
if (!device)
return -ENODEV;
 
+   /* validate device routines */
+   BUG_ON(test_bit(DMA_MEMCPY, &device->capabilities) &&
+   !device->device_prep_dma_memcpy);
+   BUG_ON(test_bit(DMA_XOR, &device->capabilities) &&
+   !device->device_prep_dma_xor);
+   BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) &&
+   !device->device_prep_dma_zero_sum);
+   BUG_ON(test_bit(DMA_MEMSET, &device->capabilities) &&
+   !device->device_prep_dma_memset);
+   BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) &&
+   !device->device_prep_dma_interrupt);
+
+   BUG_ON(!device->device_alloc_chan_resources);
+   BUG_ON(!device->device_free_chan_resources);
+   BUG_ON(!device->device_tx_submit);
+   BUG_ON(!device->device_set_dest);
+   BUG_ON(!device->device_set_src);
+   BUG_ON(!device->device_dependency_added);
+   BUG_ON(!device->device_is_tx_complete);
+   BUG_ON(!device->map_page);
+   BUG_ON(!device->map_single);
+   BUG_ON(!device->unmap_page);
+   BUG_ON(!device->unmap_single);
+   BUG_ON(!device->device_issue_pending);
+
init_completion(&device->done);
kref_init(&device->refcount);
device->dev_id = id++;
@@ -402,11 +429,8 @@ subsys_initcall(dma_bus_init);
 EXPORT_SYMBOL(dma_async_client_register);
 EXPORT_SYMBOL(dma_async_client_unregister);
 EXPORT_SYMBOL(dma_async_client_chan_request);
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
-EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
-EXPORT_SYMBOL(dma_async_memcpy_complete);
-EXPORT_SYMBOL(dma_async_memcpy_issue_pending);
+EXPORT_SYMBOL(dma_async_is_tx_complete);
+EXPORT_SYMBOL(dma_async_issue_pending);
 EXPORT_SYMBOL(dma_async_device_register);
 EXPORT_SYMBOL(dma_async_device_unregister);
 EXPORT_SYMBOL(dma_chan_cleanup);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 8e87261..70b

[PATCH 08/12] md: satisfy raid5 read requests via raid5_run_ops

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Use raid5_run_ops to carry out the memory copies for a raid5 read request.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   40 +++-
 1 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2422253..db8925f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1980,7 +1980,7 @@ static void handle_stripe5(struct stripe_head *sh)
int i;
int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-   int compute=0, req_compute=0, non_overwrite=0;
+   int to_fill=0, compute=0, req_compute=0, non_overwrite=0;
int failed_num=0;
struct r5dev *dev;
unsigned long pending=0;
@@ -2004,34 +2004,20 @@ static void handle_stripe5(struct stripe_head *sh)
dev = &sh->dev[i];
clear_bit(R5_Insync, &dev->flags);
 
-   PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
-   i, dev->flags, dev->toread, dev->towrite, dev->written);
-   /* maybe we can reply to a read */
+   PRINTK("check %d: state 0x%lx toread %p read %p write %p 
written %p\n",
+   i, dev->flags, dev->toread, dev->read, dev->towrite, 
dev->written);
+
+   /* maybe we can start a biofill operation */
if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
-   struct bio *rbi, *rbi2;
-   PRINTK("Return read for disc %d\n", i);
-   spin_lock_irq(&conf->device_lock);
-   rbi = dev->toread;
-   dev->toread = NULL;
-   if (test_and_clear_bit(R5_Overlap, &dev->flags))
-   wake_up(&conf->wait_for_overlap);
-   spin_unlock_irq(&conf->device_lock);
-   while (rbi && rbi->bi_sector < dev->sector + 
STRIPE_SECTORS) {
-   copy_data(0, rbi, dev->page, dev->sector);
-   rbi2 = r5_next_bio(rbi, dev->sector);
-   spin_lock_irq(&conf->device_lock);
-   if (--rbi->bi_phys_segments == 0) {
-   rbi->bi_next = return_bi;
-   return_bi = rbi;
-   }
-   spin_unlock_irq(&conf->device_lock);
-   rbi = rbi2;
-   }
+   to_read--;
+   if (!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+   set_bit(R5_Wantfill, &dev->flags);
}
 
/* now count some things */
if (test_bit(R5_LOCKED, &dev->flags)) locked++;
if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+   if (test_bit(R5_Wantfill, &dev->flags)) to_fill++;
if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 
1);
 
if (dev->toread) to_read++;
@@ -2055,9 +2041,13 @@ static void handle_stripe5(struct stripe_head *sh)
set_bit(R5_Insync, &dev->flags);
}
rcu_read_unlock();
+
+   if (to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+   sh->ops.count++;
+
PRINTK("locked=%d uptodate=%d to_read=%d"
-   " to_write=%d failed=%d failed_num=%d\n",
-   locked, uptodate, to_read, to_write, failed, failed_num);
+   " to_write=%d to_fill=%d failed=%d failed_num=%d\n",
+   locked, uptodate, to_read, to_write, to_fill, failed, 
failed_num);
/* check if the array has lost two devices and, if so, some requests 
might
 * need to be failed
 */
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 03/12] md: add raid5_run_ops and support routines

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Prepare the raid5 implementation to use async_tx for running stripe
operations:
* biofill (copy data into request buffers to satisfy a read request)
* compute block (generate a missing block in the cache from the other
blocks)
* prexor (subtract existing data as part of the read-modify-write process)
* biodrain (copy data out of request buffers to satisfy a write request)
* postxor (recalculate parity for new data that has entered the cache)
* check (verify that the parity is correct)
* io (submit i/o to the member disks)

Changelog:
* removed ops_complete_biodrain in favor of ops_complete_postxor and
ops_complete_write.
* removed the workqueue
* call bi_end_io for reads in ops_complete_biofill

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  520 
 include/linux/raid/raid5.h |   63 +
 2 files changed, 580 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 68b6fea..e70ee17 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -52,6 +52,7 @@
 #include "raid6.h"
 
 #include 
+#include 
 
 /*
  * Stripe cache
@@ -324,6 +325,525 @@ static struct stripe_head *get_active_stripe(raid5_conf_t 
*conf, sector_t sector
return sh;
 }
 
+static int
+raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error);
+static int
+raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error);
+
+static void ops_run_io(struct stripe_head *sh)
+{
+   raid5_conf_t *conf = sh->raid_conf;
+   int i, disks = sh->disks;
+
+   might_sleep();
+
+   for (i=disks; i-- ;) {
+   int rw;
+   struct bio *bi;
+   mdk_rdev_t *rdev;
+   if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
+   rw = WRITE;
+   else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+   rw = READ;
+   else
+   continue;
+
+   bi = &sh->dev[i].req;
+
+   bi->bi_rw = rw;
+   if (rw == WRITE)
+   bi->bi_end_io = raid5_end_write_request;
+   else
+   bi->bi_end_io = raid5_end_read_request;
+
+   rcu_read_lock();
+   rdev = rcu_dereference(conf->disks[i].rdev);
+   if (rdev && test_bit(Faulty, &rdev->flags))
+   rdev = NULL;
+   if (rdev)
+   atomic_inc(&rdev->nr_pending);
+   rcu_read_unlock();
+
+   if (rdev) {
+   if (test_bit(STRIPE_SYNCING, &sh->state) ||
+   test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
+   test_bit(STRIPE_EXPAND_READY, &sh->state))
+   md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+
+   bi->bi_bdev = rdev->bdev;
+   PRINTK("%s: for %llu schedule op %ld on disc %d\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+   bi->bi_rw, i);
+   atomic_inc(&sh->count);
+   bi->bi_sector = sh->sector + rdev->data_offset;
+   bi->bi_flags = 1 << BIO_UPTODATE;
+   bi->bi_vcnt = 1;
+   bi->bi_max_vecs = 1;
+   bi->bi_idx = 0;
+   bi->bi_io_vec = &sh->dev[i].vec;
+   bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+   bi->bi_io_vec[0].bv_offset = 0;
+   bi->bi_size = STRIPE_SIZE;
+   bi->bi_next = NULL;
+   if (rw == WRITE &&
+   test_bit(R5_ReWrite, &sh->dev[i].flags))
+   atomic_add(STRIPE_SECTORS, 
&rdev->corrected_errors);
+   generic_make_request(bi);
+   } else {
+   if (rw == WRITE)
+   set_bit(STRIPE_DEGRADED, &sh->state);
+   PRINTK("skip op %ld on disc %d for sector %llu\n",
+   bi->bi_rw, i, (unsigned long long)sh->sector);
+   clear_bit(R5_LOCKED, &sh->dev[i].flags);
+   set_bit(STRIPE_HANDLE, &sh->state);
+   }
+   }
+}
+
+static struct dma_async_tx_descriptor *
+async_copy_data(int frombio, struct bio *bio, struct page *page, sector_t 
sector,
+   struct dma_async_tx_descriptor *tx)
+{
+   struct bio_vec *bvl;
+   struct page *bio_page;
+   int i;
+   int page_

[PATCH 04/12] md: use raid5_run_ops for stripe cache operations

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Each stripe has three flag variables to reflect the state of operations
(pending, ack, and complete).
-pending: set to request servicing in raid5_run_ops
-ack: set to reflect that raid5_runs_ops has seen this request
-complete: set when the operation is complete and it is ok for handle_stripe5
to clear 'pending' and 'ack'.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   65 +---
 1 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e70ee17..2c74f9b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -126,6 +126,7 @@ static void __release_stripe(raid5_conf_t *conf, struct 
stripe_head *sh)
}
md_wakeup_thread(conf->mddev->thread);
} else {
+   BUG_ON(sh->ops.pending);
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state)) {
atomic_dec(&conf->preread_active_stripes);
if (atomic_read(&conf->preread_active_stripes) 
< IO_THRESHOLD)
@@ -225,7 +226,8 @@ static void init_stripe(struct stripe_head *sh, sector_t 
sector, int pd_idx, int
 
BUG_ON(atomic_read(&sh->count) != 0);
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
-   
+   BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
+
CHECK_DEVLOCK();
PRINTK("init_stripe called, stripe %llu\n", 
(unsigned long long)sh->sector);
@@ -241,11 +243,11 @@ static void init_stripe(struct stripe_head *sh, sector_t 
sector, int pd_idx, int
for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
 
-   if (dev->toread || dev->towrite || dev->written ||
+   if (dev->toread || dev->read || dev->towrite || dev->written ||
test_bit(R5_LOCKED, &dev->flags)) {
-   printk("sector=%llx i=%d %p %p %p %d\n",
+   printk("sector=%llx i=%d %p %p %p %p %d\n",
   (unsigned long long)sh->sector, i, dev->toread,
-  dev->towrite, dev->written,
+  dev->read, dev->towrite, dev->written,
   test_bit(R5_LOCKED, &dev->flags));
BUG();
}
@@ -325,6 +327,43 @@ static struct stripe_head *get_active_stripe(raid5_conf_t 
*conf, sector_t sector
return sh;
 }
 
+/* check_op() ensures that we only dequeue an operation once */
+#define check_op(op) do {\
+   if (test_bit(op, &sh->ops.pending) &&\
+   !test_bit(op, &sh->ops.complete)) {\
+   if (test_and_set_bit(op, &sh->ops.ack))\
+   clear_bit(op, &pending);\
+   else\
+   ack++;\
+   } else\
+   clear_bit(op, &pending);\
+} while(0)
+
+/* find new work to run, do not resubmit work that is already
+ * in flight
+ */
+static unsigned long get_stripe_work(struct stripe_head *sh)
+{
+   unsigned long pending;
+   int ack = 0;
+
+   pending = sh->ops.pending;
+
+   check_op(STRIPE_OP_BIOFILL);
+   check_op(STRIPE_OP_COMPUTE_BLK);
+   check_op(STRIPE_OP_PREXOR);
+   check_op(STRIPE_OP_BIODRAIN);
+   check_op(STRIPE_OP_POSTXOR);
+   check_op(STRIPE_OP_CHECK);
+   if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
+   ack++;
+
+   sh->ops.count -= ack;
+   BUG_ON(sh->ops.count < 0);
+
+   return pending;
+}
+
 static int
 raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error);
 static int
@@ -1859,7 +1898,6 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t 
*conf, int disks)
  *schedule a write of some buffers
  *return confirmation of parity correctness
  *
- * Parity calculations are done inside the stripe lock
  * buffers are taken off read_list or write_list, and bh_cache buffers
  * get BH_Lock set before the stripe lock is released.
  *
@@ -1877,10 +1915,11 @@ static void handle_stripe5(struct stripe_head *sh)
int non_overwrite = 0;
int failed_num=0;
struct r5dev *dev;
+   unsigned long pending=0;
 
-   PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
-   (unsigned long long)sh->sector, atomic_read(&sh->count),
-   sh->pd_idx);
+   PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d 
ops=%lx:%lx:%lx\n",
+  (unsigned long long)sh->sector, sh->state, 
atomic_read(&sh->count),
+  sh->pd_id

[PATCH 12/12] dmaengine: driver for the iop32x, iop33x, and iop13xx raid engines

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

This is a driver for the iop DMA/AAU/ADMA units which are capable of pq_xor,
pq_update, pq_zero_sum, xor, dual_xor, xor_zero_sum, fill, copy+crc, and copy
operations.

Changelog:
* fixed a slot allocation bug in do_iop13xx_adma_xor that caused too few
slots to be requested eventually leading to data corruption
* enabled the slot allocation routine to attempt to free slots before
returning -ENOMEM
* switched the cleanup routine to solely use the software chain and the
status register to determine if a descriptor is complete.  This is
necessary to support other IOP engines that do not have status writeback
capability
* make the driver iop generic
* modified the allocation routines to understand allocating a group of
slots for a single operation
* added a null xor initialization operation for the xor only channel on
iop3xx
* support xor operations on buffers larger than the hardware maximum
* split the do_* routines into separate prep, src/dest set, submit stages
* added async_tx support (dependent operations initiation at cleanup time)
* simplified group handling
* added interrupt support (callbacks via tasklets)
* brought the pending depth inline with ioat (i.e. 4 descriptors)

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/Kconfig |8 
 drivers/dma/Makefile|1 
 drivers/dma/iop-adma.c  | 1511 +++
 include/asm-arm/hardware/iop_adma.h |  116 +++
 4 files changed, 1636 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index c82ed5f..d61e3e5 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -41,4 +41,12 @@ config INTEL_IOATDMA
default m
---help---
  Enable support for the Intel(R) I/OAT DMA engine.
+
+config INTEL_IOP_ADMA
+tristate "Intel IOP ADMA support"
+depends on DMA_ENGINE && (ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX)
+default m
+---help---
+  Enable support for the Intel(R) IOP Series RAID engines.
+
 endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 6a99341..8ebf10d 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
+obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
 obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
new file mode 100644
index 000..77f859e
--- /dev/null
+++ b/drivers/dma/iop-adma.c
@@ -0,0 +1,1511 @@
+/*
+ * Copyright(c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This driver supports the asynchrounous DMA copy and RAID engines available
+ * on the Intel Xscale(R) family of I/O Processors (IOP 32x, 33x, 134x)
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define to_iop_adma_chan(chan) container_of(chan, struct iop_adma_chan, common)
+#define to_iop_adma_device(dev) container_of(dev, struct iop_adma_device, 
common)
+#define to_iop_adma_slot(lh) container_of(lh, struct iop_adma_desc_slot, 
slot_node)
+#define tx_to_iop_adma_slot(tx) container_of(tx, struct iop_adma_desc_slot, 
async_tx)
+
+#define IOP_ADMA_DEBUG 0
+#define PRINTK(x...) ((void)(IOP_ADMA_DEBUG && printk(x)))
+
+/**
+ * iop_adma_free_slots - flags descriptor slots for reuse
+ * @slot: Slot to free
+ * Caller must hold &iop_chan->lock while calling this function
+ */
+static inline void iop_adma_free_slots(struct iop_adma_desc_slot *slot)
+{
+   int stride = slot->stride;
+
+   while (stride--) {
+   slot->stride = 0;
+   slot = list_entry(slot->slot_node.next,
+   struct iop_adma_desc_slot,
+   slot_node);
+   }
+}
+
+static inline dma_cookie_t
+iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc,
+   struct iop_adma_chan *iop_chan, dma_cookie_t cookie)
+{
+   BUG_ON(

[PATCH 07/12] md: move raid5 parity checks to raid5_run_ops

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe sets STRIPE_OP_CHECK to request a check operation in
raid5_run_ops.  If raid5_run_ops is able to perform the check with a
dma engine the parity will be preserved in memory removing the need to
re-read it from disk, as is necessary in the synchronous case.

'Repair' operations re-use the same logic as compute block, with the caveat
that the results of the compute block are immediately written back to the
parity disk.  To differentiate these operations the STRIPE_OP_MOD_REPAIR_PD
flag is added.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   81 
 1 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 279a30c..2422253 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2411,32 +2411,75 @@ static void handle_stripe5(struct stripe_head *sh)
locked += handle_write_operations5(sh, rcw, 0);
}
 
-   /* maybe we need to check and possibly fix the parity for this stripe
-* Any reads will already have been scheduled, so we just see if enough 
data
-* is available
+   /* 1/ Maybe we need to check and possibly fix the parity for this 
stripe.
+*Any reads will already have been scheduled, so we just see if 
enough data
+*is available.
+* 2/ Hold off parity checks while parity dependent operations are in 
flight
+*(conflicting writes are protected by the 'locked' variable)
 */
-   if (syncing && locked == 0 &&
-   !test_bit(STRIPE_INSYNC, &sh->state)) {
+   if ((syncing && locked == 0 && !test_bit(STRIPE_OP_COMPUTE_BLK, 
&sh->ops.pending) &&
+   !test_bit(STRIPE_INSYNC, &sh->state)) ||
+   test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
+   test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+
set_bit(STRIPE_HANDLE, &sh->state);
-   if (failed == 0) {
-   BUG_ON(uptodate != disks);
-   compute_parity5(sh, CHECK_PARITY);
-   uptodate--;
-   if (page_is_zero(sh->dev[sh->pd_idx].page)) {
-   /* parity is correct (on disc, not in buffer 
any more) */
-   set_bit(STRIPE_INSYNC, &sh->state);
-   } else {
-   conf->mddev->resync_mismatches += 
STRIPE_SECTORS;
-   if (test_bit(MD_RECOVERY_CHECK, 
&conf->mddev->recovery))
-   /* don't try to repair!! */
+   /* Take one of the following actions:
+* 1/ start a check parity operation if (uptodate == disks)
+* 2/ finish a check parity operation and act on the result
+* 3/ skip to the writeback section if we previously
+*initiated a recovery operation
+*/
+   if (failed == 0 && !test_bit(STRIPE_OP_MOD_REPAIR_PD, 
&sh->ops.pending)) {
+   if (!test_and_set_bit(STRIPE_OP_CHECK, 
&sh->ops.pending)) {
+   BUG_ON(uptodate != disks);
+   clear_bit(R5_UPTODATE, 
&sh->dev[sh->pd_idx].flags);
+   sh->ops.count++;
+   uptodate--;
+   } else if (test_and_clear_bit(STRIPE_OP_CHECK, 
&sh->ops.complete)) {
+   clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
+   clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+
+   if (sh->ops.zero_sum_result == 0)
+   /* parity is correct (on disc, not in 
buffer any more) */
set_bit(STRIPE_INSYNC, &sh->state);
else {
-   compute_block(sh, sh->pd_idx);
-   uptodate++;
+   conf->mddev->resync_mismatches += 
STRIPE_SECTORS;
+   if (test_bit(MD_RECOVERY_CHECK, 
&conf->mddev->recovery))
+   /* don't try to repair!! */
+   set_bit(STRIPE_INSYNC, 
&sh->state);
+   else {
+   BUG_ON(test_and_set_bit(
+   STRIPE_OP_COMPUTE_BLK,
+   &sh->ops.pe

[PATCH 11/12] md: remove raid5 compute_block and compute_parity5

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

replaced by raid5_run_ops

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  124 
 1 files changed, 0 insertions(+), 124 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8af084f..a981c35 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1480,130 +1480,6 @@ static void copy_data(int frombio, struct bio *bio,
   }   \
} while(0)
 
-
-static void compute_block(struct stripe_head *sh, int dd_idx)
-{
-   int i, count, disks = sh->disks;
-   void *ptr[MAX_XOR_BLOCKS], *dest, *p;
-
-   PRINTK("compute_block, stripe %llu, idx %d\n", 
-   (unsigned long long)sh->sector, dd_idx);
-
-   dest = page_address(sh->dev[dd_idx].page);
-   memset(dest, 0, STRIPE_SIZE);
-   count = 0;
-   for (i = disks ; i--; ) {
-   if (i == dd_idx)
-   continue;
-   p = page_address(sh->dev[i].page);
-   if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
-   ptr[count++] = p;
-   else
-   printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
-   " not present\n", dd_idx,
-   (unsigned long long)sh->sector, i);
-
-   check_xor();
-   }
-   if (count)
-   xor_block(count, STRIPE_SIZE, dest, ptr);
-   set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-}
-
-static void compute_parity5(struct stripe_head *sh, int method)
-{
-   raid5_conf_t *conf = sh->raid_conf;
-   int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
-   void *ptr[MAX_XOR_BLOCKS], *dest;
-   struct bio *chosen;
-
-   PRINTK("compute_parity5, stripe %llu, method %d\n",
-   (unsigned long long)sh->sector, method);
-
-   count = 0;
-   dest = page_address(sh->dev[pd_idx].page);
-   switch(method) {
-   case READ_MODIFY_WRITE:
-   BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
-   for (i=disks ; i-- ;) {
-   if (i==pd_idx)
-   continue;
-   if (sh->dev[i].towrite &&
-   test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
-   ptr[count++] = page_address(sh->dev[i].page);
-   chosen = sh->dev[i].towrite;
-   sh->dev[i].towrite = NULL;
-
-   if (test_and_clear_bit(R5_Overlap, 
&sh->dev[i].flags))
-   wake_up(&conf->wait_for_overlap);
-
-   BUG_ON(sh->dev[i].written);
-   sh->dev[i].written = chosen;
-   check_xor();
-   }
-   }
-   break;
-   case RECONSTRUCT_WRITE:
-   memset(dest, 0, STRIPE_SIZE);
-   for (i= disks; i-- ;)
-   if (i!=pd_idx && sh->dev[i].towrite) {
-   chosen = sh->dev[i].towrite;
-   sh->dev[i].towrite = NULL;
-
-   if (test_and_clear_bit(R5_Overlap, 
&sh->dev[i].flags))
-   wake_up(&conf->wait_for_overlap);
-
-   BUG_ON(sh->dev[i].written);
-   sh->dev[i].written = chosen;
-   }
-   break;
-   case CHECK_PARITY:
-   break;
-   }
-   if (count) {
-   xor_block(count, STRIPE_SIZE, dest, ptr);
-   count = 0;
-   }
-   
-   for (i = disks; i--;)
-   if (sh->dev[i].written) {
-   sector_t sector = sh->dev[i].sector;
-   struct bio *wbi = sh->dev[i].written;
-   while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) 
{
-   copy_data(1, wbi, sh->dev[i].page, sector);
-   wbi = r5_next_bio(wbi, sector);
-   }
-
-   set_bit(R5_LOCKED, &sh->dev[i].flags);
-   set_bit(R5_UPTODATE, &sh->dev[i].flags);
-   }
-
-   switch(method) {
-   case RECONSTRUCT_WRITE:
-   case CHECK_PARITY:
-   for (i=disks; i--;)
-   if (i != pd_idx) {
-   ptr[count++] = page_address(sh->dev[i].page);
-   check_xor();
- 

[PATCH 10/12] md: move raid5 io requests to raid5_run_ops

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe now only updates the state of stripes.  All execution of
operations is moved to raid5_run_ops.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   68 
 1 files changed, 10 insertions(+), 58 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1956b3c..8af084f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2360,6 +2360,8 @@ static void handle_stripe5(struct stripe_head *sh)
PRINTK("Read_old block %d for 
r-m-w\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, 
&dev->flags);
+   if 
(!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
locked++;
} else {
set_bit(STRIPE_DELAYED, 
&sh->state);
@@ -2380,6 +2382,8 @@ static void handle_stripe5(struct stripe_head *sh)
PRINTK("Read_old block %d for 
Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, 
&dev->flags);
+   if 
(!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
locked++;
} else {
set_bit(STRIPE_DELAYED, 
&sh->state);
@@ -2479,6 +2483,8 @@ static void handle_stripe5(struct stripe_head *sh)
 
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
clear_bit(STRIPE_DEGRADED, &sh->state);
locked++;
set_bit(STRIPE_INSYNC, &sh->state);
@@ -2500,12 +2506,16 @@ static void handle_stripe5(struct stripe_head *sh)
dev = &sh->dev[failed_num];
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
set_bit(R5_ReWrite, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
locked++;
} else {
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
set_bit(R5_LOCKED, &dev->flags);
locked++;
}
@@ -2615,64 +2625,6 @@ static void handle_stripe5(struct stripe_head *sh)
  test_bit(BIO_UPTODATE, &bi->bi_flags)
? 0 : -EIO);
}
-   for (i=disks; i-- ;) {
-   int rw;
-   struct bio *bi;
-   mdk_rdev_t *rdev;
-   if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-   rw = WRITE;
-   else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-   rw = READ;
-   else
-   continue;
- 
-   bi = &sh->dev[i].req;
- 
-   bi->bi_rw = rw;
-   if (rw == WRITE)
-   bi->bi_end_io = raid5_end_write_request;
-   else
-   bi->bi_end_io = raid5_end_read_request;
- 
-   rcu_read_lock();
-   rdev = rcu_dereference(conf->disks[i].rdev);
-   if (rdev && test_bit(Faulty, &rdev->flags))
-   rdev = NULL;
-   if (rdev)
-   atomic_inc(&rdev->nr_pending);
-   rcu_read_unlock();
- 
-   if (rdev) {
-   if (syncing || expanding || expanded)
-   md_sync_acct(rdev->bdev, STRIPE_SECTORS);
-
-   bi->bi_bdev = rdev->bdev;
-   PRINTK("for %llu schedule op %ld on disc %d\n",
-

[PATCH 09/12] md: use async_tx and raid5_run_ops for raid5 expansion operations

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

The parity calculation for an expansion operation is the same as the
calculation performed at the end of a write with the caveat that all blocks
in the stripe are scheduled to be written.  An expansion operation is
identified as a stripe with the POSTXOR flag set and the BIODRAIN flag not
set.

The bulk copy operation to the new stripe is handled inline by async_tx.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   48 
 1 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index db8925f..1956b3c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2511,18 +2511,32 @@ static void handle_stripe5(struct stripe_head *sh)
}
}
 
-   if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
-   /* Need to write out all blocks after computing parity */
-   sh->disks = conf->raid_disks;
-   sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 
conf->raid_disks);
-   compute_parity5(sh, RECONSTRUCT_WRITE);
+   /* Finish postxor operations initiated by the expansion
+* process
+*/
+   if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
+   !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
+
+   clear_bit(STRIPE_EXPANDING, &sh->state);
+
+   clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
+   clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
+   clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+
for (i= conf->raid_disks; i--;) {
-   set_bit(R5_LOCKED, &sh->dev[i].flags);
-   locked++;
set_bit(R5_Wantwrite, &sh->dev[i].flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
}
-   clear_bit(STRIPE_EXPANDING, &sh->state);
-   } else if (expanded) {
+   }
+
+   if (expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+   !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+   /* Need to write out all blocks after computing parity */
+   sh->disks = conf->raid_disks;
+   sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 
conf->raid_disks);
+   locked += handle_write_operations5(sh, 0, 1);
+   } else if (expanded && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
@@ -2533,6 +2547,7 @@ static void handle_stripe5(struct stripe_head *sh)
/* We have read all the blocks in this stripe and now we need to
 * copy some of them into a target stripe for expand.
 */
+   struct dma_async_tx_descriptor *tx = NULL;
clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
for (i=0; i< sh->disks; i++)
if (i != sh->pd_idx) {
@@ -2556,9 +2571,12 @@ static void handle_stripe5(struct stripe_head *sh)
release_stripe(sh2);
continue;
}
-   memcpy(page_address(sh2->dev[dd_idx].page),
-  page_address(sh->dev[i].page),
-  STRIPE_SIZE);
+
+   /* place all the copies on one channel */
+   tx = async_memcpy(sh2->dev[dd_idx].page,
+   sh->dev[i].page, 0, 0, STRIPE_SIZE,
+   ASYNC_TX_DEP_ACK, tx, NULL, NULL);
+
set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
for (j=0; jraid_disks; j++)
@@ -2570,6 +2588,12 @@ static void handle_stripe5(struct stripe_head *sh)
set_bit(STRIPE_HANDLE, &sh2->state);
}
release_stripe(sh2);
+
+   /* done submitting copies, wait for them to 
complete */
+   if (i + 1 >= sh->disks) {
+   async_tx_ack(tx);
+   dma_wait_for_async_tx(tx);
+   }
}
}
 
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/12] dmaengine: add the async_tx api

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

async_tx is an api to describe a series of bulk memory
transfers/transforms.  When possible these transactions are carried out by
asynchrounous dma engines.  The api handles inter-transaction dependencies
and hides dma channel management from the client.  When a dma engine is not
present the transaction is carried out via synchronous software routines.

Xor operations are handled by async_tx, to this end xor.c is moved into
drivers/dma and is changed to take an explicit destination address and
a series of sources to match the hardware engine implementation.

When CONFIG_DMA_ENGINE is not set the asynchrounous path is compiled away.

Changelog:
* fixed a leftover debug print
* don't allow callbacks in async_interrupt_cond
* fixed xor_block changes
* fixed usage of ASYNC_TX_XOR_DROP_DEST

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/Makefile |1 
 drivers/dma/Kconfig  |   16 +
 drivers/dma/Makefile |1 
 drivers/dma/async_tx.c   |  910 ++
 drivers/dma/xor.c|  153 
 drivers/md/Kconfig   |2 
 drivers/md/Makefile  |6 
 drivers/md/raid5.c   |   52 +--
 drivers/md/xor.c |  154 
 include/linux/async_tx.h |  180 +
 include/linux/raid/xor.h |5 
 11 files changed, 1291 insertions(+), 189 deletions(-)

diff --git a/drivers/Makefile b/drivers/Makefile
index 0dd96d1..7d55837 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_I2C) += i2c/
 obj-$(CONFIG_W1)   += w1/
 obj-$(CONFIG_HWMON)+= hwmon/
 obj-$(CONFIG_PHONE)+= telephony/
+obj-$(CONFIG_ASYNC_TX_DMA) += dma/
 obj-$(CONFIG_MD)   += md/
 obj-$(CONFIG_BT)   += bluetooth/
 obj-$(CONFIG_ISDN) += isdn/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 30d021d..c82ed5f 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -7,8 +7,8 @@ menu "DMA Engine support"
 config DMA_ENGINE
bool "Support for DMA engines"
---help---
- DMA engines offload copy operations from the CPU to dedicated
- hardware, allowing the copies to happen asynchronously.
+  DMA engines offload bulk memory operations from the CPU to dedicated
+  hardware, allowing the operations to happen asynchronously.
 
 comment "DMA Clients"
 
@@ -22,6 +22,17 @@ config NET_DMA
  Since this is the main user of the DMA engine, it should be enabled;
  say Y here.
 
+config ASYNC_TX_DMA
+   tristate "Asynchronous Bulk Memory Transfers/Transforms API"
+   default y
+   ---help---
+ This enables the async_tx management layer for dma engines.
+ Subsystems coded to this API will use offload engines for bulk
+ memory operations where present.  Software implementations are
+ called when a dma engine is not present or fails to allocate
+ memory to carry out the transaction.
+ Current subsystems ported to async_tx: MD_RAID4,5
+
 comment "DMA Devices"
 
 config INTEL_IOATDMA
@@ -30,5 +41,4 @@ config INTEL_IOATDMA
default m
---help---
  Enable support for the Intel(R) I/OAT DMA engine.
-
 endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index bdcfdbd..6a99341 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
+obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o
diff --git a/drivers/dma/async_tx.c b/drivers/dma/async_tx.c
new file mode 100644
index 000..eee208d
--- /dev/null
+++ b/drivers/dma/async_tx.c
@@ -0,0 +1,910 @@
+/*
+ * Copyright(c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define ASYNC_TX_DEBUG 0
+#define PRINTK(x...) ((void)(ASYNC_TX_DEBUG && printk(x)))
+
+#ifdef CONFIG_DMA_ENGINE
+static struct dma_client *a

[PATCH 06/12] md: move raid5 compute block operations to raid5_run_ops

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe sets STRIPE_OP_COMPUTE_BLK to request servicing from
raid5_run_ops.  It also sets a flag for the block being computed to let
other parts of handle_stripe submit dependent operations.  raid5_run_ops
guarantees that the compute operation completes before any dependent
operation starts.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  125 +++-
 1 files changed, 93 insertions(+), 32 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2390657..279a30c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1980,7 +1980,7 @@ static void handle_stripe5(struct stripe_head *sh)
int i;
int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-   int non_overwrite = 0;
+   int compute=0, req_compute=0, non_overwrite=0;
int failed_num=0;
struct r5dev *dev;
unsigned long pending=0;
@@ -2032,8 +2032,8 @@ static void handle_stripe5(struct stripe_head *sh)
/* now count some things */
if (test_bit(R5_LOCKED, &dev->flags)) locked++;
if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+   if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 
1);
 
-   
if (dev->toread) to_read++;
if (dev->towrite) {
to_write++;
@@ -2188,31 +2188,82 @@ static void handle_stripe5(struct stripe_head *sh)
 * parity, or to satisfy requests
 * or to load a block that is being partially written.
 */
-   if (to_read || non_overwrite || (syncing && (uptodate < disks)) || 
expanding) {
-   for (i=disks; i--;) {
-   dev = &sh->dev[i];
-   if (!test_bit(R5_LOCKED, &dev->flags) && 
!test_bit(R5_UPTODATE, &dev->flags) &&
-   (dev->toread ||
-(dev->towrite && !test_bit(R5_OVERWRITE, 
&dev->flags)) ||
-syncing ||
-expanding ||
-(failed && (sh->dev[failed_num].toread ||
-(sh->dev[failed_num].towrite && 
!test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags
-   )
-   ) {
-   /* we would like to get this block, possibly
-* by computing it, but we might not be able to
+   if (to_read || non_overwrite || (syncing && (uptodate + compute < 
disks)) || expanding ||
+   test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
+
+   /* Clear completed compute operations.  Parity recovery
+* (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is 
handled
+* later on in this routine
+*/
+   if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
+   !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+   clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+   clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
+   clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+   }
+
+   /* look for blocks to read/compute, skip this if a compute
+* is already in flight, or if the stripe contents are in the
+* midst of changing due to a write
+*/
+   if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
+   !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
+   !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+   for (i=disks; i--;) {
+   dev = &sh->dev[i];
+
+   /* don't schedule compute operations or reads on
+* the parity block while a check is in flight
 */
-   if (uptodate == disks-1) {
-   PRINTK("Computing block %d\n", i);
-   compute_block(sh, i);
-   uptodate++;
-   } else if (test_bit(R5_Insync, &dev->flags)) {
-   set_bit(R5_LOCKED, &dev->flags);
-   set_bit(R5_Wantread, &dev->flags);
-   locked++;
-  

[PATCH 05/12] md: move write operations to raid5_run_ops

2007-01-22 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe sets STRIPE_OP_PREXOR, STRIPE_OP_BIODRAIN, STRIPE_OP_POSTXOR
to request a write to the stripe cache.  raid5_run_ops is triggerred to run
and executes the request outside the stripe lock.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  152 +---
 1 files changed, 131 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2c74f9b..2390657 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1788,7 +1788,75 @@ static void compute_block_2(struct stripe_head *sh, int 
dd_idx1, int dd_idx2)
}
 }
 
+static int handle_write_operations5(struct stripe_head *sh, int rcw, int 
expand)
+{
+   int i, pd_idx = sh->pd_idx, disks = sh->disks;
+   int locked=0;
+
+   if (rcw == 0) {
+   /* skip the drain operation on an expand */
+   if (!expand) {
+   BUG_ON(test_and_set_bit(STRIPE_OP_BIODRAIN,
+   &sh->ops.pending));
+   sh->ops.count++;
+   }
+
+   BUG_ON(test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending));
+   sh->ops.count++;
+
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+
+   if (dev->towrite) {
+   set_bit(R5_LOCKED, &dev->flags);
+   if (!expand)
+   clear_bit(R5_UPTODATE, &dev->flags);
+   locked++;
+   }
+   }
+   } else {
+   BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
+   test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
+
+   BUG_ON(test_and_set_bit(STRIPE_OP_PREXOR, &sh->ops.pending) ||
+   test_and_set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) 
||
+   test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending));
+
+   sh->ops.count += 3;
+
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+   if (i==pd_idx)
+   continue;
 
+   /* For a read-modify write there may be blocks that are
+* locked for reading while others are ready to be 
written
+* so we distinguish these blocks by the R5_Wantprexor 
bit
+*/
+   if (dev->towrite &&
+   (test_bit(R5_UPTODATE, &dev->flags) ||
+   test_bit(R5_Wantcompute, &dev->flags))) {
+   set_bit(R5_Wantprexor, &dev->flags);
+   set_bit(R5_LOCKED, &dev->flags);
+   clear_bit(R5_UPTODATE, &dev->flags);
+   locked++;
+   }
+   }
+   }
+
+   /* keep the parity disk locked while asynchronous operations
+* are in flight
+*/
+   set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
+   clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+   locked++;
+
+   PRINTK("%s: stripe %llu locked: %d pending: %lx\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+   locked, sh->ops.pending);
+
+   return locked;
+}
 
 /*
  * Each stripe/dev can have one or more bion attached.
@@ -2151,8 +2219,67 @@ static void handle_stripe5(struct stripe_head *sh)
set_bit(STRIPE_HANDLE, &sh->state);
}
 
-   /* now to consider writing and what else, if anything should be read */
-   if (to_write) {
+   /* Now we check to see if any write operations have recently
+* completed
+*/
+
+   /* leave prexor set until postxor is done, allows us to distinguish
+* a rmw from a rcw during biodrain
+*/
+   if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
+   test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
+
+   clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
+   clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
+   clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+
+   for (i=disks; i--;)
+   clear_bit(R5_Wantprexor, &sh->dev[i].flags);
+   }
+
+   /* if only POSTXOR is set then this is an 'expand' postxor */
+   if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
+   test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
+
+   clear_bit(STRIPE_OP_BIODRAIN

[PATCH 2.6.20-rc5 01/12] dmaengine: add base support for the async_tx api

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

* introduce struct dma_async_tx_descriptor as a common field for all dmaengine
software descriptors
* convert the device_memcpy_* methods into separate prep, set src/dest, and
submit stages
* support capabilities beyond memcpy (xor, memset, xor zero sum, completion
interrupts)
* convert ioatdma to the new semantics

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/dmaengine.c   |   44 ++--
 drivers/dma/ioatdma.c |  256 ++--
 drivers/dma/ioatdma.h |8 +
 include/linux/dmaengine.h |  263 ++---
 4 files changed, 394 insertions(+), 177 deletions(-)

diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c
index 1527804..8d203ad 100644
--- a/drivers/dma/dmaengine.c
+++ b/drivers/dma/dmaengine.c
@@ -210,7 +210,8 @@ static void dma_chans_rebalance(void)
mutex_lock(&dma_list_mutex);
 
list_for_each_entry(client, &dma_client_list, global_node) {
-   while (client->chans_desired > client->chan_count) {
+   while (client->chans_desired < 0 ||
+   client->chans_desired > client->chan_count) {
chan = dma_client_chan_alloc(client);
if (!chan)
break;
@@ -219,7 +220,8 @@ static void dma_chans_rebalance(void)
   chan,
   DMA_RESOURCE_ADDED);
}
-   while (client->chans_desired < client->chan_count) {
+   while (client->chans_desired >= 0 &&
+   client->chans_desired < client->chan_count) {
spin_lock_irqsave(&client->lock, flags);
chan = list_entry(client->channels.next,
  struct dma_chan,
@@ -294,12 +296,12 @@ void dma_async_client_unregister(struct dma_client 
*client)
  * @number: count of DMA channels requested
  *
  * Clients call dma_async_client_chan_request() to specify how many
- * DMA channels they need, 0 to free all currently allocated.
+ * DMA channels they need, 0 to free all currently allocated. A request
+ * < 0 indicates the client wants to handle all engines in the system.
  * The resulting allocations/frees are indicated to the client via the
  * event callback.
  */
-void dma_async_client_chan_request(struct dma_client *client,
-   unsigned int number)
+void dma_async_client_chan_request(struct dma_client *client, int number)
 {
client->chans_desired = number;
dma_chans_rebalance();
@@ -318,6 +320,31 @@ int dma_async_device_register(struct dma_device *device)
if (!device)
return -ENODEV;
 
+   /* validate device routines */
+   BUG_ON(test_bit(DMA_MEMCPY, &device->capabilities) &&
+   !device->device_prep_dma_memcpy);
+   BUG_ON(test_bit(DMA_XOR, &device->capabilities) &&
+   !device->device_prep_dma_xor);
+   BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) &&
+   !device->device_prep_dma_zero_sum);
+   BUG_ON(test_bit(DMA_MEMSET, &device->capabilities) &&
+   !device->device_prep_dma_memset);
+   BUG_ON(test_bit(DMA_ZERO_SUM, &device->capabilities) &&
+   !device->device_prep_dma_interrupt);
+
+   BUG_ON(!device->device_alloc_chan_resources);
+   BUG_ON(!device->device_free_chan_resources);
+   BUG_ON(!device->device_tx_submit);
+   BUG_ON(!device->device_set_dest);
+   BUG_ON(!device->device_set_src);
+   BUG_ON(!device->device_dependency_added);
+   BUG_ON(!device->device_is_tx_complete);
+   BUG_ON(!device->map_page);
+   BUG_ON(!device->map_single);
+   BUG_ON(!device->unmap_page);
+   BUG_ON(!device->unmap_single);
+   BUG_ON(!device->device_issue_pending);
+
init_completion(&device->done);
kref_init(&device->refcount);
device->dev_id = id++;
@@ -402,11 +429,8 @@ subsys_initcall(dma_bus_init);
 EXPORT_SYMBOL(dma_async_client_register);
 EXPORT_SYMBOL(dma_async_client_unregister);
 EXPORT_SYMBOL(dma_async_client_chan_request);
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_buf);
-EXPORT_SYMBOL(dma_async_memcpy_buf_to_pg);
-EXPORT_SYMBOL(dma_async_memcpy_pg_to_pg);
-EXPORT_SYMBOL(dma_async_memcpy_complete);
-EXPORT_SYMBOL(dma_async_memcpy_issue_pending);
+EXPORT_SYMBOL(dma_async_is_tx_complete);
+EXPORT_SYMBOL(dma_async_issue_pending);
 EXPORT_SYMBOL(dma_async_device_register);
 EXPORT_SYMBOL(dma_async_device_unregister);
 EXPORT_SYMBOL(dma_chan_cleanup);
diff --git a/drivers/dma/ioatdma.c b/drivers/dma/ioatdma.c
index 8e87261..70b

[PATCH 2.6.20-rc5 02/12] dmaengine: add the async_tx api

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

async_tx is an api to describe a series of bulk memory
transfers/transforms.  When possible these transactions are carried out by
asynchrounous dma engines.  The api handles inter-transaction dependencies
and hides dma channel management from the client.  When a dma engine is not
present the transaction is carried out via synchronous software routines.

Xor operations are handled by async_tx, to this end xor.c is moved into
drivers/dma and is changed to take an explicit destination address and
a series of sources to match the hardware engine implementation.

When CONFIG_DMA_ENGINE is not set the asynchrounous path is compiled away.

Changelog:
* fixed a leftover debug print
* don't allow callbacks in async_interrupt_cond
* fixed xor_block changes
* fixed usage of ASYNC_TX_XOR_DROP_DEST

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/Makefile |1 
 drivers/dma/Kconfig  |   16 +
 drivers/dma/Makefile |1 
 drivers/dma/async_tx.c   |  910 ++
 drivers/dma/xor.c|  153 
 drivers/md/Kconfig   |2 
 drivers/md/Makefile  |6 
 drivers/md/raid5.c   |   52 +--
 drivers/md/xor.c |  154 
 include/linux/async_tx.h |  180 +
 include/linux/raid/xor.h |5 
 11 files changed, 1291 insertions(+), 189 deletions(-)

diff --git a/drivers/Makefile b/drivers/Makefile
index 0dd96d1..7d55837 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -61,6 +61,7 @@ obj-$(CONFIG_I2C) += i2c/
 obj-$(CONFIG_W1)   += w1/
 obj-$(CONFIG_HWMON)+= hwmon/
 obj-$(CONFIG_PHONE)+= telephony/
+obj-$(CONFIG_ASYNC_TX_DMA) += dma/
 obj-$(CONFIG_MD)   += md/
 obj-$(CONFIG_BT)   += bluetooth/
 obj-$(CONFIG_ISDN) += isdn/
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 30d021d..c82ed5f 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -7,8 +7,8 @@ menu "DMA Engine support"
 config DMA_ENGINE
bool "Support for DMA engines"
---help---
- DMA engines offload copy operations from the CPU to dedicated
- hardware, allowing the copies to happen asynchronously.
+  DMA engines offload bulk memory operations from the CPU to dedicated
+  hardware, allowing the operations to happen asynchronously.
 
 comment "DMA Clients"
 
@@ -22,6 +22,17 @@ config NET_DMA
  Since this is the main user of the DMA engine, it should be enabled;
  say Y here.
 
+config ASYNC_TX_DMA
+   tristate "Asynchronous Bulk Memory Transfers/Transforms API"
+   default y
+   ---help---
+ This enables the async_tx management layer for dma engines.
+ Subsystems coded to this API will use offload engines for bulk
+ memory operations where present.  Software implementations are
+ called when a dma engine is not present or fails to allocate
+ memory to carry out the transaction.
+ Current subsystems ported to async_tx: MD_RAID4,5
+
 comment "DMA Devices"
 
 config INTEL_IOATDMA
@@ -30,5 +41,4 @@ config INTEL_IOATDMA
default m
---help---
  Enable support for the Intel(R) I/OAT DMA engine.
-
 endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index bdcfdbd..6a99341 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,3 +1,4 @@
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
+obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o
diff --git a/drivers/dma/async_tx.c b/drivers/dma/async_tx.c
new file mode 100644
index 000..eee208d
--- /dev/null
+++ b/drivers/dma/async_tx.c
@@ -0,0 +1,910 @@
+/*
+ * Copyright(c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define ASYNC_TX_DEBUG 0
+#define PRINTK(x...) ((void)(ASYNC_TX_DEBUG && printk(x)))
+
+#ifdef CONFIG_DMA_ENGINE
+static struct dma_client *a

[PATCH 2.6.20-rc5 06/12] md: move raid5 compute block operations to raid5_run_ops

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe sets STRIPE_OP_COMPUTE_BLK to request servicing from
raid5_run_ops.  It also sets a flag for the block being computed to let
other parts of handle_stripe submit dependent operations.  raid5_run_ops
guarantees that the compute operation completes before any dependent
operation starts.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  125 +++-
 1 files changed, 93 insertions(+), 32 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2390657..279a30c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1980,7 +1980,7 @@ static void handle_stripe5(struct stripe_head *sh)
int i;
int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-   int non_overwrite = 0;
+   int compute=0, req_compute=0, non_overwrite=0;
int failed_num=0;
struct r5dev *dev;
unsigned long pending=0;
@@ -2032,8 +2032,8 @@ static void handle_stripe5(struct stripe_head *sh)
/* now count some things */
if (test_bit(R5_LOCKED, &dev->flags)) locked++;
if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+   if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 
1);
 
-   
if (dev->toread) to_read++;
if (dev->towrite) {
to_write++;
@@ -2188,31 +2188,82 @@ static void handle_stripe5(struct stripe_head *sh)
 * parity, or to satisfy requests
 * or to load a block that is being partially written.
 */
-   if (to_read || non_overwrite || (syncing && (uptodate < disks)) || 
expanding) {
-   for (i=disks; i--;) {
-   dev = &sh->dev[i];
-   if (!test_bit(R5_LOCKED, &dev->flags) && 
!test_bit(R5_UPTODATE, &dev->flags) &&
-   (dev->toread ||
-(dev->towrite && !test_bit(R5_OVERWRITE, 
&dev->flags)) ||
-syncing ||
-expanding ||
-(failed && (sh->dev[failed_num].toread ||
-(sh->dev[failed_num].towrite && 
!test_bit(R5_OVERWRITE, &sh->dev[failed_num].flags
-   )
-   ) {
-   /* we would like to get this block, possibly
-* by computing it, but we might not be able to
+   if (to_read || non_overwrite || (syncing && (uptodate + compute < 
disks)) || expanding ||
+   test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending)) {
+
+   /* Clear completed compute operations.  Parity recovery
+* (STRIPE_OP_MOD_REPAIR_PD) implies a write-back which is 
handled
+* later on in this routine
+*/
+   if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) &&
+   !test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+   clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete);
+   clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack);
+   clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending);
+   }
+
+   /* look for blocks to read/compute, skip this if a compute
+* is already in flight, or if the stripe contents are in the
+* midst of changing due to a write
+*/
+   if (!test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending) &&
+   !test_bit(STRIPE_OP_PREXOR, &sh->ops.pending) &&
+   !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+   for (i=disks; i--;) {
+   dev = &sh->dev[i];
+
+   /* don't schedule compute operations or reads on
+* the parity block while a check is in flight
 */
-   if (uptodate == disks-1) {
-   PRINTK("Computing block %d\n", i);
-   compute_block(sh, i);
-   uptodate++;
-   } else if (test_bit(R5_Insync, &dev->flags)) {
-   set_bit(R5_LOCKED, &dev->flags);
-   set_bit(R5_Wantread, &dev->flags);
-   locked++;
-  

[PATCH 2.6.20-rc5 03/12] md: add raid5_run_ops and support routines

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Prepare the raid5 implementation to use async_tx for running stripe
operations:
* biofill (copy data into request buffers to satisfy a read request)
* compute block (generate a missing block in the cache from the other
blocks)
* prexor (subtract existing data as part of the read-modify-write process)
* biodrain (copy data out of request buffers to satisfy a write request)
* postxor (recalculate parity for new data that has entered the cache)
* check (verify that the parity is correct)
* io (submit i/o to the member disks)

Changelog:
* removed ops_complete_biodrain in favor of ops_complete_postxor and
ops_complete_write.
* removed the workqueue
* call bi_end_io for reads in ops_complete_biofill

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  520 
 include/linux/raid/raid5.h |   63 +
 2 files changed, 580 insertions(+), 3 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 68b6fea..e70ee17 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -52,6 +52,7 @@
 #include "raid6.h"
 
 #include 
+#include 
 
 /*
  * Stripe cache
@@ -324,6 +325,525 @@ static struct stripe_head *get_active_stripe(raid5_conf_t 
*conf, sector_t sector
return sh;
 }
 
+static int
+raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error);
+static int
+raid5_end_write_request (struct bio *bi, unsigned int bytes_done, int error);
+
+static void ops_run_io(struct stripe_head *sh)
+{
+   raid5_conf_t *conf = sh->raid_conf;
+   int i, disks = sh->disks;
+
+   might_sleep();
+
+   for (i=disks; i-- ;) {
+   int rw;
+   struct bio *bi;
+   mdk_rdev_t *rdev;
+   if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
+   rw = WRITE;
+   else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
+   rw = READ;
+   else
+   continue;
+
+   bi = &sh->dev[i].req;
+
+   bi->bi_rw = rw;
+   if (rw == WRITE)
+   bi->bi_end_io = raid5_end_write_request;
+   else
+   bi->bi_end_io = raid5_end_read_request;
+
+   rcu_read_lock();
+   rdev = rcu_dereference(conf->disks[i].rdev);
+   if (rdev && test_bit(Faulty, &rdev->flags))
+   rdev = NULL;
+   if (rdev)
+   atomic_inc(&rdev->nr_pending);
+   rcu_read_unlock();
+
+   if (rdev) {
+   if (test_bit(STRIPE_SYNCING, &sh->state) ||
+   test_bit(STRIPE_EXPAND_SOURCE, &sh->state) ||
+   test_bit(STRIPE_EXPAND_READY, &sh->state))
+   md_sync_acct(rdev->bdev, STRIPE_SECTORS);
+
+   bi->bi_bdev = rdev->bdev;
+   PRINTK("%s: for %llu schedule op %ld on disc %d\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+   bi->bi_rw, i);
+   atomic_inc(&sh->count);
+   bi->bi_sector = sh->sector + rdev->data_offset;
+   bi->bi_flags = 1 << BIO_UPTODATE;
+   bi->bi_vcnt = 1;
+   bi->bi_max_vecs = 1;
+   bi->bi_idx = 0;
+   bi->bi_io_vec = &sh->dev[i].vec;
+   bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
+   bi->bi_io_vec[0].bv_offset = 0;
+   bi->bi_size = STRIPE_SIZE;
+   bi->bi_next = NULL;
+   if (rw == WRITE &&
+   test_bit(R5_ReWrite, &sh->dev[i].flags))
+   atomic_add(STRIPE_SECTORS, 
&rdev->corrected_errors);
+   generic_make_request(bi);
+   } else {
+   if (rw == WRITE)
+   set_bit(STRIPE_DEGRADED, &sh->state);
+   PRINTK("skip op %ld on disc %d for sector %llu\n",
+   bi->bi_rw, i, (unsigned long long)sh->sector);
+   clear_bit(R5_LOCKED, &sh->dev[i].flags);
+   set_bit(STRIPE_HANDLE, &sh->state);
+   }
+   }
+}
+
+static struct dma_async_tx_descriptor *
+async_copy_data(int frombio, struct bio *bio, struct page *page, sector_t 
sector,
+   struct dma_async_tx_descriptor *tx)
+{
+   struct bio_vec *bvl;
+   struct page *bio_page;
+   int i;
+   int page_

[PATCH 2.6.20-rc5 08/12] md: satisfy raid5 read requests via raid5_run_ops

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Use raid5_run_ops to carry out the memory copies for a raid5 read request.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   40 +++-
 1 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2422253..db8925f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1980,7 +1980,7 @@ static void handle_stripe5(struct stripe_head *sh)
int i;
int syncing, expanding, expanded;
int locked=0, uptodate=0, to_read=0, to_write=0, failed=0, written=0;
-   int compute=0, req_compute=0, non_overwrite=0;
+   int to_fill=0, compute=0, req_compute=0, non_overwrite=0;
int failed_num=0;
struct r5dev *dev;
unsigned long pending=0;
@@ -2004,34 +2004,20 @@ static void handle_stripe5(struct stripe_head *sh)
dev = &sh->dev[i];
clear_bit(R5_Insync, &dev->flags);
 
-   PRINTK("check %d: state 0x%lx read %p write %p written %p\n",
-   i, dev->flags, dev->toread, dev->towrite, dev->written);
-   /* maybe we can reply to a read */
+   PRINTK("check %d: state 0x%lx toread %p read %p write %p 
written %p\n",
+   i, dev->flags, dev->toread, dev->read, dev->towrite, 
dev->written);
+
+   /* maybe we can start a biofill operation */
if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread) {
-   struct bio *rbi, *rbi2;
-   PRINTK("Return read for disc %d\n", i);
-   spin_lock_irq(&conf->device_lock);
-   rbi = dev->toread;
-   dev->toread = NULL;
-   if (test_and_clear_bit(R5_Overlap, &dev->flags))
-   wake_up(&conf->wait_for_overlap);
-   spin_unlock_irq(&conf->device_lock);
-   while (rbi && rbi->bi_sector < dev->sector + 
STRIPE_SECTORS) {
-   copy_data(0, rbi, dev->page, dev->sector);
-   rbi2 = r5_next_bio(rbi, dev->sector);
-   spin_lock_irq(&conf->device_lock);
-   if (--rbi->bi_phys_segments == 0) {
-   rbi->bi_next = return_bi;
-   return_bi = rbi;
-   }
-   spin_unlock_irq(&conf->device_lock);
-   rbi = rbi2;
-   }
+   to_read--;
+   if (!test_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+   set_bit(R5_Wantfill, &dev->flags);
}
 
/* now count some things */
if (test_bit(R5_LOCKED, &dev->flags)) locked++;
if (test_bit(R5_UPTODATE, &dev->flags)) uptodate++;
+   if (test_bit(R5_Wantfill, &dev->flags)) to_fill++;
if (test_bit(R5_Wantcompute, &dev->flags)) BUG_ON(++compute > 
1);
 
if (dev->toread) to_read++;
@@ -2055,9 +2041,13 @@ static void handle_stripe5(struct stripe_head *sh)
set_bit(R5_Insync, &dev->flags);
}
rcu_read_unlock();
+
+   if (to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
+   sh->ops.count++;
+
PRINTK("locked=%d uptodate=%d to_read=%d"
-   " to_write=%d failed=%d failed_num=%d\n",
-   locked, uptodate, to_read, to_write, failed, failed_num);
+   " to_write=%d to_fill=%d failed=%d failed_num=%d\n",
+   locked, uptodate, to_read, to_write, to_fill, failed, 
failed_num);
/* check if the array has lost two devices and, if so, some requests 
might
 * need to be failed
 */
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2.6.20-rc5 10/12] md: move raid5 io requests to raid5_run_ops

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe now only updates the state of stripes.  All execution of
operations is moved to raid5_run_ops.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   68 
 1 files changed, 10 insertions(+), 58 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1956b3c..8af084f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2360,6 +2360,8 @@ static void handle_stripe5(struct stripe_head *sh)
PRINTK("Read_old block %d for 
r-m-w\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, 
&dev->flags);
+   if 
(!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
locked++;
} else {
set_bit(STRIPE_DELAYED, 
&sh->state);
@@ -2380,6 +2382,8 @@ static void handle_stripe5(struct stripe_head *sh)
PRINTK("Read_old block %d for 
Reconstruct\n", i);
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantread, 
&dev->flags);
+   if 
(!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
locked++;
} else {
set_bit(STRIPE_DELAYED, 
&sh->state);
@@ -2479,6 +2483,8 @@ static void handle_stripe5(struct stripe_head *sh)
 
set_bit(R5_LOCKED, &dev->flags);
set_bit(R5_Wantwrite, &dev->flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
clear_bit(STRIPE_DEGRADED, &sh->state);
locked++;
set_bit(STRIPE_INSYNC, &sh->state);
@@ -2500,12 +2506,16 @@ static void handle_stripe5(struct stripe_head *sh)
dev = &sh->dev[failed_num];
if (!test_bit(R5_ReWrite, &dev->flags)) {
set_bit(R5_Wantwrite, &dev->flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
set_bit(R5_ReWrite, &dev->flags);
set_bit(R5_LOCKED, &dev->flags);
locked++;
} else {
/* let's read it back */
set_bit(R5_Wantread, &dev->flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
set_bit(R5_LOCKED, &dev->flags);
locked++;
}
@@ -2615,64 +2625,6 @@ static void handle_stripe5(struct stripe_head *sh)
  test_bit(BIO_UPTODATE, &bi->bi_flags)
? 0 : -EIO);
}
-   for (i=disks; i-- ;) {
-   int rw;
-   struct bio *bi;
-   mdk_rdev_t *rdev;
-   if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
-   rw = WRITE;
-   else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
-   rw = READ;
-   else
-   continue;
- 
-   bi = &sh->dev[i].req;
- 
-   bi->bi_rw = rw;
-   if (rw == WRITE)
-   bi->bi_end_io = raid5_end_write_request;
-   else
-   bi->bi_end_io = raid5_end_read_request;
- 
-   rcu_read_lock();
-   rdev = rcu_dereference(conf->disks[i].rdev);
-   if (rdev && test_bit(Faulty, &rdev->flags))
-   rdev = NULL;
-   if (rdev)
-   atomic_inc(&rdev->nr_pending);
-   rcu_read_unlock();
- 
-   if (rdev) {
-   if (syncing || expanding || expanded)
-   md_sync_acct(rdev->bdev, STRIPE_SECTORS);
-
-   bi->bi_bdev = rdev->bdev;
-   PRINTK("for %llu schedule op %ld on disc %d\n",
-

[PATCH 2.6.20-rc5 09/12] md: use async_tx and raid5_run_ops for raid5 expansion operations

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

The parity calculation for an expansion operation is the same as the
calculation performed at the end of a write with the caveat that all blocks
in the stripe are scheduled to be written.  An expansion operation is
identified as a stripe with the POSTXOR flag set and the BIODRAIN flag not
set.

The bulk copy operation to the new stripe is handled inline by async_tx.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   48 
 1 files changed, 36 insertions(+), 12 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index db8925f..1956b3c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2511,18 +2511,32 @@ static void handle_stripe5(struct stripe_head *sh)
}
}
 
-   if (expanded && test_bit(STRIPE_EXPANDING, &sh->state)) {
-   /* Need to write out all blocks after computing parity */
-   sh->disks = conf->raid_disks;
-   sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 
conf->raid_disks);
-   compute_parity5(sh, RECONSTRUCT_WRITE);
+   /* Finish postxor operations initiated by the expansion
+* process
+*/
+   if (test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete) &&
+   !test_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending)) {
+
+   clear_bit(STRIPE_EXPANDING, &sh->state);
+
+   clear_bit(STRIPE_OP_POSTXOR, &sh->ops.pending);
+   clear_bit(STRIPE_OP_POSTXOR, &sh->ops.ack);
+   clear_bit(STRIPE_OP_POSTXOR, &sh->ops.complete);
+
for (i= conf->raid_disks; i--;) {
-   set_bit(R5_LOCKED, &sh->dev[i].flags);
-   locked++;
set_bit(R5_Wantwrite, &sh->dev[i].flags);
+   if (!test_and_set_bit(STRIPE_OP_IO, &sh->ops.pending))
+   sh->ops.count++;
}
-   clear_bit(STRIPE_EXPANDING, &sh->state);
-   } else if (expanded) {
+   }
+
+   if (expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
+   !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
+   /* Need to write out all blocks after computing parity */
+   sh->disks = conf->raid_disks;
+   sh->pd_idx = stripe_to_pdidx(sh->sector, conf, 
conf->raid_disks);
+   locked += handle_write_operations5(sh, 0, 1);
+   } else if (expanded && !test_bit(STRIPE_OP_POSTXOR, &sh->ops.pending)) {
clear_bit(STRIPE_EXPAND_READY, &sh->state);
atomic_dec(&conf->reshape_stripes);
wake_up(&conf->wait_for_overlap);
@@ -2533,6 +2547,7 @@ static void handle_stripe5(struct stripe_head *sh)
/* We have read all the blocks in this stripe and now we need to
 * copy some of them into a target stripe for expand.
 */
+   struct dma_async_tx_descriptor *tx = NULL;
clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
for (i=0; i< sh->disks; i++)
if (i != sh->pd_idx) {
@@ -2556,9 +2571,12 @@ static void handle_stripe5(struct stripe_head *sh)
release_stripe(sh2);
continue;
}
-   memcpy(page_address(sh2->dev[dd_idx].page),
-  page_address(sh->dev[i].page),
-  STRIPE_SIZE);
+
+   /* place all the copies on one channel */
+   tx = async_memcpy(sh2->dev[dd_idx].page,
+   sh->dev[i].page, 0, 0, STRIPE_SIZE,
+   ASYNC_TX_DEP_ACK, tx, NULL, NULL);
+
set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
for (j=0; jraid_disks; j++)
@@ -2570,6 +2588,12 @@ static void handle_stripe5(struct stripe_head *sh)
set_bit(STRIPE_HANDLE, &sh2->state);
}
release_stripe(sh2);
+
+   /* done submitting copies, wait for them to 
complete */
+   if (i + 1 >= sh->disks) {
+   async_tx_ack(tx);
+   dma_wait_for_async_tx(tx);
+   }
}
}
 
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2.6.20-rc5 12/12] dmaengine: driver for the iop32x, iop33x, and iop13xx raid engines

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

This is a driver for the iop DMA/AAU/ADMA units which are capable of pq_xor,
pq_update, pq_zero_sum, xor, dual_xor, xor_zero_sum, fill, copy+crc, and copy
operations.

Changelog:
* fixed a slot allocation bug in do_iop13xx_adma_xor that caused too few
slots to be requested eventually leading to data corruption
* enabled the slot allocation routine to attempt to free slots before
returning -ENOMEM
* switched the cleanup routine to solely use the software chain and the
status register to determine if a descriptor is complete.  This is
necessary to support other IOP engines that do not have status writeback
capability
* make the driver iop generic
* modified the allocation routines to understand allocating a group of
slots for a single operation
* added a null xor initialization operation for the xor only channel on
iop3xx
* support xor operations on buffers larger than the hardware maximum
* split the do_* routines into separate prep, src/dest set, submit stages
* added async_tx support (dependent operations initiation at cleanup time)
* simplified group handling
* added interrupt support (callbacks via tasklets)
* brought the pending depth inline with ioat (i.e. 4 descriptors)

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/dma/Kconfig |8 
 drivers/dma/Makefile|1 
 drivers/dma/iop-adma.c  | 1511 +++
 include/asm-arm/hardware/iop_adma.h |  116 +++
 4 files changed, 1636 insertions(+), 0 deletions(-)

diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index c82ed5f..d61e3e5 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -41,4 +41,12 @@ config INTEL_IOATDMA
default m
---help---
  Enable support for the Intel(R) I/OAT DMA engine.
+
+config INTEL_IOP_ADMA
+tristate "Intel IOP ADMA support"
+depends on DMA_ENGINE && (ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX)
+default m
+---help---
+  Enable support for the Intel(R) IOP Series RAID engines.
+
 endmenu
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index 6a99341..8ebf10d 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -1,4 +1,5 @@
 obj-$(CONFIG_DMA_ENGINE) += dmaengine.o
 obj-$(CONFIG_NET_DMA) += iovlock.o
 obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o
+obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
 obj-$(CONFIG_ASYNC_TX_DMA) += async_tx.o xor.o
diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c
new file mode 100644
index 000..77f859e
--- /dev/null
+++ b/drivers/dma/iop-adma.c
@@ -0,0 +1,1511 @@
+/*
+ * Copyright(c) 2006 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ * This driver supports the asynchrounous DMA copy and RAID engines available
+ * on the Intel Xscale(R) family of I/O Processors (IOP 32x, 33x, 134x)
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define to_iop_adma_chan(chan) container_of(chan, struct iop_adma_chan, common)
+#define to_iop_adma_device(dev) container_of(dev, struct iop_adma_device, 
common)
+#define to_iop_adma_slot(lh) container_of(lh, struct iop_adma_desc_slot, 
slot_node)
+#define tx_to_iop_adma_slot(tx) container_of(tx, struct iop_adma_desc_slot, 
async_tx)
+
+#define IOP_ADMA_DEBUG 0
+#define PRINTK(x...) ((void)(IOP_ADMA_DEBUG && printk(x)))
+
+/**
+ * iop_adma_free_slots - flags descriptor slots for reuse
+ * @slot: Slot to free
+ * Caller must hold &iop_chan->lock while calling this function
+ */
+static inline void iop_adma_free_slots(struct iop_adma_desc_slot *slot)
+{
+   int stride = slot->stride;
+
+   while (stride--) {
+   slot->stride = 0;
+   slot = list_entry(slot->slot_node.next,
+   struct iop_adma_desc_slot,
+   slot_node);
+   }
+}
+
+static inline dma_cookie_t
+iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc,
+   struct iop_adma_chan *iop_chan, dma_cookie_t cookie)
+{
+   BUG_ON(

[PATCH 2.6.20-rc5 11/12] md: remove raid5 compute_block and compute_parity5

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

replaced by raid5_run_ops

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  124 
 1 files changed, 0 insertions(+), 124 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8af084f..a981c35 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1480,130 +1480,6 @@ static void copy_data(int frombio, struct bio *bio,
   }   \
} while(0)
 
-
-static void compute_block(struct stripe_head *sh, int dd_idx)
-{
-   int i, count, disks = sh->disks;
-   void *ptr[MAX_XOR_BLOCKS], *dest, *p;
-
-   PRINTK("compute_block, stripe %llu, idx %d\n", 
-   (unsigned long long)sh->sector, dd_idx);
-
-   dest = page_address(sh->dev[dd_idx].page);
-   memset(dest, 0, STRIPE_SIZE);
-   count = 0;
-   for (i = disks ; i--; ) {
-   if (i == dd_idx)
-   continue;
-   p = page_address(sh->dev[i].page);
-   if (test_bit(R5_UPTODATE, &sh->dev[i].flags))
-   ptr[count++] = p;
-   else
-   printk(KERN_ERR "compute_block() %d, stripe %llu, %d"
-   " not present\n", dd_idx,
-   (unsigned long long)sh->sector, i);
-
-   check_xor();
-   }
-   if (count)
-   xor_block(count, STRIPE_SIZE, dest, ptr);
-   set_bit(R5_UPTODATE, &sh->dev[dd_idx].flags);
-}
-
-static void compute_parity5(struct stripe_head *sh, int method)
-{
-   raid5_conf_t *conf = sh->raid_conf;
-   int i, pd_idx = sh->pd_idx, disks = sh->disks, count;
-   void *ptr[MAX_XOR_BLOCKS], *dest;
-   struct bio *chosen;
-
-   PRINTK("compute_parity5, stripe %llu, method %d\n",
-   (unsigned long long)sh->sector, method);
-
-   count = 0;
-   dest = page_address(sh->dev[pd_idx].page);
-   switch(method) {
-   case READ_MODIFY_WRITE:
-   BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags));
-   for (i=disks ; i-- ;) {
-   if (i==pd_idx)
-   continue;
-   if (sh->dev[i].towrite &&
-   test_bit(R5_UPTODATE, &sh->dev[i].flags)) {
-   ptr[count++] = page_address(sh->dev[i].page);
-   chosen = sh->dev[i].towrite;
-   sh->dev[i].towrite = NULL;
-
-   if (test_and_clear_bit(R5_Overlap, 
&sh->dev[i].flags))
-   wake_up(&conf->wait_for_overlap);
-
-   BUG_ON(sh->dev[i].written);
-   sh->dev[i].written = chosen;
-   check_xor();
-   }
-   }
-   break;
-   case RECONSTRUCT_WRITE:
-   memset(dest, 0, STRIPE_SIZE);
-   for (i= disks; i-- ;)
-   if (i!=pd_idx && sh->dev[i].towrite) {
-   chosen = sh->dev[i].towrite;
-   sh->dev[i].towrite = NULL;
-
-   if (test_and_clear_bit(R5_Overlap, 
&sh->dev[i].flags))
-   wake_up(&conf->wait_for_overlap);
-
-   BUG_ON(sh->dev[i].written);
-   sh->dev[i].written = chosen;
-   }
-   break;
-   case CHECK_PARITY:
-   break;
-   }
-   if (count) {
-   xor_block(count, STRIPE_SIZE, dest, ptr);
-   count = 0;
-   }
-   
-   for (i = disks; i--;)
-   if (sh->dev[i].written) {
-   sector_t sector = sh->dev[i].sector;
-   struct bio *wbi = sh->dev[i].written;
-   while (wbi && wbi->bi_sector < sector + STRIPE_SECTORS) 
{
-   copy_data(1, wbi, sh->dev[i].page, sector);
-   wbi = r5_next_bio(wbi, sector);
-   }
-
-   set_bit(R5_LOCKED, &sh->dev[i].flags);
-   set_bit(R5_UPTODATE, &sh->dev[i].flags);
-   }
-
-   switch(method) {
-   case RECONSTRUCT_WRITE:
-   case CHECK_PARITY:
-   for (i=disks; i--;)
-   if (i != pd_idx) {
-   ptr[count++] = page_address(sh->dev[i].page);
-   check_xor();
- 

[PATCH 2.6.20-rc5 04/12] md: use raid5_run_ops for stripe cache operations

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

Each stripe has three flag variables to reflect the state of operations
(pending, ack, and complete).
-pending: set to request servicing in raid5_run_ops
-ack: set to reflect that raid5_runs_ops has seen this request
-complete: set when the operation is complete and it is ok for handle_stripe5
to clear 'pending' and 'ack'.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   65 +---
 1 files changed, 56 insertions(+), 9 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e70ee17..2c74f9b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -126,6 +126,7 @@ static void __release_stripe(raid5_conf_t *conf, struct 
stripe_head *sh)
}
md_wakeup_thread(conf->mddev->thread);
} else {
+   BUG_ON(sh->ops.pending);
if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state)) {
atomic_dec(&conf->preread_active_stripes);
if (atomic_read(&conf->preread_active_stripes) 
< IO_THRESHOLD)
@@ -225,7 +226,8 @@ static void init_stripe(struct stripe_head *sh, sector_t 
sector, int pd_idx, int
 
BUG_ON(atomic_read(&sh->count) != 0);
BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
-   
+   BUG_ON(sh->ops.pending || sh->ops.ack || sh->ops.complete);
+
CHECK_DEVLOCK();
PRINTK("init_stripe called, stripe %llu\n", 
(unsigned long long)sh->sector);
@@ -241,11 +243,11 @@ static void init_stripe(struct stripe_head *sh, sector_t 
sector, int pd_idx, int
for (i = sh->disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
 
-   if (dev->toread || dev->towrite || dev->written ||
+   if (dev->toread || dev->read || dev->towrite || dev->written ||
test_bit(R5_LOCKED, &dev->flags)) {
-   printk("sector=%llx i=%d %p %p %p %d\n",
+   printk("sector=%llx i=%d %p %p %p %p %d\n",
   (unsigned long long)sh->sector, i, dev->toread,
-  dev->towrite, dev->written,
+  dev->read, dev->towrite, dev->written,
   test_bit(R5_LOCKED, &dev->flags));
BUG();
}
@@ -325,6 +327,43 @@ static struct stripe_head *get_active_stripe(raid5_conf_t 
*conf, sector_t sector
return sh;
 }
 
+/* check_op() ensures that we only dequeue an operation once */
+#define check_op(op) do {\
+   if (test_bit(op, &sh->ops.pending) &&\
+   !test_bit(op, &sh->ops.complete)) {\
+   if (test_and_set_bit(op, &sh->ops.ack))\
+   clear_bit(op, &pending);\
+   else\
+   ack++;\
+   } else\
+   clear_bit(op, &pending);\
+} while(0)
+
+/* find new work to run, do not resubmit work that is already
+ * in flight
+ */
+static unsigned long get_stripe_work(struct stripe_head *sh)
+{
+   unsigned long pending;
+   int ack = 0;
+
+   pending = sh->ops.pending;
+
+   check_op(STRIPE_OP_BIOFILL);
+   check_op(STRIPE_OP_COMPUTE_BLK);
+   check_op(STRIPE_OP_PREXOR);
+   check_op(STRIPE_OP_BIODRAIN);
+   check_op(STRIPE_OP_POSTXOR);
+   check_op(STRIPE_OP_CHECK);
+   if (test_and_clear_bit(STRIPE_OP_IO, &sh->ops.pending))
+   ack++;
+
+   sh->ops.count -= ack;
+   BUG_ON(sh->ops.count < 0);
+
+   return pending;
+}
+
 static int
 raid5_end_read_request(struct bio * bi, unsigned int bytes_done, int error);
 static int
@@ -1859,7 +1898,6 @@ static int stripe_to_pdidx(sector_t stripe, raid5_conf_t 
*conf, int disks)
  *schedule a write of some buffers
  *return confirmation of parity correctness
  *
- * Parity calculations are done inside the stripe lock
  * buffers are taken off read_list or write_list, and bh_cache buffers
  * get BH_Lock set before the stripe lock is released.
  *
@@ -1877,10 +1915,11 @@ static void handle_stripe5(struct stripe_head *sh)
int non_overwrite = 0;
int failed_num=0;
struct r5dev *dev;
+   unsigned long pending=0;
 
-   PRINTK("handling stripe %llu, cnt=%d, pd_idx=%d\n",
-   (unsigned long long)sh->sector, atomic_read(&sh->count),
-   sh->pd_idx);
+   PRINTK("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d 
ops=%lx:%lx:%lx\n",
+  (unsigned long long)sh->sector, sh->state, 
atomic_read(&sh->count),
+  sh->pd_id

[PATCH 2.6.20-rc5 05/12] md: move write operations to raid5_run_ops

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe sets STRIPE_OP_PREXOR, STRIPE_OP_BIODRAIN, STRIPE_OP_POSTXOR
to request a write to the stripe cache.  raid5_run_ops is triggerred to run
and executes the request outside the stripe lock.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  152 +---
 1 files changed, 131 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 2c74f9b..2390657 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1788,7 +1788,75 @@ static void compute_block_2(struct stripe_head *sh, int 
dd_idx1, int dd_idx2)
}
 }
 
+static int handle_write_operations5(struct stripe_head *sh, int rcw, int 
expand)
+{
+   int i, pd_idx = sh->pd_idx, disks = sh->disks;
+   int locked=0;
+
+   if (rcw == 0) {
+   /* skip the drain operation on an expand */
+   if (!expand) {
+   BUG_ON(test_and_set_bit(STRIPE_OP_BIODRAIN,
+   &sh->ops.pending));
+   sh->ops.count++;
+   }
+
+   BUG_ON(test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending));
+   sh->ops.count++;
+
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+
+   if (dev->towrite) {
+   set_bit(R5_LOCKED, &dev->flags);
+   if (!expand)
+   clear_bit(R5_UPTODATE, &dev->flags);
+   locked++;
+   }
+   }
+   } else {
+   BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
+   test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
+
+   BUG_ON(test_and_set_bit(STRIPE_OP_PREXOR, &sh->ops.pending) ||
+   test_and_set_bit(STRIPE_OP_BIODRAIN, &sh->ops.pending) 
||
+   test_and_set_bit(STRIPE_OP_POSTXOR, &sh->ops.pending));
+
+   sh->ops.count += 3;
+
+   for (i=disks ; i-- ;) {
+   struct r5dev *dev = &sh->dev[i];
+   if (i==pd_idx)
+   continue;
 
+   /* For a read-modify write there may be blocks that are
+* locked for reading while others are ready to be 
written
+* so we distinguish these blocks by the R5_Wantprexor 
bit
+*/
+   if (dev->towrite &&
+   (test_bit(R5_UPTODATE, &dev->flags) ||
+   test_bit(R5_Wantcompute, &dev->flags))) {
+   set_bit(R5_Wantprexor, &dev->flags);
+   set_bit(R5_LOCKED, &dev->flags);
+   clear_bit(R5_UPTODATE, &dev->flags);
+   locked++;
+   }
+   }
+   }
+
+   /* keep the parity disk locked while asynchronous operations
+* are in flight
+*/
+   set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
+   clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
+   locked++;
+
+   PRINTK("%s: stripe %llu locked: %d pending: %lx\n",
+   __FUNCTION__, (unsigned long long)sh->sector,
+   locked, sh->ops.pending);
+
+   return locked;
+}
 
 /*
  * Each stripe/dev can have one or more bion attached.
@@ -2151,8 +2219,67 @@ static void handle_stripe5(struct stripe_head *sh)
set_bit(STRIPE_HANDLE, &sh->state);
}
 
-   /* now to consider writing and what else, if anything should be read */
-   if (to_write) {
+   /* Now we check to see if any write operations have recently
+* completed
+*/
+
+   /* leave prexor set until postxor is done, allows us to distinguish
+* a rmw from a rcw during biodrain
+*/
+   if (test_bit(STRIPE_OP_PREXOR, &sh->ops.complete) &&
+   test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
+
+   clear_bit(STRIPE_OP_PREXOR, &sh->ops.complete);
+   clear_bit(STRIPE_OP_PREXOR, &sh->ops.ack);
+   clear_bit(STRIPE_OP_PREXOR, &sh->ops.pending);
+
+   for (i=disks; i--;)
+   clear_bit(R5_Wantprexor, &sh->dev[i].flags);
+   }
+
+   /* if only POSTXOR is set then this is an 'expand' postxor */
+   if (test_bit(STRIPE_OP_BIODRAIN, &sh->ops.complete) &&
+   test_bit(STRIPE_OP_POSTXOR, &sh->ops.complete)) {
+
+   clear_bit(STRIPE_OP_BIODRAIN

[PATCH 2.6.20-rc5 07/12] md: move raid5 parity checks to raid5_run_ops

2007-01-23 Thread Dan Williams
From: Dan Williams <[EMAIL PROTECTED]>

handle_stripe sets STRIPE_OP_CHECK to request a check operation in
raid5_run_ops.  If raid5_run_ops is able to perform the check with a
dma engine the parity will be preserved in memory removing the need to
re-read it from disk, as is necessary in the synchronous case.

'Repair' operations re-use the same logic as compute block, with the caveat
that the results of the compute block are immediately written back to the
parity disk.  To differentiate these operations the STRIPE_OP_MOD_REPAIR_PD
flag is added.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |   81 
 1 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 279a30c..2422253 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2411,32 +2411,75 @@ static void handle_stripe5(struct stripe_head *sh)
locked += handle_write_operations5(sh, rcw, 0);
}
 
-   /* maybe we need to check and possibly fix the parity for this stripe
-* Any reads will already have been scheduled, so we just see if enough 
data
-* is available
+   /* 1/ Maybe we need to check and possibly fix the parity for this 
stripe.
+*Any reads will already have been scheduled, so we just see if 
enough data
+*is available.
+* 2/ Hold off parity checks while parity dependent operations are in 
flight
+*(conflicting writes are protected by the 'locked' variable)
 */
-   if (syncing && locked == 0 &&
-   !test_bit(STRIPE_INSYNC, &sh->state)) {
+   if ((syncing && locked == 0 && !test_bit(STRIPE_OP_COMPUTE_BLK, 
&sh->ops.pending) &&
+   !test_bit(STRIPE_INSYNC, &sh->state)) ||
+   test_bit(STRIPE_OP_CHECK, &sh->ops.pending) ||
+   test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) {
+
set_bit(STRIPE_HANDLE, &sh->state);
-   if (failed == 0) {
-   BUG_ON(uptodate != disks);
-   compute_parity5(sh, CHECK_PARITY);
-   uptodate--;
-   if (page_is_zero(sh->dev[sh->pd_idx].page)) {
-   /* parity is correct (on disc, not in buffer 
any more) */
-   set_bit(STRIPE_INSYNC, &sh->state);
-   } else {
-   conf->mddev->resync_mismatches += 
STRIPE_SECTORS;
-   if (test_bit(MD_RECOVERY_CHECK, 
&conf->mddev->recovery))
-   /* don't try to repair!! */
+   /* Take one of the following actions:
+* 1/ start a check parity operation if (uptodate == disks)
+* 2/ finish a check parity operation and act on the result
+* 3/ skip to the writeback section if we previously
+*initiated a recovery operation
+*/
+   if (failed == 0 && !test_bit(STRIPE_OP_MOD_REPAIR_PD, 
&sh->ops.pending)) {
+   if (!test_and_set_bit(STRIPE_OP_CHECK, 
&sh->ops.pending)) {
+   BUG_ON(uptodate != disks);
+   clear_bit(R5_UPTODATE, 
&sh->dev[sh->pd_idx].flags);
+   sh->ops.count++;
+   uptodate--;
+   } else if (test_and_clear_bit(STRIPE_OP_CHECK, 
&sh->ops.complete)) {
+   clear_bit(STRIPE_OP_CHECK, &sh->ops.ack);
+   clear_bit(STRIPE_OP_CHECK, &sh->ops.pending);
+
+   if (sh->ops.zero_sum_result == 0)
+   /* parity is correct (on disc, not in 
buffer any more) */
set_bit(STRIPE_INSYNC, &sh->state);
else {
-   compute_block(sh, sh->pd_idx);
-   uptodate++;
+   conf->mddev->resync_mismatches += 
STRIPE_SECTORS;
+   if (test_bit(MD_RECOVERY_CHECK, 
&conf->mddev->recovery))
+   /* don't try to repair!! */
+   set_bit(STRIPE_INSYNC, 
&sh->state);
+   else {
+   BUG_ON(test_and_set_bit(
+   STRIPE_OP_COMPUTE_BLK,
+   &sh->ops.pe

Re: [RFC][PATCH 00/12] md raid acceleration and performance analysis

2007-02-07 Thread Dan Williams

On 2/6/07, Leech, Christopher <[EMAIL PROTECTED]> wrote:

Hi Dan,

I've been looking over how your patches change the ioatdma driver.  I
like the idea of removing the multiple entry points for virtual address
vs. page struct arguments, and just using dma_addr_t for the driver
interfaces.

But, I don't think having both ioatdma and iop-adma implement map_page,
map_single, unmap_page, and unmap_single entry points is much better.
Do you see a reason why it wouldn't work to expose the generic device
for a DMA channel, and replace instances of

dma_device->map_single(dma_chan, src, len, DMA_TO_DEVICE)

with

dma_map_single(dma_device->dev, src, len, DMA_TO_DEVICE)



I was initially concerned about a case where dma_map_single was not
equivalent to pci_map_single.  Looking now, it appears that case would
be a bug, so I will integrate this change.


I am a little concerned about having the DMA mapping happen outside of
the driver, but the unmapping is still in the driver cleanup routine.
I'm not sure if it's really a problem, or how I'd change it though.

- Chris


Thanks,
Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.20: stripe_cache_size goes boom with 32mb

2007-02-23 Thread Dan Williams

On 2/23/07, Justin Piszcz <[EMAIL PROTECTED]> wrote:

I have 2GB On this machine.  For me, 8192 seems to be the sweet spot, I
will probably keep it at 8mb.


Just a note stripe_cache_size = 8192 = 192MB with six disks.

The calculation is:
stripe_cache_size * num_disks * PAGE_SIZE = stripe_cache_size_bytes

--
Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: DMRAID feature direction?

2007-02-27 Thread Dan Williams

On 2/27/07, Gaston, Jason D <[EMAIL PROTECTED]> wrote:

Hello,

Can someone point me to where I can search a linux-raid mailing list
archive?


I use:
http://marc.theaimsgroup.com/?l=linux-raid&r=1&w=2


I am looking for information about where things are going with DMRAID
features and any discussion on where things stand in regards to the
possibility of merging MD and DMRAID.  My guess, from what I have found
in google, is that this was a heated discussion.  Mainly I am looking at
what direction people think is correct for getting more functionality to
support "fakeraid" volumes.


Also interesting is the discussion around surrounding the proposed EMD solution.


I appreciate any information!

Thanks,

Jason
-


Regards,
Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC, PATCH] raid456: replace the handle_list with a multi-threaded workqueue

2007-02-27 Thread Dan Williams
Currently raid456 queues up work to a single raid5d thread per array.
Since there are no dependencies between operations on different stripes
I believed a speed up could be obtained by spreading the handle_stripe
load across all available CPU's.  However I am not seeing a speed up, as
measured by tiobench.  I think the reason is that multi-processor
effects will only show up when data is already in the cache.  In this
case the work is already spread out per client thread.  Also work
submitted to workqueues is sticky to the CPU where queue_work() was
called, not load balanced amongst the available CPUs.  I'm posting it
anyway to see if I am overlooking a case where it would be helpful, and
from a cosmetic standpoint it separates raid5d housekeeping work from
handle_stripe work.

Signed-off-by: Dan Williams <[EMAIL PROTECTED]>
---

 drivers/md/raid5.c |  108 ++--
 include/linux/raid/raid5.h |6 ++
 2 files changed, 68 insertions(+), 46 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 11c3d7b..e54310c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -121,7 +121,10 @@ static void __release_stripe(raid5_conf_t *conf, struct 
stripe_head *sh)
blk_plug_device(conf->mddev->queue);
} else {
clear_bit(STRIPE_BIT_DELAY, &sh->state);
-   list_add_tail(&sh->lru, &conf->handle_list);
+   conf->workqueue_stripes++;
+   atomic_inc(&sh->count);
+   BUG_ON(queue_work(conf->workqueue,
+   &sh->work) == 0);
}
md_wakeup_thread(conf->mddev->thread);
} else {
@@ -310,6 +313,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t 
*conf, sector_t sector
if (!test_bit(STRIPE_HANDLE, &sh->state))
atomic_inc(&conf->active_stripes);
if (list_empty(&sh->lru) &&
+   !work_pending(&sh->work) &&
!test_bit(STRIPE_EXPANDING, &sh->state))
BUG();
list_del_init(&sh->lru);
@@ -324,6 +328,7 @@ static struct stripe_head *get_active_stripe(raid5_conf_t 
*conf, sector_t sector
return sh;
 }
 
+static void raid456_workqueue(struct work_struct *work);
 static int grow_one_stripe(raid5_conf_t *conf)
 {
struct stripe_head *sh;
@@ -343,6 +348,7 @@ static int grow_one_stripe(raid5_conf_t *conf)
/* we just created an active stripe so... */
atomic_set(&sh->count, 1);
atomic_inc(&conf->active_stripes);
+   INIT_WORK(&sh->work, raid456_workqueue);
INIT_LIST_HEAD(&sh->lru);
release_stripe(sh);
return 1;
@@ -2448,7 +2454,9 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
clear_bit(STRIPE_DELAYED, &sh->state);
if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 
&sh->state))
atomic_inc(&conf->preread_active_stripes);
-   list_add_tail(&sh->lru, &conf->handle_list);
+   conf->workqueue_stripes++;
+   atomic_inc(&sh->count);
+   BUG_ON(queue_work(conf->workqueue, &sh->work) == 0);
}
}
 }
@@ -3181,7 +3189,6 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct 
bio *raid_bio)
 }
 
 
-
 /*
  * This is our raid5 kernel thread.
  *
@@ -3191,9 +3198,9 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct 
bio *raid_bio)
  */
 static void raid5d (mddev_t *mddev)
 {
-   struct stripe_head *sh;
raid5_conf_t *conf = mddev_to_conf(mddev);
int handled;
+   struct bio *bio;
 
PRINTK("+++ raid5d active\n");
 
@@ -3201,51 +3208,30 @@ static void raid5d (mddev_t *mddev)
 
handled = 0;
spin_lock_irq(&conf->device_lock);
-   while (1) {
-   struct list_head *first;
-   struct bio *bio;
-
-   if (conf->seq_flush != conf->seq_write) {
-   int seq = conf->seq_flush;
-   spin_unlock_irq(&conf->device_lock);
-   bitmap_unplug(mddev->bitmap);
-   spin_lock_irq(&conf->device_lock);
-   conf->seq_write = seq;
-   activate_bit_delay(conf);
-   }
-
-   if (list_empty(&conf->handle_list) &&
-  

Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.

2007-03-15 Thread Dan Williams

On 3/15/07, Paul Mackerras <[EMAIL PROTECTED]> wrote:

Wolfgang Denk writes:

> This patch is based on and requires a set of patches posted to the
> linux-raid mailing list by Dan Williams on 2007-01-23:

Those patches don't seem to be upstream in Linus' tree.  Are they in
-mm, or is anyone pushing for them to be?


They are in -mm (git-md-accel.patch).  I'll review this driver and and
integrate it into my next push to Andrew, along with some further
cleanups.


Paul.


Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.

2007-03-16 Thread Dan Williams

On 3/16/07, Wolfgang Denk <[EMAIL PROTECTED]> wrote:

In message <[EMAIL PROTECTED]> you wrote:
>
> They are in -mm (git-md-accel.patch).  I'll review this driver and and
> integrate it into my next push to Andrew, along with some further
> cleanups.

Thanks.

We're doing some cleanup now based on the feedback we receive.

What is easier for you to handle  -  a  complete  new  patch,  or  an
incrementan  one  on  top  of  what  we  submitted  now?  (I'd prefer
incremental, but will do whatever works better for you).


I can handle incremental, but I will probably fold everything together
in the patch that goes to -mm.

Check out Stacked GIT (http://www.procode.org/stgit/) when you get a
chance, it handles this situation well.


Best regards,

Wolfgang Denk



Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.

2007-03-16 Thread Dan Williams

On 3/16/07, Benjamin Herrenschmidt <[EMAIL PROTECTED]> wrote:

> + PRINTK("\tfree slot %x: %d stride: %d\n", desc->phys, desc->idx, 
desc->stride);

Why don't you use the kernel existing debugging facilitie, like
pr_debug, or dev_dbg if you have a proper struct device (which you
should have with an arch/powerpc port hopefully using
of_platform_device).


This came from the the iop-adma driver.  I blindly copied it from
drivers/md/raid5.c, but yes it should change to dev_dbg.


> + spin_lock_bh(&spe_chan->lock);
> + /* Allocate descriptor slots */
> + i = spe_chan->slots_allocated;
> + if (spe_chan->device->id != PPC440SPE_XOR_ID)
> + db_sz = sizeof (dma_cdb_t);
> + else
> + db_sz = sizeof (xor_cb_t);
> +
> + for (; i < (plat_data->pool_size/db_sz); i++) {
> + slot = kzalloc(sizeof(struct spe_adma_desc_slot), GFP_KERNEL);

GFP_KERNEL within spin_lock_bh is no good...


This is an iop-adma wart... will fix.


> diff --git a/include/asm-ppc/adma.h b/include/asm-ppc/adma.h
> new file mode 100644
> index 000..0be88f1
> --- /dev/null
> +++ b/include/asm-ppc/adma.h

There's way too many code in this .h file, too big inline functions. It
should mostly be moved to a .c file


The iop-adma driver uses separate .h files because the driver is
shared between iop3xx and iop13xx implementations and I did not want
the overhead of another indirect-branch layer.  In this case the
hardware specific routines can be written inline since the driver is
only supporting one architecture... other suggestions?


Cheers,
Ben.



Regards,
Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.

2007-03-16 Thread Dan Williams

Here are some additional comments/nits:


+/*
+ *  Init DMA0/1 and XOR engines; allocate memory for DMAx FIFOs; set 
platform_device
+ * memory resources addresses
+ */
+static void ppc440spe_configure_raid_devices(void)


Any reason not to move most of this function into spe_adma_probe?  The
"set resource address" section is the only piece that spe_adma_probe
should not handle.


+++ b/drivers/dma/spe-adma.c
@@ -0,0 +1,1071 @@
+/*
+ * Copyright(c) 2006 DENX Engineering. All rights reserved.
+ *
+ * Author: Yuri Tikhonov <[EMAIL PROTECTED]>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59
+ * Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+ *
+ * The full GNU General Public License is included in this distribution in the
+ * file called COPYING.
+ */
+
+/*
+ *  This driver supports the asynchrounous DMA copy and RAID engines available
+ * on the AMCC PPC440SPe Processors.
+ *  Based on the Intel Xscale(R) family of I/O Processors (SPE 32x, 33x, 134x)

SPE should be IOP on this line.

../..


+static inline void
+spe_adma_slot_cleanup(struct spe_adma_chan *spe_chan)
+{
+   spin_lock_bh(&spe_chan->lock);
+   __spe_adma_slot_cleanup(spe_chan);
+   spin_unlock_bh(&spe_chan->lock);
+}
+
+static struct spe_adma_chan *spe_adma_chan_array[3];
+static void spe_adma0_task(unsigned long data)
+{
+   __spe_adma_slot_cleanup(spe_adma_chan_array[0]);
+}
+
+static void spe_adma1_task(unsigned long data)
+{
+   __spe_adma_slot_cleanup(spe_adma_chan_array[1]);
+}
+
+static void spe_adma2_task(unsigned long data)
+{
+   __spe_adma_slot_cleanup(spe_adma_chan_array[2]);
+}
+
+DECLARE_TASKLET(spe_adma0_tasklet, spe_adma0_task, 0);
+DECLARE_TASKLET(spe_adma1_tasklet, spe_adma1_task, 0);
+DECLARE_TASKLET(spe_adma2_tasklet, spe_adma2_task, 0);
+struct tasklet_struct *spe_adma_tasklet[] = {
+   &spe_adma0_tasklet,
+   &spe_adma1_tasklet,
+   &spe_adma2_tasklet,
+};
+

This is something I am cleaning up in iop-adma by adding a struct
tasklet * to each channel.I'll post an incremental diff of my
iop-adma changes so you can see what I have cleaned up since the
2.6.20-rc5 posting.


+static dma_addr_t spe_adma_map_page(struct dma_chan *chan, struct page *page,
+   unsigned long offset, size_t size,
+   int direction)
+{
+   struct spe_adma_chan *spe_chan = to_spe_adma_chan(chan);
+   return dma_map_page(&spe_chan->device->pdev->dev, page, offset, size,
+   direction);
+}
+
+static dma_addr_t spe_adma_map_single(struct dma_chan *chan, void *cpu_addr,
+   size_t size, int direction)
+{
+   struct spe_adma_chan *spe_chan = to_spe_adma_chan(chan);
+   return dma_map_single(&spe_chan->device->pdev->dev, cpu_addr, size,
+   direction);
+}
+
+static void spe_adma_unmap_page(struct dma_chan *chan, dma_addr_t handle,
+   size_t size, int direction)
+{
+   struct spe_adma_chan *spe_chan = to_spe_adma_chan(chan);
+   dma_unmap_page(&spe_chan->device->pdev->dev, handle, size, direction);
+}
+
+static void spe_adma_unmap_single(struct dma_chan *chan, dma_addr_t handle,
+   size_t size, int direction)
+{
+   struct spe_adma_chan *spe_chan = to_spe_adma_chan(chan);
+   dma_unmap_single(&spe_chan->device->pdev->dev, handle, size, direction);
+}
+

...these are gone as well in the latest code.


+static int __devinit spe_adma_probe(struct platform_device *pdev)

../..


+   printk(KERN_INFO "Intel(R) SPE ADMA Engine found [%d]: "

Intel(R)? :-)

Regards,
Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.

2007-03-17 Thread Dan Williams

On 3/17/07, Stefan Roese <[EMAIL PROTECTED]> wrote:

Dan,

I just noticed that your patch "dmaengine: add the async_tx api":

@@ -22,6 +22,17 @@ config NET_DMA
  Since this is the main user of the DMA engine, it should be enabled;
  say Y here.

+config ASYNC_TX_DMA
+   tristate "Asynchronous Bulk Memory Transfers/Transforms API"
+   default y
+   ---help---
+ This enables the async_tx management layer for dma engines.
+ Subsystems coded to this API will use offload engines for bulk
+ memory operations where present.  Software implementations are
+ called when a dma engine is not present or fails to allocate
+ memory to carry out the transaction.
+ Current subsystems ported to async_tx: MD_RAID4,5
+

adds ASYNC_TX_DMA unconditionally to _all_ platforms. You might what to bundle
this with something like DMA_ENGINE.


Yes, defaulting to 'y' is not necessary, but ASYNC_TX_DMA=y &&
DMA_ENGINE=n is an explicit feature of the interface.  When DMA_ENGINE
is not selected all the asynchronous paths in the API are compiled
out.  This allows subsytems, like md-raid5, to be written in an
asynchronous fashion without regard for the architecture[1] or
availability of offload engines.


Best regards,
Stefan


Regards,
Dan

[1] The API implicitly handles channel switching depending on the
offload engine architecture.  Where an iop13xx engine can handle a
copy+xor sequence on one channel, a 440sp or iop3xx platform will need
to switch between copy and xor capable engines.  Resolving operation
dependencies and channel switching is handled behind the scenes.
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] [PPC32] ADMA support for PPC 440SPe processors.

2007-03-17 Thread Dan Williams

The current implementation builds on my embedded PPC4xx system without any
disks the objects async_tx.o and xor.o into the kernel which I definitely
don't need and want. And I get something like:

async_tx: api initialized (sync-only)
xor: measuring software checksumming speed
   8regs :   145.000 MB/sec
   8regs_prefetch:   115.000 MB/sec
   32regs:   176.000 MB/sec
   32regs_prefetch:   135.000 MB/sec
xor: using function: 32regs (176.000 MB/sec)

upon bootup.

Understood I'll change it so that xor.o and async_tx.o are off by default.



Best regards,
Stefan


Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: 2.6.20.3 AMD64 oops in CFQ code

2007-03-22 Thread Dan Williams

On 3/22/07, Neil Brown <[EMAIL PROTECTED]> wrote:

On Thursday March 22, [EMAIL PROTECTED] wrote:
> On Thu, Mar 22 2007, [EMAIL PROTECTED] wrote:
> > > 3 (I think) seperate instances of this, each involving raid5. Is your
> > > array degraded or fully operational?
> >
> > Ding! A drive fell out the other day, which is why the problems only
> > appeared recently.
> >
> > md5 : active raid5 sdf4[5] sdd4[3] sdc4[2] sdb4[1] sda4[0]
> >   1719155200 blocks level 5, 64k chunk, algorithm 2 [6/5] [_U]
> >   bitmap: 149/164 pages [596KB], 1024KB chunk
> >
> > H'm... this means that my alarm scripts aren't working.  Well, that's
> > good to know.  The drive is being re-integrated now.
>
> Heh, at least something good came out of this bug then :-)
> But that's reaffirming. Neil, are you following this? It smells somewhat
> fishy wrt raid5.

Yes, I've been trying to pay attention

The evidence does seem to point to raid5 and degraded arrays being
implicated.  However I'm having trouble finding how the fact that an array
is degraded would be visible down in the elevator except for having a
slightly different distribution of reads and writes.

One possible way is that if an array is degraded, then some read
requests will go through the stripe cache rather than direct to the
device.  However I would more expect the direct-to-device path to have
problems as it is much newer code.  Going through the cache for reads
is very well tested code - and reads come from the cache for most
writes anyway, so the elevator will still see lots of single-page.
reads.  It only ever sees single-page write.

There might be more pressure on the stripe cache when running
degraded, so we might call the ->unplug_fn a little more often, but I
doubt that would be noticeable.

As you seem to suggest by the patch, it does look like some sort of
unlocked access to the cfq_queue structure.  However apart from the
comment before cfq_exit_single_io_context being in the wrong place
(should be before __cfq_exit_single_io_context) I cannot see anything
obviously wrong with the locking around that structure.

So I'm afraid I'm stumped too.

NeilBrown


Not a cfq failure, but I have been able to reproduce a different oops
at array stop time while i/o's were pending.  I have not dug into it
enough to suggest a patch, but I wonder if it is somehow related to
the cfq failure since it involves congestion and drives going away:

md: md0: recovery done.
Unable to handle kernel NULL pointer dereference at virtual address 00bc
pgd = 40004000
[00bc] *pgd=
Internal error: Oops: 17 [#1]
Modules linked in:
CPU: 0
PC is at raid5_congested+0x14/0x5c
LR is at sync_sb_inodes+0x278/0x2ec
pc : [<402801cc>]lr : [<400a39e8>]Not tainted
sp : 8a3e3ec4  ip : 8a3e3ed4  fp : 8a3e3ed0
r10: 40474878  r9 : 40474870  r8 : 40439710
r7 : 8a3e3f30  r6 : bfa76b78  r5 : 4161dc08  r4 : 40474800
r3 : 402801b8  r2 : 0004  r1 : 0001  r0 : 
Flags: nzCv  IRQs on  FIQs on  Mode SVC_32  Segment kernel
Control: 400397F
Table: 7B7D4018  DAC: 0035
Process pdflush (pid: 1371, stack limit = 0x8a3e2250)
Stack: (0x8a3e3ec4 to 0x8a3e4000)
3ec0:  8a3e3f04 8a3e3ed4 400a39e8 402801c4 8a3e3f24 000129f9 40474800
3ee0: 4047483c 40439a44 8a3e3f30 40439710 40438a48 4045ae68 8a3e3f24 8a3e3f08
3f00: 400a3ca0 400a377c 8a3e3f30 1162 00012bed 40438a48 8a3e3f78 8a3e3f28
3f20: 40069b58 400a3bfc 00011e41 8a3e3f38   8a3e3f28 0400
3f40:      0025 8a3e3f80 8a3e3f8c
3f60: 40439750 8a3e2000 40438a48 8a3e3fc0 8a3e3f7c 4006ab68 40069a8c 0001
3f80: bfae2ac0 40069a80  8a3e3f8c 8a3e3f8c 00012805  8a3e2000
3fa0: 9e7e1f1c 4006aa40 0001  fffc 8a3e3ff4 8a3e3fc4 4005461c
3fc0: 4006aa4c 0001      
3fe0:    8a3e3ff8 40042320 40054520  
Backtrace:
[<402801b8>] (raid5_congested+0x0/0x5c) from [<400a39e8>]
(sync_sb_inodes+0x278/0x2ec)
[<400a3770>] (sync_sb_inodes+0x0/0x2ec) from [<400a3ca0>]
(writeback_inodes+0xb0/0xb8)
[<400a3bf0>] (writeback_inodes+0x0/0xb8) from [<40069b58>]
(wb_kupdate+0xd8/0x160)
r7 = 40438A48  r6 = 00012BED  r5 = 1162  r4 = 8A3E3F30
[<40069a80>] (wb_kupdate+0x0/0x160) from [<4006ab68>] (pdflush+0x128/0x204)
r8 = 40438A48  r7 = 8A3E2000  r6 = 40439750  r5 = 8A3E3F8C
r4 = 8A3E3F80
[<4006aa40>] (pdflush+0x0/0x204) from [<4005461c>] (kthread+0x108/0x134)
[<40054514>] (kthread+0x0/0x134) from [<40042320>] (do_exit+0x0/0x844)
Code: e92dd800 e24cb004 e590 e3a01001 (e59030bc)
md: md0 stopped.
md: unbind
md: export_rdev(sda)
md: unbind
md: export_rdev(sdd)
md: unbind
md: export_rdev(sdc)
md: unbind
md: export_rdev(sdb)

2.6.20-rc3-iop1 on an iop348 platform.  SATA controller is sata_vsc.

--
Dan
-
To unsubscribe from this list: send the line "unsubscribe linux-raid" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/ma

  1   2   3   >