The patch titled

     md: support md/linear array with components greater than 2 terabytes.

has been added to the -mm tree.  Its filename is

     md-support-md-linear-array-with-components-greater-than-2-terabytes.patch

Patches currently in -mm which might be from [EMAIL PROTECTED] are

md-make-sure-resync-gets-started-when-array-starts.patch
sunrpc-cache_register-can-use-wrong-module-reference.patch
md-fix-minor-error-in-raid10-read-balancing-calculation.patch
md-fail-io-request-to-md-that-require-a-barrier.patch
md-dont-allow-new-md-bitmap-file-to-be-set-if-one-already-exists.patch
md-improve-handling-of-bitmap-initialisation.patch
md-all-hot-add-and-hot-remove-of-md-intent-logging-bitmaps.patch
md-support-write-mostly-device-in-raid1.patch
md-add-write-behind-support-for-md-raid1.patch
md-support-md-linear-array-with-components-greater-than-2-terabytes.patch
md-raid1_quiesce-is-back-to-front-fix-it.patch
md-make-sure-bitmap_daemon_work-actually-does-work.patch
md-do-not-set-mddev-bitmap-until-bitmap-is-fully-initialised.patch
md-allow-hot-adding-devices-to-arrays-with-non-persistant-superblocks.patch
md-allow-md-to-load-a-superblock-with-feature-bit-1-set.patch
md-fix-bitmap-read_sb_page-so-that-it-handles-errors-properly.patch
drivers-md-fix-up-schedule_timeout-usage.patch



From: NeilBrown <[EMAIL PROTECTED]>

linear currently uses division by the size of the smallest componenet device
to find which device a request goes to.  If that smallest device is larger
than 2 terabytes, then the division will not work on some systems.

So we introduce a pre-shift, and take care not to make the hash table too
large, much like the code in raid0.

Also get rid of conf->nr_zones, which is not needed.

Signed-off-by: Neil Brown <[EMAIL PROTECTED]>
Signed-off-by: Andrew Morton <[EMAIL PROTECTED]>
---

 drivers/md/linear.c         |   99 ++++++++++++++++++++++++++++++--------------
 include/linux/raid/linear.h |    4 -
 2 files changed, 70 insertions(+), 33 deletions(-)

diff -puN 
drivers/md/linear.c~md-support-md-linear-array-with-components-greater-than-2-terabytes
 drivers/md/linear.c
--- 
devel/drivers/md/linear.c~md-support-md-linear-array-with-components-greater-than-2-terabytes
       2005-08-21 22:35:53.000000000 -0700
+++ devel-akpm/drivers/md/linear.c      2005-08-21 22:35:53.000000000 -0700
@@ -38,7 +38,8 @@ static inline dev_info_t *which_dev(mdde
        /*
         * sector_div(a,b) returns the remainer and sets a to a/b
         */
-       (void)sector_div(block, conf->smallest->size);
+       block >>= conf->preshift;
+       (void)sector_div(block, conf->hash_spacing);
        hash = conf->hash_table[block];
 
        while ((sector>>1) >= (hash->size + hash->offset))
@@ -47,7 +48,7 @@ static inline dev_info_t *which_dev(mdde
 }
 
 /**
- *     linear_mergeable_bvec -- tell bio layer if a two requests can be merged
+ *     linear_mergeable_bvec -- tell bio layer if two requests can be merged
  *     @q: request queue
  *     @bio: the buffer head that's been built up so far
  *     @biovec: the request that could be merged to it.
@@ -116,7 +117,7 @@ static int linear_run (mddev_t *mddev)
        dev_info_t **table;
        mdk_rdev_t *rdev;
        int i, nb_zone, cnt;
-       sector_t start;
+       sector_t min_spacing;
        sector_t curr_offset;
        struct list_head *tmp;
 
@@ -127,11 +128,6 @@ static int linear_run (mddev_t *mddev)
        memset(conf, 0, sizeof(*conf) + mddev->raid_disks*sizeof(dev_info_t));
        mddev->private = conf;
 
-       /*
-        * Find the smallest device.
-        */
-
-       conf->smallest = NULL;
        cnt = 0;
        mddev->array_size = 0;
 
@@ -159,8 +155,6 @@ static int linear_run (mddev_t *mddev)
                disk->size = rdev->size;
                mddev->array_size += rdev->size;
 
-               if (!conf->smallest || (disk->size < conf->smallest->size))
-                       conf->smallest = disk;
                cnt++;
        }
        if (cnt != mddev->raid_disks) {
@@ -168,6 +162,36 @@ static int linear_run (mddev_t *mddev)
                goto out;
        }
 
+       min_spacing = mddev->array_size;
+       sector_div(min_spacing, PAGE_SIZE/sizeof(struct dev_info *));
+
+       /* min_spacing is the minimum spacing that will fit the hash
+        * table in one PAGE.  This may be much smaller than needed.
+        * We find the smallest non-terminal set of consecutive devices
+        * that is larger than min_spacing as use the size of that as
+        * the actual spacing
+        */
+       conf->hash_spacing = mddev->array_size;
+       for (i=0; i < cnt-1 ; i++) {
+               sector_t sz = 0;
+               int j;
+               for (j=i; i<cnt-1 && sz < min_spacing ; j++)
+                       sz += conf->disks[j].size;
+               if (sz >= min_spacing && sz < conf->hash_spacing)
+                       conf->hash_spacing = sz;
+       }
+
+       /* hash_spacing may be too large for sector_div to work with,
+        * so we might need to pre-shift
+        */
+       conf->preshift = 0;
+       if (sizeof(sector_t) > sizeof(u32)) {
+               sector_t space = conf->hash_spacing;
+               while (space > (sector_t)(~(u32)0)) {
+                       space >>= 1;
+                       conf->preshift++;
+               }
+       }
        /*
         * This code was restructured to work around a gcc-2.95.3 internal
         * compiler error.  Alter it with care.
@@ -177,39 +201,52 @@ static int linear_run (mddev_t *mddev)
                unsigned round;
                unsigned long base;
 
-               sz = mddev->array_size;
-               base = conf->smallest->size;
+               sz = mddev->array_size >> conf->preshift;
+               sz += 1; /* force round-up */
+               base = conf->hash_spacing >> conf->preshift;
                round = sector_div(sz, base);
-               nb_zone = conf->nr_zones = sz + (round ? 1 : 0);
+               nb_zone = sz + (round ? 1 : 0);
        }
-                       
-       conf->hash_table = kmalloc (sizeof (dev_info_t*) * nb_zone,
+       BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
+
+       conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
                                        GFP_KERNEL);
        if (!conf->hash_table)
                goto out;
 
        /*
         * Here we generate the linear hash table
+        * First calculate the device offsets.
         */
+       conf->disks[0].offset = 0;
+       for (i=1; i<mddev->raid_disks; i++)
+               conf->disks[i].offset =
+                       conf->disks[i-1].offset +
+                       conf->disks[i-1].size;
+
        table = conf->hash_table;
-       start = 0;
        curr_offset = 0;
-       for (i = 0; i < cnt; i++) {
-               dev_info_t *disk = conf->disks + i;
-
-               disk->offset = curr_offset;
-               curr_offset += disk->size;
-
-               /* 'curr_offset' is the end of this disk
-                * 'start' is the start of table
+       i = 0;
+       for (curr_offset = 0;
+            curr_offset < mddev->array_size;
+            curr_offset += conf->hash_spacing) {
+
+               while (i < mddev->raid_disks-1 &&
+                      curr_offset >= conf->disks[i+1].offset)
+                       i++;
+
+               *table ++ = conf->disks + i;
+       }
+
+       if (conf->preshift) {
+               conf->hash_spacing >>= conf->preshift;
+               /* round hash_spacing up so that when we divide by it,
+                * we err on the side of "too-low", which is safest.
                 */
-               while (start < curr_offset) {
-                       *table++ = disk;
-                       start += conf->smallest->size;
-               }
+               conf->hash_spacing++;
        }
-       if (table-conf->hash_table != nb_zone)
-               BUG();
+
+       BUG_ON(table - conf->hash_table > nb_zone);
 
        blk_queue_merge_bvec(mddev->queue, linear_mergeable_bvec);
        mddev->queue->unplug_fn = linear_unplug;
@@ -299,7 +336,7 @@ static void linear_status (struct seq_fi
        sector_t s = 0;
   
        seq_printf(seq, "      ");
-       for (j = 0; j < conf->nr_zones; j++)
+       for (j = 0; j < mddev->raid_disks; j++)
        {
                char b[BDEVNAME_SIZE];
                s += conf->smallest_size;
diff -puN 
include/linux/raid/linear.h~md-support-md-linear-array-with-components-greater-than-2-terabytes
 include/linux/raid/linear.h
--- 
devel/include/linux/raid/linear.h~md-support-md-linear-array-with-components-greater-than-2-terabytes
       2005-08-21 22:35:53.000000000 -0700
+++ devel-akpm/include/linux/raid/linear.h      2005-08-21 22:35:53.000000000 
-0700
@@ -14,8 +14,8 @@ typedef struct dev_info dev_info_t;
 struct linear_private_data
 {
        dev_info_t              **hash_table;
-       dev_info_t              *smallest;
-       int                     nr_zones;
+       sector_t                hash_spacing;
+       int                     preshift; /* shift before dividing by 
hash_spacing */
        dev_info_t              disks[0];
 };
 
_
-
To unsubscribe from this list: send the line "unsubscribe mm-commits" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to