Author: sparky                       Date: Wed Jun 21 14:02:01 2006 GMT
Module: SOURCES                       Tag: HEAD
---- Log message:
- filesystem cache, from Con Kolivas patchset:
  
http://www.kernel.org/pub/linux/kernel/people/ck/patches/2.6/2.6.17/2.6.17-ck1/patch-2.6.17-ck1.bz2

---- Files affected:
SOURCES:
   kernel-desktop-fcache.patch (NONE -> 1.1)  (NEW)

---- Diffs:

================================================================
Index: SOURCES/kernel-desktop-fcache.patch
diff -u /dev/null SOURCES/kernel-desktop-fcache.patch:1.1
--- /dev/null   Wed Jun 21 16:02:01 2006
+++ SOURCES/kernel-desktop-fcache.patch Wed Jun 21 16:01:56 2006
@@ -0,0 +1,1783 @@
+diff -Nur linux-2.6.17/block/ll_rw_blk.c linux-2.6.17.fcache/block/ll_rw_blk.c
+--- linux-2.6.17/block/ll_rw_blk.c     2006-06-21 15:52:12.000000000 +0200
++++ linux-2.6.17.fcache/block/ll_rw_blk.c      2006-06-21 15:58:45.000000000 
+0200
+@@ -2817,12 +2817,10 @@
+        */
+       if (bio_rw_ahead(bio) || bio_failfast(bio))
+               req->flags |= REQ_FAILFAST;
+-
+-      /*
+-       * REQ_BARRIER implies no merging, but lets make it explicit
+-       */
+       if (unlikely(bio_barrier(bio)))
+-              req->flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
++              req->flags |= REQ_HARDBARRIER;
++      if (!bio_mergeable(bio))
++              req->flags |= REQ_NOMERGE;
+ 
+       req->errors = 0;
+       req->hard_sector = req->sector = bio->bi_sector;
+@@ -2870,7 +2868,7 @@
+ 
+       spin_lock_irq(q->queue_lock);
+ 
+-      if (unlikely(barrier) || elv_queue_empty(q))
++      if (!bio_mergeable(bio) || elv_queue_empty(q))
+               goto get_rq;
+ 
+       el_ret = elv_merge(q, &req, bio);
+@@ -3109,6 +3107,7 @@
+ 
+       BIO_BUG_ON(!bio->bi_size);
+       BIO_BUG_ON(!bio->bi_io_vec);
++      BIO_BUG_ON(bio->bi_next);
+       bio->bi_rw |= rw;
+       if (rw & WRITE)
+               mod_page_state(pgpgout, count);
+diff -Nur linux-2.6.17/drivers/block/fcache.c 
linux-2.6.17.fcache/drivers/block/fcache.c
+--- linux-2.6.17/drivers/block/fcache.c        1970-01-01 01:00:00.000000000 
+0100
++++ linux-2.6.17.fcache/drivers/block/fcache.c 2006-06-21 15:58:45.000000000 
+0200
+@@ -0,0 +1,1475 @@
++/*
++ * A frontend cache for a block device. The purpose is to speedup a
++ * fairly random but repeated read work load, like the boot of a system.
++ *
++ * When run in priming mode, fcache allocates and writes data read from
++ * the source drive to our extent cache in the order in which they are
++ * accessed. When later run in non-priming mode, data accessed in the same
++ * order will be linearly available in the cache.
++ *
++ * Performance when priming is slower than non-fcache usage would be. If
++ * the fcache is located on another disk, the hit should be small. If the
++ * the fcache is located on the same disk (another partition), it runs
++ * at about half the speed. Non-priming performance should be fairly
++ * similar on same/other disk.
++ *
++ * On-disk format is as follows:
++ *    Block0:         header
++ *    Block1..X       extent maps
++ *    BlockX+1..Y     extent data
++ *
++ * Copyright (C) 2006 Jens Axboe <[EMAIL PROTECTED]>
++ *
++ */
++#include <linux/config.h>
++#include <linux/module.h>
++#include <linux/moduleparam.h>
++#include <linux/sched.h>
++#include <linux/blkdev.h>
++#include <linux/prio_tree.h>
++#include <linux/buffer_head.h>
++#include <linux/slab.h>
++
++#define FCACHE_MAGIC  0x61786663
++#define FCACHE_VERSION        0x02
++
++#define FCACHE_HEADER_BLOCK   0
++#define FCACHE_EXTENT_BLOCK   1
++
++#undef FCACHE_PAGES_PROTECTED
++
++struct fcache_dev {
++      struct block_device *bdev;
++      struct block_device *fs_bdev;
++      make_request_fn *mfn;
++      struct prio_tree_root prio_root;
++      unsigned long next_cache_block;
++      unsigned long nr_extents;
++      unsigned long max_extents;
++      unsigned int old_bs;
++      spinlock_t lock;
++
++      sector_t cache_start_sector;
++      unsigned long cache_blocks;
++      sector_t fs_start_sector;
++      sector_t fs_sectors;
++
++      unsigned long flags;
++      int priming;
++      int serial;
++      int chop_ios;
++
++      struct list_head list;
++      struct work_struct work;
++
++      /*
++       * stats
++       */
++      unsigned int ios[2];
++      unsigned int hits;
++      unsigned int misses;
++      unsigned int overwrites;
++};
++
++enum {
++      FDEV_F_DOWN = 0,
++};
++
++static struct fcache_dev fcache_dev;
++
++static int disable;
++module_param(disable, int, 0444);
++
++struct fcache_endio_data {
++      struct fcache_dev *fdev;
++      sector_t fs_sector;
++      unsigned int fs_size;
++      sector_t cache_sector;
++      atomic_t completions;
++      struct bio *bio;
++      int io_error;
++      struct list_head list;
++};
++
++/*
++ * Maps a file system block to the fcache
++ */
++struct fcache_extent {
++      sector_t fs_sector;     /* real device offset */
++      unsigned int fs_size;   /* extent length */
++      sector_t cache_sector;  /* cache device offset */
++
++      struct prio_tree_node prio_node;
++};
++
++/*
++ * Header on fcache device - will take up the first page of data, so
++ * plenty of room to go around.
++ */
++struct fcache_header {
++      u32 magic;              /* fcache magic */
++      u32 version;            /* fcache version */
++      u32 nr_extents;         /* nr of extents in cache */
++      u32 max_extents;        /* max nr of extents available */
++      u32 serial;             /* fs and cache serial */
++      u32 extent_offset;      /* where extents start */
++      u64 fs_start_sector;    /* where fs starts */
++      u64 fs_sectors;         /* how big fs is */
++      char fs_dev[BDEVNAME_SIZE];     /* fs partition */
++      u64 cache_blocks;       /* number of blocks in cache */
++      u64 cache_blocks_used;  /* used blocks in cache */
++      u16 sector_t_size;      /* user space helper */
++      u16 extent_size;        /* user space helper */
++};
++
++#define BLOCK_SHIFT   (PAGE_SHIFT - 9)
++
++static struct kmem_cache *fcache_slab;
++static struct kmem_cache *fcache_fed_slab;
++static mempool_t *fed_pool;
++static struct workqueue_struct *fcache_workqueue;
++
++static int fcache_rw_page_endio(struct bio *bio, unsigned int bytes, int err)
++{
++      if (bio->bi_size)
++              return 1;
++
++      complete(bio->bi_private);
++      return 0;
++}
++
++/*
++ * Writes out a page of data and waits for it to complete.
++ */
++static int fcache_rw_page(struct fcache_dev *fdev, sector_t index,
++                        struct page *page, int rw)
++{
++      DECLARE_COMPLETION(wait);
++      struct bio *bio;
++      int ret = 0;
++
++      bio = bio_alloc(GFP_KERNEL, 1);
++
++      bio->bi_sector = index << BLOCK_SHIFT;
++      bio->bi_bdev = fdev->bdev;
++      bio->bi_rw |= (1 << BIO_RW_SYNC);
++      bio->bi_end_io = fcache_rw_page_endio;
++      bio->bi_private = &wait;
++
++      bio_add_page(bio, page, PAGE_SIZE, 0);
++      submit_bio(rw, bio);
++
++      wait_for_completion(&wait);
++
++      if (!bio_flagged(bio, BIO_UPTODATE))
++              ret = -EIO;
++
++      bio_put(bio);
++      return ret;
++}
++
++static inline void fcache_fill_header(struct fcache_dev *fdev,
++                                    struct fcache_header *header,
++                                    unsigned int nr_extents)
++{
++      /*
++       * See how many pages we need for extent headers, then we know where
++       * to start putting data. Assume worst case of 1 page per extent, and
++       * reserve the first page for the header.
++       */
++
++      header->magic = FCACHE_MAGIC;
++      header->version = FCACHE_VERSION;
++      header->nr_extents = nr_extents;
++      header->max_extents = ((fdev->cache_blocks - 1) * PAGE_SIZE) / 
(PAGE_SIZE - sizeof(struct fcache_extent));
++      header->serial = fdev->serial;
++
++      header->extent_offset = 1 + (header->max_extents * sizeof(struct 
fcache_extent) / PAGE_SIZE);
++
++      header->fs_start_sector = fdev->fs_start_sector;
++      header->fs_sectors = fdev->fs_sectors;
++      bdevname(fdev->fs_bdev, header->fs_dev);
++      header->cache_blocks = fdev->cache_blocks;
++      header->cache_blocks_used = fdev->next_cache_block;
++      header->sector_t_size = sizeof(sector_t);
++      header->extent_size = sizeof(struct fcache_extent);
++}
++
++static int fcache_write_new_header(struct fcache_dev *fdev)
++{
++      struct fcache_header *header;
++      struct page *page;
++      int ret;
++
++      page = alloc_page(GFP_HIGHUSER);
++      if (unlikely(!page))
++              return -ENOMEM;
++
++      header = kmap_atomic(page, KM_USER0);
++      clear_page(header);
++      fcache_fill_header(fdev, header, 0);
++      fdev->next_cache_block = header->extent_offset;
++      fdev->max_extents = header->max_extents;
++      kunmap_atomic(header, KM_USER0);
++
++      printk("fcache: new header: first block %lu, max %lu\n",
++                              fdev->next_cache_block, fdev->max_extents);
++      ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
++      __free_page(page);
++      return ret;
++}
++
++static void fcache_free_prio_tree(struct fcache_dev *fdev)
++{
++      struct fcache_extent *fe;
++      struct prio_tree_iter iter;
++      struct prio_tree_node *node;
++
++      /*
++       * Now prune and free tree, wish there was a better way...
++       */
++      do {
++              prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX);
++
++              node = prio_tree_next(&iter);
++              if (!node)
++                      break;
++
++              fe = prio_tree_entry(node, struct fcache_extent, prio_node);
++              prio_tree_remove(&fdev->prio_root, node);
++              kmem_cache_free(fcache_slab, fe);
++      } while (1);
++}
++
++/*
++ * First clear the header, write extents, then write real header.
++ */
++static int fcache_write_extents(struct fcache_dev *fdev)
++{
++      struct fcache_header *header;
++      sector_t index, sectors;
++      unsigned int nr_extents, this_extents;
++      struct fcache_extent *fe;
++      struct prio_tree_iter iter;
++      struct prio_tree_node *node;
++      struct page *page;
++      void *p;
++      int ret;
++
++      page = alloc_page(GFP_KERNEL);
++      if (unlikely(!page))
++              return -ENOMEM;
++
++      header = page_address(page);
++      clear_page(header);
++      fcache_fill_header(fdev, header, 0);
++      ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
++      if (ret)
++              goto err;
++
++      /*
++       * Now write the extents in page size chunks.
++       */
++      p = page_address(page);
++      clear_page(p);
++      index = FCACHE_EXTENT_BLOCK;
++      sectors = 0;
++      this_extents = nr_extents = 0;
++
++      prio_tree_iter_init(&iter, &fdev->prio_root, 0, ULONG_MAX);
++
++      do {
++              node = prio_tree_next(&iter);
++              if (!node)
++                      break;
++
++              fe = prio_tree_entry(node, struct fcache_extent, prio_node);
++              nr_extents++;
++              this_extents++;
++              sectors += fe->fs_size >> 9;
++              memcpy(p, fe, sizeof(*fe));
++              p += sizeof(*fe);
++              if ((this_extents + 1) * sizeof(*fe) > PAGE_SIZE) {
++                      ret = fcache_rw_page(fdev, index, page, WRITE);
++                      if (ret)
++                              break;
++
++                      this_extents = 0;
++                      index++;
++                      p = page_address(page);
++              }
++      } while (1);
++
++      if (this_extents)
++              ret = fcache_rw_page(fdev, index, page, WRITE);
++
++      fdev->nr_extents = nr_extents;
++      printk("fcache: wrote %d extents, holding %llu sectors of data\n",
++                              nr_extents, (unsigned long long) sectors);
++err:
++      __free_page(page);
++      return ret;
++}
++
++static int fcache_write_header(struct fcache_dev *fdev)
++{
++      struct page *page;
++      int ret;
++
++      page = alloc_page(GFP_KERNEL);
++      if (unlikely(!page))
++              return -ENOMEM;
++
++      ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, READ);
++      if (!ret) {
++              struct fcache_header *header = page_address(page);
++
++              fcache_fill_header(fdev, header, fdev->nr_extents);
++              ret = fcache_rw_page(fdev, FCACHE_HEADER_BLOCK, page, WRITE);
++              printk("fcache: wrote header (extents=%lu,serial=%d)\n",
++                                              fdev->nr_extents, fdev->serial);
++      }
++
++      __free_page(page);
++      return ret;
++}
++
++static void fcache_tree_link(struct fcache_dev *fdev, struct fcache_extent 
*fe)
++{
++      struct prio_tree_node *node = &fe->prio_node;
++      unsigned long flags;
++
++      INIT_PRIO_TREE_NODE(node);
++      node->start = fe->fs_sector;
++      node->last = fe->fs_sector + (fe->fs_size >> 9) - 1;
++
++      spin_lock_irqsave(&fdev->lock, flags);
++      prio_tree_insert(&fdev->prio_root, node);
++      spin_unlock_irqrestore(&fdev->lock, flags);
++}
++
++#define MAX_FE        16
++
++/*
++ * Lookup the range of a given request in the prio tree. Used for both
++ * looking up a range covering a read operation to be served from cache,
++ * and to lookup potential conflicts from a new write with an existing
++ * extent.
++ */
++static int fcache_lookup_extent(struct fcache_dev *fdev, sector_t offset,
++                              unsigned int bytes, struct fcache_extent **map)
++{
++      sector_t end_sector = offset + (bytes >> 9) - 1;
++      struct prio_tree_node *node;
++      struct prio_tree_iter iter;
++      int i = 0;
++
++      prio_tree_iter_init(&iter, &fdev->prio_root, offset, end_sector);
++
++      /*
++       * We only need to lock, if we are priming. The prio tree does
++       * not change when in normal mode.
++       */
++      if (fdev->priming)
++              spin_lock_irq(&fdev->lock);
++
++      do {
++              node = prio_tree_next(&iter);
++              if (!node)
++                      break;
++
++              map[i] = prio_tree_entry(node, struct fcache_extent, prio_node);
++      } while (++i < MAX_FE);
++
++      if (fdev->priming)
++              spin_unlock_irq(&fdev->lock);
++
++      return i;
++}
++
++/*
++ * Our data write is done, now insert the fcache extents into the rbtree.
++ */
++static int fcache_instantiate_extent(struct fcache_dev *fdev,
++                                   struct fcache_endio_data *fed)
++{
++      struct fcache_extent *fe;
++
++      fe = kmem_cache_alloc(fcache_slab, GFP_ATOMIC);
++      if (fe) {
++              fe->fs_sector = fed->fs_sector;
++              fe->fs_size = fed->fs_size;
++              fe->cache_sector = fed->cache_sector;
++
++              fcache_tree_link(fdev, fe);
++              return 0;
++      }
++
++      return -ENOMEM;
++}
++
++/*
++ * Hang on to the bio and its pages - ideally we would want to ensure
++ * that the page data doesn't change between calling this function and
++ * fcache_put_bio_pages() as well...
++ */
++static void fcache_get_bio_pages(struct fcache_dev *fdev, struct bio *bio)
++{
++      /*
++       * Currently stubbed out, as we cannot end the bio read before
++       * the write completes without also making sure that the pages
++       * don't get reused for something else in the mean time.
++       */
++#ifdef FCACHE_PAGES_PROTECTED
++      struct bio_vec *bvec;
++      int i;
++
++      bio_get(bio);
++
++      __bio_for_each_segment(bvec, bio, i, 0)
++              get_page(bvec->bv_page);
++#endif
++}
++
++static void fcache_put_bio_pages(struct fcache_dev *fdev, struct bio *bio)
++{
++#ifdef FCACHE_PAGES_PROTECTED
++      struct bio_vec *bvec;
++      int i;
++
++      __bio_for_each_segment(bvec, bio, i, 0)
++              put_page(bvec->bv_page);
++
++      bio_put(bio);
++#endif
++}
++
++static void fcache_chop_write_done(struct fcache_endio_data *fed)
++{
++      /*
++       * Last io completes.
++       */
++      if (atomic_dec_and_test(&fed->completions)) {
++              struct fcache_dev *fdev = fed->fdev;
++              struct bio *bio = fed->bio;
++
++              /*
++               * Release our reference to the original bio and
++               * its pages.
++               */
++              fcache_put_bio_pages(fdev, bio);
++
++              /*
++               * End the read!
++               */
++              bio_endio(bio, bio->bi_size, 0);
++
++              /*
++               * All done, now add extent to our list if io completed ok.
++               */
++              if (!fed->io_error)
++                      fcache_instantiate_extent(fdev, fed);
++
++              mempool_free(fed, fed_pool);
++      }
++}
++
++/*
++ * Our data write to the cache completes, we can free our clone and
++ * instantiate the extent block.
++ */
++static int fcache_extent_write_endio(struct bio *bio, unsigned int bytes,
++                                   int err)
++{
++      struct fcache_endio_data *fed;
++
++      if (bio->bi_size)
++              return 1;
++
++      fed = bio->bi_private;
++
++      if (!bio_flagged(bio, BIO_UPTODATE))
++              fed->io_error = -EIO;
++
++      bio_put(bio);
++      fcache_chop_write_done(fed);
++      return 0;
++}
++
++static void fcache_chop_read_done(struct fcache_endio_data *fed)
++{
++      if (atomic_dec_and_test(&fed->completions)) {
++              struct bio *bio = fed->bio;
++
++              bio_endio(bio, bio->bi_size, fed->io_error);
++              mempool_free(fed, fed_pool);
++      }
++}
++
++static int fcache_chop_read_endio(struct bio *bio, unsigned int bytes, int 
err)
++{
++      struct fcache_endio_data *fed;
++
++      if (bio->bi_size)
++              return 1;
++
++      fed = bio->bi_private;
++
++      if (!bio_flagged(bio, BIO_UPTODATE))
++              fed->io_error = -EIO;
++
++      bio_put(bio);
++      fcache_chop_read_done(fed);
++      return 0;
++}
++
++typedef void (chopper_done_t) (struct fcache_endio_data *);
++
++/*
++ * This is our io chopper - it hacks a bio into smaller pieces, suitable
++ * for the target device. Caller supplies suitable end_io and done functions.
++ */
++static void fcache_io_chopper(struct fcache_dev *fdev,
++                            struct fcache_endio_data *fed,
++                            bio_end_io_t *endio, chopper_done_t *done, int rw)
++{
++      struct bio *bio = NULL;
++      struct bio_vec *bv;
++      unsigned int total_bytes;
++      sector_t sector;
++      int i, vecs;
++
++      /*
++       * Make sure 'fed' doesn't disappear while we are still issuing
++       * ios, the artificial reference is dropped at the end.
++       */
++      atomic_set(&fed->completions, 1);
++
++      sector = fed->cache_sector;
++      total_bytes = fed->fs_size;
++      vecs = fed->bio->bi_vcnt;
++      bio_for_each_segment(bv, fed->bio, i) {
<<Diff was trimmed, longer than 597 lines>>
_______________________________________________
pld-cvs-commit mailing list
[email protected]
http://lists.pld-linux.org/mailman/listinfo/pld-cvs-commit

Reply via email to