SOURCES: linux-dmcache.patch (NEW) - DM-Cache: A Generic Block-lev...

glen Tue, 13 Nov 2007 16:05:41 -0800

Author: glen                         Date: Wed Nov 14 00:07:39 2007 GMT
Module: SOURCES                       Tag: HEAD
---- Log message:
- DM-Cache: A Generic Block-level Disk Cache - 
http://www.acis.ufl.edu/~ming/dmcache/index.html
  saved http://www.acis.ufl.edu/~ming/dmcache/patch-2.6.21


---- Files affected:
SOURCES:
   linux-dmcache.patch (NONE -> 1.1)  (NEW)

---- Diffs:

================================================================
Index: SOURCES/linux-dmcache.patch
diff -u /dev/null SOURCES/linux-dmcache.patch:1.1
--- /dev/null   Wed Nov 14 01:07:39 2007
+++ SOURCES/linux-dmcache.patch Wed Nov 14 01:07:34 2007
@@ -0,0 +1,1797 @@
+diff -Naur linux-2.6.21.7-orig/drivers/md/dm-cache.c 
linux-2.6.21.7-dmcache/drivers/md/dm-cache.c
+--- linux-2.6.21.7-orig/drivers/md/dm-cache.c  1969-12-31 19:00:00.000000000 
-0500
++++ linux-2.6.21.7-dmcache/drivers/md/dm-cache.c       2007-08-23 
14:10:58.000000000 -0400
+@@ -0,0 +1,1766 @@
++/****************************************************************************
++ *  dm-cache.c
++ *  Device mapper target for block-level disk caching
++ *
++ *  Copyright (C) International Business Machines Corp., 2006
++ *  Author: Ming Zhao ([EMAIL PROTECTED])
++ *
++ *  This program is free software; you can redistribute it and/or modify
++ *  it under the terms of the GNU General Public License as published by
++ *  the Free Software Foundation; under version 2 of the License.
++ *
++ *  This program is distributed in the hope that it will be useful,
++ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
++ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
++ *  GNU General Public License for more details.
++ *
++ *  You should have received a copy of the GNU General Public License
++ *  along with this program; if not, write to the Free Software
++ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
++ *
++ ****************************************************************************/
++
++#include <asm/atomic.h>
++#include <asm/checksum.h>
++#include <linux/module.h>
++#include <linux/init.h>
++#include <linux/list.h>
++#include <linux/blkdev.h>
++#include <linux/bio.h>
++#include <linux/slab.h>
++#include <linux/hash.h>
++#include <linux/spinlock.h>
++#include <linux/workqueue.h>
++#include <linux/pagemap.h>
++
++#include "dm.h"
++#include "dm-io.h"
++#include "dm-bio-list.h"
++#include "kcopyd.h"
++
++#define DMC_DEBUG 0
++
++#define DM_MSG_PREFIX "cache"
++#define DMC_PREFIX "dm-cache: "
++
++#if DMC_DEBUG
++#define DPRINTK( s, arg... ) printk(DMC_PREFIX s "\n", ##arg)
++#else
++#define DPRINTK( s, arg... )
++#endif
++
++/* Default cache parameters */
++#define DEFAULT_CACHE_SIZE    65536
++#define DEFAULT_CACHE_ASSOC   1024
++#define DEFAULT_BLOCK_SIZE    8
++#define CONSECUTIVE_BLOCKS    512
++
++/* Write policy */
++#define WRITE_THROUGH 0
++#define WRITE_BACK 1
++#define DEFAULT_WRITE_POLICY WRITE_THROUGH
++
++/* Number of pages for I/O */
++#define DMCACHE_COPY_PAGES 1024
++
++/* States of a cache block */
++#define INVALID               0
++#define VALID         1       /* Valid */
++#define RESERVED      2       /* Allocated but data not in place yet */
++#define DIRTY         4       /* Locally modified */
++#define WRITEBACK     8       /* In the process of write back */
++
++#define is_state(x, y)                (x & y)
++#define set_state(x, y)               (x |= y)
++#define clear_state(x, y)     (x &= ~y)
++
++/*
++ * Cache context
++ */
++struct cache_c {
++      struct dm_dev *src_dev;         /* Source device */
++      struct dm_dev *cache_dev;       /* Cache device */
++      struct kcopyd_client *kcp_client; /* Kcopyd client for writing back 
data */
++
++      struct cacheblock *cache;       /* Hash table for cache blocks */
++      sector_t size;                  /* Cache size */
++      unsigned int bits;              /* Cache size in bits */
++      unsigned int assoc;             /* Cache associativity */
++      unsigned int block_size;        /* Cache block size */
++      unsigned int block_shift;       /* Cache block size in bits */
++      unsigned int block_mask;        /* Cache block mask */
++      unsigned int consecutive_shift; /* Consecutive blocks size in bits */
++      unsigned long counter;          /* Logical timestamp of last access */
++      unsigned int write_policy;      /* Cache write policy */
++      sector_t dirty_blocks;          /* Number of dirty blocks */
++
++      spinlock_t lock;                /* Lock to protect page 
allocation/deallocation */
++      struct page_list *pages;        /* Pages for I/O */
++      unsigned int nr_pages;          /* Number of pages */
++      unsigned int nr_free_pages;     /* Number of free pages */
++      wait_queue_head_t destroyq;     /* Wait queue for I/O completion */
++      atomic_t nr_jobs;               /* Number of I/O jobs */
++      /* Stats */
++      unsigned long reads;            /* Number of reads */
++      unsigned long writes;           /* Number of writes */
++      unsigned long cache_hits;       /* Number of cache hits */
++      unsigned long replace;          /* Number of cache replacements */
++      unsigned long writeback;        /* Number of replaced dirty blocks */
++      unsigned long dirty;            /* Number of submitted dirty blocks */
++};
++
++/* Cache block metadata structure */
++struct cacheblock {
++      spinlock_t lock;        /* Lock to protect operations on the bio list */
++      sector_t block;         /* Sector number of the cached block */
++      unsigned short state;   /* State of a block */
++      unsigned long counter;  /* Logical timestamp of the block's last access 
*/
++      struct bio_list bios;   /* List of pending bios */
++};
++
++
++/****************************************************************************
++ *  Functions and data structures for implementing a kcached to handle async
++ *  I/O. Code for page and queue handling is borrowed from kcopyd.c.
++ ****************************************************************************/
++
++/*
++ * Functions for handling pages used by async I/O.
++ * The data asked by a bio request may not be aligned with cache blocks, in
++ * which case additional pages are required for the request that is forwarded
++ * to the server. A pool of pages are reserved for this purpose.
++ */
++
++static struct page_list *alloc_pl(void)
++{
++      struct page_list *pl;
++
++      pl = kmalloc(sizeof(*pl), GFP_KERNEL);
++      if (!pl)
++              return NULL;
++
++      pl->page = alloc_page(GFP_KERNEL);
++      if (!pl->page) {
++              kfree(pl);
++              return NULL;
++      }
++
++      return pl;
++}
++
++static void free_pl(struct page_list *pl)
++{
++      __free_page(pl->page);
++      kfree(pl);
++}
++
++static void drop_pages(struct page_list *pl)
++{
++      struct page_list *next;
++
++      while (pl) {
++              next = pl->next;
++              free_pl(pl);
++              pl = next;
++      }
++}
++
++static int kcached_get_pages(struct cache_c *dmc, unsigned int nr,
++                               struct page_list **pages)
++{
++      struct page_list *pl;
++
++      spin_lock(&dmc->lock);
++      if (dmc->nr_free_pages < nr) {
++              DPRINTK("kcached_get_pages: No free pages: %u<%u",
++                      dmc->nr_free_pages, nr);
++              spin_unlock(&dmc->lock);
++              return -ENOMEM;
++      }
++
++      dmc->nr_free_pages -= nr;
++      for (*pages = pl = dmc->pages; --nr; pl = pl->next)
++              ;
++
++      dmc->pages = pl->next;
++      pl->next = NULL;
++
++      spin_unlock(&dmc->lock);
++
++      return 0;
++}
++
++static void kcached_put_pages(struct cache_c *dmc, struct page_list *pl)
++{
++      struct page_list *cursor;
++
++      spin_lock(&dmc->lock);
++      for (cursor = pl; cursor->next; cursor = cursor->next)
++              dmc->nr_free_pages++;
++
++      dmc->nr_free_pages++;
++      cursor->next = dmc->pages;
++      dmc->pages = pl;
++
++      spin_unlock(&dmc->lock);
++}
++
++static int alloc_bio_pages(struct cache_c *dmc, unsigned int nr)
++{
++      unsigned int i;
++      struct page_list *pl = NULL, *next;
++
++      for (i = 0; i < nr; i++) {
++              next = alloc_pl();
++              if (!next) {
++                      if (pl)
++                              drop_pages(pl);
++                      return -ENOMEM;
++              }
++              next->next = pl;
++              pl = next;
++      }
++
++      kcached_put_pages(dmc, pl);
++      dmc->nr_pages += nr;
++
++      return 0;
++}
++
++static void free_bio_pages(struct cache_c *dmc)
++{
++      BUG_ON(dmc->nr_free_pages != dmc->nr_pages);
++      drop_pages(dmc->pages);
++      dmc->pages = NULL;
++      dmc->nr_free_pages = dmc->nr_pages = 0;
++}
++
++/* Structure for a kcached job */
++struct kcached_job {
++      struct list_head list;
++      struct cache_c *dmc;
++      struct bio *bio;        /* Original bio */
++      struct io_region src;
++      struct io_region dest;
++      struct cacheblock *cacheblock;
++      int rw;
++      /*
++       * When the original bio is not aligned with cache blocks,
++       * we need extra bvecs and pages for padding.
++       */
++      struct bio_vec *bvec;
++      unsigned int nr_pages;
++      struct page_list *pages;
++};
++
++static struct workqueue_struct *_kcached_wq;
++static struct work_struct _kcached_work;
++
++static inline void wake(void)
++{
++      queue_work(_kcached_wq, &_kcached_work);
++}
++
++#define MIN_JOBS 1024
++
++static struct kmem_cache *_job_cache;
++static mempool_t *_job_pool;
++
++static DEFINE_SPINLOCK(_job_lock);
++
++static LIST_HEAD(_complete_jobs);
++static LIST_HEAD(_io_jobs);
++static LIST_HEAD(_pages_jobs);
++
++static int jobs_init(void)
++{
++      _job_cache = kmem_cache_create("kcached-jobs",
++                                     sizeof(struct kcached_job),
++                                     __alignof__(struct kcached_job),
++                                     0, NULL, NULL);
++      if (!_job_cache)
++              return -ENOMEM;
++
++      _job_pool = mempool_create(MIN_JOBS, mempool_alloc_slab,
++                                 mempool_free_slab, _job_cache);
++      if (!_job_pool) {
++              kmem_cache_destroy(_job_cache);
++              return -ENOMEM;
++      }
++
++      return 0;
++}
++
++static void jobs_exit(void)
++{
++      BUG_ON(!list_empty(&_complete_jobs));
++      BUG_ON(!list_empty(&_io_jobs));
++      BUG_ON(!list_empty(&_pages_jobs));
++
++      mempool_destroy(_job_pool);
++      kmem_cache_destroy(_job_cache);
++      _job_pool = NULL;
++      _job_cache = NULL;
++}
++
++/*
++ * Functions to push and pop a job onto the head of a given job list.
++ */
++static inline struct kcached_job *pop(struct list_head *jobs)
++{
++      struct kcached_job *job = NULL;
++      unsigned long flags;
++
++      spin_lock_irqsave(&_job_lock, flags);
++
++      if (!list_empty(jobs)) {
++              job = list_entry(jobs->next, struct kcached_job, list);
++              list_del(&job->list);
++      }
++      spin_unlock_irqrestore(&_job_lock, flags);
++
++      return job;
++}
++
++static inline void push(struct list_head *jobs, struct kcached_job *job)
++{
++      unsigned long flags;
++
++      spin_lock_irqsave(&_job_lock, flags);
++      list_add_tail(&job->list, jobs);
++      spin_unlock_irqrestore(&_job_lock, flags);
++}
++
++
++/****************************************************************************
++ * Functions for asynchronously fetching data from source device and storing
++ * data in cache device. Because the requested data may not align with the
++ * cache blocks, extra handling is required to pad a block request and extract
++ * the requested data from the results.
++ ****************************************************************************/
++
++static void io_callback(unsigned long error, void *context)
++{
++      struct kcached_job *job = (struct kcached_job *) context;
++
++      if (error) {
++              /* TODO */
++              DMERR("io_callback: io error");
++              return;
++      }
++
++      if (job->rw == READ) {
++              job->rw = WRITE;
++              push(&_io_jobs, job);
++      } else
++              push(&_complete_jobs, job);
++      wake();
++}
++
++/*
++ * Fetch data from the source device asynchronously.
++ * For a READ bio, if a cache block is larger than the requested data, then
++ * additional data are prefetched. Larger cache block size enables more
++ * aggressive read prefetching, which is useful for read-mostly usage.
++ * For a WRITE bio, if a cache block is larger than the requested data, the
++ * entire block needs to be fetched, and larger block size incurs more 
overhead.
++ * In scenaros where writes are frequent, 4KB is a good cache block size.
++ */
++static int do_fetch(struct kcached_job *job)
++{
++      int r = 0, i, j;
++      struct bio *bio = job->bio;
++      struct cache_c *dmc = job->dmc;
++      unsigned int offset, head, tail, remaining, nr_vecs, idx = 0;
++      struct bio_vec *bvec;
++      struct page_list *pl;
++
++      offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
++      head = to_bytes(offset);
++      tail = to_bytes(dmc->block_size) - bio->bi_size - head;
++
++      DPRINTK("do_fetch: %llu(%llu->%llu,%llu), head:%u,tail:%u",
++              bio->bi_sector, job->src.sector, job->dest.sector,
++              job->src.count, head, tail);
++
++      if (bio_data_dir(bio) == READ) { /* The original request is a READ */
++              if (0 == job->nr_pages) { /* The request is aligned to cache 
block */
++                      r = dm_io_async_bvec(1, &job->src, READ,
++                                           bio->bi_io_vec + bio->bi_idx,
++                                           io_callback, job);
++                      return r;
++              }
++
++              nr_vecs = bio->bi_vcnt - bio->bi_idx + job->nr_pages;
++              bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_NOIO);
++              if (!bvec) {
++                      DMERR("do_fetch: No memory");
++                      return 1;
++              }
++
++              pl = job->pages;
++              i = 0;
++              while (head) {
++                      bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
++                      bvec[i].bv_offset = 0;
++                      bvec[i].bv_page = pl->page;
++                      head -= bvec[i].bv_len;
++                      pl = pl->next;
++                      i++;
++              }
++
++              remaining = bio->bi_size;
++              j = bio->bi_idx;
++              while (remaining) {
++                      bvec[i] = bio->bi_io_vec[j];
++                      remaining -= bvec[i].bv_len;
++                      i++; j++;
++              }
++
++              while (tail) {
++                      bvec[i].bv_len = min(tail, (unsigned int)PAGE_SIZE);
++                      bvec[i].bv_offset = 0;
++                      bvec[i].bv_page = pl->page;
++                      tail -= bvec[i].bv_len;
++                      pl = pl->next;
++                      i++;
++              }
++
++              job->bvec = bvec;
++              r = dm_io_async_bvec(1, &job->src, READ, job->bvec, 
io_callback, job);
++              return r;
++      } else { /* The original request is a WRITE */
++              pl = job->pages;
++
++              if (head && tail) { /* Special case */
++                      bvec = kmalloc(job->nr_pages * sizeof(*bvec), 
GFP_KERNEL);
++                      if (!bvec) {
++                              DMERR("do_fetch: No memory");
++                              return 1;
++                      }
++                      for (i=0; i<job->nr_pages; i++) {
++                              bvec[i].bv_len = PAGE_SIZE;
++                              bvec[i].bv_offset = 0;
++                              bvec[i].bv_page = pl->page;
++                              pl = pl->next;
++                      }
++                      job->bvec = bvec;
++                      r = dm_io_async_bvec(1, &job->src, READ, job->bvec,
++                                           io_callback, job);
++                      return r;
++              }
++
++              bvec = kmalloc((job->nr_pages + bio->bi_vcnt - bio->bi_idx)
++                              * sizeof(*bvec), GFP_KERNEL);
++              if (!bvec) {
++                      DMERR("do_fetch: No memory");
++                      return 1;
++              }
++
++              i = 0;
++              while (head) {
++                      bvec[i].bv_len = min(head, (unsigned int)PAGE_SIZE);
++                      bvec[i].bv_offset = 0;
++                      bvec[i].bv_page = pl->page;
++                      head -= bvec[i].bv_len;
++                      pl = pl->next;
++                      i++;
++              }
++
++              remaining = bio->bi_size;
++              j = bio->bi_idx;
++              while (remaining) {
++                      bvec[i] = bio->bi_io_vec[j];
++                      remaining -= bvec[i].bv_len;
++                      i++; j++;
++              }
++
++              if (tail) {
++                      idx = i;
++                      bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) &
++                                          (PAGE_SIZE - 1);
++                      bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
++                      bvec[i].bv_page = pl->page;
++                      tail -= bvec[i].bv_len;
++                      pl = pl->next; i++;
++                      while (tail) {
++                              bvec[i].bv_len = PAGE_SIZE;
++                              bvec[i].bv_offset = 0;
++                              bvec[i].bv_page = pl->page;
++                              tail -= bvec[i].bv_len;
++                              pl = pl->next; i++;
++                      }
++              }
++
++              job->bvec = bvec;
++              r = dm_io_async_bvec(1, &job->src, READ, job->bvec + idx,
++                                   io_callback, job);
++
++              return r;
++      }
++}
++
++/*
++ * Store data to the cache source device asynchronously.
++ * For a READ bio request, the data fetched from the source device are 
returned
++ * to kernel and stored in cache at the same time.
++ * For a WRITE bio request, the data are written to the cache and source 
device
++ * at the same time.
++ */
++static int do_store(struct kcached_job *job)
++{
++      int i, j, r = 0;
++      struct bio *bio = job->bio, *clone;
++      struct cache_c *dmc = job->dmc;
++      unsigned int offset, head, tail, remaining, nr_vecs;
++      struct bio_vec *bvec;
++
++      offset = (unsigned int) (bio->bi_sector & dmc->block_mask);
++      head = to_bytes(offset);
++      tail = to_bytes(dmc->block_size) - bio->bi_size - head;
++
++      DPRINTK("do_store: %llu(%llu->%llu,%llu), head:%u,tail:%u",
++              bio->bi_sector, job->src.sector, job->dest.sector,
++              job->src.count, head, tail);
++
++      /* A READ is acknowledged as soon as the requested data is fetched, and
++         does not have to wait for it being stored in cache. The bio is cloned
++         so that the original one can be ended here. But to avoid copying
++         pages, we reuse the pages allocated for the original bio, and mark
++         each of them to prevent the pages being freed before the cache
++         insertion is completed.
++       */
++      if (bio_data_dir(bio) == READ) {
++              clone = bio_clone(bio, GFP_NOIO);
++              for (i=bio->bi_idx; i<bio->bi_vcnt; i++) {
++                      get_page(bio->bi_io_vec[i].bv_page);
++              }
++              DPRINTK("bio ended for %llu:%u", bio->bi_sector, bio->bi_size);
++              bio_endio(bio, bio->bi_size, 0);
++              bio = clone;
++              job->bio = clone;
++      }
++
++      if (0 == job->nr_pages) /* Original request is aligned with cache 
blocks */
++              r = dm_io_async_bvec(1, &job->dest, WRITE, bio->bi_io_vec + 
bio->bi_idx,
++                                   io_callback, job);
++      else {
++              if (bio_data_dir(bio) == WRITE && head > 0 && tail > 0) {
++                      DPRINTK("Special case: %lu %u %u", bio_data_dir(bio), 
head, tail);
++                      nr_vecs = job->nr_pages + bio->bi_vcnt - bio->bi_idx;
++                      if (offset && (offset + bio->bi_size < PAGE_SIZE)) 
nr_vecs++;
++                      DPRINTK("Create %u new vecs", nr_vecs);
++                      bvec = kmalloc(nr_vecs * sizeof(*bvec), GFP_KERNEL);
++                      if (!bvec) {
++                              DMERR("do_store: No memory");
++                              return 1;
++                      }
++
++                      i = 0;
++                      while (head) {
++                              bvec[i].bv_len = min(head, job->bvec[i].bv_len);
++                              bvec[i].bv_offset = 0;
++                              bvec[i].bv_page = job->bvec[i].bv_page;
++                              head -= bvec[i].bv_len;
++                              i++;
++                      }
++                      remaining = bio->bi_size;
++                      j = bio->bi_idx;
++                      while (remaining) {
++                              bvec[i] = bio->bi_io_vec[j];
++                              remaining -= bvec[i].bv_len;
++                              i++; j++;
++                      }
++                      j = (to_bytes(offset) + bio->bi_size) / PAGE_SIZE;
++                      bvec[i].bv_offset = (to_bytes(offset) + bio->bi_size) -
++                                          j * PAGE_SIZE;
++                      bvec[i].bv_len = PAGE_SIZE - bvec[i].bv_offset;
++                      bvec[i].bv_page = job->bvec[j].bv_page;
++                      tail -= bvec[i].bv_len;
++                      i++; j++;
++                      while (tail) {
++                              bvec[i] = job->bvec[j];
++                              tail -= bvec[i].bv_len;
++                              i++; j++;
++                      }
++                      kfree(job->bvec);
++                      job->bvec = bvec;
<<Diff was trimmed, longer than 597 lines>>
_______________________________________________
pld-cvs-commit mailing list
[email protected]
http://lists.pld-linux.org/mailman/listinfo/pld-cvs-commit

SOURCES: linux-dmcache.patch (NEW) - DM-Cache: A Generic Block-lev...

Reply via email to