Generic reserve management code. 

It provides methods to reserve and charge. Upon this, generic alloc/free style
reserve pools could be build, which could fully replace mempool_t
functionality.

It should also allow for a Banker's algorithm replacement of __GFP_NOFAIL.

Signed-off-by: Peter Zijlstra <[EMAIL PROTECTED]>
---
 include/linux/reserve.h |   54 +++++
 mm/Makefile             |    2 
 mm/reserve.c            |  438 ++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 493 insertions(+), 1 deletion(-)

Index: linux-2.6/include/linux/reserve.h
===================================================================
--- /dev/null
+++ linux-2.6/include/linux/reserve.h
@@ -0,0 +1,54 @@
+/*
+ * Memory reserve management.
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <[EMAIL PROTECTED]>
+ *
+ * This file contains the public data structure and API definitions.
+ */
+
+#ifndef _LINUX_RESERVE_H
+#define _LINUX_RESERVE_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+struct mem_reserve {
+       struct mem_reserve *parent;
+       struct list_head children;
+       struct list_head siblings;
+
+       const char *name;
+
+       long pages;
+       long limit;
+       long usage;
+       spinlock_t lock;        /* protects limit and usage */
+};
+
+extern struct mem_reserve mem_reserve_root;
+
+void mem_reserve_init(struct mem_reserve *res, const char *name,
+                     struct mem_reserve *parent);
+int mem_reserve_connect(struct mem_reserve *new_child,
+                       struct mem_reserve *node);
+int mem_reserve_disconnect(struct mem_reserve *node);
+
+int mem_reserve_pages_set(struct mem_reserve *res, long pages);
+int mem_reserve_pages_add(struct mem_reserve *res, long pages);
+int mem_reserve_pages_charge(struct mem_reserve *res, long pages,
+                            int overcommit);
+
+int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes);
+int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes,
+                              int overcommit);
+
+struct kmem_cache;
+
+int mem_reserve_kmem_cache_set(struct mem_reserve *res,
+                              struct kmem_cache *s,
+                              int objects);
+int mem_reserve_kmem_cache_charge(struct mem_reserve *res,
+                                 long objs,
+                                 int overcommit);
+
+#endif /* _LINUX_RESERVE_H */
Index: linux-2.6/mm/Makefile
===================================================================
--- linux-2.6.orig/mm/Makefile
+++ linux-2.6/mm/Makefile
@@ -11,7 +11,7 @@ obj-y                 := bootmem.o filemap.o mempool.o
                           page_alloc.o page-writeback.o pdflush.o \
                           readahead.o swap.o truncate.o vmscan.o \
                           prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
-                          page_isolation.o $(mmu-y)
+                          page_isolation.o reserve.o $(mmu-y)
 
 obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o
 obj-$(CONFIG_BOUNCE)   += bounce.o
Index: linux-2.6/mm/reserve.c
===================================================================
--- /dev/null
+++ linux-2.6/mm/reserve.c
@@ -0,0 +1,438 @@
+/*
+ * Memory reserve management.
+ *
+ *  Copyright (C) 2007, Red Hat, Inc., Peter Zijlstra <[EMAIL PROTECTED]>
+ *
+ * Description:
+ *
+ * Manage a set of memory reserves.
+ *
+ * A memory reserve is a reserve for a specified number of object of specified
+ * size. Since memory is managed in pages, this reserve demand is then
+ * translated into a page unit.
+ *
+ * So each reserve has a specified object limit, an object usage count and a
+ * number of pages required to back these objects.
+ *
+ * Usage is charged against a reserve, if the charge fails, the resource must
+ * not be allocated/used.
+ *
+ * The reserves are managed in a tree, and the resource demands (pages and
+ * limit) are propagated up the tree. Obviously the object limit will be
+ * meaningless as soon as the unit starts mixing, but the required page reserve
+ * (being of one unit) is still valid at the root.
+ *
+ * It is the page demand of the root node that is used to set the global
+ * reserve (adjust_memalloc_reserve() which sets zone->pages_emerg).
+ *
+ * As long as a subtree has the same usage unit, an aggregate node can be used
+ * to charge against, instead of the leaf nodes. However, do be consistent with
+ * who is charged, resource usage is not propagated up the tree (for
+ * performance reasons).
+ */
+
+#include <linux/reserve.h>
+#include <linux/mutex.h>
+#include <linux/mmzone.h>
+#include <linux/log2.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+static DEFINE_MUTEX(mem_reserve_mutex);
+
+/**
+ * @mem_reserve_root - the global reserve root
+ *
+ * The global reserve is empty, and has no limit unit, it merely
+ * acts as an aggregation point for reserves and an interface to
+ * adjust_memalloc_reserve().
+ */
+struct mem_reserve mem_reserve_root = {
+       .children = LIST_HEAD_INIT(mem_reserve_root.children),
+       .siblings = LIST_HEAD_INIT(mem_reserve_root.siblings),
+       .name = "total reserve",
+       .lock = __SPIN_LOCK_UNLOCKED(mem_reserve_root.lock),
+};
+
+EXPORT_SYMBOL_GPL(mem_reserve_root);
+
+/**
+ * mem_reserve_init - initialize a memory reserve object
+ * @res - the new reserve object
+ * @name - a name for this reserve
+ */
+void mem_reserve_init(struct mem_reserve *res, const char *name,
+                     struct mem_reserve *parent)
+{
+       memset(res, 0, sizeof(*res));
+       INIT_LIST_HEAD(&res->children);
+       INIT_LIST_HEAD(&res->siblings);
+       res->name = name;
+       spin_lock_init(&res->lock);
+
+       if (parent)
+               mem_reserve_connect(res, parent);
+}
+
+EXPORT_SYMBOL_GPL(mem_reserve_init);
+
+/*
+ * propagate the pages and limit changes up the tree.
+ */
+static void __calc_reserve(struct mem_reserve *res, long pages, long limit)
+{
+       unsigned long flags;
+
+       for ( ; res; res = res->parent) {
+               res->pages += pages;
+
+               if (limit) {
+                       spin_lock_irqsave(&res->lock, flags);
+                       res->limit += limit;
+                       spin_unlock_irqrestore(&res->lock, flags);
+               }
+       }
+}
+
+/**
+ * __mem_reserve_add - primitive to change the size of a reserve
+ * @res - reserve to change
+ * @pages - page delta
+ * @limit - usage limit delta
+ *
+ * Returns -ENOMEM when a size increase is not possible atm.
+ */
+static int __mem_reserve_add(struct mem_reserve *res, long pages, long limit)
+{
+       int ret = 0;
+       long reserve;
+
+       reserve = mem_reserve_root.pages;
+       __calc_reserve(res, pages, 0);
+       reserve = mem_reserve_root.pages - reserve;
+
+       if (reserve) {
+               ret = adjust_memalloc_reserve(reserve);
+               if (ret)
+                       __calc_reserve(res, -pages, 0);
+       }
+
+       if (!ret)
+               __calc_reserve(res, 0, limit);
+
+       return ret;
+}
+
+/**
+ * __mem_reserve_charge - primitive to charge object usage to a reserve
+ * @res - reserve to charge
+ * @charge - size of the charge
+ * @overcommit - allow despite of limit (use with caution!)
+ *
+ * Returns non-zero on success, zero on failure.
+ */
+static
+int __mem_reserve_charge(struct mem_reserve *res, long charge, int overcommit)
+{
+       unsigned long flags;
+       int ret = 0;
+
+       spin_lock_irqsave(&res->lock, flags);
+       if (charge < 0 || res->usage + charge < res->limit || overcommit) {
+               res->usage += charge;
+               if (unlikely(res->usage < 0))
+                       res->usage = 0;
+               ret = 1;
+       }
+       spin_unlock_irqrestore(&res->lock, flags);
+
+       return ret;
+}
+
+/**
+ * mem_reserve_connect - connect a reserve to another in a child-parent 
relation
+ * @new_child - the reserve node to connect (child)
+ * @node - the reserve node to connect to (parent)
+ *
+ * Returns -ENOMEM when the new connection would increase the reserve (parent
+ * is connected to mem_reserve_root) and there is no memory to do so.
+ *
+ * The child is _NOT_ connected on error.
+ */
+int mem_reserve_connect(struct mem_reserve *new_child, struct mem_reserve 
*node)
+{
+       int ret;
+
+       WARN_ON(!new_child->name);
+
+       mutex_lock(&mem_reserve_mutex);
+       new_child->parent = node;
+       list_add(&new_child->siblings, &node->children);
+       ret = __mem_reserve_add(node, new_child->pages, new_child->limit);
+       if (ret) {
+               new_child->parent = NULL;
+               list_del_init(&new_child->siblings);
+       }
+       mutex_unlock(&mem_reserve_mutex);
+
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(mem_reserve_connect);
+
+/**
+ * mem_reserve_disconnect - sever a nodes connection to the reserve tree
+ * @node - the node to disconnect
+ *
+ * Could, in theory, return -ENOMEM, but since disconnecting a node _should_
+ * only decrease the reserves that _should_ not happen.
+ */
+int mem_reserve_disconnect(struct mem_reserve *node)
+{
+       int ret;
+
+       BUG_ON(!node->parent);
+
+       mutex_lock(&mem_reserve_mutex);
+       ret = __mem_reserve_add(node->parent, -node->pages, -node->limit);
+       if (!ret) {
+               node->parent = NULL;
+               list_del_init(&node->siblings);
+       }
+       mutex_unlock(&mem_reserve_mutex);
+
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(mem_reserve_disconnect);
+
+#ifdef CONFIG_PROC_FS
+
+/*
+ * Simple output of the reserve tree in: /proc/reserve_info
+ * Example:
+ *
+ * localhost ~ # cat /proc/reserve_info
+ * total reserve                  8156K (0/544817)
+ *   total network reserve          8156K (0/544817)
+ *     network TX reserve             196K (0/49)
+ *       protocol TX pages              196K (0/49)
+ *     network RX reserve             7960K (0/544768)
+ *       IPv6 route cache               1372K (0/4096)
+ *       IPv4 route cache               5468K (0/16384)
+ *       SKB data reserve               1120K (0/524288)
+ *         IPv6 fragment cache            560K (0/262144)
+ *         IPv4 fragment cache            560K (0/262144)
+ */
+
+static void mem_reserve_show_item(struct seq_file *m, struct mem_reserve *res,
+                                 int nesting)
+{
+       int i;
+       struct mem_reserve *child;
+
+       for (i = 0; i < nesting; i++)
+               seq_puts(m, "  ");
+
+       seq_printf(m, "%-30s %ldK (%ld/%ld)\n",
+                  res->name, res->pages << (PAGE_SHIFT - 10),
+                  res->usage, res->limit);
+
+       list_for_each_entry(child, &res->children, siblings)
+               mem_reserve_show_item(m, child, nesting+1);
+}
+
+static int mem_reserve_show(struct seq_file *m, void *v)
+{
+       mutex_lock(&mem_reserve_mutex);
+       mem_reserve_show_item(m, &mem_reserve_root, 0);
+       mutex_unlock(&mem_reserve_mutex);
+
+       return 0;
+}
+
+static int mem_reserve_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, mem_reserve_show, NULL);
+}
+
+static const struct file_operations mem_reserve_opterations = {
+       .open = mem_reserve_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+static __init int mem_reserve_proc_init(void)
+{
+       struct proc_dir_entry *entry;
+
+       entry = create_proc_entry("reserve_info", S_IRUSR, NULL);
+       if (entry)
+               entry->proc_fops = &mem_reserve_opterations;
+
+       return 0;
+}
+
+__initcall(mem_reserve_proc_init);
+
+#endif
+
+/*
+ * alloc_page helpers
+ */
+
+/**
+ * mem_reserve_pages_set - set reserves size in pages
+ * @res - reserve to set
+ * @pages - size in pages to set it to
+ *
+ * Returns -ENOMEM when it fails to set the reserve. On failure the old size
+ * is preserved.
+ */
+int mem_reserve_pages_set(struct mem_reserve *res, long pages)
+{
+       int ret;
+
+       mutex_lock(&mem_reserve_mutex);
+       pages -= res->pages;
+       ret = __mem_reserve_add(res, pages, pages);
+       mutex_unlock(&mem_reserve_mutex);
+
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(mem_reserve_pages_set);
+
+/**
+ * mem_reserve_pages_add - change the size in a relative way
+ * @res - reserve to change
+ * @pages - number of pages to add (or subtract when negative)
+ *
+ * Similar to mem_reserve_pages_set, except that the argument is relative 
instead
+ * of absolute.
+ *
+ * Returns -ENOMEM when it fails to increase.
+ */
+int mem_reserve_pages_add(struct mem_reserve *res, long pages)
+{
+       int ret;
+
+       mutex_lock(&mem_reserve_mutex);
+       ret = __mem_reserve_add(res, pages, pages);
+       mutex_unlock(&mem_reserve_mutex);
+
+       return ret;
+}
+
+/**
+ * mem_reserve_pages_charge - charge page usage to a reserve
+ * @res - reserve to charge
+ * @pages - size to charge
+ * @overcommit - disregard the usage limit (use with caution!)
+ *
+ * Returns non-zero on success.
+ */
+int mem_reserve_pages_charge(struct mem_reserve *res, long pages, int 
overcommit)
+{
+       return __mem_reserve_charge(res, pages, overcommit);
+}
+
+EXPORT_SYMBOL_GPL(mem_reserve_pages_charge);
+
+/*
+ * kmalloc helpers
+ */
+
+/**
+ * mem_reserve_kmalloc_set - set this reserve to bytes worth of kmalloc
+ * @res - reserve to change
+ * @bytes - size in bytes to reserve
+ *
+ * Returns -ENOMEM on failure.
+ */
+int mem_reserve_kmalloc_set(struct mem_reserve *res, long bytes)
+{
+       int ret;
+       long pages;
+
+       mutex_lock(&mem_reserve_mutex);
+       pages = kestimate(GFP_ATOMIC, bytes);
+       pages -= res->pages;
+       bytes -= res->limit;
+       ret = __mem_reserve_add(res, pages, bytes);
+       mutex_unlock(&mem_reserve_mutex);
+
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(mem_reserve_kmalloc_set);
+
+/**
+ * mem_reserve_kmalloc_charge - charge bytes to a reserve
+ * @res - reserve to charge
+ * @bytes - bytes to charge
+ * @overcommit - disregard the usage limit (use with caution!)
+ *
+ * Returns non-zero on success.
+ */
+int mem_reserve_kmalloc_charge(struct mem_reserve *res, long bytes,
+                              int overcommit)
+{
+       if (bytes < 0)
+               bytes = -roundup_pow_of_two(-bytes);
+       else
+               bytes = roundup_pow_of_two(bytes);
+
+       return __mem_reserve_charge(res, bytes, overcommit);
+}
+
+EXPORT_SYMBOL_GPL(mem_reserve_kmalloc_charge);
+
+/*
+ * kmem_cache helpers
+ */
+
+/**
+ * mem_reserve_kmem_cache_set - set reserve to @objects worth of 
kmem_cache_alloc of @s
+ * @res - reserve to set
+ * @s - kmem_cache to reserve from
+ * @objects - number of objects to reserve
+ *
+ * Returns -ENOMEM on failure.
+ */
+int mem_reserve_kmem_cache_set(struct mem_reserve *res, struct kmem_cache *s,
+                              int objects)
+{
+       int ret;
+       long pages;
+
+       mutex_lock(&mem_reserve_mutex);
+       pages = kmem_estimate_pages(s, GFP_ATOMIC, objects);
+       pages -= res->pages;
+       objects -= res->limit;
+       ret = __mem_reserve_add(res, pages, objects);
+       mutex_unlock(&mem_reserve_mutex);
+
+       return ret;
+}
+
+EXPORT_SYMBOL_GPL(mem_reserve_kmem_cache_set);
+
+/**
+ * mem_reserve_kmem_cache_charge - charge (or uncharge) usage of objs
+ * @res - reserve to charge
+ * @objs - objects to charge for
+ * @overcommit - disregard the usage limit (use with caution!)
+ *
+ * Returns non-zero on success.
+ */
+int mem_reserve_kmem_cache_charge(struct mem_reserve *res, long objs,
+                                 int overcommit)
+{
+       return __mem_reserve_charge(res, objs, overcommit);
+}
+
+EXPORT_SYMBOL_GPL(mem_reserve_kmem_cache_charge);

--

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to