[XFRM]: Dynamic xfrm_state hash table sizing.

The grow algorithm is simple, we grow if:

1) we see a hash chain collision at insert, and
2) we haven't hit the hash size limit (currently 1*1024*1024 slots), and
3) the number of xfrm_state objects is > the current hash mask

All of this needs some tweaking.

Remove __initdata from "hashdist" so we can use it safely at run time.

Signed-off-by: David S. Miller <[EMAIL PROTECTED]>
---
 include/linux/bootmem.h |    2 
 mm/page_alloc.c         |    2 
 net/xfrm/xfrm_state.c   |  200 ++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 165 insertions(+), 39 deletions(-)

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 1021f50..e319c64 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -114,7 +114,7 @@ #define HASHDIST_DEFAULT 1
 #else
 #define HASHDIST_DEFAULT 0
 #endif
-extern int __initdata hashdist;                /* Distribute hashes across 
NUMA nodes? */
+extern int hashdist;           /* Distribute hashes across NUMA nodes? */
 
 
 #endif /* _LINUX_BOOTMEM_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 54a4f53..3b5358a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2363,7 +2363,7 @@ int percpu_pagelist_fraction_sysctl_hand
        return 0;
 }
 
-__initdata int hashdist = HASHDIST_DEFAULT;
+int hashdist = HASHDIST_DEFAULT;
 
 #ifdef CONFIG_NUMA
 static int __init set_hashdist(char *str)
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index 724479f..af3f23e 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -18,6 +18,9 @@ #include <net/xfrm.h>
 #include <linux/pfkeyv2.h>
 #include <linux/ipsec.h>
 #include <linux/module.h>
+#include <linux/bootmem.h>
+#include <linux/vmalloc.h>
+#include <linux/cache.h>
 #include <asm/uaccess.h>
 
 struct sock *xfrm_nl;
@@ -38,77 +41,188 @@ EXPORT_SYMBOL(sysctl_xfrm_aevent_rseqth)
 
 static DEFINE_SPINLOCK(xfrm_state_lock);
 
-#define XFRM_DST_HSIZE         1024
-
 /* Hash table to find appropriate SA towards given target (endpoint
  * of tunnel or destination of transport mode) allowed by selector.
  *
  * Main use is finding SA after policy selected tunnel or transport mode.
  * Also, it can be used by ah/esp icmp error handler to find offending SA.
  */
-static struct hlist_head xfrm_state_bydst[XFRM_DST_HSIZE];
-static struct hlist_head xfrm_state_byspi[XFRM_DST_HSIZE];
+static struct hlist_head *xfrm_state_bydst __read_mostly;
+static struct hlist_head *xfrm_state_byspi __read_mostly;
+static unsigned int xfrm_state_hmask __read_mostly;
+static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
+static unsigned int xfrm_state_num;
 
-static __inline__
-unsigned __xfrm4_dst_hash(xfrm_address_t *addr)
+static inline unsigned int __xfrm4_dst_hash(xfrm_address_t *addr, unsigned int 
hmask)
 {
-       unsigned h;
+       unsigned int h;
        h = ntohl(addr->a4);
-       h = (h ^ (h>>16)) % XFRM_DST_HSIZE;
+       h = (h ^ (h>>16)) & xfrm_state_hmask;
        return h;
 }
 
-static __inline__
-unsigned __xfrm6_dst_hash(xfrm_address_t *addr)
+static inline unsigned int __xfrm6_dst_hash(xfrm_address_t *addr, unsigned int 
hmask)
 {
-       unsigned h;
+       unsigned int h;
        h = ntohl(addr->a6[2]^addr->a6[3]);
-       h = (h ^ (h>>16)) % XFRM_DST_HSIZE;
+       h = (h ^ (h>>16)) & hmask;
        return h;
 }
 
-static __inline__
-unsigned xfrm_dst_hash(xfrm_address_t *addr, unsigned short family)
+static inline unsigned int __xfrm_dst_hash(xfrm_address_t *addr, unsigned 
short family,
+                                          unsigned int hmask)
 {
        switch (family) {
        case AF_INET:
-               return __xfrm4_dst_hash(addr);
+               return __xfrm4_dst_hash(addr, hmask);
        case AF_INET6:
-               return __xfrm6_dst_hash(addr);
+               return __xfrm6_dst_hash(addr, hmask);
        }
        return 0;
 }
 
-static __inline__
-unsigned __xfrm4_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto)
+static inline unsigned int xfrm_dst_hash(xfrm_address_t *addr, unsigned short 
family)
+{
+       return __xfrm_dst_hash(addr, family, xfrm_state_hmask);
+}
+
+static inline unsigned int __xfrm4_spi_hash(xfrm_address_t *addr, u32 spi, u8 
proto,
+                                           unsigned int hmask)
 {
-       unsigned h;
+       unsigned int h;
        h = ntohl(addr->a4^spi^proto);
-       h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE;
+       h = (h ^ (h>>10) ^ (h>>20)) & hmask;
        return h;
 }
 
-static __inline__
-unsigned __xfrm6_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto)
+static inline unsigned int __xfrm6_spi_hash(xfrm_address_t *addr, u32 spi, u8 
proto,
+                                           unsigned int hmask)
 {
-       unsigned h;
+       unsigned int h;
        h = ntohl(addr->a6[2]^addr->a6[3]^spi^proto);
-       h = (h ^ (h>>10) ^ (h>>20)) % XFRM_DST_HSIZE;
+       h = (h ^ (h>>10) ^ (h>>20)) & hmask;
        return h;
 }
 
-static __inline__
-unsigned xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short 
family)
+static inline unsigned int
+__xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short family,
+               unsigned int hmask)
 {
        switch (family) {
        case AF_INET:
-               return __xfrm4_spi_hash(addr, spi, proto);
+               return __xfrm4_spi_hash(addr, spi, proto, hmask);
        case AF_INET6:
-               return __xfrm6_spi_hash(addr, spi, proto);
+               return __xfrm6_spi_hash(addr, spi, proto, hmask);
        }
        return 0;       /*XXX*/
 }
 
+static inline unsigned int
+xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short family)
+{
+       return __xfrm_spi_hash(addr, spi, proto, family, xfrm_state_hmask);
+}
+
+static struct hlist_head *xfrm_state_hash_alloc(unsigned int sz)
+{
+       struct hlist_head *n;
+
+       if (sz <= PAGE_SIZE)
+               n = kmalloc(sz, GFP_KERNEL);
+       else if (hashdist)
+               n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
+       else
+               n = (struct hlist_head *)
+                       __get_free_pages(GFP_KERNEL, get_order(sz));
+
+       if (n)
+               memset(n, 0, sz);
+
+       return n;
+}
+
+static void xfrm_state_hash_free(struct hlist_head *n, unsigned int sz)
+{
+       if (sz <= PAGE_SIZE)
+               kfree(n);
+       else if (hashdist)
+               vfree(n);
+       else
+               free_pages((unsigned long)n, get_order(sz));
+}
+
+static void xfrm_hash_transfer(struct hlist_head *list,
+                              struct hlist_head *ndsttable,
+                              struct hlist_head *nspitable,
+                              unsigned int nhashmask)
+{
+       struct hlist_node *entry, *tmp;
+       struct xfrm_state *x;
+
+       hlist_for_each_entry_safe(x, entry, tmp, list, bydst) {
+               unsigned int h;
+
+               h = __xfrm_dst_hash(&x->id.daddr, x->props.family, nhashmask);
+               hlist_add_head(&x->bydst, ndsttable+h);
+
+               h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto,
+                                   x->props.family, nhashmask);
+               hlist_add_head(&x->byspi, nspitable+h);
+       }
+}
+
+static unsigned long xfrm_hash_new_size(void)
+{
+       return ((xfrm_state_hmask + 1) << 1) *
+               sizeof(struct hlist_head);
+}
+
+static DEFINE_MUTEX(hash_resize_mutex);
+
+static void xfrm_hash_resize(void *__unused)
+{
+       struct hlist_head *ndst, *nspi, *odst, *ospi;
+       unsigned long nsize;
+       unsigned int nhashmask, ohashmask;
+       int i;
+
+       mutex_lock(&hash_resize_mutex);
+
+       nsize = xfrm_hash_new_size();
+       ndst = xfrm_state_hash_alloc(nsize);
+       if (!ndst)
+               goto out_unlock;
+       nspi = xfrm_state_hash_alloc(nsize);
+       if (!nspi) {
+               xfrm_state_hash_free(ndst, nsize);
+               goto out_unlock;
+       }
+
+       spin_lock_bh(&xfrm_state_lock);
+
+       nhashmask = (nsize / sizeof(struct hlist_head)) - 1U;
+       for (i = xfrm_state_hmask; i >= 0; i--)
+               xfrm_hash_transfer(xfrm_state_bydst+i, ndst, nspi, nhashmask);
+
+       odst = xfrm_state_bydst;
+       ospi = xfrm_state_byspi;
+       ohashmask = xfrm_state_hmask;
+
+       xfrm_state_bydst = ndst;
+       xfrm_state_byspi = nspi;
+       xfrm_state_hmask = nhashmask;
+
+       spin_unlock_bh(&xfrm_state_lock);
+
+       xfrm_state_hash_free(odst, (ohashmask + 1) * sizeof(struct hlist_head));
+       xfrm_state_hash_free(ospi, (ohashmask + 1) * sizeof(struct hlist_head));
+
+out_unlock:
+       mutex_unlock(&hash_resize_mutex);
+}
+
+static DECLARE_WORK(xfrm_hash_work, xfrm_hash_resize, NULL);
+
 DECLARE_WAIT_QUEUE_HEAD(km_waitq);
 EXPORT_SYMBOL(km_waitq);
 
@@ -306,6 +420,7 @@ int __xfrm_state_delete(struct xfrm_stat
                        hlist_del(&x->byspi);
                        __xfrm_state_put(x);
                }
+               xfrm_state_num--;
                spin_unlock(&xfrm_state_lock);
                if (del_timer(&x->timer))
                        __xfrm_state_put(x);
@@ -351,7 +466,7 @@ void xfrm_state_flush(u8 proto)
        int i;
 
        spin_lock_bh(&xfrm_state_lock);
-       for (i = 0; i < XFRM_DST_HSIZE; i++) {
+       for (i = 0; i <= xfrm_state_hmask; i++) {
                struct hlist_node *entry;
                struct xfrm_state *x;
 restart:
@@ -541,6 +656,13 @@ static void __xfrm_state_insert(struct x
                xfrm_state_hold(x);
 
        wake_up(&km_waitq);
+
+       xfrm_state_num++;
+
+       if (x->bydst.next != NULL &&
+           (xfrm_state_hmask + 1) < xfrm_state_hashmax &&
+           xfrm_state_num > xfrm_state_hmask)
+               schedule_work(&xfrm_hash_work);
 }
 
 void xfrm_state_insert(struct xfrm_state *x)
@@ -828,7 +950,7 @@ static struct xfrm_state *__xfrm_find_ac
 {
        int i;
 
-       for (i = 0; i < XFRM_DST_HSIZE; i++) {
+       for (i = 0; i <= xfrm_state_hmask; i++) {
                struct hlist_node *entry;
                struct xfrm_state *x;
 
@@ -917,7 +1039,7 @@ int xfrm_state_walk(u8 proto, int (*func
        int err = 0;
 
        spin_lock_bh(&xfrm_state_lock);
-       for (i = 0; i < XFRM_DST_HSIZE; i++) {
+       for (i = 0; i <= xfrm_state_hmask; i++) {
                hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
                        if (proto == IPSEC_PROTO_ANY || x->id.proto == proto)
                                count++;
@@ -928,7 +1050,7 @@ int xfrm_state_walk(u8 proto, int (*func
                goto out;
        }
 
-       for (i = 0; i < XFRM_DST_HSIZE; i++) {
+       for (i = 0; i <= xfrm_state_hmask; i++) {
                hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
                        if (proto != IPSEC_PROTO_ANY && x->id.proto != proto)
                                continue;
@@ -1355,12 +1477,16 @@ EXPORT_SYMBOL(xfrm_init_state);
  
 void __init xfrm_state_init(void)
 {
-       int i;
+       unsigned int sz;
+
+       sz = sizeof(struct hlist_head) * 8;
+
+       xfrm_state_bydst = xfrm_state_hash_alloc(sz);
+       xfrm_state_byspi = xfrm_state_hash_alloc(sz);
+       if (!xfrm_state_bydst || !xfrm_state_byspi)
+               panic("XFRM: Cannot allocate bydst/byspi hashes.");
+       xfrm_state_hmask = ((sz / sizeof(struct hlist_head)) - 1);
 
-       for (i=0; i<XFRM_DST_HSIZE; i++) {
-               INIT_HLIST_HEAD(&xfrm_state_bydst[i]);
-               INIT_HLIST_HEAD(&xfrm_state_byspi[i]);
-       }
        INIT_WORK(&xfrm_state_gc_work, xfrm_state_gc_task, NULL);
 }
 
-- 
1.4.2.rc2.g3e042

-
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to