> On Sat, 2007-06-23 at 08:25 -0400, Steven Rostedt wrote:
> 
>> --- linux-2.6.21.orig/net/ipv4/tcp_ipv4.c       2007-06-17 
>> 17:19:02.000000000 +0200
>> +++ linux-2.6.21/net/ipv4/tcp_ipv4.c    2007-06-17 17:20:27.000000000 +0200
>> @@ -2033,7 +2033,12 @@ static void *established_get_first(struc
>>         struct tcp_iter_state* st = seq->private;
>>         void *rc = NULL;
>> 
>> -       for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; 
>> ++st->bucket) {
>> 
>> The above is a linear search through out a very large array, where most
>> of the items are NULL.  I believe it was Lee that noticed this creating
>> a large latency. This was back in 2.6.14. I'll check to see if this
>> still is a source of latency with the latest kernels.
> 
> It looks fairly generic, is it a latency that could be fix the same way
> up stream?
> 
> Daniel
>
Hi Steven,

I believe that we should push the following patch to upstream(netdev).
(This is a patch which extracted a part of the hash code from real-time patch.)
How do you think?

Masayuki Nakagawa

Index: linux-2.6/include/net/inet_hashtables.h
===================================================================
--- linux-2.6.orig/include/net/inet_hashtables.h
+++ linux-2.6/include/net/inet_hashtables.h
@@ -101,6 +101,7 @@ struct inet_hashinfo {
         * TIME_WAIT sockets use a separate chain (twchain).
         */
        struct inet_ehash_bucket        *ehash;
+       unsigned long                   *ebitmask;

        /* Ok, let's try this, I give up, we do need a local binding
         * TCP hash as well as the others for fast bind/connect.
@@ -135,6 +136,12 @@ static inline struct inet_ehash_bucket *
        return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
 }

+static inline unsigned int inet_ehash_index(
+       struct inet_hashinfo *hashinfo, unsigned int hash)
+{
+       return hash & (hashinfo->ehash_size - 1);
+}
+
 extern struct inet_bind_bucket *
                    inet_bind_bucket_create(struct kmem_cache *cachep,
                                            struct inet_bind_hashbucket *head,
@@ -207,11 +214,27 @@ static inline void inet_listen_unlock(st
                wake_up(&hashinfo->lhash_wait);
 }

+static inline void __inet_hash_setbit(unsigned long *bitmask,
+       unsigned int index)
+{
+       if (bitmask)
+               set_bit(index, bitmask);
+}
+
+static inline void __inet_hash_clearbit(unsigned long *bitmask,
+       unsigned int index)
+{
+       if (bitmask)
+               clear_bit(index, bitmask);
+}
+
 static inline void __inet_hash(struct inet_hashinfo *hashinfo,
                               struct sock *sk, const int listen_possible)
 {
        struct hlist_head *list;
        rwlock_t *lock;
+       unsigned long *bitmask = NULL;
+       unsigned int index = 0;

        BUG_TRAP(sk_unhashed(sk));
        if (listen_possible && sk->sk_state == TCP_LISTEN) {
@@ -221,12 +244,15 @@ static inline void __inet_hash(struct in
        } else {
                struct inet_ehash_bucket *head;
                sk->sk_hash = inet_sk_ehashfn(sk);
+               index = inet_ehash_index(hashinfo, sk->sk_hash);
                head = inet_ehash_bucket(hashinfo, sk->sk_hash);
                list = &head->chain;
                lock = &head->lock;
+               bitmask = hashinfo->ebitmask;
                write_lock(lock);
        }
        __sk_add_node(sk, list);
+       __inet_hash_setbit(bitmask, index);
        sock_prot_inc_use(sk->sk_prot);
        write_unlock(lock);
        if (listen_possible && sk->sk_state == TCP_LISTEN)
@@ -245,6 +271,8 @@ static inline void inet_hash(struct inet
 static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
 {
        rwlock_t *lock;
+       unsigned long *bitmask = NULL;
+       unsigned int index = 0;

        if (sk_unhashed(sk))
                goto out;
@@ -254,12 +282,16 @@ static inline void inet_unhash(struct in
                inet_listen_wlock(hashinfo);
                lock = &hashinfo->lhash_lock;
        } else {
+               index = inet_ehash_index(hashinfo, sk->sk_hash);
                lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+               bitmask = hashinfo->ebitmask;
                write_lock_bh(lock);
        }

-       if (__sk_del_node_init(sk))
+       if (__sk_del_node_init(sk)) {
+               __inet_hash_clearbit(bitmask, index);
                sock_prot_dec_use(sk->sk_prot);
+       }
        write_unlock_bh(lock);
 out:
        if (sk->sk_state == TCP_LISTEN)
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -3919,6 +3919,33 @@ void *__init alloc_large_system_hash(con
        return table;
 }

+void *__init alloc_large_system_bitmask(char *bitmaskname,
+                                       unsigned long bits, int flags)
+{
+       unsigned long words = bits / (sizeof(unsigned long)*8);
+       unsigned long size = words * sizeof(unsigned long);
+       unsigned long *bitmask = NULL;
+
+       if (flags & HASH_EARLY)
+               bitmask = alloc_bootmem(size);
+       else if (hashdist)
+               bitmask = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+       else {
+               bitmask = kmalloc(size, GFP_ATOMIC);
+               if (!bitmask) {
+                       unsigned long order;
+                       for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; 
order++)
+                               ;
+                       bitmask = (void*) __get_free_pages(GFP_ATOMIC, order);
+               }
+       }
+
+       if (!bitmask)
+               panic("Failed to allocate %s bitmask\n", bitmaskname);
+
+       return bitmask;
+}
+
 #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
 struct page *pfn_to_page(unsigned long pfn)
 {
Index: linux-2.6/net/ipv4/tcp.c
===================================================================
--- linux-2.6.orig/net/ipv4/tcp.c
+++ linux-2.6/net/ipv4/tcp.c
@@ -2418,6 +2418,9 @@ static int __init set_thash_entries(char
 }
 __setup("thash_entries=", set_thash_entries);

+void *__init alloc_large_system_bitmask(char *bitmaskname,
+                                       unsigned long bits, int flags);
+
 void __init tcp_init(void)
 {
        struct sk_buff *skb = NULL;
@@ -2449,6 +2452,10 @@ void __init tcp_init(void)
                                        NULL,
                                        0);
        tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
+       tcp_hashinfo.ebitmask =
+               alloc_large_system_bitmask("TCP established",
+                                       tcp_hashinfo.ehash_size,
+                                       0);
        for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
                rwlock_init(&tcp_hashinfo.ehash[i].lock);
                INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
Index: linux-2.6/net/ipv4/tcp_ipv4.c
===================================================================
--- linux-2.6.orig/net/ipv4/tcp_ipv4.c
+++ linux-2.6/net/ipv4/tcp_ipv4.c
@@ -2040,7 +2040,12 @@ static void *established_get_first(struc
        struct tcp_iter_state* st = seq->private;
        void *rc = NULL;

-       for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; 
++st->bucket) {
+       for (st->bucket = find_first_bit(tcp_hashinfo.ebitmask,
+                                       tcp_hashinfo.ehash_size);
+               st->bucket < tcp_hashinfo.ehash_size;
+               st->bucket = find_next_bit(tcp_hashinfo.ebitmask,
+                                       tcp_hashinfo.ehash_size,
+                                       st->bucket+1)) {
                struct sock *sk;
                struct hlist_node *node;
                struct inet_timewait_sock *tw;
Index: linux-2.6/net/ipv6/inet6_hashtables.c
===================================================================
--- linux-2.6.orig/net/ipv6/inet6_hashtables.c
+++ linux-2.6/net/ipv6/inet6_hashtables.c
@@ -27,6 +27,8 @@ void __inet6_hash(struct inet_hashinfo *
 {
        struct hlist_head *list;
        rwlock_t *lock;
+       unsigned long *bitmask = NULL;
+       unsigned int index = 0;

        BUG_TRAP(sk_unhashed(sk));

@@ -35,15 +37,16 @@ void __inet6_hash(struct inet_hashinfo *
                lock = &hashinfo->lhash_lock;
                inet_listen_wlock(hashinfo);
        } else {
-               unsigned int hash;
-               sk->sk_hash = hash = inet6_sk_ehashfn(sk);
-               hash &= (hashinfo->ehash_size - 1);
-               list = &hashinfo->ehash[hash].chain;
-               lock = &hashinfo->ehash[hash].lock;
+               sk->sk_hash = inet6_sk_ehashfn(sk);
+               index = inet_ehash_index(hashinfo, sk->sk_hash);
+               list = &hashinfo->ehash[index].chain;
+               lock = &hashinfo->ehash[index].lock;
+               bitmask = hashinfo->ebitmask;
                write_lock(lock);
        }

        __sk_add_node(sk, list);
+       __inet_hash_setbit(bitmask, index);
        sock_prot_inc_use(sk->sk_prot);
        write_unlock(lock);
 }
-
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to