> On Sat, 2007-06-23 at 08:25 -0400, Steven Rostedt wrote:
>
>> --- linux-2.6.21.orig/net/ipv4/tcp_ipv4.c 2007-06-17
>> 17:19:02.000000000 +0200
>> +++ linux-2.6.21/net/ipv4/tcp_ipv4.c 2007-06-17 17:20:27.000000000 +0200
>> @@ -2033,7 +2033,12 @@ static void *established_get_first(struc
>> struct tcp_iter_state* st = seq->private;
>> void *rc = NULL;
>>
>> - for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size;
>> ++st->bucket) {
>>
>> The above is a linear search through out a very large array, where most
>> of the items are NULL. I believe it was Lee that noticed this creating
>> a large latency. This was back in 2.6.14. I'll check to see if this
>> still is a source of latency with the latest kernels.
>
> It looks fairly generic, is it a latency that could be fix the same way
> up stream?
>
> Daniel
>
Hi Steven,
I believe that we should push the following patch to upstream(netdev).
(This is a patch which extracted a part of the hash code from real-time patch.)
How do you think?
Masayuki Nakagawa
Index: linux-2.6/include/net/inet_hashtables.h
===================================================================
--- linux-2.6.orig/include/net/inet_hashtables.h
+++ linux-2.6/include/net/inet_hashtables.h
@@ -101,6 +101,7 @@ struct inet_hashinfo {
* TIME_WAIT sockets use a separate chain (twchain).
*/
struct inet_ehash_bucket *ehash;
+ unsigned long *ebitmask;
/* Ok, let's try this, I give up, we do need a local binding
* TCP hash as well as the others for fast bind/connect.
@@ -135,6 +136,12 @@ static inline struct inet_ehash_bucket *
return &hashinfo->ehash[hash & (hashinfo->ehash_size - 1)];
}
+static inline unsigned int inet_ehash_index(
+ struct inet_hashinfo *hashinfo, unsigned int hash)
+{
+ return hash & (hashinfo->ehash_size - 1);
+}
+
extern struct inet_bind_bucket *
inet_bind_bucket_create(struct kmem_cache *cachep,
struct inet_bind_hashbucket *head,
@@ -207,11 +214,27 @@ static inline void inet_listen_unlock(st
wake_up(&hashinfo->lhash_wait);
}
+static inline void __inet_hash_setbit(unsigned long *bitmask,
+ unsigned int index)
+{
+ if (bitmask)
+ set_bit(index, bitmask);
+}
+
+static inline void __inet_hash_clearbit(unsigned long *bitmask,
+ unsigned int index)
+{
+ if (bitmask)
+ clear_bit(index, bitmask);
+}
+
static inline void __inet_hash(struct inet_hashinfo *hashinfo,
struct sock *sk, const int listen_possible)
{
struct hlist_head *list;
rwlock_t *lock;
+ unsigned long *bitmask = NULL;
+ unsigned int index = 0;
BUG_TRAP(sk_unhashed(sk));
if (listen_possible && sk->sk_state == TCP_LISTEN) {
@@ -221,12 +244,15 @@ static inline void __inet_hash(struct in
} else {
struct inet_ehash_bucket *head;
sk->sk_hash = inet_sk_ehashfn(sk);
+ index = inet_ehash_index(hashinfo, sk->sk_hash);
head = inet_ehash_bucket(hashinfo, sk->sk_hash);
list = &head->chain;
lock = &head->lock;
+ bitmask = hashinfo->ebitmask;
write_lock(lock);
}
__sk_add_node(sk, list);
+ __inet_hash_setbit(bitmask, index);
sock_prot_inc_use(sk->sk_prot);
write_unlock(lock);
if (listen_possible && sk->sk_state == TCP_LISTEN)
@@ -245,6 +271,8 @@ static inline void inet_hash(struct inet
static inline void inet_unhash(struct inet_hashinfo *hashinfo, struct sock *sk)
{
rwlock_t *lock;
+ unsigned long *bitmask = NULL;
+ unsigned int index = 0;
if (sk_unhashed(sk))
goto out;
@@ -254,12 +282,16 @@ static inline void inet_unhash(struct in
inet_listen_wlock(hashinfo);
lock = &hashinfo->lhash_lock;
} else {
+ index = inet_ehash_index(hashinfo, sk->sk_hash);
lock = &inet_ehash_bucket(hashinfo, sk->sk_hash)->lock;
+ bitmask = hashinfo->ebitmask;
write_lock_bh(lock);
}
- if (__sk_del_node_init(sk))
+ if (__sk_del_node_init(sk)) {
+ __inet_hash_clearbit(bitmask, index);
sock_prot_dec_use(sk->sk_prot);
+ }
write_unlock_bh(lock);
out:
if (sk->sk_state == TCP_LISTEN)
Index: linux-2.6/mm/page_alloc.c
===================================================================
--- linux-2.6.orig/mm/page_alloc.c
+++ linux-2.6/mm/page_alloc.c
@@ -3919,6 +3919,33 @@ void *__init alloc_large_system_hash(con
return table;
}
+void *__init alloc_large_system_bitmask(char *bitmaskname,
+ unsigned long bits, int flags)
+{
+ unsigned long words = bits / (sizeof(unsigned long)*8);
+ unsigned long size = words * sizeof(unsigned long);
+ unsigned long *bitmask = NULL;
+
+ if (flags & HASH_EARLY)
+ bitmask = alloc_bootmem(size);
+ else if (hashdist)
+ bitmask = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
+ else {
+ bitmask = kmalloc(size, GFP_ATOMIC);
+ if (!bitmask) {
+ unsigned long order;
+ for (order = 0; ((1UL << order) << PAGE_SHIFT) < size;
order++)
+ ;
+ bitmask = (void*) __get_free_pages(GFP_ATOMIC, order);
+ }
+ }
+
+ if (!bitmask)
+ panic("Failed to allocate %s bitmask\n", bitmaskname);
+
+ return bitmask;
+}
+
#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
struct page *pfn_to_page(unsigned long pfn)
{
Index: linux-2.6/net/ipv4/tcp.c
===================================================================
--- linux-2.6.orig/net/ipv4/tcp.c
+++ linux-2.6/net/ipv4/tcp.c
@@ -2418,6 +2418,9 @@ static int __init set_thash_entries(char
}
__setup("thash_entries=", set_thash_entries);
+void *__init alloc_large_system_bitmask(char *bitmaskname,
+ unsigned long bits, int flags);
+
void __init tcp_init(void)
{
struct sk_buff *skb = NULL;
@@ -2449,6 +2452,10 @@ void __init tcp_init(void)
NULL,
0);
tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size;
+ tcp_hashinfo.ebitmask =
+ alloc_large_system_bitmask("TCP established",
+ tcp_hashinfo.ehash_size,
+ 0);
for (i = 0; i < tcp_hashinfo.ehash_size; i++) {
rwlock_init(&tcp_hashinfo.ehash[i].lock);
INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain);
Index: linux-2.6/net/ipv4/tcp_ipv4.c
===================================================================
--- linux-2.6.orig/net/ipv4/tcp_ipv4.c
+++ linux-2.6/net/ipv4/tcp_ipv4.c
@@ -2040,7 +2040,12 @@ static void *established_get_first(struc
struct tcp_iter_state* st = seq->private;
void *rc = NULL;
- for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size;
++st->bucket) {
+ for (st->bucket = find_first_bit(tcp_hashinfo.ebitmask,
+ tcp_hashinfo.ehash_size);
+ st->bucket < tcp_hashinfo.ehash_size;
+ st->bucket = find_next_bit(tcp_hashinfo.ebitmask,
+ tcp_hashinfo.ehash_size,
+ st->bucket+1)) {
struct sock *sk;
struct hlist_node *node;
struct inet_timewait_sock *tw;
Index: linux-2.6/net/ipv6/inet6_hashtables.c
===================================================================
--- linux-2.6.orig/net/ipv6/inet6_hashtables.c
+++ linux-2.6/net/ipv6/inet6_hashtables.c
@@ -27,6 +27,8 @@ void __inet6_hash(struct inet_hashinfo *
{
struct hlist_head *list;
rwlock_t *lock;
+ unsigned long *bitmask = NULL;
+ unsigned int index = 0;
BUG_TRAP(sk_unhashed(sk));
@@ -35,15 +37,16 @@ void __inet6_hash(struct inet_hashinfo *
lock = &hashinfo->lhash_lock;
inet_listen_wlock(hashinfo);
} else {
- unsigned int hash;
- sk->sk_hash = hash = inet6_sk_ehashfn(sk);
- hash &= (hashinfo->ehash_size - 1);
- list = &hashinfo->ehash[hash].chain;
- lock = &hashinfo->ehash[hash].lock;
+ sk->sk_hash = inet6_sk_ehashfn(sk);
+ index = inet_ehash_index(hashinfo, sk->sk_hash);
+ list = &hashinfo->ehash[index].chain;
+ lock = &hashinfo->ehash[index].lock;
+ bitmask = hashinfo->ebitmask;
write_lock(lock);
}
__sk_add_node(sk, list);
+ __inet_hash_setbit(bitmask, index);
sock_prot_inc_use(sk->sk_prot);
write_unlock(lock);
}
-
To unsubscribe from this list: send the line "unsubscribe linux-rt-users" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html