On 01.06.2015 15:46, Kirill Tkhai wrote: > В Пн, 01/06/2015 в 15:16 +0300, Vasily Averin пишет: >> On 01.06.2015 14:31, Kirill Tkhai wrote: >>> В Пт, 29/05/2015 в 17:20 +0300, Andrew Vagin пишет: >>>> Acked-by: Andrew Vagin <[email protected]> >>>> >>>> I'm agree that we need to remove this function, but I don't know how it >>>> fixes the bug. >>> >>> tcp_v4_kill_ve_sockets() is called when all processes are killed, >>> and their sockets have been already replaced with time wait sockets. >>> >>> So, when we are iterating through chain list: >>> >>> sk_nulls_for_each(sk, node, &head[i].chain) >>> >>> we can't find TCP sockets there. Every sk is of type inet_timewait_sock. >>> Yes, they both are linked in the same table. >>> >>> struct sock (tcp_sock) and inet_timewait_sock are not a child and a parent: >>> >>> struct sock { struct inet_timewait_sock { >>> struct sock_common __sk_common; struct sock_common __tw_common; >>> socket_lock_t sk_lock; int tw_timeout; >>> >>> They are different. >>> >>> So, when we are doing bh_lock_sock(sk) in tcp_v4_kill_ve_sockets(), i.e. >>> >>> #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) >>> >>> we are trying to manipulate with tw_timeout, which is not zero, and can't >>> acquire the spinlock. >> >> it's nice theory, but I still do not understand how is it related to >> observed crash? >> >> I did not found such process in crash dump of PSBM-33755. >> Kernel was crashed in another function, not on access to spinlock. >> and it was happen in 50 seconds after stop of CT 106. >> How can you explain it? > > You dereference time wait socket in tcp_v4_kill_ve_sockets() and use it > as tcp socket. Starting from bh_lock_sock() and further in tcp_kill_ve_onesk() > you're modifying a memory of tw socket like it'd be a tcp_sk, and this > leads to memory corruption. > > Another one problem is you can't install a time wait socket for a time wait > socket. When you do that, you're replacing old tw sock with a new in > hashtable, > but it still remains to be linked in death row using tw_death_node.
Why you think that tcp_v4_kill_ve_sockets() found time wait socket? As far as I understand time wait sockets should be included into inet_ehash_bucket.twchain but tcp_v4_kill_ve_sockets processes inet_ehash_bucket.chain. Could you please explain how time wait sockets was added to inet_ehash_bucket.chain? >>>> On Fri, May 29, 2015 at 04:53:39PM +0300, Kirill Tkhai wrote: >>>>> This is a leftover from earlier versions of PCS, >>>>> and we do not need that functionality in 3.10. >>>>> >>>>> It was the reason of the kernel panic in 2.6.32: >>>>> https://jira.sw.ru/browse/PSBM-33755, in the test >>>>> of forced CT destroying. >>>>> >>>>> Also, tcp_v4_kill_ve_sockets() looks as the only >>>>> user of synchronize_net(), so we're not need this >>>>> function too (the only net hook fini_venet_acct_ip6_stat() >>>>> from the chain doesn't need it). Besides this, there >>>>> will be one more synchronize_rcu() later in >>>>> ve_drop_context(). >>>>> >>>>> Signed-off-by: Kirill Tkhai <[email protected]> >>>>> --- >>>>> include/linux/ve_proto.h | 5 +-- >>>>> kernel/ve/ve.c | 5 +-- >>>>> net/ipv4/tcp_ipv4.c | 88 >>>>> ---------------------------------------------- >>>>> 3 files changed, 2 insertions(+), 96 deletions(-) >>>>> >>>>> diff --git a/include/linux/ve_proto.h b/include/linux/ve_proto.h >>>>> index d491ab0..e77d6f9 100644 >>>>> --- a/include/linux/ve_proto.h >>>>> +++ b/include/linux/ve_proto.h >>>>> @@ -50,12 +50,9 @@ typedef void (*ve_seq_print_t)(struct seq_file *, >>>>> struct ve_struct *); >>>>> void vzmon_register_veaddr_print_cb(ve_seq_print_t); >>>>> void vzmon_unregister_veaddr_print_cb(ve_seq_print_t); >>>>> >>>>> -#ifdef CONFIG_INET >>>>> -void tcp_v4_kill_ve_sockets(struct ve_struct *envid); >>>>> -#ifdef CONFIG_VE_NETDEV >>>>> +#if defined(CONFIG_INET) && defined(CONFIG_VE_NETDEV) >>>>> int venet_init(void); >>>>> #endif >>>>> -#endif >>>>> >>>>> extern struct list_head ve_list_head; >>>>> #define for_each_ve(ve) list_for_each_entry((ve), &ve_list_head, >>>>> ve_list) >>>>> diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c >>>>> index 55b7d86..3128fdb 100644 >>>>> --- a/kernel/ve/ve.c >>>>> +++ b/kernel/ve/ve.c >>>>> @@ -647,10 +647,7 @@ void ve_exit_ns(struct pid_namespace *pid_ns) >>>>> >>>>> down_write(&ve->op_sem); >>>>> ve_hook_iterate_fini(VE_SS_CHAIN, ve); >>>>> -#ifdef CONFIG_INET >>>>> - tcp_v4_kill_ve_sockets(ve); >>>>> - synchronize_net(); >>>>> -#endif >>>>> + >>>>> ve_list_del(ve); >>>>> ve_drop_context(ve); >>>>> up_write(&ve->op_sem); >>>>> diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c >>>>> index ad4520c..a8ef57a 100644 >>>>> --- a/net/ipv4/tcp_ipv4.c >>>>> +++ b/net/ipv4/tcp_ipv4.c >>>>> @@ -2812,91 +2812,3 @@ void __init tcp_v4_init(void) >>>>> if (register_pernet_subsys(&tcp_sk_ops)) >>>>> panic("Failed to create the TCP control socket.\n"); >>>>> } >>>>> - >>>>> -#ifdef CONFIG_VE >>>>> -static void tcp_kill_ve_onesk(struct sock *sk) >>>>> -{ >>>>> - struct tcp_sock *tp = tcp_sk(sk); >>>>> - >>>>> - /* Check the assumed state of the socket. */ >>>>> - if (!sock_flag(sk, SOCK_DEAD)) { >>>>> - printk(KERN_WARNING "Killing sk: dead %d, state %d, " >>>>> - "wrseq %u unseq %u, wrqu %d.\n", >>>>> - sock_flag(sk, SOCK_DEAD), sk->sk_state, >>>>> - tp->write_seq, tp->snd_una, >>>>> - !skb_queue_empty(&sk->sk_write_queue)); >>>>> - sk->sk_err = ECONNRESET; >>>>> - sk->sk_error_report(sk); >>>>> - } >>>>> - >>>>> - tcp_send_active_reset(sk, GFP_ATOMIC); >>>>> - switch (sk->sk_state) { >>>>> - case TCP_FIN_WAIT1: >>>>> - case TCP_CLOSING: >>>>> - /* In these 2 states the peer may want us to retransmit >>>>> - * some data and/or FIN. Entering "resetting mode" >>>>> - * instead. >>>>> - */ >>>>> - tcp_time_wait(sk, TCP_CLOSE, 0); >>>>> - break; >>>>> - case TCP_FIN_WAIT2: >>>>> - /* By some reason the socket may stay in this state >>>>> - * without turning into a TW bucket. Fix it. >>>>> - */ >>>>> - tcp_time_wait(sk, TCP_FIN_WAIT2, 0); >>>>> - break; >>>>> - default: >>>>> - /* Just jump into CLOSED state. */ >>>>> - tcp_done(sk); >>>>> - break; >>>>> - } >>>>> -} >>>>> - >>>>> -void tcp_v4_kill_ve_sockets(struct ve_struct *envid) >>>>> -{ >>>>> - struct inet_ehash_bucket *head; >>>>> - int i, retry; >>>>> - >>>>> - /* alive */ >>>>> -again: >>>>> - retry = 0; >>>>> - local_bh_disable(); >>>>> - head = tcp_hashinfo.ehash; >>>>> - for (i = 0; i < tcp_hashinfo.ehash_mask; i++) { >>>>> - struct sock *sk; >>>>> - struct hlist_nulls_node *node; >>>>> - spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, i); >>>>> -more_work: >>>>> - spin_lock(lock); >>>>> - sk_nulls_for_each(sk, node, &head[i].chain) { >>>>> - if (sock_net(sk)->owner_ve == envid) { >>>>> - sock_hold(sk); >>>>> - spin_unlock(lock); >>>>> - >>>>> - bh_lock_sock(sk); >>>>> - if (sock_owned_by_user(sk)) { >>>>> - retry = 1; >>>>> - bh_unlock_sock(sk); >>>>> - sock_put(sk); >>>>> - goto enable_bh; >>>>> - } >>>>> - /* sk might have disappeared from the hash >>>>> before >>>>> - * we got the lock */ >>>>> - if (sk->sk_state != TCP_CLOSE) >>>>> - tcp_kill_ve_onesk(sk); >>>>> - bh_unlock_sock(sk); >>>>> - sock_put(sk); >>>>> - goto more_work; >>>>> - } >>>>> - } >>>>> - spin_unlock(lock); >>>>> - } >>>>> -enable_bh: >>>>> - local_bh_enable(); >>>>> - if (retry) { >>>>> - schedule_timeout_interruptible(HZ); >>>>> - goto again; >>>>> - } >>>>> -} >>>>> -EXPORT_SYMBOL(tcp_v4_kill_ve_sockets); >>>>> -#endif >>>>> >>> >>> >>> _______________________________________________ >>> Devel mailing list >>> [email protected] >>> https://lists.openvz.org/mailman/listinfo/devel >>> >>> > > > _______________________________________________ Devel mailing list [email protected] https://lists.openvz.org/mailman/listinfo/devel
