> -----Original Message-----
> From: Ying Xue [mailto:[email protected]]
> Sent: Monday, December 04, 2017 09:11
> To: [email protected]; Jon Maloy <[email protected]>
> Cc: [email protected]
> Subject: [net-next PATCH 2/2] tipc: fix race condition at topology server
> receive
>
> In commit 9dc3abdd1f7e ("tipc: fix nametbl_lock soft lockup at module exit"),
> tipc_conn_release() was moved from tipc_conn_kref_release() to
> tipc_close_conn() in order to solve nametbl_lock soft lockup at module exit.
>
> But it accidentally introduced a serious side effect that the lifecycle of
> connection instance and its tipc_subscriber is different.
> Especially, tipc_close_conn() is releasing the corresponding struct
> tipc_subscriber instance without considering that there may still be work
> items in the receive work queue. When those are scheduled in the function
> tipc_receive_from_work(), they are using the subscriber pointer stored in
> struct tipc_conn without first checking if this is valid or not. This will
> sometimes lead to crashes as the next call of
> tipc_conn_recvmsg() will access the now deleted wokr item.
s/work/work
>
> If we move tipc_conn_release() back to tipc_conn_kref_release() from
> tipc_close_conn(), the crash will be resolved as the lifecycle of connection
> instance and its user data is always same. But the soft lockup solved by
> commit 9dc3abdd1f will occur again. However, if we flush the pending rx/tx
> work items in tipc_topsrv_stop() context before connection is closed by
> tipc_close_conn(), it means connection instance will be finally destroyed in
> the tipc_exit_net() context rather than other contexts. As a result, the soft
> lockup will never happen again.
This one looks ok.
Acked-by: jon
>
> Fixs: 9dc3abdd1f7e ("tipc: fix nametbl_lock soft lockup at module exit")
> Signed-off-by: Ying Xue <[email protected]>
> ---
> net/tipc/server.c | 40 +++++++++++++++++++++-------------------
> 1 file changed, 21 insertions(+), 19 deletions(-)
>
> diff --git a/net/tipc/server.c b/net/tipc/server.c index 571e137..4baf1f7
> 100644
> --- a/net/tipc/server.c
> +++ b/net/tipc/server.c
> @@ -105,6 +105,8 @@ static void tipc_conn_kref_release(struct kref *kref)
> }
> saddr->scope = -TIPC_NODE_SCOPE;
> kernel_bind(sock, (struct sockaddr *)saddr, sizeof(*saddr));
> + if (con->conid)
> + s->tipc_conn_release(con->conid, con->usr_data);
> sock_release(sock);
> con->sock = NULL;
> }
> @@ -192,24 +194,24 @@ static void tipc_unregister_callbacks(struct
> tipc_conn *con)
> write_unlock_bh(&sk->sk_callback_lock);
> }
>
> -static void tipc_close_conn(struct tipc_conn *con)
> +static void tipc_close_conn(struct tipc_conn *con, bool flush)
> {
> - struct tipc_server *s = con->server;
> -
> if (test_and_clear_bit(CF_CONNECTED, &con->flags)) {
> - if (con->sock)
> + if (con->sock) {
> tipc_unregister_callbacks(con);
>
> - if (con->conid)
> - s->tipc_conn_release(con->conid, con->usr_data);
> -
> - /* We shouldn't flush pending works as we may be in the
> - * thread. In fact the races with pending rx/tx work structs
> - * are harmless for us here as we have already deleted this
> - * connection from server connection list.
> - */
> - if (con->sock)
> + /* We shouldn't flush pending works as we may
> + * be in the worker thread. But if flush is set,
> + * it means that we are not on worker thread
> + * context and it's safe for us to flush the
> + * pending rx/tx works.
> + */
> + if (flush) {
> + flush_work(&con->swork);
> + flush_work(&con->rwork);
> + }
> kernel_sock_shutdown(con->sock, SHUT_RDWR);
> + }
> conn_put(con);
> }
> }
> @@ -280,7 +282,7 @@ static int tipc_receive_from_sock(struct tipc_conn
> *con)
>
> out_close:
> if (ret != -EWOULDBLOCK)
> - tipc_close_conn(con);
> + tipc_close_conn(con, false);
> else if (ret == 0)
> /* Don't return success if we really got EOF */
> ret = -EAGAIN;
> @@ -488,7 +490,7 @@ void tipc_conn_terminate(struct tipc_server *s, int
> conid)
>
> con = tipc_conn_lookup(s, conid);
> if (con) {
> - tipc_close_conn(con);
> + tipc_close_conn(con, false);
> conn_put(con);
> }
> }
> @@ -516,7 +518,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32
> port, u32 type,
> s = con->server;
> scbr = s->tipc_conn_new(*conid);
> if (!scbr) {
> - tipc_close_conn(con);
> + tipc_close_conn(con, false);
> return false;
> }
>
> @@ -533,7 +535,7 @@ void tipc_topsrv_kern_unsubscr(struct net *net, int
> conid)
> con = tipc_conn_lookup(tipc_topsrv(net), conid);
> if (!con)
> return;
> - tipc_close_conn(con);
> + tipc_close_conn(con, false);
> conn_put(con);
> }
>
> @@ -606,7 +608,7 @@ static void tipc_send_to_sock(struct tipc_conn *con)
> return;
>
> send_err:
> - tipc_close_conn(con);
> + tipc_close_conn(con, false);
> }
>
> static void tipc_recv_work(struct work_struct *work) @@ -698,7 +700,7 @@
> void tipc_server_stop(struct tipc_server *s)
> con = idr_find(&s->conn_idr, id);
> if (con) {
> spin_unlock_bh(&s->idr_lock);
> - tipc_close_conn(con);
> + tipc_close_conn(con, true);
> spin_lock_bh(&s->idr_lock);
> }
> }
> --
> 2.7.4
------------------------------------------------------------------------------
Check out the vibrant tech community on one of the world's most
engaging tech sites, Slashdot.org! http://sdm.link/slashdot
_______________________________________________
tipc-discussion mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/tipc-discussion