Following nbd and iscsi, commit 89baaa570ab0 ("libceph: use memalloc
flags for net IO") set SOCK_MEMALLOC and PF_MEMALLOC flags for rbd and
cephfs. However it turned out to not play nice with loopback scenario,
leading to lockups with a full socket send-q and empty recv-q.
While we always advised against colocating kernel client and ceph
servers on the same box, a few people are doing it and it's also useful
for light development testing, so rather than reverting make sure to
not set those flags in the loopback case.
Cc: Mike Christie <[email protected]>
Cc: Mel Gorman <[email protected]>
Cc: Sage Weil <[email protected]>
Cc: [email protected] # 3.18+, needs backporting
Signed-off-by: Ilya Dryomov <[email protected]>
---
net/ceph/messenger.c | 40 +++++++++++++++++++++++++++++++++++++---
1 file changed, 37 insertions(+), 3 deletions(-)
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 6b3f54ed65ba..9fa2cce71164 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -101,6 +101,7 @@
#define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */
#define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */
#define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */
+#define CON_FLAG_LOCAL 5 /* using loopback interface */
static bool con_flag_valid(unsigned long con_flag)
{
@@ -110,6 +111,7 @@ static bool con_flag_valid(unsigned long con_flag)
case CON_FLAG_WRITE_PENDING:
case CON_FLAG_SOCK_CLOSED:
case CON_FLAG_BACKOFF:
+ case CON_FLAG_LOCAL:
return true;
default:
return false;
@@ -470,6 +472,18 @@ static void set_sock_callbacks(struct socket *sock,
* socket helpers
*/
+static bool sk_is_loopback(struct sock *sk)
+{
+ struct dst_entry *dst = sk_dst_get(sk);
+ bool ret = false;
+
+ if (dst) {
+ ret = dst->dev && (dst->dev->flags & IFF_LOOPBACK);
+ dst_release(dst);
+ }
+ return ret;
+}
+
/*
* initiate connection to a remote socket.
*/
@@ -484,7 +498,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
IPPROTO_TCP, &sock);
if (ret)
return ret;
- sock->sk->sk_allocation = GFP_NOFS | __GFP_MEMALLOC;
+ sock->sk->sk_allocation = GFP_NOFS;
#ifdef CONFIG_LOCKDEP
lockdep_set_class(&sock->sk->sk_lock, &socket_class);
@@ -510,6 +524,11 @@ static int ceph_tcp_connect(struct ceph_connection *con)
return ret;
}
+ if (sk_is_loopback(sock->sk))
+ con_flag_set(con, CON_FLAG_LOCAL);
+ else
+ con_flag_clear(con, CON_FLAG_LOCAL);
+
if (con->msgr->tcp_nodelay) {
int optval = 1;
@@ -520,7 +539,18 @@ static int ceph_tcp_connect(struct ceph_connection *con)
ret);
}
- sk_set_memalloc(sock->sk);
+ /*
+ * Tagging with SOCK_MEMALLOC / setting PF_MEMALLOC may lead to
+ * lockups if our peer is on the same host (communicating via
+ * loopback) due to sk_filter() mercilessly dropping pfmemalloc
+ * skbs on the receiving side - receiving loopback socket is
+ * not going to be tagged with SOCK_MEMALLOC. See:
+ *
+ * - http://article.gmane.org/gmane.linux.kernel/1418791
+ * - http://article.gmane.org/gmane.linux.kernel.stable/46128
+ */
+ if (!con_flag_test(con, CON_FLAG_LOCAL))
+ sk_set_memalloc(sock->sk);
con->sock = sock;
return 0;
@@ -2811,7 +2841,11 @@ static void con_work(struct work_struct *work)
unsigned long pflags = current->flags;
bool fault;
- current->flags |= PF_MEMALLOC;
+ /*
+ * See SOCK_MEMALLOC comment in ceph_tcp_connect().
+ */
+ if (!con_flag_test(con, CON_FLAG_LOCAL))
+ current->flags |= PF_MEMALLOC;
mutex_lock(&con->mutex);
while (true) {
--
1.9.3
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to [email protected]
More majordomo info at http://vger.kernel.org/majordomo-info.html