[PATCH 0/3] net/rds: SOL_RDS socket option to explicitly select transport

2015-05-29 Thread Sowmini Varadhan
Today the underlying transport (TCP or IB) for a PF_RDS socket is
implicitly selected based on the local address used to bind(2) the
PF_RDS socket. This results in some non-deterministic behavior when
there are un-numbered and IPoIB interfaces sharing the same IP address.
It also places the constraint that the IB interface must have an IP
address (and thus, IPoIB) configured on it.

The non-determinism may be avoided by providing the user-space application
a socket option that allows it to explicitly select the transport
prior to bind(2).

Patch 1 of this series provides the constant definitions needed by
the application via linux/rds.h.

Patch 2 provides the setsockopt support, and Patch 3 provides the
getsockopt support.

Sowmini Varadhan (3):
  Declare SO_RDS_TRANSPORT and RDS_TRANS_* constants in
uapi/linux/rds.h
  Add setsockopt support for SO_RDS_TRANSPORT
  Add setsockopt support for SO_RDS_TRANSPORT

 include/uapi/linux/rds.h |   10 ++
 net/rds/af_rds.c |   41 +
 net/rds/bind.c   |4 
 net/rds/rds.h|6 +-
 net/rds/transport.c  |   21 +
 5 files changed, 77 insertions(+), 5 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] net/rds Add getsockopt support for SO_RDS_TRANSPORT

2015-05-29 Thread Sowmini Varadhan
The currently attached transport for a PF_RDS socket may be obtained
from user space by invoking getsockopt(2) using the SO_RDS_TRANSPORT
option at the SOL_RDS level. The integer optval returned will be one
of the RDS_TRANS_* constants defined in linux/rds.h.


Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
 net/rds/af_rds.c |   14 ++
 1 files changed, 14 insertions(+), 0 deletions(-)

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 0487744..2ad9032 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -339,6 +339,7 @@ static int rds_getsockopt(struct socket *sock, int level, 
int optname,
 {
struct rds_sock *rs = rds_sk_to_rs(sock-sk);
int ret = -ENOPROTOOPT, len;
+   int trans;
 
if (level != SOL_RDS)
goto out;
@@ -364,6 +365,19 @@ static int rds_getsockopt(struct socket *sock, int level, 
int optname,
else
ret = 0;
break;
+   case SO_RDS_TRANSPORT:
+   if (len  sizeof(int)) {
+   ret = -EINVAL;
+   break;
+   }
+   trans = (rs-rs_transport ? rs-rs_transport-t_type :
+RDS_TRANS_NONE); /* unbound */
+   if (put_user(trans, (int __user *)optval) ||
+   put_user(sizeof(int), optlen))
+   ret = -EFAULT;
+   else
+   ret = 0;
+   break;
default:
break;
}
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] net/rds: Declare SO_RDS_TRANSPORT and RDS_TRANS_* constants in uapi/linux/rds.h

2015-05-29 Thread Sowmini Varadhan
User space applications that desire to explicitly select the
underlying transport for a PF_RDS socket may do so by using the
SO_RDS_TRANSPORT socket option at the SOL_RDS level before bind().
The integer argument provided to the socket option would be one
of the RDS_TRANS_* values, e.g., RDS_TRANS_TCP. This commit exports
the constant values need by such applications via linux/rds.h

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
 include/uapi/linux/rds.h |   10 ++
 net/rds/rds.h|5 -
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/include/uapi/linux/rds.h b/include/uapi/linux/rds.h
index 9195095..0f9265c 100644
--- a/include/uapi/linux/rds.h
+++ b/include/uapi/linux/rds.h
@@ -38,6 +38,8 @@
 
 #define RDS_IB_ABI_VERSION 0x301
 
+#defineSOL_RDS 276
+
 /*
  * setsockopt/getsockopt for SOL_RDS
  */
@@ -48,6 +50,14 @@
 #define RDS_RECVERR5
 #define RDS_CONG_MONITOR   6
 #define RDS_GET_MR_FOR_DEST7
+#define SO_RDS_TRANSPORT   8
+
+/* supported values for SO_RDS_TRANSPORT */
+#defineRDS_TRANS_IB0
+#defineRDS_TRANS_IWARP 1
+#defineRDS_TRANS_TCP   2
+#define RDS_TRANS_COUNT3
+#defineRDS_TRANS_NONE  (~0)
 
 /*
  * Control message types for SOL_RDS.
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0d41155..76db508 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -408,11 +408,6 @@ struct rds_notifier {
  * should try hard not to block.
  */
 
-#define RDS_TRANS_IB   0
-#define RDS_TRANS_IWARP1
-#define RDS_TRANS_TCP  2
-#define RDS_TRANS_COUNT3
-
 struct rds_transport {
chart_name[TRANSNAMSIZ];
struct list_headt_item;
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] net/rds: Add setsockopt support for SO_RDS_TRANSPORT

2015-05-29 Thread Sowmini Varadhan
An application may deterministically attach the underlying transport for
a PF_RDS socket by invoking setsockopt(2) with the SO_RDS_TRANSPORT
option at the SOL_RDS level. The integer argument to setsockopt must be
one of the RDS_TRANS_* transport types, e.g., RDS_TRANS_TCP. The option
must be specified before invoking bind(2) on the socket, and may only
be used once on the socket. An attempt to set the option on a bound
socket, or to invoke the option after a successful SO_RDS_TRANSPORT
attachment, will return EOPNOTSUPP.

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
 net/rds/af_rds.c|   27 +++
 net/rds/bind.c  |4 
 net/rds/rds.h   |1 +
 net/rds/transport.c |   21 +
 4 files changed, 53 insertions(+), 0 deletions(-)

diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index 3d83641..0487744 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -270,6 +270,28 @@ static int rds_cong_monitor(struct rds_sock *rs, char 
__user *optval,
return ret;
 }
 
+static int rds_set_transport(struct rds_sock *rs, char __user *optval,
+int optlen)
+{
+   int t_type;
+
+   if (rs-rs_transport)
+   return -EOPNOTSUPP; /* previously attached to transport */
+
+   if (optlen != sizeof(int))
+   return -EINVAL;
+
+   if (copy_from_user(t_type, (int __user *)optval, sizeof(t_type)))
+   return -EFAULT;
+
+   if (t_type  0 || t_type = RDS_TRANS_COUNT)
+   return -EINVAL;
+
+   rs-rs_transport = rds_trans_get(t_type);
+
+   return rs-rs_transport ? 0 : -ENOPROTOOPT;
+}
+
 static int rds_setsockopt(struct socket *sock, int level, int optname,
  char __user *optval, unsigned int optlen)
 {
@@ -300,6 +322,11 @@ static int rds_setsockopt(struct socket *sock, int level, 
int optname,
case RDS_CONG_MONITOR:
ret = rds_cong_monitor(rs, optval, optlen);
break;
+   case SO_RDS_TRANSPORT:
+   lock_sock(sock-sk);
+   ret = rds_set_transport(rs, optval, optlen);
+   release_sock(sock-sk);
+   break;
default:
ret = -ENOPROTOOPT;
}
diff --git a/net/rds/bind.c b/net/rds/bind.c
index a2e6562..4ebd29c 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -181,6 +181,10 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
if (ret)
goto out;
 
+   if (rs-rs_transport) { /* previously bound */
+   ret = 0;
+   goto out;
+   }
trans = rds_trans_get_preferred(sin-sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 76db508..a33fb4a 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -798,6 +798,7 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr);
 void rds_trans_put(struct rds_transport *trans);
 unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
   unsigned int avail);
+struct rds_transport *rds_trans_get(int t_type);
 int rds_trans_init(void);
 void rds_trans_exit(void);
 
diff --git a/net/rds/transport.c b/net/rds/transport.c
index 7f2ac4f..8b4a6cd 100644
--- a/net/rds/transport.c
+++ b/net/rds/transport.c
@@ -101,6 +101,27 @@ struct rds_transport *rds_trans_get_preferred(__be32 addr)
return ret;
 }
 
+struct rds_transport *rds_trans_get(int t_type)
+{
+   struct rds_transport *ret = NULL;
+   struct rds_transport *trans;
+   unsigned int i;
+
+   down_read(rds_trans_sem);
+   for (i = 0; i  RDS_TRANS_COUNT; i++) {
+   trans = transports[i];
+
+   if (trans  trans-t_type == t_type 
+   (!trans-t_owner || try_module_get(trans-t_owner))) {
+   ret = trans;
+   break;
+   }
+   }
+   up_read(rds_trans_sem);
+
+   return ret;
+}
+
 /*
  * This returns the number of stats entries in the snapshot and only
  * copies them using the iter if there is enough space for them.  The
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net] net: sk_clone_lock() should only do get_net() if the parent is not a kernel socket

2015-07-30 Thread Sowmini Varadhan


The newsk returned by sk_clone_lock should hold a get_net()
reference if, and only if, the parent is not a kernel socket
(making this similar to sk_alloc()).

E.g,. for the SYN_RECV path, tcp_v4_syn_recv_sock-..inet_csk_clone_lock
sets up the syn_recv newsk from sk_clone_lock. When the parent (listen)
socket is a kernel socket (defined in sk_alloc() as having
sk_net_refcnt == 0), then the newsk should also have a 0 sk_net_refcnt
and should not hold a get_net() reference.

Fixes: 26abe14379f8 (net: Modify sk_alloc to not reference count the
  netns of kernel sockets.)
Acked-by: Eric Dumazet eduma...@google.com
Cc: Eric W. Biederman ebied...@xmission.com
Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
v2: pulled patch #3 out of the RFC patch-set for RDS-TCP netns fixes; 
Added Fixes, Acked-by, Cc fields based on mailing list feedback
from Eric Dumazet.

 net/core/sock.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 08f16db..371d1b7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1497,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
sock_copy(newsk, sk);
 
/* SANITY */
-   get_net(sock_net(newsk));
+   if (likely(newsk-sk_net_refcnt))
+   get_net(sock_net(newsk));
sk_node_init(newsk-sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next 0/2] RDS-TCP: Network namespace support

2015-08-03 Thread Sowmini Varadhan
This patch series contains the set of changes to correctly set up 
the infra for PF_RDS sockets that use TCP as the transport in multiple
network namespaces.

Patch 1 in the series is the minimal set of changes to allow
a single instance of RDS-TCP to run in any (i.e init_net or other) net
namespace.  The changes in this patch set ensure that the execution of 
'modprobe [-r] rds_tcp' sets up the kernel TCP sockets 
relative to the current netns, so that RDS applications can send/recv
packets from that netns, and the netns can later be deleted cleanly.

Patch 2 of the series further allows multiple RDS-TCP instances,
one per network namespace. The changes in this patch allows dynamic
creation/tear-down of RDS-TCP client and server sockets  across all
current and future namespaces. 

v2 changes from RFC sent out earlier:
David Ahern comments in patch 1, net_device notifier in patch 2, 
patch 3 broken off and submitted separately.

Sowmini Varadhan (2):
  Make RDS-TCP work correctly when it is set up in a netns other than
init_net
  Support multiple RDS-TCP listen endpoints, one per netns.

 net/rds/bind.c|3 +-
 net/rds/connection.c  |   16 +++--
 net/rds/ib.c  |2 +-
 net/rds/ib_cm.c   |5 +-
 net/rds/iw.c  |2 +-
 net/rds/iw_cm.c   |5 +-
 net/rds/rds.h |   23 ++-
 net/rds/send.c|3 +-
 net/rds/tcp.c |  167 +++-
 net/rds/tcp.h |7 ++-
 net/rds/tcp_connect.c |9 ++-
 net/rds/tcp_listen.c  |   40 
 net/rds/transport.c   |4 +-
 13 files changed, 216 insertions(+), 70 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next 2/2] RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns.

2015-08-03 Thread Sowmini Varadhan
Register pernet subsys init/stop functions that will set up
and tear down per-net RDS-TCP listen endpoints. Unregister
pernet subusys functions on 'modprobe -r' to clean up these
end points.

Enable keepalive on both accept and connect socket endpoints.
The keepalive timer expiration will ensure that client socket
endpoints will be removed as appropriate from the netns when
an interface is removed from a namespace.

Register a device notifier callback that will clean up all
sockets (and thus avoid the need to wait for keepalive timeout)
when the loopback device is unregistered from the netns indicating
that the netns is getting deleted.

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
v2: net_device notifier for synchronous cleanup of sockets.

 net/rds/tcp.c |  163 -
 net/rds/tcp.h |7 ++-
 net/rds/tcp_connect.c |6 +-
 net/rds/tcp_listen.c  |   38 +++-
 4 files changed, 164 insertions(+), 50 deletions(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 98f5de3..339392b 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -35,6 +35,9 @@
 #include linux/in.h
 #include linux/module.h
 #include net/tcp.h
+#include net/net_namespace.h
+#include net/netns/generic.h
+#include net/tcp.h
 
 #include rds.h
 #include tcp.h
@@ -250,16 +253,7 @@ static void rds_tcp_destroy_conns(void)
}
 }
 
-static void rds_tcp_exit(void)
-{
-   rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-   rds_tcp_listen_stop();
-   rds_tcp_destroy_conns();
-   rds_trans_unregister(rds_tcp_transport);
-   rds_tcp_recv_exit();
-   kmem_cache_destroy(rds_tcp_conn_slab);
-}
-module_exit(rds_tcp_exit);
+static void rds_tcp_exit(void);
 
 struct rds_transport rds_tcp_transport = {
.laddr_check= rds_tcp_laddr_check,
@@ -281,6 +275,138 @@ struct rds_transport rds_tcp_transport = {
.t_prefer_loopback  = 1,
 };
 
+static int rds_tcp_netid;
+
+/* per-network namespace private data for this module */
+struct rds_tcp_net {
+   struct socket *rds_tcp_listen_sock;
+   struct work_struct rds_tcp_accept_w;
+};
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+   struct rds_tcp_net *rtn = container_of(work,
+  struct rds_tcp_net,
+  rds_tcp_accept_w);
+
+   while (rds_tcp_accept_one(rtn-rds_tcp_listen_sock) == 0)
+   cond_resched();
+}
+
+void rds_tcp_accept_work(struct sock *sk)
+{
+   struct net *net = sock_net(sk);
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   queue_work(rds_wq, rtn-rds_tcp_accept_w);
+}
+
+static __net_init int rds_tcp_init_net(struct net *net)
+{
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   rtn-rds_tcp_listen_sock = rds_tcp_listen_init(net);
+   if (!rtn-rds_tcp_listen_sock) {
+   pr_warn(could not set up listen sock\n);
+   return -EAFNOSUPPORT;
+   }
+   INIT_WORK(rtn-rds_tcp_accept_w, rds_tcp_accept_worker);
+   return 0;
+}
+
+static void __net_exit rds_tcp_exit_net(struct net *net)
+{
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   /* If rds_tcp_exit_net() is called as a result of netns deletion,
+* the rds_tcp_kill_sock() device notifier would already have cleaned
+* up the listen socket, thus there is no work to do in this function.
+*
+* If rds_tcp_exit_net() is called as a result of module unload,
+* i.e., due to rds_tcp_exit() - unregister_pernet_subsys(), then
+* we do need to clean up the listen socket here.
+*/
+   if (rtn-rds_tcp_listen_sock) {
+   rds_tcp_listen_stop(rtn-rds_tcp_listen_sock);
+   rtn-rds_tcp_listen_sock = NULL;
+   flush_work(rtn-rds_tcp_accept_w);
+   }
+}
+
+static struct pernet_operations rds_tcp_net_ops = {
+   .init = rds_tcp_init_net,
+   .exit = rds_tcp_exit_net,
+   .id = rds_tcp_netid,
+   .size = sizeof(struct rds_tcp_net),
+};
+
+static void rds_tcp_kill_sock(struct net *net)
+{
+   struct rds_tcp_connection *tc, *_tc;
+   struct sock *sk;
+   struct list_head tmp_list;
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   rds_tcp_listen_stop(rtn-rds_tcp_listen_sock);
+   rtn-rds_tcp_listen_sock = NULL;
+   flush_work(rtn-rds_tcp_accept_w);
+   INIT_LIST_HEAD(tmp_list);
+   spin_lock_irq(rds_tcp_conn_lock);
+   list_for_each_entry_safe(tc, _tc, rds_tcp_conn_list, t_tcp_node) {
+   struct net *c_net = read_pnet(tc-conn-c_net);
+
+   if (net != c_net || !tc-t_sock)
+   continue;
+   list_del(tc-t_tcp_node);
+   list_add_tail(tc-t_tcp_node, tmp_list);
+   }
+   spin_unlock_irq(rds_tcp_conn_lock

[PATCH v2 net-next 1/2] RDS-TCP: Make RDS-TCP work correctly when it is set up in a netns other than init_net

2015-08-03 Thread Sowmini Varadhan
Open the sockets calling sock_create_kern() with the correct struct net
pointer, and use that struct net pointer when verifying the
address passed to rds_bind().

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
v2: David Ahern comments.

 net/rds/bind.c|3 ++-
 net/rds/connection.c  |   16 ++--
 net/rds/ib.c  |2 +-
 net/rds/ib_cm.c   |5 +++--
 net/rds/iw.c  |2 +-
 net/rds/iw_cm.c   |5 +++--
 net/rds/rds.h |   23 +++
 net/rds/send.c|3 ++-
 net/rds/tcp.c |4 ++--
 net/rds/tcp_connect.c |3 ++-
 net/rds/tcp_listen.c  |   16 
 net/rds/transport.c   |4 ++--
 12 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 4ebd29c..dd666fb 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -185,7 +185,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
ret = 0;
goto out;
}
-   trans = rds_trans_get_preferred(sin-sin_addr.s_addr);
+   trans = rds_trans_get_preferred(sock_net(sock-sk),
+   sin-sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index da6da57..d4fecb2 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -117,7 +117,8 @@ static void rds_conn_reset(struct rds_connection *conn)
  * For now they are not garbage collected once they're created.  They
  * are torn down as the module is removed, if ever.
  */
-static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+static struct rds_connection *__rds_conn_create(struct net *net,
+   __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp,
   int is_outgoing)
 {
@@ -157,6 +158,7 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
conn-c_faddr = faddr;
spin_lock_init(conn-c_lock);
conn-c_next_tx_seq = 1;
+   rds_conn_net_set(conn, net);
 
init_waitqueue_head(conn-c_waitq);
INIT_LIST_HEAD(conn-c_send_queue);
@@ -174,7 +176,7 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
 * can bind to the destination address then we'd rather the messages
 * flow through loopback rather than either transport.
 */
-   loop_trans = rds_trans_get_preferred(faddr);
+   loop_trans = rds_trans_get_preferred(net, faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn-c_loopback = 1;
@@ -260,17 +262,19 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
return conn;
 }
 
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+  __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create);
 
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+   __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ba2dffe..1381422 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -317,7 +317,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned 
int len,
  * allowed to influence which paths have priority.  We could call userspace
  * asserting this policy routing.
  */
-static int rds_ib_laddr_check(__be32 addr)
+static int rds_ib_laddr_check(struct net *net, __be32 addr)
 {
int ret;
struct rdma_cm_id *cm_id;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0da2a45..f40d8f5 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -448,8 +448,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 (unsigned long long)be64_to_cpu(lguid),
 (unsigned long long)be64_to_cpu(fguid));
 
-   conn = rds_conn_create(dp-dp_daddr, dp-dp_saddr, rds_ib_transport,
-  GFP_KERNEL);
+   /* RDS/IB is not currently netns aware, thus init_net */
+   conn = rds_conn_create(init_net, dp-dp_daddr, dp-dp_saddr,
+  rds_ib_transport, GFP_KERNEL);
if (IS_ERR(conn

[PATCH v3 net-next 2/2] RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns.

2015-08-04 Thread Sowmini Varadhan
Register pernet subsys init/stop functions that will set up
and tear down per-net RDS-TCP listen endpoints. Unregister
pernet subusys functions on 'modprobe -r' to clean up these
end points.

Enable keepalive on both accept and connect socket endpoints.
The keepalive timer expiration will ensure that client socket
endpoints will be removed as appropriate from the netns when
an interface is removed from a namespace.

Register a device notifier callback that will clean up all
sockets (and thus avoid the need to wait for keepalive timeout)
when the loopback device is unregistered from the netns indicating
that the netns is getting deleted.

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
v2: net_device notifier for synchronous cleanup of sockets.
v3: Cong Wang comments

 net/rds/tcp.c |  161 -
 net/rds/tcp.h |7 ++-
 net/rds/tcp_connect.c |6 +-
 net/rds/tcp_listen.c  |   38 +++-
 4 files changed, 162 insertions(+), 50 deletions(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 98f5de3..c42b60b 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -35,6 +35,9 @@
 #include linux/in.h
 #include linux/module.h
 #include net/tcp.h
+#include net/net_namespace.h
+#include net/netns/generic.h
+#include net/tcp.h
 
 #include rds.h
 #include tcp.h
@@ -250,16 +253,7 @@ static void rds_tcp_destroy_conns(void)
}
 }
 
-static void rds_tcp_exit(void)
-{
-   rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-   rds_tcp_listen_stop();
-   rds_tcp_destroy_conns();
-   rds_trans_unregister(rds_tcp_transport);
-   rds_tcp_recv_exit();
-   kmem_cache_destroy(rds_tcp_conn_slab);
-}
-module_exit(rds_tcp_exit);
+static void rds_tcp_exit(void);
 
 struct rds_transport rds_tcp_transport = {
.laddr_check= rds_tcp_laddr_check,
@@ -281,6 +275,136 @@ struct rds_transport rds_tcp_transport = {
.t_prefer_loopback  = 1,
 };
 
+static int rds_tcp_netid;
+
+/* per-network namespace private data for this module */
+struct rds_tcp_net {
+   struct socket *rds_tcp_listen_sock;
+   struct work_struct rds_tcp_accept_w;
+};
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+   struct rds_tcp_net *rtn = container_of(work,
+  struct rds_tcp_net,
+  rds_tcp_accept_w);
+
+   while (rds_tcp_accept_one(rtn-rds_tcp_listen_sock) == 0)
+   cond_resched();
+}
+
+void rds_tcp_accept_work(struct sock *sk)
+{
+   struct net *net = sock_net(sk);
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   queue_work(rds_wq, rtn-rds_tcp_accept_w);
+}
+
+static __net_init int rds_tcp_init_net(struct net *net)
+{
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   rtn-rds_tcp_listen_sock = rds_tcp_listen_init(net);
+   if (!rtn-rds_tcp_listen_sock) {
+   pr_warn(could not set up listen sock\n);
+   return -EAFNOSUPPORT;
+   }
+   INIT_WORK(rtn-rds_tcp_accept_w, rds_tcp_accept_worker);
+   return 0;
+}
+
+static void __net_exit rds_tcp_exit_net(struct net *net)
+{
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   /* If rds_tcp_exit_net() is called as a result of netns deletion,
+* the rds_tcp_kill_sock() device notifier would already have cleaned
+* up the listen socket, thus there is no work to do in this function.
+*
+* If rds_tcp_exit_net() is called as a result of module unload,
+* i.e., due to rds_tcp_exit() - unregister_pernet_subsys(), then
+* we do need to clean up the listen socket here.
+*/
+   if (rtn-rds_tcp_listen_sock) {
+   rds_tcp_listen_stop(rtn-rds_tcp_listen_sock);
+   rtn-rds_tcp_listen_sock = NULL;
+   flush_work(rtn-rds_tcp_accept_w);
+   }
+}
+
+static struct pernet_operations rds_tcp_net_ops = {
+   .init = rds_tcp_init_net,
+   .exit = rds_tcp_exit_net,
+   .id = rds_tcp_netid,
+   .size = sizeof(struct rds_tcp_net),
+};
+
+static void rds_tcp_kill_sock(struct net *net)
+{
+   struct rds_tcp_connection *tc, *_tc;
+   struct sock *sk;
+   LIST_HEAD(tmp_list);
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   rds_tcp_listen_stop(rtn-rds_tcp_listen_sock);
+   rtn-rds_tcp_listen_sock = NULL;
+   flush_work(rtn-rds_tcp_accept_w);
+   spin_lock_irq(rds_tcp_conn_lock);
+   list_for_each_entry_safe(tc, _tc, rds_tcp_conn_list, t_tcp_node) {
+   struct net *c_net = read_pnet(tc-conn-c_net);
+
+   if (net != c_net || !tc-t_sock)
+   continue;
+   list_move_tail(tc-t_tcp_node, tmp_list);
+   }
+   spin_unlock_irq(rds_tcp_conn_lock);
+   list_for_each_entry_safe(tc, _tc, tmp_list, t_tcp_node

[PATCH v3 net-next 0/2] RDS-TCP: Network namespace support

2015-08-04 Thread Sowmini Varadhan
This patch series contains the set of changes to correctly set up 
the infra for PF_RDS sockets that use TCP as the transport in multiple
network namespaces.

Patch 1 in the series is the minimal set of changes to allow
a single instance of RDS-TCP to run in any (i.e init_net or other) net
namespace.  The changes in this patch set ensure that the execution of 
'modprobe [-r] rds_tcp' sets up the kernel TCP sockets 
relative to the current netns, so that RDS applications can send/recv
packets from that netns, and the netns can later be deleted cleanly.

Patch 2 of the series further allows multiple RDS-TCP instances,
one per network namespace. The changes in this patch allows dynamic
creation/tear-down of RDS-TCP client and server sockets  across all
current and future namespaces. 

v2 changes from RFC sent out earlier:
David Ahern comments in patch 1, net_device notifier in patch 2, 
patch 3 broken off and submitted separately.
v3: Cong Wang review comments.

Sowmini Varadhan (2):
  Make RDS-TCP work correctly when it is set up in a netns other than
init_net
  Support multiple RDS-TCP listen endpoints, one per netns.

 net/rds/bind.c|3 +-
 net/rds/connection.c  |   16 +++--
 net/rds/ib.c  |2 +-
 net/rds/ib_cm.c   |5 +-
 net/rds/iw.c  |2 +-
 net/rds/iw_cm.c   |5 +-
 net/rds/rds.h |   23 ++-
 net/rds/send.c|3 +-
 net/rds/tcp.c |  165 +++-
 net/rds/tcp.h |7 ++-
 net/rds/tcp_connect.c |9 ++-
 net/rds/tcp_listen.c  |   40 
 net/rds/transport.c   |4 +-
 13 files changed, 214 insertions(+), 70 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 net-next 1/2] RDS-TCP: Make RDS-TCP work correctly when it is set up in a netns other than init_net

2015-08-04 Thread Sowmini Varadhan
Open the sockets calling sock_create_kern() with the correct struct net
pointer, and use that struct net pointer when verifying the
address passed to rds_bind().

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
v2: David Ahern comments.

 net/rds/bind.c|3 ++-
 net/rds/connection.c  |   16 ++--
 net/rds/ib.c  |2 +-
 net/rds/ib_cm.c   |5 +++--
 net/rds/iw.c  |2 +-
 net/rds/iw_cm.c   |5 +++--
 net/rds/rds.h |   23 +++
 net/rds/send.c|3 ++-
 net/rds/tcp.c |4 ++--
 net/rds/tcp_connect.c |3 ++-
 net/rds/tcp_listen.c  |   16 
 net/rds/transport.c   |4 ++--
 12 files changed, 59 insertions(+), 27 deletions(-)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 4ebd29c..dd666fb 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -185,7 +185,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
ret = 0;
goto out;
}
-   trans = rds_trans_get_preferred(sin-sin_addr.s_addr);
+   trans = rds_trans_get_preferred(sock_net(sock-sk),
+   sin-sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index da6da57..d4fecb2 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -117,7 +117,8 @@ static void rds_conn_reset(struct rds_connection *conn)
  * For now they are not garbage collected once they're created.  They
  * are torn down as the module is removed, if ever.
  */
-static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+static struct rds_connection *__rds_conn_create(struct net *net,
+   __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp,
   int is_outgoing)
 {
@@ -157,6 +158,7 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
conn-c_faddr = faddr;
spin_lock_init(conn-c_lock);
conn-c_next_tx_seq = 1;
+   rds_conn_net_set(conn, net);
 
init_waitqueue_head(conn-c_waitq);
INIT_LIST_HEAD(conn-c_send_queue);
@@ -174,7 +176,7 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
 * can bind to the destination address then we'd rather the messages
 * flow through loopback rather than either transport.
 */
-   loop_trans = rds_trans_get_preferred(faddr);
+   loop_trans = rds_trans_get_preferred(net, faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn-c_loopback = 1;
@@ -260,17 +262,19 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
return conn;
 }
 
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+  __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create);
 
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+   __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ba2dffe..1381422 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -317,7 +317,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned 
int len,
  * allowed to influence which paths have priority.  We could call userspace
  * asserting this policy routing.
  */
-static int rds_ib_laddr_check(__be32 addr)
+static int rds_ib_laddr_check(struct net *net, __be32 addr)
 {
int ret;
struct rdma_cm_id *cm_id;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0da2a45..f40d8f5 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -448,8 +448,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 (unsigned long long)be64_to_cpu(lguid),
 (unsigned long long)be64_to_cpu(fguid));
 
-   conn = rds_conn_create(dp-dp_daddr, dp-dp_saddr, rds_ib_transport,
-  GFP_KERNEL);
+   /* RDS/IB is not currently netns aware, thus init_net */
+   conn = rds_conn_create(init_net, dp-dp_daddr, dp-dp_saddr,
+  rds_ib_transport, GFP_KERNEL);
if (IS_ERR(conn

Re: [PATCH RFC net-next 1/3] RDS-TCP: Make RDS-TCP work correctly when it is set up in a netns other than init_net

2015-07-30 Thread Sowmini Varadhan
On (07/30/15 11:03), David Ahern wrote:
 +write_pnet(conn-c_net, net);
 
 these are typically in wrappers like sock_net and sock_net_set
  :

 +conn = rds_conn_create(init_net, dp-dp_daddr, dp-dp_saddr,
 +   rds_ib_transport, GFP_KERNEL);
 
 I forget what connection this is -- control channel? 

this is IB. It should/will eventually also use any net than init_net,
but for the moment, I'd like to figure out the bigger  issue
of pernet vs per_subsys, which is harder to fix than the above
(and which I'll happily fix later).

I suspect that the right solution may be to have some notifier
callbacks in rds_tcp that listen for ifdown and tear down the sockets,
rather than wait for keepalive timeout.

 Ditto here.

yes, me too.

--Sowmini
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC net-next 2/3] RDS-TCP: Support multiple RDS-TCP listen endpoints, one per netns.

2015-07-30 Thread Sowmini Varadhan
Register pernet subsys init/stop functions that will set up
and tear down per-net RDS-TCP listen endpoints. Unregister
pernet subusys functions on 'modprobe -r' to clean up these
end points.

Enable keepalive on both accept and connect socket endpoints.
The keepalive timer expiration will ensure that cleanup_net()
will eventually complete, allowing the pernet -exit to be invoked.

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
 net/rds/tcp.c |  112 ++--
 net/rds/tcp.h |7 ++-
 net/rds/tcp_connect.c |6 ++-
 net/rds/tcp_listen.c  |   38 -
 4 files changed, 115 insertions(+), 48 deletions(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 98f5de3..fadf1a1 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -35,6 +35,8 @@
 #include linux/in.h
 #include linux/module.h
 #include net/tcp.h
+#include net/net_namespace.h
+#include net/netns/generic.h
 
 #include rds.h
 #include tcp.h
@@ -250,16 +252,32 @@ static void rds_tcp_destroy_conns(void)
}
 }
 
-static void rds_tcp_exit(void)
+static void rds_tcp_destroy_conns_for_net(struct net *net)
 {
-   rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
-   rds_tcp_listen_stop();
-   rds_tcp_destroy_conns();
-   rds_trans_unregister(rds_tcp_transport);
-   rds_tcp_recv_exit();
-   kmem_cache_destroy(rds_tcp_conn_slab);
+   struct rds_tcp_connection *tc, *_tc;
+   struct list_head tmp_list;
+
+   BUG_ON(!net);
+   INIT_LIST_HEAD(tmp_list);
+   /* avoid calling conn_destroy with irqs off */
+   spin_lock_irq(rds_tcp_conn_lock);
+   list_for_each_entry_safe(tc, _tc, rds_tcp_conn_list, t_tcp_node) {
+   struct net *c_net = read_pnet(tc-conn-c_net);
+
+   if (net == c_net) {
+   list_del(tc-t_tcp_node);
+   list_add_tail(tc-t_tcp_node, tmp_list);
+   }
+   }
+   spin_unlock_irq(rds_tcp_conn_lock);
+   list_for_each_entry_safe(tc, _tc, tmp_list, t_tcp_node) {
+   if (tc-conn-c_passive)
+   rds_conn_destroy(tc-conn-c_passive);
+   rds_conn_destroy(tc-conn);
+   }
 }
-module_exit(rds_tcp_exit);
+
+static void rds_tcp_exit(void);
 
 struct rds_transport rds_tcp_transport = {
.laddr_check= rds_tcp_laddr_check,
@@ -281,6 +299,73 @@ struct rds_transport rds_tcp_transport = {
.t_prefer_loopback  = 1,
 };
 
+static int rds_tcp_netid;
+
+/* per-network namespace private data for this module */
+struct rds_tcp_net {
+   struct socket *rds_tcp_listen_sock;
+   struct work_struct rds_tcp_accept_w;
+};
+
+static void rds_tcp_accept_worker(struct work_struct *work)
+{
+   struct rds_tcp_net *rtn = container_of(work,
+  struct rds_tcp_net,
+  rds_tcp_accept_w);
+
+   while (rds_tcp_accept_one(rtn-rds_tcp_listen_sock) == 0)
+   cond_resched();
+}
+
+void rds_tcp_accept_work(struct sock *sk)
+{
+   struct net *net = sock_net(sk);
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   queue_work(rds_wq, rtn-rds_tcp_accept_w);
+}
+
+static __net_init int rds_tcp_init_net(struct net *net)
+{
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   rtn-rds_tcp_listen_sock = rds_tcp_listen_init(net);
+   if (!rtn-rds_tcp_listen_sock) {
+   pr_warn(could not set up listen sock\n);
+   return -EAFNOSUPPORT;
+   }
+   INIT_WORK(rtn-rds_tcp_accept_w, rds_tcp_accept_worker);
+   return 0;
+}
+
+static void __net_exit rds_tcp_exit_net(struct net *net)
+{
+   struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+   rds_tcp_listen_stop(rtn-rds_tcp_listen_sock);
+   rtn-rds_tcp_listen_sock = NULL;
+   flush_work(rtn-rds_tcp_accept_w);
+   rds_tcp_destroy_conns_for_net(net);
+}
+
+static struct pernet_operations rds_tcp_net_ops = {
+   .init = rds_tcp_init_net,
+   .exit = rds_tcp_exit_net,
+   .id = rds_tcp_netid,
+   .size = sizeof(struct rds_tcp_net),
+};
+
+static void rds_tcp_exit(void)
+{
+   rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
+   unregister_pernet_subsys(rds_tcp_net_ops);
+   rds_tcp_destroy_conns();
+   rds_trans_unregister(rds_tcp_transport);
+   rds_tcp_recv_exit();
+   kmem_cache_destroy(rds_tcp_conn_slab);
+}
+module_exit(rds_tcp_exit);
+
 static int rds_tcp_init(void)
 {
int ret;
@@ -293,6 +378,10 @@ static int rds_tcp_init(void)
goto out;
}
 
+   ret = register_pernet_subsys(rds_tcp_net_ops);
+   if (ret)
+   goto out_slab;
+
ret = rds_tcp_recv_init();
if (ret)
goto out_slab;
@@ -301,19 +390,14 @@ static int rds_tcp_init(void)
if (ret)
goto out_recv

[PATCH RFC net-next 1/3] RDS-TCP: Make RDS-TCP work correctly when it is set up in a netns other than init_net

2015-07-30 Thread Sowmini Varadhan
Open the sockets calling sock_create_kern() with the correct struct net
pointer, and use the correct struct net pointer when verifying the
address passed to rds_bind().

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
 net/rds/bind.c|3 ++-
 net/rds/connection.c  |   16 ++--
 net/rds/ib.c  |2 +-
 net/rds/ib_cm.c   |4 ++--
 net/rds/iw.c  |2 +-
 net/rds/iw_cm.c   |4 ++--
 net/rds/rds.h |   11 +++
 net/rds/send.c|3 ++-
 net/rds/tcp.c |4 ++--
 net/rds/tcp_connect.c |3 ++-
 net/rds/tcp_listen.c  |   16 
 net/rds/transport.c   |4 ++--
 12 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 4ebd29c..dd666fb 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -185,7 +185,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
ret = 0;
goto out;
}
-   trans = rds_trans_get_preferred(sin-sin_addr.s_addr);
+   trans = rds_trans_get_preferred(sock_net(sock-sk),
+   sin-sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index da6da57..3bea7b9 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -117,7 +117,8 @@ static void rds_conn_reset(struct rds_connection *conn)
  * For now they are not garbage collected once they're created.  They
  * are torn down as the module is removed, if ever.
  */
-static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+static struct rds_connection *__rds_conn_create(struct net *net,
+   __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp,
   int is_outgoing)
 {
@@ -157,6 +158,7 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
conn-c_faddr = faddr;
spin_lock_init(conn-c_lock);
conn-c_next_tx_seq = 1;
+   write_pnet(conn-c_net, net);
 
init_waitqueue_head(conn-c_waitq);
INIT_LIST_HEAD(conn-c_send_queue);
@@ -174,7 +176,7 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
 * can bind to the destination address then we'd rather the messages
 * flow through loopback rather than either transport.
 */
-   loop_trans = rds_trans_get_preferred(faddr);
+   loop_trans = rds_trans_get_preferred(net, faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn-c_loopback = 1;
@@ -260,17 +262,19 @@ static struct rds_connection *__rds_conn_create(__be32 
laddr, __be32 faddr,
return conn;
 }
 
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+  __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create);
 
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+   __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ba2dffe..1381422 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -317,7 +317,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned 
int len,
  * allowed to influence which paths have priority.  We could call userspace
  * asserting this policy routing.
  */
-static int rds_ib_laddr_check(__be32 addr)
+static int rds_ib_laddr_check(struct net *net, __be32 addr)
 {
int ret;
struct rdma_cm_id *cm_id;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0da2a45..c38d8a0 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -448,8 +448,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 (unsigned long long)be64_to_cpu(lguid),
 (unsigned long long)be64_to_cpu(fguid));
 
-   conn = rds_conn_create(dp-dp_daddr, dp-dp_saddr, rds_ib_transport,
-  GFP_KERNEL);
+   conn = rds_conn_create(init_net, dp-dp_daddr, dp-dp_saddr,
+  rds_ib_transport, GFP_KERNEL);
if (IS_ERR(conn)) {
rdsdebug(rds_conn_create failed (%ld)\n, PTR_ERR(conn));
conn = NULL;
diff

[PATCH RFC net-next 0/3] RDS-TCP: Network namespace support

2015-07-30 Thread Sowmini Varadhan

This patch series contains the set of changes to correctly set up 
the infra for PF_RDS sockets that use TCP as the transport in multiple
network namespaces.

Patch 1 in the series is the minimal set of changes to allow
a single instance of RDS-TCP to run in any (i.e init_net or other) 
namespace. The changes in this patch set ensure that the
execution of 'modprobe [-r] rds_tcp' correctly sets up the kernel
TCP sockets relative to the current netns. 

Patch 2 of the series further allows multiple RDS-TCP instances,
one per network namespace. The changes in this patch allows dynamic
creation/tear-down of RDS-TCP client and server sockets  across all
current and future namespaces. 

Comments are specifically invited about the following:

   There is some question in my mind as to whether Patch 2 should
   use register_pernet_subsys() or register_pernet_device(): due
   to the nature of the architecture, RDS/TCP is not a network device,
   but more accurately a subsystem that encapsulates an RDS packet into
   a TCP/IP header at the ksocket layer. However, the listen socket
   is created as part of the -init in the pernet_operations, and the 
   connect/accept sockets get created in the kernel dynamically, with the
   intention that all of these sockets should be cleaned as part of -exit.

   Based on the comments in net_namespace.h, sockets would need
   to be cleaned up as part of a pernet operation, else they would
   hold up lo cleanup.  In the current version of patch2,  that cleanup is
   achieved after the ethernet devices, by the socket keepalive timeout,
   after which the -exit will get called. I'm not sure there is a clean
   way to avoid this.  As thing stand, doing ip netns delete name
   would result in syslogd messages about unregister_netdevice: waiting
   for lo to become free. Usage count .. being seen in the interval between
   ethernet device migration to init_net and the keepalive timeout
 
Patch 3 in this set is independant of the above two changes, and is 
a bugfix/follow up to eeb1bd5c encountered while testing the above.

Sowmini Varadhan (3):
  Make RDS-TCP work correctly when it is set up in a netns other than
init_net
  Support multiple RDS-TCP listen endpoints, one per netns.
  sk_clone_lock() should only do get_net() if the parent is not a
kernel socket

 net/core/sock.c   |3 +-
 net/rds/bind.c|3 +-
 net/rds/connection.c  |   16 ---
 net/rds/ib.c  |2 +-
 net/rds/ib_cm.c   |4 +-
 net/rds/iw.c  |2 +-
 net/rds/iw_cm.c   |4 +-
 net/rds/rds.h |   11 +++--
 net/rds/send.c|3 +-
 net/rds/tcp.c |  116 ++---
 net/rds/tcp.h |7 ++-
 net/rds/tcp_connect.c |9 +++-
 net/rds/tcp_listen.c  |   40 ++---
 net/rds/transport.c   |4 +-
 14 files changed, 155 insertions(+), 69 deletions(-)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC net-next 3/3] net/core/sock.c: sk_clone_lock() should only do get_net() if the parent is not a kernel socket

2015-07-30 Thread Sowmini Varadhan
The newsk returned by sk_clone_lock should hold a get_net()
reference if, and only if, the parent is not a kernel socket
(making this similar to sk_alloc()).

E.g,. for the SYN_RECV call path, tcp_v4_syn_recv_sock-..inet_csk_clone_lock
sets up the syn_recv newsk from sk_clone_lock. When the parent (listen)
socket is a kernel socket (defined in sk_alloc() as having
sk_net_refcnt == 0), then the newsk should also have a 0 sk_net_refcnt
and should not hold a get_net() reference.

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
 net/core/sock.c |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 08f16db..371d1b7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1497,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
sock_copy(newsk, sk);
 
/* SANITY */
-   get_net(sock_net(newsk));
+   if (likely(newsk-sk_net_refcnt))
+   get_net(sock_net(newsk));
sk_node_init(newsk-sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


netns refcnt leak for kernel accept sock

2015-07-27 Thread Sowmini Varadhan

I'm running into a netns refcnt issue, and I suspect that 
eeb1bd5c has something to do with it (perhaps we need an 
additional change in sk_clone_lock() after eeb1bd5c). 
Here's the problem:

When we create an syn_recv sock based on a kernel listen sock, we
take a get_net() ref  with a stack similar to the one shown below.
Note that the parent (kernel, listen) sock itself has not taken
a get_net() ref, because it explicitly calls sock_create_kern().

  get_net /* for the newsk */
  sk_clone_lock
  inet_csk_clone_lock
  tcp_create_openreq_child
  tcp_v4_syn_recv_sock
  tcp_check_req
  tcp_v4_do_rcv
  tcp_v4_rcv 
   :

But it's not clear to me where this refcnt will be released: 
in my case, I expect to create/cleanup kernel sockets as part 
of -init/-exit for my module, but because the accept socket 
has a netns refcnt, it blocks cleanup_net(), thus my -exit 
pernet_subsys op cannot run and clean this up, and we have a leak.

I think that sk_clone_lock() should only do a get_net() if the parent
is not a kernel socket (making this similar to sk_alloc()), i.e.,

diff --git a/net/core/sock.c b/net/core/sock.c
index 08f16db..371d1b7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1497,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gf
sock_copy(newsk, sk);

/* SANITY */
-   get_net(sock_net(newsk));
+   if (likely(newsk-sk_net_refcnt))
+   get_net(sock_net(newsk));
sk_node_init(newsk-sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);

Does this sound right?

--Sowmini

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: netns refcnt leak for kernel accept sock

2015-07-27 Thread Sowmini Varadhan
On (07/27/15 12:40), ebied...@xmission.com wrote:
 sock_create_kern and friends are specialied interfaces for special
 purposes.  At a quick read through I don't think we have a single in
 tree user doing with them what you are trying to do.

That doesnt change the fact that the architecture is questionable.
and my description should make it quite clear why this is so.

 
 Without seeing code using the interfaces in the way are trying to use
 them I do not have enough information to comment intelligently.

Ok, here you go.

I'm still testing it, but there's enough there for you to see the bug
quite clearly.

Enjoy. I think my other mail had better information to comment intelligently
but ymmv.

--Sowmini

diff --git a/net/core/sock.c b/net/core/sock.c
index 08f16db..371d1b7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1497,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
sock_copy(newsk, sk);
 
/* SANITY */
-   get_net(sock_net(newsk));
+   if (likely(newsk-sk_net_refcnt))
+   get_net(sock_net(newsk));
sk_node_init(newsk-sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 4ebd29c..dd666fb 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -185,7 +185,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
ret = 0;
goto out;
}
-   trans = rds_trans_get_preferred(sin-sin_addr.s_addr);
+   trans = rds_trans_get_preferred(sock_net(sock-sk),
+   sin-sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index da6da57..273fa6c 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -117,7 +117,8 @@ static void rds_conn_reset(struct rds_connection *conn)
  * For now they are not garbage collected once they're created.  They
  * are torn down as the module is removed, if ever.
  */
-static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+static struct rds_connection *__rds_conn_create(struct net *net,
+  __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp,
   int is_outgoing)
 {
@@ -157,7 +158,7 @@ new_conn:
conn-c_faddr = faddr;
spin_lock_init(conn-c_lock);
conn-c_next_tx_seq = 1;
-
+   write_pnet(conn-c_net, net);
init_waitqueue_head(conn-c_waitq);
INIT_LIST_HEAD(conn-c_send_queue);
INIT_LIST_HEAD(conn-c_retrans);
@@ -174,7 +175,7 @@ new_conn:
 * can bind to the destination address then we'd rather the messages
 * flow through loopback rather than either transport.
 */
-   loop_trans = rds_trans_get_preferred(faddr);
+   loop_trans = rds_trans_get_preferred(net, faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn-c_loopback = 1;
@@ -260,17 +261,19 @@ out:
return conn;
 }
 
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+  __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create);
 
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+  __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ba2dffe..1381422 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -317,7 +317,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned 
int len,
  * allowed to influence which paths have priority.  We could call userspace
  * asserting this policy routing.
  */
-static int rds_ib_laddr_check(__be32 addr)
+static int rds_ib_laddr_check(struct net *net, __be32 addr)
 {
int ret;
struct rdma_cm_id *cm_id;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0da2a45..c38d8a0 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -448,8 +448,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 (unsigned long long)be64_to_cpu(lguid),
 (unsigned long long)be64_to_cpu(fguid));
 
-   conn = 

Re: netns refcnt leak for kernel accept sock

2015-07-27 Thread Sowmini Varadhan
On (07/27/15 11:13), Cong Wang wrote:
 
 That refcnt should be released in sock destructor too, when the tcp
 connection is terminated.

yes, but in my case, the listen socket is opened as part of
the -init indirection in pernet_operations (thus it is a kernel socket)
and the expectation is that this listen socket, and any accept sockets
derived from it, will be closed in -exit.

But if the accept socket is treated as a uspace socket (thus holds a get_net())
then it will block cleanup_net() and the associated -exit cleanup operations.

This is probably not a problem for other systems like vxlan/gue/geneve etc
because they all use udp sockets, thus dont have the accept equivalent.

But fundamentally, its wrong for a kspace listen socket to result in a
uspace accept socket.

 Given the fact that sk_destruct() checks for sk_net_refcnt, your
 patch makes sense to me. But I am not sure how a TCP kernel
 socket is supposed to use.

Thanks for the confirmation - I think RDS is a bit of a maverick here in
that it uses tcp sockets unlike vxlan etc.
For those curious about RDS-TCP, I've actually updated the documentation at
https://oss.oracle.com/projects/rds/dist/documentation/rds-3.1-spec.html
recently. I hope that helps.

--Sowmini
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: netns refcnt leak for kernel accept sock

2015-07-27 Thread Sowmini Varadhan
On (07/27/15 11:37), Cong Wang wrote:
 
 dlm uses a kernel TCP socket too, but it allocates a new socket and calls
 -accept() by itself. ;)

sure, and rds does this in rds_tcp_accept_one() too.

But the newsk being created in sk_clone_lock  is the one on an 
incoming syn, i.e., the one that is saved up as part of listen backlog, 
to be returned later on the accept.

I dont know the details of dlm- can you have one dlm instance per
network namespace? That's where I'm running into this issue- when we
try to have one rds listen socket per netns, and want to be able to
do both
- dynamically build/tear down new network namepsaces, without 
  unloading rds_tcp globally
- unload rds_tcp globally withouth tearing down individual netns.

But perhaps we digress.

Fundamental issue remains: newsk is the syn_recv version of the
listen socket. If the listen socket is a kernel socket (kern == 1
for sk_alloc, and the listen socket thus has no sk_net_refcnt),
the syn_recv socket must also have that behavior, so that it is
cleaned up in the same way.

--Sowmini

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ARP response with link local IP, why not broadcast

2015-07-21 Thread Sowmini Varadhan
On Tue, Jul 21, 2015 at 4:38 PM, Sebastian Fett db_ext...@gmx.de wrote:
 Hello!

 According to RFC3927 every ARP packet (reply and request) should be sent as
 link layer broadcast as long as the sender IP is a link local address. (see
 chapter 2.5).

Because broadcast replies are noisy and should be avoided.
if possible- it creates a broadcast flood that would wake up all receivers,
and is especially undesirable in today's world, where bcast would wake
up sleepy devices, or require other inefficient processes in a cloud env.
See also https://www.ietf.org/id/draft-nordmark-6man-dad-approaches-01.txt

 That functionality would help me a lot with a use case I have with our
 application.

what is your use case?


 But it is not implemented in the kernel that way.
 Does anyone know why?

--Sowmini
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ARP response with link local IP, why not broadcast

2015-07-22 Thread Sowmini Varadhan
On Wed, Jul 22, 2015 at 9:49 AM, Sebastian Fett db_ext...@gmx.de wrote:

 what is your use case?


 My problem ist a local network of audio devices. It is a valid possibility
 that two halfs of the setup are set up individually (Stage left and stage
 right). Both local networks will auto configure themselves via link local
 and will be stable. But there always can be two devices with the same IP in
 both networks.
 At one point those two networks will be connected. With the current
 behaviour the conflicting devices will never know of each other and the
 address conflict.

Ah yes, this is a valid problem (Partition-Join tolerance) and one that is
being discussed in the Ipv6 context on 6man:
http://www.ietf.org/mail-archive/web/ipv6/current/msg22712.html

FWIW, when Solaris implemented ACD (rfc 5227) the compromise
that was made between bcasting *every* ARP response whle solving
the type or issue that you describe was to use a periodic ARP announce,
advertising the IP address (a Grat ARP) with exponential backoff.
If a duplicate address is triggered (as would happen in the scenario
that you describe) the system would fall into the aggressive defend mode.

ARP announcemnts were bcast, but the noise is mitigated by tunable
exponential backoff.

Of course, all of this only helps to *detect* the duplicate- eventually
some other entity has to jump in and arbitrate on which one should
own the address.

 The devices are controlled by a central PC using avahi/bonjour. It will know
 of all conflicting devices, but will only be able to talk to the one that
 happens to be in it's ARP cache. And renewing that cache will not change
 anything, because it will happen with unicast messages.

 I looked at a Dante Controller (an audio data streaming device). And here
 all ARP messages are answered with broadcasts.

 I think that behaviour is acceptable because it only happens in local
 networks. Waking up sleeping devices will not be a concern there.

I dont know if a short term solution that makes sense here is to have
a tunable for this.

But even the always bcast arp response will fail if you have a silent
rejoin of the partitioned network- there is a reliance on the owner
of an address bcasting their ARP resp at some point right?

(there's also a DoS vector here- I can create a lot of bcast traffic
by arping for an address..)

 That brings me to another question. When I react to an ARP packet in a
 userspace program, can I keep that packet from reaching the kernel as well?
 I would like to avoid to completely handle ARP in userspace.

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


rfc: making rds-tcp netns aware

2015-07-15 Thread Sowmini Varadhan

I am working on making rds-tcp to be netns-aware, and in addition
to a few bug fixes that I'm lining up, there's a basic issue with 
the way rds-tcp sets up the listen socket that is causing problems

The RDS tcp listen endpoint is created as part of module init.
(rds_tcp_init - rds_tcp_listen_init()). So this means that
if I create a blue netns, and 'modprobe rds_tcp' within that netns,
I get a kernel socket attached to the blue netns (which is good),
but then I cannot use the same technique to set up a socket
for a different netns ('modprobe rds_tcp' in that netns 
will return silently, as it should). 

And there's another downside to this design: the socket wont
get released till the module is unloaed, so it ends up holding
the reference on the net.

So perhaps it was not a good idea to set up the listen socket
as part of module init, but I'm trying to figure out a clean
design for setting up the listen socket.

Some uspace daemon that listens for changes to namespaces and
reacts appropriately? A separate sysctl that sets up the listen
endpoint in each namespace? 

Are there other subsystems that have to handle a similar case?  
I suspect that RDS-TCP is somewhat unusual here- I think most other
similar encaps protocols like vxlan etc are associated with a network 
driver, so the listen endpoint is created as part of the -ndo_open

Suggestions for other modules that have to deal with a similar
situation that I can refer to are invited..

--Sowmini

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC net-next] net/vxlan: Fix kernel unaligned access in __vxlan_find_mac

2015-07-17 Thread Sowmini Varadhan

__vxlan_find_mac invokes ether_addr_equal on the eth_addr field,
which triggers unaligned access messages, so rearrange vxlan_fdb
to avoid this as non-intrusively as possible. 

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
 drivers/net/vxlan.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 34c519e..c9790a2 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -107,8 +107,8 @@ struct vxlan_fdb {
unsigned long used;
struct list_head  remotes;
u16   state;/* see ndm_state */
-   u8flags;/* see ndm_flags */
u8eth_addr[ETH_ALEN];
+   u8flags;/* see ndm_flags */
 };
 
 /* Pseudo network device */
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2 RFC net-next] net/vxlan: Fix kernel unaligned access in __vxlan_find_mac

2015-07-17 Thread Sowmini Varadhan
On (07/17/15 16:07), Joe Perches wrote:
 On Fri, 2015-07-17 at 22:00 +0200, Sowmini Varadhan wrote:
  __vxlan_find_mac invokes ether_addr_equal on the eth_addr field,
  which triggers unaligned access messages, so rearrange vxlan_fdb
  to avoid this in the most non-intrusive way.
 
 What arch does this?

sparc.

BTW, I was also getting a lot of alignment errors from 
vxlan_xmit_skb (vxh comes out unaligned) for the IPv6 path.
I did not have time to investigate/fix this correctly- not sure
if this is specific to v6.

--Sowmini

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2 RFC net-next] net/vxlan: Fix kernel unaligned access in __vxlan_find_mac

2015-07-17 Thread Sowmini Varadhan

__vxlan_find_mac invokes ether_addr_equal on the eth_addr field,
which triggers unaligned access messages, so rearrange vxlan_fdb
to avoid this in the most non-intrusive way.

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
v2: Alexander Duyck comments: make eth_addr[] 64b aligned.

 drivers/net/vxlan.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 34c519e..ec86a11 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -106,9 +106,9 @@ struct vxlan_fdb {
unsigned long updated;  /* jiffies */
unsigned long used;
struct list_head  remotes;
+   u8eth_addr[ETH_ALEN];
u16   state;/* see ndm_state */
u8flags;/* see ndm_flags */
-   u8eth_addr[ETH_ALEN];
 };
 
 /* Pseudo network device */
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv2 RFC net-next] net/vxlan: Fix kernel unaligned access in __vxlan_find_mac

2015-07-19 Thread Sowmini Varadhan

On 07/18/2015 08:06 PM, Joe Perches wrote:


It seems that this code has had unaligned accesses
on this field even before compare_ether_addr was
converted to ether_addr_equal.

Is sparc64 the only one that emits / ratelimits that
unaligned access message?  I looked a little, but I
didn't find a fixup message when MIPS does unaligned
accesses.  Are all the other arches silent when
fixing up unaligned accesses?  Maye adding a generic
debug only ratelimited message might help remove
more of these.  As it's not fatal, naybe the sparc64
message should be KERN_DEBUG/pr_debug.


I'm confused, are we suggesting that we fix the unaligned
access by snuffing out the message that complains loudly and correctly
about it?

See also: large block comment above __pksb_trim
about correctly using skb_reserve(). Evidently not being
correctly done for the IPv6-vxlan code path (and possibly
for other encaps too?)

--Sowmini



--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next] net/vxlan: Fix kernel unaligned access in __vxlan_find_mac

2015-07-20 Thread Sowmini Varadhan

__vxlan_find_mac invokes ether_addr_equal on the eth_addr field,
which triggers unaligned access messages, so rearrange vxlan_fdb
to avoid this in the most non-intrusive way.

Signed-off-by: Sowmini Varadhan sowmini.varad...@oracle.com
---
v2: Alexander Duyck comments: place eth_addr[] to be 64b aligned

 drivers/net/vxlan.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 34c519e..ec86a11 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -106,9 +106,9 @@ struct vxlan_fdb {
unsigned long updated;  /* jiffies */
unsigned long used;
struct list_head  remotes;
+   u8eth_addr[ETH_ALEN];
u16   state;/* see ndm_state */
u8flags;/* see ndm_flags */
-   u8eth_addr[ETH_ALEN];
 };
 
 /* Pseudo network device */
-- 
1.7.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next] xfrm: Fix unaligned access to stats in copy_to_user_state()

2015-10-21 Thread Sowmini Varadhan

On sparc, deleting established SAs (e.g., by restarting ipsec)
results in unaligned access messages via xfrm_del_sa -> 
km_state_notify -> xfrm_send_state_notify().

Even though struct xfrm_usersa_info is aligned on 8-byte boundaries,
netlink attributes are fundamentally only 4 byte aligned, and this
cannot be changed for nla_data() that is passed up to userspace.
As a result, the put_unaligned() macro needs to be used to
set up potentially unaligned fields such as the xfrm_stats in
copy_to_user_state()

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
v2: review comment from thread: cannot use PTR_ALIGN as this would break
userspace assumptions about 4 byte alignment. Use *_unaligned() macros
as needed, instead.

 net/xfrm/xfrm_user.c |5 -
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index a8de9e3..639e0d5 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -31,6 +31,7 @@
 #if IS_ENABLED(CONFIG_IPV6)
 #include 
 #endif
+#include 
 
 static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type)
 {
@@ -728,7 +729,9 @@ static void copy_to_user_state(struct xfrm_state *x, struct 
xfrm_usersa_info *p)
memcpy(>sel, >sel, sizeof(p->sel));
memcpy(>lft, >lft, sizeof(p->lft));
memcpy(>curlft, >curlft, sizeof(p->curlft));
-   memcpy(>stats, >stats, sizeof(p->stats));
+   put_unaligned(x->stats.replay_window, >stats.replay_window);
+   put_unaligned(x->stats.replay, >stats.replay);
+   put_unaligned(x->stats.integrity_failed, >stats.integrity_failed);
memcpy(>saddr, >props.saddr, sizeof(p->saddr));
p->mode = x->props.mode;
p->replay_window = x->props.replay_window;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-10-29 Thread Sowmini Varadhan


This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
address in Open Firmware or IDPROM").

As with that fix, attempt to look up the MAC address in Open Firmware
on systems that uspport it, and use IDPROM on SPARC if no OF address
is found.

Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 drivers/net/ethernet/intel/i40e/i40e_common.c |   36 +
 1 files changed, 36 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 2d74c6e..53f804a 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -24,11 +24,23 @@
  *
  
**/
 
+#include 
+#ifdef CONFIG_OF
+#include 
+#endif
+#include 
+#include "i40e.h"
+
 #include "i40e_type.h"
 #include "i40e_adminq.h"
 #include "i40e_prototype.h"
 #include "i40e_virtchnl.h"
 
+#ifdef CONFIG_SPARC
+#include 
+#include 
+#endif
+
 /**
  * i40e_set_mac_type - Sets MAC type
  * @hw: pointer to the HW structure
@@ -1008,6 +1020,27 @@ i40e_status i40e_aq_mac_address_write(struct i40e_hw *hw,
return status;
 }
 
+static int i40e_get_platform_mac_addr(struct i40e_hw *hw, u8 *mac_addr)
+{
+#ifdef CONFIG_OF
+   struct i40e_pf *pf = hw->back;
+   struct device_node *dp = pci_device_to_OF_node(pf->pdev);
+   const unsigned char *addr;
+
+   addr = of_get_mac_address(dp);
+   if (addr) {
+   ether_addr_copy(mac_addr, addr);
+   return 0;
+   }
+#endif /* CONFIG_OF */
+
+#ifdef CONFIG_SPARC
+   ether_addr_copy(mac_addr, idprom->id_ethaddr);
+   return 0;
+#endif /* CONFIG_SPARC */
+   return 1;
+}
+
 /**
  * i40e_get_mac_addr - get MAC address
  * @hw: pointer to the HW structure
@@ -1021,6 +1054,9 @@ i40e_status i40e_get_mac_addr(struct i40e_hw *hw, u8 
*mac_addr)
i40e_status status;
u16 flags = 0;
 
+   if (!i40e_get_platform_mac_addr(hw, mac_addr))
+   return I40E_SUCCESS;
+
status = i40e_aq_mac_address_read(hw, , , NULL);
 
if (flags & I40E_AQC_LAN_ADDR_VALID)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net] RDS-TCP: Recover correctly from pskb_pull()/pksb_trim() failure in rds_tcp_data_recv

2015-10-26 Thread Sowmini Varadhan

Either of pskb_pull() or pskb_trim() may fail under low memory conditions.
If rds_tcp_data_recv() ignores such failures, the application will
receive corrupted data because the skb has not been correctly
carved to the RDS datagram size.

Avoid this by handling pskb_pull/pskb_trim failure in the same
manner as the skb_clone failure: bail out of rds_tcp_data_recv(), and
retry via the deferred call to rds_send_worker() that gets set up on
ENOMEM from rds_tcp_read_sock()

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 net/rds/tcp_recv.c |   11 +--
 1 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c
index fbc5ef8..27a9921 100644
--- a/net/rds/tcp_recv.c
+++ b/net/rds/tcp_recv.c
@@ -214,8 +214,15 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, 
struct sk_buff *skb,
}
 
to_copy = min(tc->t_tinc_data_rem, left);
-   pskb_pull(clone, offset);
-   pskb_trim(clone, to_copy);
+   if (!pskb_pull(clone, offset) ||
+   pskb_trim(clone, to_copy)) {
+   pr_warn("rds_tcp_data_recv: pull/trim failed "
+   "left %zu data_rem %zu skb_len %d\n",
+   left, tc->t_tinc_data_rem, skb->len);
+   kfree_skb(clone);
+   desc->error = -ENOMEM;
+   goto out;
+   }
skb_queue_tail(>ti_skb_list, clone);
 
rdsdebug("skb %p data %p len %d off %u to_copy %zu -> "
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 2/2] xfrm: Fix unaligned access in xfrm_notify_sa() for DELSA

2015-10-21 Thread Sowmini Varadhan
On (10/21/15 06:22), David Miller wrote:
> memcpy() _never_ works for avoiding unaligned accessed.
> 
> I repeat, no matter what you do, no matter what kinds of casts or
> fancy typing you use, memcpy() _never_ works for this purpose.
  :
> There is one and only one portable way to access unaligned data,
> and that is with the get_unaligned() and put_unaligned() helpers.

ok. I'll fix it up to use the *_unaligned functions and resend this 
out later today.

--Sowmini

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Routing loops & TTL tracking with tunnel devices

2015-11-16 Thread Sowmini Varadhan
On (11/16/15 21:14), Jason A. Donenfeld wrote:
> 
> But what about in devices for which self-routing might actually be
> useful? For example, let's say that if an incoming skb is headed for
> dst X, it gets encapsulated and sent to dst A, and for dst Y it gets
> encapsulated and sent to dst B, and for dst Z it gets encapsulated and
> sent to dst C. I can imagine situations in which setting A==Y and B==Z
> might be useful to do multiple levels of encapsulation on one device,
> so that skbs headed for dst X get sent to dst C, but with intermediate
> transformations of dst A and dst B.

I believe that what you are talking about is basically nested encapsulation-
see https://tools.ietf.org/html/rfc2473. 

The tunnelling endpoint could track the number of encapsulations and keep
a limit on that? (conceptually this may be the same thing as your ttl
proposal, except that "ttl" has other meanings in other contexts, so
a bit non-intuitive) 

--Sowmini

(fwiw, RFC 2473 proposes an ipv6 option to track nested encapsulation,
and that never took off, because, among other reasons, its hard to
offload such options to hardware. Anyway, you are not trying to carry
this around in the packet).

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Routing loops & TTL tracking with tunnel devices

2015-11-16 Thread Sowmini Varadhan
> Neat. Though, in my case, I'm not actually just prepending a header.
> I'm doing some more substantial transformations of a packet. And this
> needs to work with v4 too. So I'm not sure implementing a v6 spec will

Understood, that spec was just referenced to indicate that there
are more issues (mtu reduction etc) with nested encapsulation,
and this is actually applicable even without the recursion issue
(i.e even if you dont have a tunnelling loop, and even if it
is not ipv6, there are some non-trivial problems here. Luckily,
nested encaps is somewhat uncommon).

--Sowmini


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-02 Thread Sowmini Varadhan
On (11/02/15 17:26), Nelson, Shannon wrote:
> > I assume you mean .1q 
> 
> Yes, this is what I had in mind.

I dont think we're quite there yet, even without vlans.

If I turn on/off tcpdump, there's something about the way that the link
is bounced that leaves the device down while tcpdump is running. Then
after I exit tcpdump, it bounces things a few more times again, packets
flow for a brief interval, and then there's silence.

Seems like there's is a workq that results in
i40e_service_task->i40e_sync_vsi_filters that periodically resets things.

Doing 'ip link set eth0 promisc on' keeps things nice and steady.

How is this all supposed to work if I change the macaddr from /sbin/ip 
using i40e_set_mac() and then jiggle the promisc (either just the flag,
or with tcpdump)? (I cant tell because I dont have an x86 machine with 
i40e handy)

To frame the question differently, where all should I be invoking
the new i40e_macaddr_init() function from? 

--Sowmini


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-10-30 Thread Sowmini Varadhan
On (10/30/15 22:03), Nelson, Shannon wrote:
> The more common idiom in our driver would be
> 
>   err = i40e_get_platform_mac_addr(..);
>   if (err) {

Ok.

> Have you tested this beyond a compile?
> Do you have a DT model to try this against?

yes.

> In looking at a couple other drivers, I see the difference being that
> they typically are writing the primary mac filter on probe (and any
> other reset), whereas the i40e "knows" that the default mac address is
> already set up as the filter and doesn't bother with a redundant write.
> If you want to add this Open Filter code, you'll need to arrange for
> this write to happen.  You can't call i40e_set_mac() to do it, but you
> can see the i40e_aq_mac_address_write() code there that is involved in
> updating the mac address as an example.  You probably will want to look
> at section 4.2.1.5.3 of the XL710 data sheet in order to know how to
> use i40e_aq_mac_address_write() for your situation.

ok. I'll look into it (and also why this did not show up in my testing).

fwiw, the ixgbe patch is quite clearly missing in i40e, and hopefully
we wont be opportunistically fixing this one driver at a time in 
the future.

--Sowmini

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-10-30 Thread Sowmini Varadhan
On (10/30/15 02:14), Andy Shevchenko wrote:
> 
> Does the following has no stubs?
> 
> > +   struct i40e_pf *pf = hw->back;
> > +   struct device_node *dp = pci_device_to_OF_node(pf->pdev);
> > +   const unsigned char *addr;
> > +
> > +   addr = of_get_mac_address(dp);
> 
> ^^^

I was not able to find any.
I'm fixing up the rest and respinning V2 as a separate thread.

Thanks
--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-10-30 Thread Sowmini Varadhan


This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
address in Open Firmware or IDPROM").

As with that fix, attempt to look up the MAC address in Open Firmware
on systems that support it, and use IDPROM on SPARC if no OF address
is found.

Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
v2: review comments from Andy Shevchenko

 drivers/net/ethernet/intel/i40e/i40e_common.c |   32 +
 1 files changed, 32 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 2d74c6e..5b3e377 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -24,6 +24,14 @@
  *
  
**/
 
+#include 
+#include 
+#include 
+#ifdef CONFIG_SPARC
+#include 
+#include 
+#endif
+#include "i40e.h"
 #include "i40e_type.h"
 #include "i40e_adminq.h"
 #include "i40e_prototype.h"
@@ -1008,6 +1016,27 @@ i40e_status i40e_aq_mac_address_write(struct i40e_hw *hw,
return status;
 }
 
+static int i40e_get_platform_mac_addr(struct i40e_hw *hw, u8 *mac_addr)
+{
+#ifdef CONFIG_OF
+   struct i40e_pf *pf = hw->back;
+   struct device_node *dp = pci_device_to_OF_node(pf->pdev);
+   const unsigned char *addr;
+
+   addr = of_get_mac_address(dp);
+   if (addr) {
+   ether_addr_copy(mac_addr, addr);
+   return 0;
+   }
+#endif /* CONFIG_OF */
+
+#ifdef CONFIG_SPARC
+   ether_addr_copy(mac_addr, idprom->id_ethaddr);
+   return 0;
+#endif /* CONFIG_SPARC */
+   return 1;
+}
+
 /**
  * i40e_get_mac_addr - get MAC address
  * @hw: pointer to the HW structure
@@ -1021,6 +1050,9 @@ i40e_status i40e_get_mac_addr(struct i40e_hw *hw, u8 
*mac_addr)
i40e_status status;
u16 flags = 0;
 
+   if (!i40e_get_platform_mac_addr(hw, mac_addr))
+   return I40E_SUCCESS;
+
status = i40e_aq_mac_address_read(hw, , , NULL);
 
if (flags & I40E_AQC_LAN_ADDR_VALID)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 RFC net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-01 Thread Sowmini Varadhan
This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
address in Open Firmware or IDPROM").

As with that fix, attempt to look up the MAC address in Open Firmware
on systems that support it, and use IDPROM on SPARC if no OF address
is found.

In the case of the i40e there is an assumption that the default mac
address has already been set up as the primary mac filter on probe,
so if this filter is obtained from the Open Firmware or IDPROM, an
explicit write is needed via i40e_aq_mac_address_write() and
i40e_aq_add_macvlan() invocation.

Reviewers: please check if invoking i40e_macaddr_init() on 
platforms that use the default mac address (i.e., when it is not from
OF or idprom) will cause harm, and if it is necessary/possible to
move this invocation to an earlier point in i40e_probe().

Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
v2, v3: Andy Shevchenko comments
v4: 

 drivers/net/ethernet/intel/i40e/i40e_main.c |   53 ++-
 1 files changed, 52 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b825f97..3c81c0c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -24,6 +24,15 @@
  *
  
**/
 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_SPARC
+#include 
+#include 
+#endif
+
 /* Local includes */
 #include "i40e.h"
 #include "i40e_diag.h"
@@ -9212,6 +9221,25 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct 
i40e_vsi *vsi)
return NULL;
 }
 
+static int i40e_macaddr_init( struct i40e_vsi *vsi, u8 *macaddr)
+{
+   int ret; 
+   struct i40e_aqc_add_macvlan_element_data element;
+
+   ret = i40e_aq_mac_address_write(>back->hw,
+   I40E_AQC_WRITE_TYPE_LAA_WOL,
+   macaddr, NULL);
+   if (ret)
+   return -EADDRNOTAVAIL;
+
+   memset(, 0, sizeof(element));
+   ether_addr_copy(element.mac_addr, macaddr);
+   element.flags = cpu_to_le16(I40E_AQC_MACVLAN_ADD_PERFECT_MATCH);
+   i40e_aq_add_macvlan(>back->hw, vsi->seid, , 1, NULL);
+
+   return ret;
+}
+
 /**
  * i40e_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
@@ -9341,6 +9369,9 @@ struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 
type,
ret = i40e_config_netdev(vsi);
if (ret)
goto err_netdev;
+   ret = i40e_macaddr_init(vsi, pf->hw.mac.addr); 
+   if (ret)
+   goto err_netdev;
ret = register_netdev(vsi->netdev);
if (ret)
goto err_netdev;
@@ -10162,6 +10193,24 @@ static void i40e_print_features(struct i40e_pf *pf)
kfree(string);
 }
 
+static int i40e_get_platform_mac_addr(struct pci_dev *pdev, u8 *mac_addr)
+{
+   struct device_node *dp = pci_device_to_OF_node(pdev);
+   const unsigned char *addr;
+
+   addr = of_get_mac_address(dp);
+   if (addr) {
+   ether_addr_copy(mac_addr, addr);
+   return 0;
+   }
+#ifdef CONFIG_SPARC
+   ether_addr_copy(mac_addr, idprom->id_ethaddr);
+   return 0;
+#else
+   return -EINVAL;
+#endif /* CONFIG_SPARC */
+}
+
 /**
  * i40e_probe - Device initialization routine
  * @pdev: PCI device information struct
@@ -10360,7 +10409,9 @@ static int i40e_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
i40e_aq_stop_lldp(hw, true, NULL);
}
 
-   i40e_get_mac_addr(hw, hw->mac.addr);
+   err = i40e_get_platform_mac_addr(pdev, hw->mac.addr);
+   if (err)
+   i40e_get_mac_addr(hw, hw->mac.addr);
if (!is_valid_ether_addr(hw->mac.addr)) {
dev_info(>dev, "invalid MAC address %pM\n", hw->mac.addr);
err = -EIO;
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-01 Thread Sowmini Varadhan
On (10/30/15 19:13), Sowmini Varadhan wrote:
> > In looking at a couple other drivers, I see the difference being that
> > they typically are writing the primary mac filter on probe (and any
> > other reset), whereas the i40e "knows" that the default mac address is
> > already set up as the filter and doesn't bother with a redundant write.
> > If you want to add this Open Filter code, you'll need to arrange for
> > this write to happen.  You can't call i40e_set_mac() to do it, but you
> > can see the i40e_aq_mac_address_write() code there that is involved in
> > updating the mac address as an example.  You probably will want to look
> > at section 4.2.1.5.3 of the XL710 data sheet in order to know how to
> > use i40e_aq_mac_address_write() for your situation.
>
> ok. I'll look into it (and also why this did not show up in my testing).

So I figured out why it all "seemed to work" - my test env had another
obscure init process that was marking the link promiscuous.  I guess
that was having the side-effect of somehow setting the filters above.

But looks like there's more to getting this right than just calling
i40e_aq_mac_address_write() - I think it also needs a i40e_aq_add_macvlan().  

I was able to get this to work by calling a the core part of
i40e_set_mac just before register_netdev. In my patch (RFC patch 
in a separate thread - please review) I now have this sequence in
i40e_probe

err = i40e_get_platform_mac_addr(pdev, hw->mac.addr);
if (err)
i40e_get_mac_addr(hw, hw->mac.addr);
 : 
i40e_setup_pf_switch(..);

And the resulting i40e_vsi_setup() from i40e_setup_pf_switch()
will end up doing the right thing by invoking the guts of 
i40e_set_mac(), which is basically the  sequence:
i40e_aq_mac_address_write()
i40e_aq_add_macvlan()   

I dont know if it is necessary/possible/important to set up the
filters sooner in the sequence- the add_macvlan needs an "seid",
and I could not tell when (in the ":" code above) the right seid 
can be found.  

Please review the RFC patch I'll be sending shortly.

--Sowmini

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-01 Thread Sowmini Varadhan
On (11/01/15 21:03), Nelson, Shannon wrote:
> .. In the meantime, be sure to test what happens over a reset, such as what
> happens when the MTU is changed.  This will make sure that the replay
> of mac and vlan filters happens correctly.  You'll want to test this
> with and without vlans.

I assume you mean .1q (aka linux macvlan) as opposed to access/trunk vlans? 
I will test that tomorrow but I did a quick sanity check on mtu, as well
as turning tso on/off which also restarts the driver (I believe), and
it was "fine", i.e., able to ping offlink hosts. 

--Sowmini

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-10-30 Thread Sowmini Varadhan

This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
address in Open Firmware or IDPROM").

As with that fix, attempt to look up the MAC address in Open Firmware
on systems that support it, and use IDPROM on SPARC if no OF address
is found.

Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
v2: andy shevchenko comments
v3: more andy shevchenko comments
 drivers/net/ethernet/intel/i40e/i40e_common.c |   30 +
 1 files changed, 30 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c 
b/drivers/net/ethernet/intel/i40e/i40e_common.c
index 2d74c6e..3edfe19 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -24,6 +24,14 @@
  *
  
**/
 
+#include 
+#include 
+#include 
+#ifdef CONFIG_SPARC
+#include 
+#include 
+#endif
+#include "i40e.h"
 #include "i40e_type.h"
 #include "i40e_adminq.h"
 #include "i40e_prototype.h"
@@ -1008,6 +1016,25 @@ i40e_status i40e_aq_mac_address_write(struct i40e_hw *hw,
return status;
 }
 
+static int i40e_get_platform_mac_addr(struct i40e_hw *hw, u8 *mac_addr)
+{
+   struct i40e_pf *pf = hw->back;
+   struct device_node *dp = pci_device_to_OF_node(pf->pdev);
+   const unsigned char *addr;
+
+   addr = of_get_mac_address(dp);
+   if (addr) {
+   ether_addr_copy(mac_addr, addr);
+   return 0;
+   }
+
+#ifdef CONFIG_SPARC
+   ether_addr_copy(mac_addr, idprom->id_ethaddr);
+   return 0;
+#endif /* CONFIG_SPARC */
+   return 1;
+}
+
 /**
  * i40e_get_mac_addr - get MAC address
  * @hw: pointer to the HW structure
@@ -1021,6 +1048,9 @@ i40e_status i40e_get_mac_addr(struct i40e_hw *hw, u8 
*mac_addr)
i40e_status status;
u16 flags = 0;
 
+   if (!i40e_get_platform_mac_addr(hw, mac_addr))
+   return I40E_SUCCESS;
+
status = i40e_aq_mac_address_read(hw, , , NULL);
 
if (flags & I40E_AQC_LAN_ADDR_VALID)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v6] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Sowmini Varadhan

This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
address in Open Firmware or IDPROM").

As with that fix, attempt to look up the MAC address in Open Firmware
on systems that support it, and use IDPROM on SPARC if no OF address
is found.

In the case of the i40e there is an assumption that the default mac
address has already been set up as the primary mac filter on probe,
so if this filter is obtained from the Open Firmware or IDPROM, an
explicit write is needed via i40e_aq_mac_address_write() and
i40e_aq_add_macvlan() invocation.

Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
v2, v3: Andy Shevchenko comments
v4: Shannon Nelson review: explicitly set up mac filters before register_netdev
v5: Shannon Nelson code style comments
v6: Shannon Nelson code style comments

 drivers/net/ethernet/intel/i40e/i40e_main.c |   83 ++-
 1 files changed, 82 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b825f97..e355873 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -24,6 +24,15 @@
  *
  
**/
 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_SPARC
+#include 
+#include 
+#endif
+
 /* Local includes */
 #include "i40e.h"
 #include "i40e_diag.h"
@@ -9213,6 +9222,44 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct 
i40e_vsi *vsi)
 }
 
 /**
+ * i40e_macaddr_init - explicitly write the mac address filters.
+ *
+ * @vsi: pointer to the vsi.
+ * @macaddr: the MAC address
+ *
+ * This is needed when the macaddr has been obtained by other
+ * means than the default, e.g., from Open Firmware or IDPROM.
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_macaddr_init(struct i40e_vsi *vsi, u8 *macaddr)
+{
+   int ret, aq_err;
+   struct i40e_aqc_add_macvlan_element_data element;
+
+   ret = i40e_aq_mac_address_write(>back->hw,
+   I40E_AQC_WRITE_TYPE_LAA_WOL,
+   macaddr, NULL);
+   if (ret) {
+   dev_info(>back->pdev->dev,
+"Addr change for VSI failed: %d\n", ret);
+   return -EADDRNOTAVAIL;
+   }
+
+   memset(, 0, sizeof(element));
+   ether_addr_copy(element.mac_addr, macaddr);
+   element.flags = cpu_to_le16(I40E_AQC_MACVLAN_ADD_PERFECT_MATCH);
+   ret = i40e_aq_add_macvlan(>back->hw, vsi->seid, , 1, NULL);
+   aq_err = vsi->back->hw.aq.asq_last_status;
+   if (aq_err != I40E_AQ_RC_OK) {
+   dev_info(>back->pdev->dev,
+"add filter failed err %s aq_err %s\n",
+i40e_stat_str(>back->hw, ret),
+i40e_aq_str(>back->hw, aq_err));
+   }
+   return ret;
+}
+
+/**
  * i40e_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
  * @type: VSI type
@@ -9341,6 +9388,9 @@ struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 
type,
ret = i40e_config_netdev(vsi);
if (ret)
goto err_netdev;
+   ret = i40e_macaddr_init(vsi, pf->hw.mac.addr);
+   if (ret)
+   goto err_netdev;
ret = register_netdev(vsi->netdev);
if (ret)
goto err_netdev;
@@ -10163,6 +10213,35 @@ static void i40e_print_features(struct i40e_pf *pf)
 }
 
 /**
+ * i40e_get_platform_mac_addr - get platform-specific MAC address
+ *
+ * @pdev: PCI device information struct
+ * @mac_addr: the MAC address to be returned
+ *
+ * Look up the MAC address in Open Firmware  on systems that support it,
+ * and use IDPROM on SPARC if no OF address is found.
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_get_platform_mac_addr(struct pci_dev *pdev, u8 *mac_addr)
+{
+   struct device_node *dp = pci_device_to_OF_node(pdev);
+   const unsigned char *addr;
+
+   addr = of_get_mac_address(dp);
+   if (addr) {
+   ether_addr_copy(mac_addr, addr);
+   return 0;
+   }
+#ifdef CONFIG_SPARC
+   ether_addr_copy(mac_addr, idprom->id_ethaddr);
+   return 0;
+#else
+   return -EINVAL;
+#endif /* CONFIG_SPARC */
+}
+
+/**
  * i40e_probe - Device initialization routine
  * @pdev: PCI device information struct
  * @ent: entry in i40e_pci_tbl
@@ -10360,7 +10439,9 @@ static int i40e_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
i40e_aq_stop_lldp(hw, true, NULL);
}
 
-   i40e_get_mac_addr(hw, hw->mac.addr);
+   err = i40e_get_platform_mac_addr(pdev, hw-&g

Re: [PATCH v5] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Sowmini Varadhan
On (11/04/15 21:59), Andy Shevchenko wrote:
> 
> Usually the structure of kernel doc is something like following
> 
> /**
>  * func - summary
>  * @paramx: desc
>  *
>  * Description:
>  * Long description in many lines and / or paragraphs
>  *
>  * Returns:
>  * 0 on success or errno otherwise.
>  */
> 
> 
> > + **/
> 
> No need two stars.

I was actually following the exact comment style of 
the function just before i40e_macaddr_init, namely:;

/**
 * i40e_vsi_setup - Set up a VSI by a given type
 * @pf: board private structure
 * @type: VSI type
 * @uplink_seid: the switch element to link to
 * @param1: usage depends upon VSI type. For VF types, indicates VF id
 *
 * This allocates the sw VSI structure and its queue resources, then add a VSI
 * to the identified VEB.
 *
 * Returns pointer to the successfully allocated and configure VSI sw struct on
 * success, otherwise returns NULL on failure.
 **/
struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 type,
u16 uplink_seid, u32 param1)

So I'm not sure we need to really bike-shed this one?
> > +   macaddr, NULL);
> > +   if (ret) {
> > +   dev_info(>back->pdev->dev,
> > +"Addr change for VSI failed: %d\n", ret);
> 
> dev_err() or dev_warn() I would say.

again, this was a cut/paste of code from i40e_set_mac()
which does netdev_info.

> > +   ret = i40e_aq_add_macvlan(>back->hw, vsi->seid, , 1, 
> > NULL);
> > +   aq_err = vsi->back->hw.aq.asq_last_status;
> 
> Do you really need a separate variable (aq_err)?

That seems to be the convention used elsewhere, where ret is
distinguished from aq_err, see i40e_sync_vsi_filters()

> > +   if (aq_err != I40E_AQ_RC_OK) {
> > +   dev_info(>back->pdev->dev,
> > +"add filter failed err %s aq_err %s\n",
> > +i40e_stat_str(>back->hw, ret),
> > +i40e_aq_str(>back->hw, aq_err));
> > +   }
> > +   return ret;

> Same about kernel doc.
See earlier response.

--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v5] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Sowmini Varadhan

This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
address in Open Firmware or IDPROM").

As with that fix, attempt to look up the MAC address in Open Firmware
on systems that support it, and use IDPROM on SPARC if no OF address
is found.

In the case of the i40e there is an assumption that the default mac
address has already been set up as the primary mac filter on probe,
so if this filter is obtained from the Open Firmware or IDPROM, an
explicit write is needed via i40e_aq_mac_address_write() and
i40e_aq_add_macvlan() invocation.

Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
v2, v3: Andy Shevchenko comments
v4: Shannon Nelson review: explicitly set up mac filters before register_netdev
v5: Shannon Nelson code style comments

 drivers/net/ethernet/intel/i40e/i40e_main.c |   84 ++-
 1 files changed, 83 insertions(+), 1 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index b825f97..a3883cf 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -24,6 +24,15 @@
  *
  
**/
 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_SPARC
+#include 
+#include 
+#endif
+
 /* Local includes */
 #include "i40e.h"
 #include "i40e_diag.h"
@@ -9213,6 +9222,44 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct 
i40e_vsi *vsi)
 }
 
 /**
+ * i40e_macaddr_init - explicitly write the mac address filters. This
+ * is needed when the macaddr has been obtained by other means than
+ * the default, e.g., from Open Firmware or IDPROM.
+ *
+ * @vsi: pointer to the vsi.
+ * @macaddr: the MAC address
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_macaddr_init(struct i40e_vsi *vsi, u8 *macaddr)
+{
+   int ret, aq_err;
+   struct i40e_aqc_add_macvlan_element_data element;
+
+   ret = i40e_aq_mac_address_write(>back->hw,
+   I40E_AQC_WRITE_TYPE_LAA_WOL,
+   macaddr, NULL);
+   if (ret) {
+   dev_info(>back->pdev->dev,
+"Addr change for VSI failed: %d\n", ret);
+   return -EADDRNOTAVAIL;
+   }
+
+   memset(, 0, sizeof(element));
+   ether_addr_copy(element.mac_addr, macaddr);
+   element.flags = cpu_to_le16(I40E_AQC_MACVLAN_ADD_PERFECT_MATCH);
+   ret = i40e_aq_add_macvlan(>back->hw, vsi->seid, , 1, NULL);
+   aq_err = vsi->back->hw.aq.asq_last_status;
+   if (aq_err != I40E_AQ_RC_OK) {
+   dev_info(>back->pdev->dev,
+"add filter failed err %s aq_err %s\n",
+i40e_stat_str(>back->hw, ret),
+i40e_aq_str(>back->hw, aq_err));
+   }
+   return ret;
+}
+
+/**
  * i40e_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
  * @type: VSI type
@@ -9341,6 +9388,9 @@ struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 
type,
ret = i40e_config_netdev(vsi);
if (ret)
goto err_netdev;
+   ret = i40e_macaddr_init(vsi, pf->hw.mac.addr);
+   if (ret)
+   goto err_netdev;
ret = register_netdev(vsi->netdev);
if (ret)
goto err_netdev;
@@ -10163,6 +10213,36 @@ static void i40e_print_features(struct i40e_pf *pf)
 }
 
 /**
+ * i40e_get_platform_mac_addr - get mac address from Open Firmware
+ * or IDPROM if supported by the platform
+ *
+ * @pdev: PCI device information struct
+ * @mac_addr: the MAC address to be returned
+ *
+ * Look up the MAC address in Open Firmware  on systems that support it,
+ * and use IDPROM on SPARC if no OF address is found.
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_get_platform_mac_addr(struct pci_dev *pdev, u8 *mac_addr)
+{
+   struct device_node *dp = pci_device_to_OF_node(pdev);
+   const unsigned char *addr;
+
+   addr = of_get_mac_address(dp);
+   if (addr) {
+   ether_addr_copy(mac_addr, addr);
+   return 0;
+   }
+#ifdef CONFIG_SPARC
+   ether_addr_copy(mac_addr, idprom->id_ethaddr);
+   return 0;
+#else
+   return -EINVAL;
+#endif /* CONFIG_SPARC */
+}
+
+/**
  * i40e_probe - Device initialization routine
  * @pdev: PCI device information struct
  * @ent: entry in i40e_pci_tbl
@@ -10360,7 +10440,9 @@ static int i40e_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
i40e_aq_stop_lldp(hw, true, NULL);
}
 
-   i40e_get_mac_addr(hw, hw->mac.addr);
+   err = i40e_get_platform_mac_addr(pdev, hw-&g

Re: [PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-04 Thread Sowmini Varadhan
On (11/02/15 14:57), Sowmini Varadhan wrote:
> On (11/02/15 17:26), Nelson, Shannon wrote:
> > > I assume you mean .1q 
> > 
> > Yes, this is what I had in mind.
> 
> I dont think we're quite there yet, even without vlans.
> 

Ok finally got all the .1q stuff verified (took a bit longer
than it should, because of some lab/administrative hassles in
getting the vlans adjusted at intermediate switches). But
the fix as I'd sent has been tested, and it is good for .1q.

The issues I was seeing with promisc were orthogonal to my fix,
or even to my platform - I understand those are being addressed
separately.

I'm probably too late for the merge window, and I dont know if
this fix qualifies as "critical" for net, so I'll re-target v5 
for net-next, to be conservative.

--Sowmini


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-05 Thread Sowmini Varadhan
On (11/05/15 11:29), David Miller wrote:
> 
> The intention is to let your patch go in as-is, then try and update
> ixgbe/i40e later in net-next or similar.

Sounds good, I can take care of ixgbe/i40e after that happens.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v6] i40e: Look up MAC address in Open Firmware or IDPROM

2015-11-05 Thread Sowmini Varadhan
On (11/05/15 11:05), David Miller wrote:
> From: David Miller 
> Date: Thu, 05 Nov 2015 10:31:26 -0500 (EST)
> 
> > I'll see if I can cook something up.
> 
> How does this look?

Looks good to me,

Do you want me to respin patch v7 with this? Or update ixgbe/i40e to use
this later, after this goes in?

--Sowmini


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-10-30 Thread Sowmini Varadhan
On (10/30/15 18:57), Nelson, Shannon wrote:
> > >
> > > Going along with this being the equivalent of the ixgbe patch, I'd
> > > prefer the new code to be in i40e_main.c, rather than in i40e_common.c.
> > > In the design of our drivers, the common file is essentially a device
> > > specific layer, and the OS and platform related stuff should stay in
> > > i40e_main.c.  That would also take care of one of the include file nits
> > > that Andy mentioned.
   :
> I'm not sure about any deeper wisdom, but the HW/FW model that this
> device uses is rather different from our previous devices, so our SW
> abstractions are a little different from ixgbe and igb.

Ok, in that case I can make i40e_main.c to do something like

static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
   :
   if (i40e_get_platform_mac_addr(..) != I40E_SUCCESS)
i40e_get_mac_addr(..);
   :
}

and i agree that doing this from i40e_main will simplify a lot of the
other stuff around getting the pdev etc.

[Discussion about ndo_set_mac_address..]

> In the cases in which I'm aware, the platform manufacturer normally will
> burn a different mac-address into the device's eeprom at manufacturing
> time if they want something other than what came from Intel.  I suppose a
> Device Tree oriented platform could have a different address in the DT,
> but somehow that needs to get communicated to the device driver so that
> it can tell the HW.
> 
> My question to you remains: does this ndo_set_mac_address happen from
> the stack above the driver or not?

I'm probably even less clued in to the DT arch than you in this case,
so input from the intel experts would be useful here.

But both in this case, and for the ixgbe template on which I tried
to model this, the OF/idprom probing happens from the ->probe when the
driver comes up, and ndo_set_mac_address is not involved.

I dont know if it is easier (or even feasible to do this from ->probe) 
to just call the i40e_set_mac to make sure all the state-bits in
the driver are correctly set.

--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-10-30 Thread Sowmini Varadhan
On (10/30/15 20:06), Andy Shevchenko wrote:
> 
> > +#include "i40e.h"
> 
> Why do you need this one exactly?

I needed it to find pf->pdev below.

> > +   struct device_node *dp = pci_device_to_OF_node(pf->pdev);

Without it, you will get:
  :
 CC [M]  drivers/net/ethernet/intel/i40e/i40e_common.o
   drivers/net/ethernet/intel/i40e/i40e_common.c: In function 
?i40e_get_platform_mac_addr?:
   drivers/net/ethernet/intel/i40e/i40e_common.c:1021: error: dereferencing 
pointer to incomplete type


Unless you feel passionately about the \n nits, I'm going to pass on those. 
Thanks for reviewing.

--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 net] i40e: Look up MAC address in Open Firmware or IDPROM

2015-10-30 Thread Sowmini Varadhan
On (10/30/15 18:28), Nelson, Shannon wrote:
> 
> Going along with this being the equivalent of the ixgbe patch, I'd
> prefer the new code to be in i40e_main.c, rather than in i40e_common.c.
> In the design of our drivers, the common file is essentially a device
> specific layer, and the OS and platform related stuff should stay in
> i40e_main.c.  That would also take care of one of the include file nits
> that Andy mentioned.

ok, and that was my initial instinct as well, except that I noticed
that the existing i40e code tries interesting variations from the 
typical intel driver model by calling i40e_get_mac_addr() which lives in
(for reasons not obvious to me) i40e_common.c

So I assumed there must be some other deeper wisdom at work here.

> At the risk of flying my Device Tree ignorance for all to see, I have
> a couple questions on how this is used.
> 
> Since the mac address is specific to the individual device, how does it
> get into the device tree?  I thought the device tree was compiled ahead
> of time for the platform it is used on.  Is this a generic DT pattern
> just in case some platform actually has the mac-address?  Or does the
> device mac-address get saved into the DT on the first time through this
> path so it can be found next time?
> 
> If the Device Tree has a different mac address than the HW, when
> does the kernel tell the HW to use a different mac address?  Does the
> DT management eventually call the ndo_set_mac_address call so the HW
> knows to use a different mac address?

yes, and here I was hoping for some feedback from the intel folks as
well. Commit c762dff24c06 sets hw->mac.perm_addr. I dont know
if there is some similar i40e state that needs to be set.

Please share insights.

--Sowmini

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] xfrm/crypto: unaligned access fixes

2015-10-19 Thread Sowmini Varadhan
A two-part patchset that fixes some "unaligned access" warnings
that showed up my sparc test machines with ipsec set up.


Sowmini Varadhan (2):
  crypto/x509: Fix unaligned access in x509_get_sig_params()
  Fix unaligned access in xfrm_notify_sa() for DELSA

 crypto/asymmetric_keys/x509_public_key.c |5 +++--
 net/xfrm/xfrm_user.c |2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] crypto/x509: Fix unaligned access in x509_get_sig_params()

2015-10-19 Thread Sowmini Varadhan
x509_get_sig_params() has the same code pattern as the one in
pkcs7_verify() that is fixed by commit 62f57d05e287 ("crypto: pkcs7 - Fix
unaligned access in pkcs7_verify()") so apply a similar fix here: make
sure that desc is pointing at an algined value past the digest_size,
and take alignment values into consideration when doing kzalloc()

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 crypto/asymmetric_keys/x509_public_key.c |5 +++--
 1 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/crypto/asymmetric_keys/x509_public_key.c 
b/crypto/asymmetric_keys/x509_public_key.c
index 1970966..68c3c40 100644
--- a/crypto/asymmetric_keys/x509_public_key.c
+++ b/crypto/asymmetric_keys/x509_public_key.c
@@ -194,14 +194,15 @@ int x509_get_sig_params(struct x509_certificate *cert)
 * digest storage space.
 */
ret = -ENOMEM;
-   digest = kzalloc(digest_size + desc_size, GFP_KERNEL);
+   digest = kzalloc(ALIGN(digest_size, __alignof__(*desc)) + desc_size,
+GFP_KERNEL);
if (!digest)
goto error;
 
cert->sig.digest = digest;
cert->sig.digest_size = digest_size;
 
-   desc = digest + digest_size;
+   desc = PTR_ALIGN(digest + digest_size, __alignof__(*desc));
desc->tfm = tfm;
desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
 
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 2/2] xfrm: Fix unaligned access in xfrm_notify_sa() for DELSA

2015-10-19 Thread Sowmini Varadhan
On sparc, deleting established SAs (e.g., by restarting ipsec
at the peer) results in unaligned access messages via
xfrm_del_sa -> km_state_notify -> xfrm_send_state_notify().
Use an aligned pointer to xfrm_usersa_info for this case.

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 net/xfrm/xfrm_user.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index a8de9e3..158ef4a 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2659,7 +2659,7 @@ static int xfrm_notify_sa(struct xfrm_state *x, const 
struct km_event *c)
if (attr == NULL)
goto out_free_skb;
 
-   p = nla_data(attr);
+   p = PTR_ALIGN(nla_data(attr), __alignof__(*p));
}
err = copy_to_user_state_extra(x, p, skb);
if (err)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 2/2] xfrm: Fix unaligned access in xfrm_notify_sa() for DELSA

2015-10-21 Thread Sowmini Varadhan
On (10/21/15 06:54), Sowmini Varadhan wrote:
> But __alignof__(*p) is 8 on sparc, and without the patch I get
> all types of unaligned access. So what do you suggest as the fix?

Even though the alignment is, in fact, 8 (and that comes from
struct xfrm_lifetime_cfg), if uspace is firmly attached to the 4 byte
alignment, I think we can retain that behavior and still avoid
unaligned access in the kernel with the following (admittedly ugly hack).
Can you please take a look? I tested it with 'ip x m' and a transport 
mode tunnel on my sparc.


diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 158ef4a..ca4e7f0 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2620,7 +2620,7 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x)
 static int xfrm_notify_sa(struct xfrm_state *x, const struct km_event *c)
 {
struct net *net = xs_net(x);
-   struct xfrm_usersa_info *p;
+   struct xfrm_usersa_info *p, tmp;
struct xfrm_usersa_id *id;
struct nlmsghdr *nlh;
struct sk_buff *skb;
@@ -2659,11 +2659,16 @@ static int xfrm_notify_sa(struct xfrm_state *x, const 
struct km_event *c)
if (attr == NULL)
goto out_free_skb;
 
-   p = PTR_ALIGN(nla_data(attr), __alignof__(*p));
+   p = nla_data(attr);
+   err = copy_to_user_state_extra(x, , skb);
+   if (err)
+   goto out_free_skb;
+   memcpy((u8 *)p, , sizeof(tmp));
+   } else {
+   err = copy_to_user_state_extra(x, p, skb);
+   if (err)
+   goto out_free_skb;
}
-   err = copy_to_user_state_extra(x, p, skb);
-   if (err)
-   goto out_free_skb;
 
nlmsg_end(skb, nlh);


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 2/2] xfrm: Fix unaligned access in xfrm_notify_sa() for DELSA

2015-10-21 Thread Sowmini Varadhan
On (10/21/15 08:57), Steffen Klassert wrote:
> > --- a/net/xfrm/xfrm_user.c
> > +++ b/net/xfrm/xfrm_user.c
> > @@ -2659,7 +2659,7 @@ static int xfrm_notify_sa(struct xfrm_state *x, const 
> > struct km_event *c)
> > if (attr == NULL)
> > goto out_free_skb;
> >  
> > -   p = nla_data(attr);
> > +   p = PTR_ALIGN(nla_data(attr), __alignof__(*p));
> 
> Hm, this breaks userspace notifications on 64-bit systems.
> Userspace expects this to be aligned to 4, with your patch
> it is aligned to 8 on 64-bit.
> 
> Without your patch I get the correct notification when deleting a SA:
> 

But __alignof__(*p) is 8 on sparc, and without the patch I get
all types of unaligned access. So what do you suggest as the fix?

(and openswan/pluto dont flag any errors with the patch, which is
a separate bug).

--Sowmini

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] RDS: fix rds-ping deadlock over TCP transport

2015-10-16 Thread Sowmini Varadhan
On (10/16/15 20:26), Santosh Shilimkar wrote:
> 
> diff --git a/net/rds/send.c b/net/rds/send.c
> + if (!test_bit(RDS_LL_SEND_FULL, >c_flags))
> + queue_delayed_work(rds_wq, >c_send_w, 0);

A minor note- it would help to add some comments here explaining 
that the pong has already been added to the sendq earlier.. 
in the case of IB, if RDS_LL_SEND_FULL has been set, it takes some
head-scratching to figure out how the pong gets sent, and a  few
comments could help clarify that.

but other than that, the contents look good to me, thus

Acked-by: Sowmini Varadhan <sowmini.varad...@oracle.com>



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 net-next] RDS: fix rds-ping deadlock over TCP transport

2015-10-17 Thread Sowmini Varadhan

On 10/16/2015 10:13 PM, Santosh Shilimkar wrote:



But because of above recursive lock hang with RDS TCP,
the send work from rds_send_pong() needs to deferred to
worker to avoid lock up. Given RDS ping is more of connectivity
test than performance critical path, its should be ok even
for transport like IB.



Acked-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next] RDS-TCP: Reset tcp callbacks if re-using an outgoing socket in rds_tcp_accept_one()

2015-10-11 Thread Sowmini Varadhan

Consider the following "duelling syn" sequence between two peers A and B:
A   B
SYN1 -->
<-- SYN2
SYN2ACK  -->

Note that the SYN/ACK has already been sent out by TCP before
rds_tcp_accept_one() gets invoked as part of callbacks.

If the inet_addr(A) is numerically less than inet_addr(B),
the arbitration scheme in rds_tcp_accept_one() will prefer the
TCP connection triggered by SYN1, and will send a CLOSE for the
SYN2 (just after the SYN2ACK was sent).

Since B also follows the same arbitration scheme, it will send the SYN-ACK
for SYN1 that will set up a healthy ESTABLISHED connection on both sides.
B will also get a  CLOSE for SYN2, which should result in the cleanup
of the TCP state machine for SYN2, but it should not trigger any
stale RDS-TCP callbacks (such as ->writespace, ->state_change etc),
that would disrupt the progress of the SYN2 based RDS-TCP  connection.

Thus the arbitration scheme in rds_tcp_accept_one() should restore
rds_tcp callbacks for the winner before setting them up for the
new accept socket, and also make sure that conn->c_outgoing
is set to 0 so that we do not trigger any reconnect attempts on the
passive side of the tcp socket in the future, in conformance with
commit c82ac7e69efe ("net/rds: RDS-TCP: only initiate reconnect attempt
on outgoing TCP socket.")

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 net/rds/tcp_listen.c |3 +++
 1 files changed, 3 insertions(+), 0 deletions(-)

diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
index 1d90240..0936a4a 100644
--- a/net/rds/tcp_listen.c
+++ b/net/rds/tcp_listen.c
@@ -125,6 +125,9 @@ int rds_tcp_accept_one(struct socket *sock)
new_sock = NULL;
ret = 0;
goto out;
+   } else if (rs_tcp->t_sock) {
+   rds_tcp_restore_callbacks(rs_tcp->t_sock, rs_tcp);
+   conn->c_outgoing = 0;
}
 
rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next] RDS: Invoke ->laddr_check() in rds_bind() for explicitly bound transports.

2015-10-11 Thread Sowmini Varadhan

The IP address passed to rds_bind() should be vetted by the
transport's ->laddr_check() for a previously bound transport.
This needs to be done to avoid cases where, for example,
the application has asked for an IB transport,
but the IP address passed to bind is only usable on
ethernet interfaces.

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 net/rds/bind.c |9 -
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index bc6b93e..6192566 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -196,7 +196,14 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
goto out;
 
if (rs->rs_transport) { /* previously bound */
-   ret = 0;
+   trans = rs->rs_transport;
+   if (trans->laddr_check(sock_net(sock->sk),
+  sin->sin_addr.s_addr) != 0) {
+   ret = -ENOPROTOOPT;
+   rds_remove_bound(rs);
+   } else {
+   ret = 0;
+   }
goto out;
}
trans = rds_trans_get_preferred(sock_net(sock->sk),
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


a question about the kcm proposal

2015-10-12 Thread Sowmini Varadhan
Thinking back a bit about the kcm proposal:
 https://www.mail-archive.com/netdev@vger.kernel.org/msg78696.html
I had a question:

If the user-space  has decided to encrypt the http/2 header using tls,
the len (and other http/2 fields) is no longer in the clear for the kernel.

My understanding is that http header encryption is common practice/BCP,
since the http hdr may contain a lot of identity, session and tenancy data.
If that's true, then wouldn't this break the BPF/kcm assumptions? 

There is a different but related problem in this space- existing TLS/DTLS
libraries (openssl, gnutls etc) only know how to work with tcp
or udp sockets - they do not know anything about PF_RDS or the
newly proposed kcm socket type.

In theory, it is possible to extend these libraries to handle
RDS/kcm etc, but (as we found out with RDS and IP_PKTINFO/BINDTODEVICE),
some things become tricky because of the many-to-one dgram-over-stream
hybrid.

I've looked at  IPSEC/IKE in transport mode for RDS on the kernel tcp
socket as we discussed at Plumbers in August, and that has some costs..
would be interesting to evaluate against other options..

--Sowmini


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v4] net: ipv6: Make address flushing on ifdown optional

2015-10-07 Thread Sowmini Varadhan
On Wed, Oct 7, 2015 at 11:17 AM, David Ahern  wrote:
> Currently, all ipv6 addresses are flushed when the interface is configured
> down, including global, static addresses:
  :
>
> Add a new sysctl to make this behavior optional. The new setting defaults to
> flush all addresses to maintain backwards compatibility. When the setting is
> reset global addresses with no expire times are not flushed:

does src addr selection also need to be modified to know if/when it can/cannot
use this static address as a source addr? Or does the TENTATIVE flag
make it Do The Right Thing per rfc 3484?

--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: a question about the kcm proposal

2015-10-12 Thread Sowmini Varadhan
On (10/12/15 15:05), Tom Herbert wrote:
> > There is a different but related problem in this space- existing TLS/DTLS
> > libraries (openssl, gnutls etc) only know how to work with tcp
> > or udp sockets - they do not know anything about PF_RDS or the
> > newly proposed kcm socket type.
> >
> TLS-in-kernel would be a lower layer so it shouldn't have to know
> anything about RDS or KCM. If it makes sent KCM could be used for
> parsing TLS records themselves...

I wouldn't quite jump to that conclusion just yet though :-)

there are a lot of alternatives- you could have a uspace module
that shims between the application and kcm (even something that gets
LD_PRELOADed) and adds the right kcm header as needed. Or you
could use ipsec/ike..

tls in the kernel can be quite complex and history shows that it
can easily become hard to maintain: uspace TLS (both the protocol itself,
and the negotiated crypto) tend to move much faster than kernel changes
(at least that's what the 10+ year long solaris-kssl experiment found).

There is another aspect to this: in the DB world, for example,
I might seriously care about encrypting my payroll-database, but not
care so much about the christmas-potluck-database. Thus allowing the
uspace to select when (and what type of crypto algo) to use is a flexibiility
offered by TLS that a "kernel-TLS" would have a hard time matching.

> The design of TLS in the kernel is that it will be enabled on the TCP
> socket, so that receive and transmit path are below RDS and KCM. We
> have the transmit path for TLS-in-kernel running with good preliminary
> results, we will post that at least as RFC shortly. Receive side still
> seems to be feasible.

yes, please share.

TLS does complex things like mid-session CCS. Such things can result
in a lot of asyncrony in the kernel. Given that ipsec has already crossed 
that bridge, I, for one, would like to understand the trade-offs.

The question in my mind,  is "how does this match up with 
transport mode ipsec/ike", and if it does not, why not? The only 
difference (in theory) is whether you do encryption before, or after,
adding the transport (tcp/udp) header, so if there is a big perf gap,
we need to understand why.

--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC net-next 3/6] net: Introduce VRF device driver - v2

2015-07-08 Thread Sowmini Varadhan
On Mon, Jul 6, 2015 at 5:03 PM, David Ahern d...@cumulusnetworks.com wrote:
 This driver borrows heavily from IPvlan and teaming drivers.

 Routing domains (VRF-lite) are created by instantiating a device
 and enslaving all routed interfaces that participate in the domain.
 As part of the enslavement, all local routes pointing to enslaved
 devices are re-pointed to the vrf device, thus forcing outgoing
 sockets to bind to the vrf to function.

 Standard FIB rules can then bind the VRF device to tables and regular
 fib rule processing is followed.

Perhaps I misunderstand the design proposal here, but a switch's VRF
is essentially just a separate routing table, whose input and output interfaces
are exclusively bound to the VRF.

Can an application in the model above get visibiltiy into the (enslaved?)
interfaces in the vrf? Can an application use IP_PKTINFO to send a packet out of
a specific interface on a selected VRF? What about receiving
IP_PKTINFO about input interface?
What about setting ipsec policy for interfaces in the vrf?

 Routed traffic through the box, is fwded by using the VRF device as
 the IIF and following the IIF rule to a table which is mated with
 the VRF.

 Locally originated traffic is directed at the VRF device using
 SO_BINDTODEVICE or cmsg headers. This in turn drops the packet into
 the xmit function of the vrf driver, which then completes the ip lookup
 and output.

 This solution is completely orthogonal to namespaces and allow the L3
 equivalent of vlans to exist allowing the routing space to be
 partitioned.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC net-next 3/6] net: Introduce VRF device driver - v2

2015-07-09 Thread Sowmini Varadhan
On Thu, Jul 9, 2015 at 7:19 PM, David Ahern d...@cumulusnetworks.com wrote:

 On the to-do list to use cmsg to specify a VRF for outbound packets using
 non-connected sockets. I do not believe it is going to work, but need to
 look into it.

 What about setting ipsec policy for interfaces in the vrf?

From a purely parochial standpoint, how would rds sockets work in this model?
Would the tcp encaps happen before or after the the vrf driver output?
Same problem for NFS.

From a non-parochial standpoint. There are a *lot* of routing apps that 
actually
need more visibility into many details about the slave interface: e.g., OSPF,
ARP snoop, IPSLA.. the list is pretty long.

I think it's a bad idea to use a driver to represent a table lookup. Too many
hacks will become necessary.

--Sowmini
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC net-next 3/6] net: Introduce VRF device driver - v2

2015-07-09 Thread Sowmini Varadhan
On Fri, Jul 10, 2015 at 4:39 AM, David Ahern d...@cumulusnetworks.com wrote:

 If I set the VRF context (ie., set the SO_BINDTODEVICE for all sockets) of
 any RDS, NFS or any other socket app it runs in that VRF context and works
 just fine

What if the application wants to do SO_BINDTODEVICE?
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next] RDS: rds_conn_lookup() should factor in the struct net for a match

2015-09-03 Thread Sowmini Varadhan

Only return a conn if the rds_conn_net(conn) matches the struct
net passed to rds_conn_lookup().

Fixes: 467fa15356ac ("RDS-TCP: Support multiple RDS-TCP listen endpoints,
   one per netns.")

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 net/rds/connection.c |9 +
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/rds/connection.c b/net/rds/connection.c
index a50e652..9b2de5e 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -70,7 +70,8 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, 
__be32 faddr)
 } while (0)
 
 /* rcu read lock must be held or the connection spinlock */
-static struct rds_connection *rds_conn_lookup(struct hlist_head *head,
+static struct rds_connection *rds_conn_lookup(struct net *net,
+ struct hlist_head *head,
  __be32 laddr, __be32 faddr,
  struct rds_transport *trans)
 {
@@ -78,7 +79,7 @@ static struct rds_connection *rds_conn_lookup(struct 
hlist_head *head,
 
hlist_for_each_entry_rcu(conn, head, c_hash_node) {
if (conn->c_faddr == faddr && conn->c_laddr == laddr &&
-   conn->c_trans == trans) {
+   conn->c_trans == trans && net == rds_conn_net(conn)) {
ret = conn;
break;
}
@@ -132,7 +133,7 @@ static struct rds_connection *__rds_conn_create(struct net 
*net,
if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
goto new_conn;
rcu_read_lock();
-   conn = rds_conn_lookup(head, laddr, faddr, trans);
+   conn = rds_conn_lookup(net, head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != _loop_transport &&
laddr == faddr && !is_outgoing) {
/* This is a looped back IB connection, and we're
@@ -239,7 +240,7 @@ static struct rds_connection *__rds_conn_create(struct net 
*net,
if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
found = NULL;
else
-   found = rds_conn_lookup(head, laddr, faddr, trans);
+   found = rds_conn_lookup(net, head, laddr, faddr, trans);
if (found) {
trans->conn_free(conn->c_transport_data);
kmem_cache_free(rds_conn_slab, conn);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 0/5] net: L2 only interfaces

2015-08-25 Thread Sowmini Varadhan
On Tue, Aug 25, 2015 at 4:52 PM, David Ahern d...@cumulusnetworks.com wrote:


 The VRF driver can check the device when the enslave request happens.


Will this work correctly if I set up a bonding interface or SVI,
and want to put the bond-master or SVI in the vrf (but subsequently
want to get, say, timestamp/other-stats from the L2 slave in the vrf?)

--Sowmini
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 0/5] net: L2 only interfaces

2015-08-25 Thread Sowmini Varadhan
On Tue, Aug 25, 2015 at 3:50 PM, Florian Fainelli f.faine...@gmail.com wrote:
 Hi all,

 This patch series implements a L2 only interface concept which basically 
 denies
 any kind of IP address configuration on these interfaces, but still allows 
 them
 to be used as configuration end-points to keep using ethtool and friends.


This is a very interesting idea. A few questions/thoughts: will there
be any eventual restrictions on which types interfaces can be L2_ONLY?
Ideally, it should be possible to let interfaces wink in/out of L2 only
state administratively (as can be done on a typical router, after unwinding
existing config as needed)

I'm assuming something will prevent an L2-only interface from being
part of a vrf.

--Sowmini
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: IFLA_INET6_[ICMP6]STATS

2015-09-10 Thread Sowmini Varadhan
On (09/10/15 08:43), roopa wrote:
> If you decide to use a flag, there is IFLA_EXT_MASK which is used to
> specify such filters from userspace today.
> 
> /* New extended info filters for IFLA_EXT_MASK */
> #define RTEXT_FILTER_VF (1 << 0)
> #define RTEXT_FILTER_BRVLAN (1 << 1)
> #define RTEXT_FILTER_BRVLAN_COMPRESSED  (1 << 2)

I was actually going to use a NLM_F* flag, which is what
I thought Dave as suggesting (seems a bit simpler
than IFLA_EXT_MASK). 

--Sowmini


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: IFLA_INET6_[ICMP6]STATS

2015-09-10 Thread Sowmini Varadhan
On (09/10/15 10:13), David Miller wrote:
> I don't think using such a generic netlink flag works best, the
> IFLA_EXT_MASK is definitely more suitable.

Ok, though this more of a IFLA_TRUNCATE_MASK than a IFLA_EXT_MASK.


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCHv2 RFC] RTEXT_FILTER_SKIP_STATS support to avoid dumping inet/inet6 stats

2015-09-10 Thread Sowmini Varadhan

Many commonly used functions like getifaddrs() invoke RTM_GETLINK
to dump the interface information, and do not need the
the AF_INET6 statististics that are always returned by default
from rtnl_fill_ifinfo().

Computing the statistics can be an expensive operation that impacts
scaling, so it is desirable to avoid this if the information is
not needed.

This patch adds a the RTEXT_FILTER_SKIP_STATS extended info flag that
can be passed with netlink_request() to avoid statistics computation
for the ifinfo path.

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
v2: David Miller comments: pass u32 ext_filter_mask down.

 include/net/rtnetlink.h|3 ++-
 include/uapi/linux/rtnetlink.h |1 +
 net/core/rtnetlink.c   |2 +-
 net/ipv4/devinet.c |3 ++-
 net/ipv6/addrconf.c|   13 +
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 18fdb98..aff6ceb 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -122,7 +122,8 @@ struct rtnl_af_ops {
int family;
 
int (*fill_link_af)(struct sk_buff *skb,
-   const struct net_device *dev);
+   const struct net_device *dev,
+   u32 ext_filter_mask);
size_t  (*get_link_af_size)(const struct net_device 
*dev);
 
int (*validate_link_af)(const struct net_device 
*dev,
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 7020247..434227f 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -666,6 +666,7 @@ struct tcamsg {
 #define RTEXT_FILTER_VF(1 << 0)
 #define RTEXT_FILTER_BRVLAN(1 << 1)
 #define RTEXT_FILTER_BRVLAN_COMPRESSED (1 << 2)
+#defineRTEXT_FILTER_SKIP_STATS (1 << 3)
 
 /* End of information exported to user level */
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a466821..e545229 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1272,7 +1272,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct 
net_device *dev,
if (!(af = nla_nest_start(skb, af_ops->family)))
goto nla_put_failure;
 
-   err = af_ops->fill_link_af(skb, dev);
+   err = af_ops->fill_link_af(skb, dev, ext_filter_mask);
 
/*
 * Caller may return ENODATA to indicate that there
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 2d9cb17..7350084 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1654,7 +1654,8 @@ static size_t inet_get_link_af_size(const struct 
net_device *dev)
return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
 }
 
-static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
+u32 ext_filter_mask)
 {
struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
struct nlattr *nla;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 99c0f2b..9acbb09 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4760,7 +4760,8 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev 
*idev, int attrtype,
}
 }
 
-static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev)
+static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
+ u32 ext_filter_mask)
 {
struct nlattr *nla;
struct ifla_cacheinfo ci;
@@ -4780,6 +4781,9 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, 
struct inet6_dev *idev)
 
/* XXX - MC not implemented */
 
+   if (!!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS))
+   return 0;
+
nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
if (!nla)
goto nla_put_failure;
@@ -4815,14 +4819,15 @@ static size_t inet6_get_link_af_size(const struct 
net_device *dev)
return inet6_ifla6_size();
 }
 
-static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device 
*dev)
+static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device 
*dev,
+ u32 ext_filter_mask)
 {
struct inet6_dev *idev = __in6_dev_get(dev);
 
if (!idev)
return -ENODATA;
 
-   if (inet6_fill_ifla6_attrs(skb, idev) < 0)
+   if (inet6_fill_ifla6_attrs(skb, idev, ext_filter_mask) < 0)
return -EMSGSIZE;
 
return 0;
@@ -4977,7 +4982,7 @@ static int inet6_fill_ifinfo(struct sk_buff *skb, struct 
inet6_dev *idev,

[PATCH v3 net-next] rtnetlink: RTEXT_FILTER_SKIP_STATS support to avoid dumping inet/inet6 stats

2015-09-11 Thread Sowmini Varadhan

Many commonly used functions like getifaddrs() invoke RTM_GETLINK
to dump the interface information, and do not need the
the AF_INET6 statististics that are always returned by default
from rtnl_fill_ifinfo().

Computing the statistics can be an expensive operation that impacts
scaling, so it is desirable to avoid this if the information is
not needed.

This patch adds a the RTEXT_FILTER_SKIP_STATS extended info flag that
can be passed with netlink_request() to avoid statistics computation
for the ifinfo path.

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
v2: David Miller comments: pass u32 ext_filter_mask down.
v3: non-RFC version of v2.

 include/net/rtnetlink.h|3 ++-
 include/uapi/linux/rtnetlink.h |1 +
 net/core/rtnetlink.c   |2 +-
 net/ipv4/devinet.c |3 ++-
 net/ipv6/addrconf.c|   13 +
 5 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 18fdb98..aff6ceb 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -122,7 +122,8 @@ struct rtnl_af_ops {
int family;
 
int (*fill_link_af)(struct sk_buff *skb,
-   const struct net_device *dev);
+   const struct net_device *dev,
+   u32 ext_filter_mask);
size_t  (*get_link_af_size)(const struct net_device 
*dev);
 
int (*validate_link_af)(const struct net_device 
*dev,
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 7020247..434227f 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -666,6 +666,7 @@ struct tcamsg {
 #define RTEXT_FILTER_VF(1 << 0)
 #define RTEXT_FILTER_BRVLAN(1 << 1)
 #define RTEXT_FILTER_BRVLAN_COMPRESSED (1 << 2)
+#defineRTEXT_FILTER_SKIP_STATS (1 << 3)
 
 /* End of information exported to user level */
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a466821..e545229 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1272,7 +1272,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct 
net_device *dev,
if (!(af = nla_nest_start(skb, af_ops->family)))
goto nla_put_failure;
 
-   err = af_ops->fill_link_af(skb, dev);
+   err = af_ops->fill_link_af(skb, dev, ext_filter_mask);
 
/*
 * Caller may return ENODATA to indicate that there
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 2d9cb17..7350084 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1654,7 +1654,8 @@ static size_t inet_get_link_af_size(const struct 
net_device *dev)
return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
 }
 
-static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
+u32 ext_filter_mask)
 {
struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
struct nlattr *nla;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 99c0f2b..9acbb09 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4760,7 +4760,8 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev 
*idev, int attrtype,
}
 }
 
-static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev)
+static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
+ u32 ext_filter_mask)
 {
struct nlattr *nla;
struct ifla_cacheinfo ci;
@@ -4780,6 +4781,9 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, 
struct inet6_dev *idev)
 
/* XXX - MC not implemented */
 
+   if (!!(ext_filter_mask & RTEXT_FILTER_SKIP_STATS))
+   return 0;
+
nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
if (!nla)
goto nla_put_failure;
@@ -4815,14 +4819,15 @@ static size_t inet6_get_link_af_size(const struct 
net_device *dev)
return inet6_ifla6_size();
 }
 
-static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device 
*dev)
+static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device 
*dev,
+ u32 ext_filter_mask)
 {
struct inet6_dev *idev = __in6_dev_get(dev);
 
if (!idev)
return -ENODATA;
 
-   if (inet6_fill_ifla6_attrs(skb, idev) < 0)
+   if (inet6_fill_ifla6_attrs(skb, idev, ext_filter_mask) < 0)
return -EMSGSIZE;
 
return 0;
@@ -4977,7 +4982,7 @@ static int inet6_fill_ifinfo(struct sk_buff *

Re: [PATCHv2 RFC] RTEXT_FILTER_SKIP_STATS support to avoid dumping inet/inet6 stats

2015-09-11 Thread Sowmini Varadhan
On (09/12/15 00:22), Raghavendra K T wrote:
> 
> Sowmini, Thanks for the patch which is more cleaner way without
> breaking current behaviour.
> 
> [ Though RTEXT_FILTER_NEED_STATS flag with reverse effect  would have
> helped immediately :)]

Agree, but existing legacy usage will not set this flag, so I had 
few choices here.

> /me waits for the RTEXT_FILTER_SKIP_STATS to be supported in
> gccgo/golang, so that it can be used in docker newNetlinkRequest() to
> exploit this.

yes, I'm working on lining up the glibc bit as well (thus the cc to Jose..)
I'll send out the non-rfc version in a bit..

thanks for the feedback!

--Sowmini

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RFC] RTEXT_FILTER_SKIP_STATS support to avoid dumping inet/inet6 stats

2015-09-10 Thread Sowmini Varadhan

Many commonly used functions like getifaddrs() invoke RTM_GETLINK
to dump the interface information, and do not need the
the AF_INET6 statististics that are always returned by default
from rtnl_fill_ifinfo().

Computing the statistics can be an expensive operation that impacts
scaling, so it is desirable to avoid this if the information is
not needed.

This patch adds a the RTEXT_FILTER_SKIP_STATS extended info flag that
can be passed with netlink_request() to avoid statistics comuputation
for the ifinfo path.

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 include/net/rtnetlink.h|3 ++-
 include/uapi/linux/rtnetlink.h |1 +
 net/core/rtnetlink.c   |3 ++-
 net/ipv4/devinet.c |3 ++-
 net/ipv6/addrconf.c|   13 +
 5 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 18fdb98..2219c83 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -122,7 +122,8 @@ struct rtnl_af_ops {
int family;
 
int (*fill_link_af)(struct sk_buff *skb,
-   const struct net_device *dev);
+   const struct net_device *dev,
+   bool skip_af_stats);
size_t  (*get_link_af_size)(const struct net_device 
*dev);
 
int (*validate_link_af)(const struct net_device 
*dev,
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 7020247..434227f 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -666,6 +666,7 @@ struct tcamsg {
 #define RTEXT_FILTER_VF(1 << 0)
 #define RTEXT_FILTER_BRVLAN(1 << 1)
 #define RTEXT_FILTER_BRVLAN_COMPRESSED (1 << 2)
+#defineRTEXT_FILTER_SKIP_STATS (1 << 3)
 
 /* End of information exported to user level */
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index a466821..958e299 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1054,6 +1054,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct 
net_device *dev,
struct nlattr *attr, *af_spec;
struct rtnl_af_ops *af_ops;
struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
+   bool skip_af_stats = ((ext_filter_mask & RTEXT_FILTER_SKIP_STATS) != 0);
 
ASSERT_RTNL();
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
@@ -1272,7 +1273,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct 
net_device *dev,
if (!(af = nla_nest_start(skb, af_ops->family)))
goto nla_put_failure;
 
-   err = af_ops->fill_link_af(skb, dev);
+   err = af_ops->fill_link_af(skb, dev, skip_af_stats);
 
/*
 * Caller may return ENODATA to indicate that there
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 2d9cb17..b1ef81f 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1654,7 +1654,8 @@ static size_t inet_get_link_af_size(const struct 
net_device *dev)
return nla_total_size(IPV4_DEVCONF_MAX * 4); /* IFLA_INET_CONF */
 }
 
-static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev)
+static int inet_fill_link_af(struct sk_buff *skb, const struct net_device *dev,
+bool skip_af_stats)
 {
struct in_device *in_dev = rcu_dereference_rtnl(dev->ip_ptr);
struct nlattr *nla;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index 99c0f2b..928f32b 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -4760,7 +4760,8 @@ static void snmp6_fill_stats(u64 *stats, struct inet6_dev 
*idev, int attrtype,
}
 }
 
-static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev)
+static int inet6_fill_ifla6_attrs(struct sk_buff *skb, struct inet6_dev *idev,
+ bool skip_af_stats)
 {
struct nlattr *nla;
struct ifla_cacheinfo ci;
@@ -4780,6 +4781,9 @@ static int inet6_fill_ifla6_attrs(struct sk_buff *skb, 
struct inet6_dev *idev)
 
/* XXX - MC not implemented */
 
+   if (skip_af_stats)
+   return 0;
+
nla = nla_reserve(skb, IFLA_INET6_STATS, IPSTATS_MIB_MAX * sizeof(u64));
if (!nla)
goto nla_put_failure;
@@ -4815,14 +4819,15 @@ static size_t inet6_get_link_af_size(const struct 
net_device *dev)
return inet6_ifla6_size();
 }
 
-static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device 
*dev)
+static int inet6_fill_link_af(struct sk_buff *skb, const struct net_device 
*dev,
+ bool skip_af_stats)
 {
struct inet6_dev *idev = __in6_dev_get(dev);
 

[PATCH v2 net-next 2/3] RDS-TCP: Do not bloat sndbuf/rcvbuf in rds_tcp_tune

2015-09-30 Thread Sowmini Varadhan
Using the value of RDS_TCP_DEFAULT_BUFSIZE (128K)
clobbers efficient use of TSO because it inflates the size_goal
that is computed in tcp_sendmsg/tcp_sendpage and skews packet
latency, and the default values for these parameters actually
results in significantly better performance.

In request-response tests using rds-stress with a packet size of
100K with 16 threads (test parameters -q 10 -a 256 -t16 -d16)
between a single pair of IP addresses achieves a throughput of
6-8 Gbps. Without this patch, throughput maxes at 2-3 Gbps under
equivalent conditions on these platforms.

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 net/rds/tcp.c |   16 
 1 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index c42b60b..9d6ddba 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -67,21 +67,13 @@ void rds_tcp_nonagle(struct socket *sock)
set_fs(oldfs);
 }
 
+/* All module specific customizations to the RDS-TCP socket should be done in
+ * rds_tcp_tune() and applied after socket creation. In general these
+ * customizations should be tunable via module_param()
+ */
 void rds_tcp_tune(struct socket *sock)
 {
-   struct sock *sk = sock->sk;
-
rds_tcp_nonagle(sock);
-
-   /*
-* We're trying to saturate gigabit with the default,
-* see svc_sock_setbufsize().
-*/
-   lock_sock(sk);
-   sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
-   sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
-   sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
-   release_sock(sk);
 }
 
 u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next 0/3] RDS: RDS-TCP perf enhancements

2015-09-30 Thread Sowmini Varadhan
A 3-part patchset that (a) improves current RDS-TCP perf
by 2X-3X and (b) refactors earlier robustness code for
better observability/scaling.

Patch 1 is an enhancment of earlier robustness fixes 
that had used separate sockets for client and server endpoints to
resolve race conditions. It is possible to have an equivalent
solution that does not use 2 sockets. The benefit of a
single socket solution is that it results in more predictable
and observable behavior for the underlying TCP pipe of an 
RDS connection

Patches 2 and 3 are simple, straightforward perf bug fixes
that align the RDS TCP socket with other parts of the kernel stack.

v2: fix kbuild-test-robot warnings, comments from  Sergei Shtylov
and Santosh Shilimkar. 

Sowmini Varadhan (3):
  Use a single TCP socket for both send and receive.
  Do not bloat sndbuf/rcvbuf in rds_tcp_tune
  Set up MSG_MORE and MSG_SENDPAGE_NOTLAST as appropriate in
rds_tcp_xmit

 net/rds/connection.c |   22 ++
 net/rds/rds.h|4 +++-
 net/rds/tcp.c|   16 
 net/rds/tcp_listen.c |   22 +-
 net/rds/tcp_send.c   |8 +++-
 5 files changed, 29 insertions(+), 43 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next 1/3] RDS: Use a single TCP socket for both send and receive.

2015-09-30 Thread Sowmini Varadhan
Commit f711a6ae062c ("net/rds: RDS-TCP: Always create a new rds_sock
for an incoming connection.") modified rds-tcp so that an incoming SYN
would ignore an existing "client" TCP connection which had the local
port set to the transient port.  The motivation for ignoring the existing
"client" connection in f711a6ae was to avoid race conditions and an
endless duel of reconnect attempts triggered by a restart/abort of one
of the nodes in the TCP connection.

However, having separate sockets for active and passive sides
is avoidable, and the simpler model of a single TCP socket for
both send and receives of all RDS connections associated with
that tcp socket makes for easier observability. We avoid the race
conditions from f711a6ae by attempting reconnects in rds_conn_shutdown
if, and only if, the (new) c_outgoing bit is set for RDS_TRANS_TCP.
The c_outgoing bit is initialized in __rds_conn_create().

A side-effect of re-using the client rds_connection for an incoming
SYN is the potential of encountering duelling SYNs, i.e., we
have an outgoing RDS_CONN_CONNECTING socket when we get the incoming
SYN. The logic to arbitrate this criss-crossing SYN exchange in
rds_tcp_accept_one() has been modified to emulate the BGP state
machine: the smaller IP address should back off from the connection attempt.

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
v2: kbuild-test-robot warning around __be32, modify subject line per 
Santosh Shilimkar

 net/rds/connection.c |   22 ++
 net/rds/rds.h|4 +++-
 net/rds/tcp_listen.c |   22 +-
 3 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 49adeef..d456403 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -128,10 +128,7 @@ static struct rds_connection *__rds_conn_create(struct net 
*net,
struct rds_transport *loop_trans;
unsigned long flags;
int ret;
-   struct rds_transport *otrans = trans;
 
-   if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
-   goto new_conn;
rcu_read_lock();
conn = rds_conn_lookup(net, head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != _loop_transport &&
@@ -147,7 +144,6 @@ static struct rds_connection *__rds_conn_create(struct net 
*net,
if (conn)
goto out;
 
-new_conn:
conn = kmem_cache_zalloc(rds_conn_slab, gfp);
if (!conn) {
conn = ERR_PTR(-ENOMEM);
@@ -207,6 +203,7 @@ static struct rds_connection *__rds_conn_create(struct net 
*net,
 
atomic_set(>c_state, RDS_CONN_DOWN);
conn->c_send_gen = 0;
+   conn->c_outgoing = (is_outgoing ? 1 : 0);
conn->c_reconnect_jiffies = 0;
INIT_DELAYED_WORK(>c_send_w, rds_send_worker);
INIT_DELAYED_WORK(>c_recv_w, rds_recv_worker);
@@ -243,22 +240,13 @@ static struct rds_connection *__rds_conn_create(struct 
net *net,
/* Creating normal conn */
struct rds_connection *found;
 
-   if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
-   found = NULL;
-   else
-   found = rds_conn_lookup(net, head, laddr, faddr, trans);
+   found = rds_conn_lookup(net, head, laddr, faddr, trans);
if (found) {
trans->conn_free(conn->c_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
-   if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
-   (otrans->t_type != RDS_TRANS_TCP)) {
-   /* Only the active side should be added to
-* reconnect list for TCP.
-*/
-   hlist_add_head_rcu(>c_hash_node, head);
-   }
+   hlist_add_head_rcu(>c_hash_node, head);
rds_cong_add_conn(conn);
rds_conn_count++;
}
@@ -337,7 +325,9 @@ void rds_conn_shutdown(struct rds_connection *conn)
rcu_read_lock();
if (!hlist_unhashed(>c_hash_node)) {
rcu_read_unlock();
-   rds_queue_reconnect(conn);
+   if (conn->c_trans->t_type != RDS_TRANS_TCP ||
+   conn->c_outgoing == 1)
+   rds_queue_reconnect(conn);
} else {
rcu_read_unlock();
}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index afb4048..b4c7ac0 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -86,7 +86,9 @@ struct rds_connection {
struct hlist_node   c_hash_node;
__be32  

[PATCH v2 net-next 3/3] RDS-TCP: Set up MSG_MORE and MSG_SENDPAGE_NOTLAST as appropriate in rds_tcp_xmit

2015-09-30 Thread Sowmini Varadhan
For the same reasons as commit 2f5338442425 ("tcp: allow splice() to
build full TSO packets") and commit 35f9c09fe9c7 ("tcp: tcp_sendpages()
should call tcp_push() once"), rds_tcp_xmit may have multiple pages to
send, so use the MSG_MORE and MSG_SENDPAGE_NOTLAST as hints to
tcp_sendpage()

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
v2: Sergei Shtylov, Santosh Shilimkar comments (some parens retained for
readability)

 net/rds/tcp_send.c |8 +++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 53b17ca..2894e60 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -83,6 +83,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct 
rds_message *rm,
struct rds_tcp_connection *tc = conn->c_transport_data;
int done = 0;
int ret = 0;
+   int more;
 
if (hdr_off == 0) {
/*
@@ -116,12 +117,15 @@ int rds_tcp_xmit(struct rds_connection *conn, struct 
rds_message *rm,
goto out;
}
 
+   more = rm->data.op_nents > 1 ? (MSG_MORE | MSG_SENDPAGE_NOTLAST) : 0;
while (sg < rm->data.op_nents) {
+   int flags = MSG_DONTWAIT | MSG_NOSIGNAL | more;
+
ret = tc->t_sock->ops->sendpage(tc->t_sock,
sg_page(>data.op_sg[sg]),
rm->data.op_sg[sg].offset + off,
rm->data.op_sg[sg].length - off,
-   MSG_DONTWAIT|MSG_NOSIGNAL);
+   flags);
rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void 
*)sg_page(>data.op_sg[sg]),
 rm->data.op_sg[sg].offset + off, 
rm->data.op_sg[sg].length - off,
 ret);
@@ -134,6 +138,8 @@ int rds_tcp_xmit(struct rds_connection *conn, struct 
rds_message *rm,
off = 0;
sg++;
}
+   if (sg == rm->data.op_nents - 1)
+   more = 0;
}
 
 out:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 2/3] RDS-TCP: Do not bloat sndbuf/rcvbuf in rds_tcp_tune

2015-09-30 Thread Sowmini Varadhan
Using the value of RDS_TCP_DEFAULT_BUFSIZE (128K)
clobbers efficient use of TSO because it inflates the size_goal
that is computed in tcp_sendmsg/tcp_sendpage and skews packet
latency, and the default values for these parameters actually
results in significantly better performance.

In request-response tests using rds-stress with a packet size of
100K with 16 threads (test parameters -q 10 -a 256 -t16 -d16)
between a single pair of IP addresses achieves a throughput of
6-8 Gbps. Without this patch, throughput maxes at 2-3 Gbps under
equivalent conditions on these platforms.

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 net/rds/tcp.c |   16 
 1 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index c42b60b..9d6ddba 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -67,21 +67,13 @@ void rds_tcp_nonagle(struct socket *sock)
set_fs(oldfs);
 }
 
+/* All module specific customizations to the RDS-TCP socket should be done in
+ * rds_tcp_tune() and applied after socket creation. In general these
+ * customizations should be tunable via module_param()
+ */
 void rds_tcp_tune(struct socket *sock)
 {
-   struct sock *sk = sock->sk;
-
rds_tcp_nonagle(sock);
-
-   /*
-* We're trying to saturate gigabit with the default,
-* see svc_sock_setbufsize().
-*/
-   lock_sock(sk);
-   sk->sk_sndbuf = RDS_TCP_DEFAULT_BUFSIZE;
-   sk->sk_rcvbuf = RDS_TCP_DEFAULT_BUFSIZE;
-   sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK;
-   release_sock(sk);
 }
 
 u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 0/3] RDS: RDS-TCP perf enhancements

2015-09-30 Thread Sowmini Varadhan
A 3-part patchset that (a) improves current RDS-TCP perf
by 2X-3X and (b) refactors earlier robustness code for
better observability/scaling.

Patch 1 is an enhancment of earlier robustness fixes 
that had used separate sockets for client and server endpoints to
resolve race conditions. It is possible to have an equivalent
solution that does not use 2 sockets. The benefit of a
single socket solution is that it results in more predictable
and observable behavior for the underlying TCP pipe of an 
RDS connection

Patches 2 and 3 are simple, straightforward perf bug fixes
that align the RDS TCP socket with other parts of the kernel stack.

Sowmini Varadhan (3):
  Use a single TCP socket for both send and receive.
  Do not bloat sndbuf/rcvbuf in rds_tcp_tune
  Set up MSG_MORE and MSG_SENDPAGE_NOTLAST as appropriate in
rds_tcp_xmit

 net/rds/connection.c |   22 ++
 net/rds/rds.h|4 +++-
 net/rds/tcp.c|   16 
 net/rds/tcp_listen.c |   19 +++
 net/rds/tcp_send.c   |8 +++-
 5 files changed, 27 insertions(+), 42 deletions(-)

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 3/3] RDS-TCP: Set up MSG_MORE and MSG_SENDPAGE_NOTLAST as appropriate in rds_tcp_xmit

2015-09-30 Thread Sowmini Varadhan
For the same reasons as 2f53384424 and 35f9c09fe9, rds_tcp_xmit
may have multiple pages to send, so use the MSG_MORE and
MSG_SENDPAGE_NOTLAST as hints to tcp_sendpage()

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 net/rds/tcp_send.c |8 +++-
 1 files changed, 7 insertions(+), 1 deletions(-)

diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c
index 53b17ca..5f3e3fa 100644
--- a/net/rds/tcp_send.c
+++ b/net/rds/tcp_send.c
@@ -83,6 +83,7 @@ int rds_tcp_xmit(struct rds_connection *conn, struct 
rds_message *rm,
struct rds_tcp_connection *tc = conn->c_transport_data;
int done = 0;
int ret = 0;
+   int more;
 
if (hdr_off == 0) {
/*
@@ -116,12 +117,15 @@ int rds_tcp_xmit(struct rds_connection *conn, struct 
rds_message *rm,
goto out;
}
 
+   more = (rm->data.op_nents > 1 ? (MSG_MORE | MSG_SENDPAGE_NOTLAST) : 0);
while (sg < rm->data.op_nents) {
+   int flags = (MSG_DONTWAIT | MSG_NOSIGNAL | more);
+
ret = tc->t_sock->ops->sendpage(tc->t_sock,
sg_page(>data.op_sg[sg]),
rm->data.op_sg[sg].offset + off,
rm->data.op_sg[sg].length - off,
-   MSG_DONTWAIT|MSG_NOSIGNAL);
+   flags);
rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void 
*)sg_page(>data.op_sg[sg]),
 rm->data.op_sg[sg].offset + off, 
rm->data.op_sg[sg].length - off,
 ret);
@@ -134,6 +138,8 @@ int rds_tcp_xmit(struct rds_connection *conn, struct 
rds_message *rm,
off = 0;
sg++;
}
+   if (sg == rm->data.op_nents - 1)
+   more = 0;
}
 
 out:
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 1/3] net/rds: Use a single TCP socket for both send and receive.

2015-09-30 Thread Sowmini Varadhan
Commit f711a6ae062c ("net/rds: RDS-TCP: Always create a new rds_sock
for an incoming connection.") modified rds-tcp so that an incoming SYN
would ignore an existing "client" TCP connection which had the local
port set to the transient port.  The motivation for ignoring the existing
"client" connection in f711a6ae was to avoid race conditions and an
endless duel of reconnect attempts triggered by a restart/abort of one
of the nodes in the TCP connection.

However, having separate sockets for active and passive sides
is avoidable, and the simpler model of a single TCP socket for
both send and receives of all RDS connections associated with
that tcp socket makes for easier observability. We avoid the race
conditions from f711a6ae by attempting reconnects in rds_conn_shutdown
if, and only if, the (new) c_outgoing bit is set for RDS_TRANS_TCP.
The c_outgoing bit is initialized in __rds_conn_create().

A side-effect of re-using the client rds_connection for an incoming
SYN is the potential of encountering duelling SYNs, i.e., we
have an outgoing RDS_CONN_CONNECTING socket when we get the incoming
SYN. The logic to arbitrate this criss-crossing SYN exchange in
rds_tcp_accept_one() has been modified to emulate the BGP state
machine: the smaller IP address should back off from the connection attempt.

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 net/rds/connection.c |   22 ++
 net/rds/rds.h|4 +++-
 net/rds/tcp_listen.c |   19 +++
 3 files changed, 16 insertions(+), 29 deletions(-)

diff --git a/net/rds/connection.c b/net/rds/connection.c
index 49adeef..d456403 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -128,10 +128,7 @@ static struct rds_connection *__rds_conn_create(struct net 
*net,
struct rds_transport *loop_trans;
unsigned long flags;
int ret;
-   struct rds_transport *otrans = trans;
 
-   if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
-   goto new_conn;
rcu_read_lock();
conn = rds_conn_lookup(net, head, laddr, faddr, trans);
if (conn && conn->c_loopback && conn->c_trans != _loop_transport &&
@@ -147,7 +144,6 @@ static struct rds_connection *__rds_conn_create(struct net 
*net,
if (conn)
goto out;
 
-new_conn:
conn = kmem_cache_zalloc(rds_conn_slab, gfp);
if (!conn) {
conn = ERR_PTR(-ENOMEM);
@@ -207,6 +203,7 @@ static struct rds_connection *__rds_conn_create(struct net 
*net,
 
atomic_set(>c_state, RDS_CONN_DOWN);
conn->c_send_gen = 0;
+   conn->c_outgoing = (is_outgoing ? 1 : 0);
conn->c_reconnect_jiffies = 0;
INIT_DELAYED_WORK(>c_send_w, rds_send_worker);
INIT_DELAYED_WORK(>c_recv_w, rds_recv_worker);
@@ -243,22 +240,13 @@ static struct rds_connection *__rds_conn_create(struct 
net *net,
/* Creating normal conn */
struct rds_connection *found;
 
-   if (!is_outgoing && otrans->t_type == RDS_TRANS_TCP)
-   found = NULL;
-   else
-   found = rds_conn_lookup(net, head, laddr, faddr, trans);
+   found = rds_conn_lookup(net, head, laddr, faddr, trans);
if (found) {
trans->conn_free(conn->c_transport_data);
kmem_cache_free(rds_conn_slab, conn);
conn = found;
} else {
-   if ((is_outgoing && otrans->t_type == RDS_TRANS_TCP) ||
-   (otrans->t_type != RDS_TRANS_TCP)) {
-   /* Only the active side should be added to
-* reconnect list for TCP.
-*/
-   hlist_add_head_rcu(>c_hash_node, head);
-   }
+   hlist_add_head_rcu(>c_hash_node, head);
rds_cong_add_conn(conn);
rds_conn_count++;
}
@@ -337,7 +325,9 @@ void rds_conn_shutdown(struct rds_connection *conn)
rcu_read_lock();
if (!hlist_unhashed(>c_hash_node)) {
rcu_read_unlock();
-   rds_queue_reconnect(conn);
+   if (conn->c_trans->t_type != RDS_TRANS_TCP ||
+   conn->c_outgoing == 1)
+   rds_queue_reconnect(conn);
} else {
rcu_read_unlock();
}
diff --git a/net/rds/rds.h b/net/rds/rds.h
index afb4048..b4c7ac0 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -86,7 +86,9 @@ struct rds_connection {
struct hlist_node   c_hash_node;
__be32  c_laddr;
__be32  c_faddr;
-   unsigned intc_loopback:1;
+   un

Re: [PATCH net-next 3/3] RDS-TCP: Set up MSG_MORE and MSG_SENDPAGE_NOTLAST as appropriate in rds_tcp_xmit

2015-09-30 Thread Sowmini Varadhan
On (09/30/15 08:56), santosh shilimkar wrote:
> Your checkpatch.pl should have complained about commit
> reference in the change-log. You might want to fix that
> for consistency.

It didnt. But ok, I'll fix this nit as well.

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 1/3] net/rds: Use a single TCP socket for both send and receive.

2015-09-30 Thread Sowmini Varadhan
On (09/30/15 08:50), santosh shilimkar wrote:
> > rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
> >-WARN_ON(!rs_tcp || rs_tcp->t_sock);
> >+if (rs_tcp->t_sock && inet->inet_saddr < inet->inet_daddr) {
> >+struct sock *nsk = new_sock->sk;
> >
> Any reason you dropped the WARN_ON. Note that till we got commit
> 74e98eb0 (" RDS: verify the underlying transport exists before creating
> a connection") merged, we had an issue. That guards it now.

That was done deliberately. Now that we have only one tcp socket,
we can run into an rds_tcp_connection for an outgoing connection
that we initiated, thus rs_tcp->t_sock can be non-null - which is
why a new check is added in the newly added line in the patch.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 1/3] net/rds: Use a single TCP socket for both send and receive.

2015-09-30 Thread Sowmini Varadhan
On (09/30/15 08:50), santosh shilimkar wrote:
> minor nit though not a strict rule. Just to be consistent based on
> what we are following.
> 
> - core RDS patches "RDS:"
> - RDS IB patches "RDS: IB:" or "RDS/IB:"
> - RDS IW patches "RDS: IW:" or
> - RDS TCP can use "RDS: TCP" or "RDS/TCP:"

Ok, but in this case patch 1/3 the changes affect both core and rds-tcp
modules. 

Working on patchv2 that will address Sergei's comments and the
kbuild-test-robot warning as well

> 
> $subject
> s/net/rds:/RDS:
> 
> On 9/30/2015 6:45 AM, Sowmini Varadhan wrote:
> >Commit f711a6ae062c ("net/rds: RDS-TCP: Always create a new rds_sock
> >for an incoming connection.") modified rds-tcp so that an incoming SYN
> >would ignore an existing "client" TCP connection which had the local
> >port set to the transient port.  The motivation for ignoring the existing
> >"client" connection in f711a6ae was to avoid race conditions and an
> >endless duel of reconnect attempts triggered by a restart/abort of one
> >of the nodes in the TCP connection.
> >
> >However, having separate sockets for active and passive sides
> >is avoidable, and the simpler model of a single TCP socket for
> >both send and receives of all RDS connections associated with
> >that tcp socket makes for easier observability. We avoid the race
> >conditions from f711a6ae by attempting reconnects in rds_conn_shutdown
> >if, and only if, the (new) c_outgoing bit is set for RDS_TRANS_TCP.
> >The c_outgoing bit is initialized in __rds_conn_create().
> >
> >A side-effect of re-using the client rds_connection for an incoming
> >SYN is the potential of encountering duelling SYNs, i.e., we
> >have an outgoing RDS_CONN_CONNECTING socket when we get the incoming
> >SYN. The logic to arbitrate this criss-crossing SYN exchange in
> >rds_tcp_accept_one() has been modified to emulate the BGP state
> >machine: the smaller IP address should back off from the connection attempt.
> >
> >Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
> >---
> >  net/rds/connection.c |   22 ++
> >  net/rds/rds.h|4 +++-
> >  net/rds/tcp_listen.c |   19 +++
> >  3 files changed, 16 insertions(+), 29 deletions(-)
> >
> 
> [...]
> 
> >diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c
> >index 444d78d..ee70d13 100644
> >--- a/net/rds/tcp_listen.c
> >+++ b/net/rds/tcp_listen.c
> >@@ -110,28 +110,23 @@ int rds_tcp_accept_one(struct socket *sock)
> > goto out;
> > }
> > /* An incoming SYN request came in, and TCP just accepted it.
> >- * We always create a new conn for listen side of TCP, and do not
> >- * add it to the c_hash_list.
> >  *
> >  * If the client reboots, this conn will need to be cleaned up.
> >  * rds_tcp_state_change() will do that cleanup
> >  */
> > rs_tcp = (struct rds_tcp_connection *)conn->c_transport_data;
> >-WARN_ON(!rs_tcp || rs_tcp->t_sock);
> >+if (rs_tcp->t_sock && inet->inet_saddr < inet->inet_daddr) {
> >+struct sock *nsk = new_sock->sk;
> >
> Any reason you dropped the WARN_ON. Note that till we got commit
> 74e98eb0 (" RDS: verify the underlying transport exists before creating
> a connection") merged, we had an issue. That guards it now.
> 
> Am curious about WARN_ON() and hence the question.
> 
> Rest of the patch looks fine to me.
> Acked-by: Santosh Shilimkar <santosh.shilim...@oracle.com>
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


IFLA_INET6_[ICMP6]STATS

2015-09-09 Thread Sowmini Varadhan

I'm doing some experiments that are trying to simultaneously
scaling the number of CPUs, and the number of processes 
and encountering getifaddrs() weaknesses.

Others have run into similar things in the past,  e.g., 
  http://lists.openwall.net/netdev/2014/01/23/119
and more relevant to my experiment: the findings behind 
the recent commit a3a77372.

In my case, it looks like getifaddrs() doesnt even use the
results of IFLA_INET6_STATS or IFLA_INET6_ICMP6STATS- 
from my scan of glibc, this information is ignored (it only 
looks at IFLA_STATS).  Moreover, if I hack out  all of 
snmp_fold_field() (so that it always returns 0),  it helps 
my cpu utilization and scaling, and no errors are reported.

So the question is- who uses IFLA_INET6_STATS/IFLA_INET6_ICMP6STATS?
Is this intended for some ND/ripngd etc daemon?  Doesnt seem to 
be documented in rtnetlink(7), and couldnt find any users in glibc,
and google did not find any usage. 

--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: IFLA_INET6_[ICMP6]STATS

2015-09-09 Thread Sowmini Varadhan
On (09/09/15 14:43), David Miller wrote:
> 
> But what we could do is add a flag in the netlink request which
> elides the stats.  GLIBC et al. could then start setting the flag.
> 

Yes, interestingly that's what I was experimenting with myself 
(though I was using a setsockopt in my version). I'll send out 
a patch rfc later this week for this.

--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 0/3] kcm: Kernel Connection Multiplexor (KCM)

2015-09-21 Thread Sowmini Varadhan
On (09/21/15 15:36), Tom Herbert wrote:
> segments. What we need to do, which you're probably doing for RDS, is
> do message delineation on the stream as a sequence of:
> 
> 1) Read protocol header to determine message length (BPF used here)

right, that's what rds does- first reads the sizeof(rds_header),
and from that, figures out payload len, to stitch each rds dgram 
together from intermediate tcp segments..

> 2) Read data up to the length of the message
> 3) Deliver message
> 4) Goto #1 (i.e. process next message in the stream).

Thanks for the rest of the responses.

--Sowmini

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next] sunvnet:Invoke SET_NETDEV_DEV() to set up the vdev in vnet_new()

2015-09-18 Thread Sowmini Varadhan
`ls /sys/devices/channel-devices/vnet-port-0-0/net' is missing without
this change, and applications like NetworkManager are looking in
sysfs for the information.

Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
 drivers/net/ethernet/sun/sunvnet.c |   17 +++--
 1 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/sun/sunvnet.c 
b/drivers/net/ethernet/sun/sunvnet.c
index 53fe200..cc106d8 100644
--- a/drivers/net/ethernet/sun/sunvnet.c
+++ b/drivers/net/ethernet/sun/sunvnet.c
@@ -1756,7 +1756,8 @@ static const struct net_device_ops vnet_ops = {
 #endif
 };
 
-static struct vnet *vnet_new(const u64 *local_mac)
+static struct vnet *vnet_new(const u64 *local_mac,
+struct vio_dev *vdev)
 {
struct net_device *dev;
struct vnet *vp;
@@ -1790,6 +1791,8 @@ static struct vnet *vnet_new(const u64 *local_mac)
   NETIF_F_HW_CSUM | NETIF_F_SG;
dev->features = dev->hw_features;
 
+   SET_NETDEV_DEV(dev, >dev);
+
err = register_netdev(dev);
if (err) {
pr_err("Cannot register net device, aborting\n");
@@ -1808,7 +1811,8 @@ static struct vnet *vnet_new(const u64 *local_mac)
return ERR_PTR(err);
 }
 
-static struct vnet *vnet_find_or_create(const u64 *local_mac)
+static struct vnet *vnet_find_or_create(const u64 *local_mac,
+   struct vio_dev *vdev)
 {
struct vnet *iter, *vp;
 
@@ -1821,7 +1825,7 @@ static struct vnet *vnet_find_or_create(const u64 
*local_mac)
}
}
if (!vp)
-   vp = vnet_new(local_mac);
+   vp = vnet_new(local_mac, vdev);
mutex_unlock(_list_mutex);
 
return vp;
@@ -1848,7 +1852,8 @@ static void vnet_cleanup(void)
 static const char *local_mac_prop = "local-mac-address";
 
 static struct vnet *vnet_find_parent(struct mdesc_handle *hp,
-   u64 port_node)
+u64 port_node,
+struct vio_dev *vdev)
 {
const u64 *local_mac = NULL;
u64 a;
@@ -1869,7 +1874,7 @@ static struct vnet *vnet_find_parent(struct mdesc_handle 
*hp,
if (!local_mac)
return ERR_PTR(-ENODEV);
 
-   return vnet_find_or_create(local_mac);
+   return vnet_find_or_create(local_mac, vdev);
 }
 
 static struct ldc_channel_config vnet_ldc_cfg = {
@@ -1923,7 +1928,7 @@ static int vnet_port_probe(struct vio_dev *vdev, const 
struct vio_device_id *id)
 
hp = mdesc_grab();
 
-   vp = vnet_find_parent(hp, vdev->mp);
+   vp = vnet_find_parent(hp, vdev->mp, vdev);
if (IS_ERR(vp)) {
pr_err("Cannot find port parent vnet\n");
err = PTR_ERR(vp);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 0/3] kcm: Kernel Connection Multiplexor (KCM)

2015-09-21 Thread Sowmini Varadhan
On (09/20/15 15:29), Tom Herbert wrote:
> 
> Kernel Connection Multiplexor (KCM) is a facility that provides a
> message based interface over TCP for generic application protocols.
> The motivation for this is based on the observation that although
> TCP is byte stream transport protocol with no concept of message
> boundaries, a common use case is to implement a framed application
> layer protocol running over TCP. To date, most TCP stacks offer
> byte stream API for applications, which places the burden of message
> delineation, message I/O operation atomicity, and load balancing
> in the application. With KCM an application can efficiently send
> and receive application protocol messages over TCP using a
> datagram interface.

A lot of this design is very similar to the PF_RDS/RDS-TCP
design. There too, we have a PF_RDS dgram socket (that already 
supports SEQPACKET semantics today) that can be tunneled over TCP.

The biggest design difference that I see in your proposal is 
that you are using BPF so presumably the demux has more flexibility
than RDS, which does the demux based on RDS port numbers?

Would it make sense to build your solution on top of RDS,
rather than re-invent solutions for many of the challenges
that one encounters when building a dgram-over-stream hybrid
socket (see "lessons learned" list below)?

Some things that were not clear to me from the patch-set:

The doc statses that we re-assemble packets the "stated length" -
but how will the receiver know the "stated length"? 
(fwiw, RDS figures that out from the header len in RDS,
and elsewhere I think you allude to some similar encaps
header - is that a correct understanding?)

not clear from the diagram: Is there one TCP socket per kcm-socket? 
what is the relation (one-one, many-one etc.)  between a kcm-socket and
a psock?  How does the ksock-psock-tcp-sock association get set up? 

the notes say one can "accept()" over a kcm socket- but "accept()"
is itself a connection-oriented concept- one does not accept() on
a dgram socket. So what exactly does this mean, and why not just
use the well-defined TCP socket semantics at that point (with something
like XDR for message boundary marking)?

In the "fwiw" bucket of lessons learned from RDS..  please ignore if
you were already aware of these- 

In the case of RDS, since multiple rds/dgram sockets share a single TCP
socket, some issues that have to be dealt with are

- congestion/starvation: we dont want tcp to start advertising
  zero-window because one dgram socket pair has flooded the pipe
  and the peer is not reading. So the RDS protocol has port-congestion
  RDS control plane messages that track congestion at the RDS port.

- imposes some constraints on the TCP send side- if sock1 and sock2
  are sharing a tcp socket, and both are sending dgrams over the 
  stream, dgrams from sock1 may get interleaved  (see comments above
  rds_send_xmit() for a note on how rds deals witt this). There are ways
  to fan this out over multiple tcp sockets (and I'm working on those,
  to improve the scaling), but just a note that there is some complexity
  to be dealt with here. Not sure if this was considered in the "KCM
  sockets" section in patch2..

- in general the "dgram-over-stream" hybrid has some peculiar issues. E.g.,
  dgram APIs like BINDTODEVICE and IP_PKTINFO cannot be applied
  to the underlying stream. In the typical use case for RDS (database 
  clusters) there's a reasonable workaround for this using network
  namespaces to define bundles of outgoing interfaces, but that solution
  may not always be workable for other use-cases. Thus it might actually
  be more obvious to simply use tcp sockets (and use something like XDR
  for message boundary markers on the stream).

--Sowmini
 


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC 0/3] kcm: Kernel Connection Multiplexor (KCM)

2015-09-21 Thread Sowmini Varadhan
On (09/21/15 10:33), Tom Herbert wrote:
> >
> > Some things that were not clear to me from the patch-set:
> >
> > The doc statses that we re-assemble packets the "stated length" -
> > but how will the receiver know the "stated length"?
> 
> BPF program returns the length of the next message. In my testing so
> far I've been using HTTP/2 which defines a frame format with first 3
> bytes being header length field . The BPF program (using LLVM/Clang--
> thanks Alexei!) is just:

Maybe I dont see something about the mux/demux here (I have to 
take a closer look at reserve_psock/unreserve_psock), but 
will every tcp segment have a 3 byte length in the payload?

Not every TCP segment in the RDS-TCP case will have a RDS header,
thus the comments before rds_send_xmit(), thus applying the bpf filter
to a TCP segment holding some "from-the-middle" piece of the RDS dgram
may not be possible 

> > the notes say one can "accept()" over a kcm socket- but "accept()"
> > is itself a connection-oriented concept- one does not accept() on
:
> The accept method is overloaded on KCM sockets to do the socket
> cloning operation. This is unrelated to TCP semantics, connection
> management is performed on TCP sockets (i.e. before being attached to
> a KCM multiplexor).

If possible,it might be better to use some other
glibc-func/name/syscall/sockopt/whatever
for this, rather than overloading accept().. feels like that would
keep the semantics cleaner, and probably less likely to trip 
up on accept code in the kernel..

--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ipsec impact on performance

2015-12-08 Thread Sowmini Varadhan
On (12/08/15 12:32), Steffen Klassert wrote:
> 
> Would be nice if you could share the results. Comments are

Sure, not a problem. Give me some time though, I'm also looking
into the skb_cow_data and other memory-management issues that
were flagged on this thread. 

I'll have all this info by netdev, at the latest.

--Sowmini


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ipsec impact on performance

2015-12-02 Thread Sowmini Varadhan
On (12/02/15 12:41), David Laight wrote:
> You are getting 0.7 Gbps with ass-ccm-a-128, scale the esp-null back to
> that and it would use 7/18*71 = 27% of the cpu.
> So 69% of the cpu in the a-128 case is probably caused by the
> encryption itself.
> Even if the rest of the code cost nothing you'd not increase
> above 1Gbps.

Fortunately, the situation is not quite hopeless yet.

Thanks to Rick Jones for supplying the hints for this, but with
some careful manual pinning of irqs and iperf processes to cpus,
I can get to 4.5 Gbps for the esp-null case.

Given that the [clear traffic + GSO without GRO] gets me about 5-7 Gbps,
the 4.5 Gbps is not that far off (and at that point, the nickel-and-dime
tweaks may help even more).

For AES-GCM, I'm able to go from 1.8 Gbps (no GSO) to 2.8 Gbps.
Still not great, but proves that we haven't yet hit any upper bounds
yet.

I think a lot of the manual tweaking of irq/process placement
is needed because the existing rps/rfs flow steering is looking
for TCP/UDP flow numbers to do the steering. It can just as easily
use the IPsec SPI numbers to do this, and that's another place where
we can make this more ipsec-friendly.

--Sowmini

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC] Stable interface index option

2015-12-01 Thread Sowmini Varadhan
On (12/01/15 15:57), David Miller wrote:

> >> > Also current versions of SNMP provide more useful information about
> >> > network interface slot information in ifDescription
> >> 
> >> Well if they do provide strings, then that is probably a better way
> >> forward than messing with the kernel.
> > 
> > It gives strings based on PCI information but nothing useful
> > on tunnels.
> 
> But at least in theory, that could be extended to do so right?

iirc even for the cisco NOS-es, the snmp ifindex for virtual interfaces
(tunnels, vpc, loopback) etc would not have any slot etc info, but
would have other things (specific to the virtual interface type, e.g.,
FEX interface index had something that was pertinent to fex)

But the bigger reason they had a immutable snmp-ifindex was that
the uspace networking applications could build state based on that
immutable index and hang on to that number, regardless of any renumbering
that happened due to HA/failover.

And, since they did not (in general) have to deal with random third
party apps, they did not have to deal with questions like "what should
POSIX/glibc APIs send - the immutable or the mutable index?" so it
was ok for them to have the complexity of two interface indices.

--Sowmini

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ipsec impact on performance

2015-12-02 Thread Sowmini Varadhan
On (12/02/15 13:07), Tom Herbert wrote:
> That's easy enough to add to flow dissector, but is SPI really
> intended to be used an L4 entropy value? We would need to consider the

yes. To quote https://en.wikipedia.org/wiki/Security_Parameter_Index
"This works like port numbers in TCP and UDP connections. What it means
 is that there could be different SAs used to provide security to one
 connection. An SA could therefore act as a set of rules."

> effects of running multiple TCP connections over an IPsec. Also, you
> might want to try IPv6, the flow label should provide a good L4 hash
> for RPS/RFS, it would be interesting to see what the effects are with
> IPsec processing. (ESP/UDP could also if RSS/ECMP is critical)

IPv6 would be an interesting academic exercise, but it's going
to be a while before we get RDS-TCP to go over IPv6.

--Sowmini

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ipsec impact on performance

2015-12-02 Thread Sowmini Varadhan
On (12/02/15 14:01), Tom Herbert wrote:
> No, please don't persist is this myopic "we'll get to IPv6 later"
> model! IPv6 is a real protocol, it has significant deployment of the
> Internet, and there are now whole data centers that are IPv6 only
> (e.g. FB), and there are plenty of use cases of IPSEC/IPv6 that could
> benefit for performance improvements just as much IPv4. This vendor
> mentality that IPv6 is still not important simply doesn't help
> matters. :-(

Ok, I'll get you the numbers for this later, and sure, if we do
this, we should solve the ipv6 problem too.

BTW, the ipv6 nov3 paths have severe alignment issues. I flagged
this a long time ago http://www.spinics.net/lists/netdev/msg336257.html

I think all of it is triggered by mld. Someone needs to do
something about that too. I dont think those paths are using 
NET_ALIGN very well, and I dont think this is the most wholesome
thing for perf.

--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ipsec impact on performance

2015-12-02 Thread Sowmini Varadhan
On (12/02/15 13:44), Tom Herbert wrote:
> > IPv6 would be an interesting academic exercise, but it's going
> > to be a while before we get RDS-TCP to go over IPv6.
> >
> Huh? Who said anything about RDS-TCP? I thought you were trying to
> improve IPsec performance...

yes, and it would be nice to find out that IPsec for IPv6 is
fast, but I'm afraid there are a lot of IPv4 use cases out there that
need the same thing for IPv4 too (first?).

--Sowmini


--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: ipsec impact on performance

2015-12-03 Thread Sowmini Varadhan
On (12/03/15 09:45), Steffen Klassert wrote:
> pcrypt(echainiv(authenc(hmac(sha1-ssse3),cbc-aes-aesni)))
> 
> Result:
> 
> iperf -c 10.0.0.12 -t 60
> 
> Client connecting to 10.0.0.12, TCP port 5001
> TCP window size: 45.0 KByte (default)
> 
> [  3] local 192.168.0.12 port 39380 connected with 10.0.0.12 port 5001
> [ ID] Interval   Transfer Bandwidth
> [  3]  0.0-60.0 sec  32.8 GBytes  4.70 Gbits/sec
> 
> I provide more informatios as soon as the code is available.

that's pretty good compared to the baseline. 
I'd like to try out our patches, when they are ready.

I think you may get some more improvement if you manually pin the irq 
and iperf to specific cpus (at least that was my observation for transp 
mode) 

--Sowmini
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH RESEND v7] i40e: Look up MAC address in Open Firmware or IDPROM

2015-12-04 Thread Sowmini Varadhan

[Apologies for fat-fingering subject in the other attempt]

This is the i40e equivalent of commit c762dff24c06 ("ixgbe: Look up MAC
address in Open Firmware or IDPROM").

As with that fix, attempt to look up the MAC address in Open Firmware
on systems that support it, and use IDPROM on SPARC if no OF address
is found.

In the case of the i40e there is an assumption that the default mac
address has already been set up as the primary mac filter on probe,
so if this filter is obtained from the Open Firmware or IDPROM, an
explicit write is needed via i40e_aq_mac_address_write() and
i40e_aq_add_macvlan() invocation.

The is_default_mac field in the platform-private i40e_pf structure
tracks whether the mac address was default or not, and in the latter
case, will trigger the calls to i40e_aq_mac_address_write() and
i40e_aq_add_macvlan().

Reviewed-by: Martin K. Petersen <martin.peter...@oracle.com>
Signed-off-by: Sowmini Varadhan <sowmini.varad...@oracle.com>
---
v2, v3: Andy Shevchenko comments
v4: Shannon Nelson review: explicitly set up mac filters before register_netdev
v5: Shannon Nelson code style comments
v6: Shannon Nelson code style comments
v7: Ensure that i40e_macaddr_init() is called only for VSI_MAIN, and only
if the mac address is not the default. Some additional code-refactoring
based on comments from Shannon Nelson

 drivers/net/ethernet/intel/i40e/i40e.h  |2 +
 drivers/net/ethernet/intel/i40e/i40e_main.c |   87 +++
 2 files changed, 89 insertions(+), 0 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 0b9537b..6d41757 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -420,6 +420,8 @@ struct i40e_pf {
 
u32 ioremap_len;
u32 fd_inv;
+
+   u32 is_default_mac:1;
 };
 
 struct i40e_mac_filter {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 9e6268b..40a5d53 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -24,6 +24,15 @@
  *
  
**/
 
+#include 
+#include 
+#include 
+
+#ifdef CONFIG_SPARC
+#include 
+#include 
+#endif
+
 /* Local includes */
 #include "i40e.h"
 #include "i40e_diag.h"
@@ -9387,6 +9396,44 @@ static struct i40e_vsi *i40e_vsi_reinit_setup(struct 
i40e_vsi *vsi)
 }
 
 /**
+ * i40e_macaddr_init - explicitly write the mac address filters.
+ *
+ * @vsi: pointer to the vsi.
+ * @macaddr: the MAC address
+ *
+ * This is needed when the macaddr has been obtained by other
+ * means than the default, e.g., from Open Firmware or IDPROM.
+ * Returns 0 on success, negative on failure
+ **/
+static int i40e_macaddr_init(struct i40e_vsi *vsi, u8 *macaddr)
+{
+   int ret, aq_err;
+   struct i40e_aqc_add_macvlan_element_data element;
+
+   ret = i40e_aq_mac_address_write(>back->hw,
+   I40E_AQC_WRITE_TYPE_LAA_WOL,
+   macaddr, NULL);
+   if (ret) {
+   dev_info(>back->pdev->dev,
+"Addr change for VSI failed: %d\n", ret);
+   return -EADDRNOTAVAIL;
+   }
+
+   memset(, 0, sizeof(element));
+   ether_addr_copy(element.mac_addr, macaddr);
+   element.flags = cpu_to_le16(I40E_AQC_MACVLAN_ADD_PERFECT_MATCH);
+   ret = i40e_aq_add_macvlan(>back->hw, vsi->seid, , 1, NULL);
+   aq_err = vsi->back->hw.aq.asq_last_status;
+   if (aq_err != I40E_AQ_RC_OK) {
+   dev_info(>back->pdev->dev,
+"add filter failed err %s aq_err %s\n",
+i40e_stat_str(>back->hw, ret),
+i40e_aq_str(>back->hw, aq_err));
+   }
+   return ret;
+}
+
+/**
  * i40e_vsi_setup - Set up a VSI by a given type
  * @pf: board private structure
  * @type: VSI type
@@ -9510,6 +9557,14 @@ struct i40e_vsi *i40e_vsi_setup(struct i40e_pf *pf, u8 
type,
switch (vsi->type) {
/* setup the netdev if needed */
case I40E_VSI_MAIN:
+   /* Apply relevant filters if a platform-specific mac
+* address was selected.
+*/
+   if (!pf->is_default_mac) {
+   ret = i40e_macaddr_init(vsi, pf->hw.mac.addr);
+   if (ret)
+   goto err_netdev;
+   }
case I40E_VSI_VMDQ2:
case I40E_VSI_FCOE:
ret = i40e_config_netdev(vsi);
@@ -10340,6 +10395,36 @@ static void i40e_print_features(struct i40e_pf *pf)
 }
 
 /**
+ * i40e_get_platform_mac_addr - get platform-specific MAC address
+ *
+ * @pdev: PCI device information struct
+ * @pf: board private structure

  1   2   3   4   5   6   7   >