from:"Tonghao Zhang"

Re: [PATCH net-next v6 1/4] net: vhost: lock the vqs one by one

2018-07-25 Thread Tonghao Zhang

On Sun, Jul 22, 2018 at 11:26 PM Michael S. Tsirkin  wrote:
>
> On Sat, Jul 21, 2018 at 11:03:59AM -0700, xiangxia.m@gmail.com wrote:
> > From: Tonghao Zhang 
> >
> > This patch changes the way that lock all vqs
> > at the same, to lock them one by one. It will
> > be used for next patch to avoid the deadlock.
> >
> > Signed-off-by: Tonghao Zhang 
> > Acked-by: Jason Wang 
> > Signed-off-by: Jason Wang 
> > ---
> >  drivers/vhost/vhost.c | 24 +++-
> >  1 file changed, 7 insertions(+), 17 deletions(-)
> >
> > diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> > index a502f1a..a1c06e7 100644
> > --- a/drivers/vhost/vhost.c
> > +++ b/drivers/vhost/vhost.c
> > @@ -294,8 +294,11 @@ static void vhost_vq_meta_reset(struct vhost_dev *d)
> >  {
> >   int i;
> >
> > - for (i = 0; i < d->nvqs; ++i)
> > + for (i = 0; i < d->nvqs; ++i) {
> > + mutex_lock(>vqs[i]->mutex);
> >   __vhost_vq_meta_reset(d->vqs[i]);
> > + mutex_unlock(>vqs[i]->mutex);
> > + }
> >  }
> >
> >  static void vhost_vq_reset(struct vhost_dev *dev,
> > @@ -890,20 +893,6 @@ static inline void __user *__vhost_get_user(struct 
> > vhost_virtqueue *vq,
> >  #define vhost_get_used(vq, x, ptr) \
> >   vhost_get_user(vq, x, ptr, VHOST_ADDR_USED)
> >
> > -static void vhost_dev_lock_vqs(struct vhost_dev *d)
> > -{
> > - int i = 0;
> > - for (i = 0; i < d->nvqs; ++i)
> > - mutex_lock_nested(>vqs[i]->mutex, i);
> > -}
> > -
> > -static void vhost_dev_unlock_vqs(struct vhost_dev *d)
> > -{
> > - int i = 0;
> > - for (i = 0; i < d->nvqs; ++i)
> > - mutex_unlock(>vqs[i]->mutex);
> > -}
> > -
> >  static int vhost_new_umem_range(struct vhost_umem *umem,
> >   u64 start, u64 size, u64 end,
> >   u64 userspace_addr, int perm)
> > @@ -953,7 +942,10 @@ static void vhost_iotlb_notify_vq(struct vhost_dev *d,
> >   if (msg->iova <= vq_msg->iova &&
> >   msg->iova + msg->size - 1 > vq_msg->iova &&
> >   vq_msg->type == VHOST_IOTLB_MISS) {
> > + mutex_lock(>vq->mutex);
> >   vhost_poll_queue(>vq->poll);
> > + mutex_unlock(>vq->mutex);
> > +
> >   list_del(>node);
> >   kfree(node);
> >   }
> > @@ -985,7 +977,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev 
> > *dev,
> >   int ret = 0;
> >
> >   mutex_lock(>mutex);
> > - vhost_dev_lock_vqs(dev);
> >   switch (msg->type) {
> >   case VHOST_IOTLB_UPDATE:
> >   if (!dev->iotlb) {
> > @@ -1019,7 +1010,6 @@ static int vhost_process_iotlb_msg(struct vhost_dev 
> > *dev,
> >   break;
> >   }
> >
> > - vhost_dev_unlock_vqs(dev);
> >   mutex_unlock(>mutex);
> >
> >   return ret;
>
> I do prefer the finer-grained locking but I remember we
> discussed something like this in the past and Jason saw issues
> with such a locking.
This change is suggested by Jason. Should I send new version because
the patch 3 is changed.

> Jason?
>
> > --
> > 1.8.3.1

Re: [PATCH net-next v3 3/4] net: vhost: factor out busy polling logic to vhost_net_busy_poll()

2018-07-01 Thread Tonghao Zhang

On Mon, Jul 2, 2018 at 10:29 AM Jason Wang  wrote:
>
>
>
> On 2018年06月30日 14:33, xiangxia.m@gmail.com wrote:
> > From: Tonghao Zhang 
> >
> > Factor out generic busy polling logic and will be
> > used for tx path in the next patch. And with the patch,
> > qemu can set differently the busyloop_timeout for rx queue.
> >
> > Signed-off-by: Tonghao Zhang 
> > ---
> >   drivers/vhost/net.c | 92 
> > ++---
> >   1 file changed, 53 insertions(+), 39 deletions(-)
> >
> > diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> > index 62bb8e8..458f81d 100644
> > --- a/drivers/vhost/net.c
> > +++ b/drivers/vhost/net.c
> > @@ -429,6 +429,50 @@ static int vhost_net_enable_vq(struct vhost_net *n,
> >   return vhost_poll_start(poll, sock->file);
> >   }
> >
> > +static int sk_has_rx_data(struct sock *sk)
> > +{
> > + struct socket *sock = sk->sk_socket;
> > +
> > + if (sock->ops->peek_len)
> > + return sock->ops->peek_len(sock);
> > +
> > + return skb_queue_empty(>sk_receive_queue);
> > +}
> > +
> > +static void vhost_net_busy_poll(struct vhost_net *net,
> > + struct vhost_virtqueue *rvq,
> > + struct vhost_virtqueue *tvq,
> > + bool rx)
> > +{
> > + unsigned long uninitialized_var(endtime);
> > + struct socket *sock = rvq->private_data;
> > + struct vhost_virtqueue *vq = rx ? tvq : rvq;
> > + unsigned long busyloop_timeout = rx ? rvq->busyloop_timeout :
> > +   tvq->busyloop_timeout;
>
> As simple as vq->busyloop_timeout?
maybe we should allow user set busyloop_timeout for rx or tx
differently. this code should be moved under mutex.

> > +
> > + mutex_lock_nested(>mutex, rx ? VHOST_NET_VQ_TX: VHOST_NET_VQ_RX);
>
> We need move sock = rvq->private_data under the protection of vq mutex
> if rx is false.
yes, thanks for your review.

> > + vhost_disable_notify(>dev, vq);
> > +
> > + preempt_disable();
> > + endtime = busy_clock() + busyloop_timeout;
> > + while (vhost_can_busy_poll(tvq->dev, endtime) &&
> > +!(sock && sk_has_rx_data(sock->sk)) &&
> > +vhost_vq_avail_empty(tvq->dev, tvq))
> > + cpu_relax();
> > + preempt_enable();
> > +
> > + if ((rx && !vhost_vq_avail_empty(>dev, vq)) ||
> > + (!rx && (sock && sk_has_rx_data(sock->sk {
> > + vhost_poll_queue(>poll);
> > + } else if (unlikely(vhost_enable_notify(>dev, vq))) {
> > + vhost_disable_notify(>dev, vq);
> > + vhost_poll_queue(>poll);
> > + }
> > +
> > + mutex_unlock(>mutex);
> > +}
> > +
> > +
> >   static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> >   struct vhost_virtqueue *vq,
> >   struct iovec iov[], unsigned int iov_size,
> > @@ -621,16 +665,6 @@ static int peek_head_len(struct vhost_net_virtqueue 
> > *rvq, struct sock *sk)
> >   return len;
> >   }
> >
> > -static int sk_has_rx_data(struct sock *sk)
> > -{
> > - struct socket *sock = sk->sk_socket;
> > -
> > - if (sock->ops->peek_len)
> > - return sock->ops->peek_len(sock);
> > -
> > - return skb_queue_empty(>sk_receive_queue);
> > -}
> > -
> >   static void vhost_rx_signal_used(struct vhost_net_virtqueue *nvq)
> >   {
> >   struct vhost_virtqueue *vq = >vq;
> > @@ -645,39 +679,19 @@ static void vhost_rx_signal_used(struct 
> > vhost_net_virtqueue *nvq)
> >
> >   static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock 
> > *sk)
> >   {
> > - struct vhost_net_virtqueue *rvq = >vqs[VHOST_NET_VQ_RX];
> > - struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
> > - struct vhost_virtqueue *vq = >vq;
> > - unsigned long uninitialized_var(endtime);
> > - int len = peek_head_len(rvq, sk);
> > + struct vhost_net_virtqueue *nvq_rx = >vqs[VHOST_NET_VQ_RX];
> > + struct vhost_net_virtqueue *nvq_tx = >vqs[VHOST_NET_VQ_TX];
>
> It looks to me rnvq and tnvq is slightly better.
yes. patch 4 will also update.

> Other looks good to me.
>
> Thanks
>

[PATCH] net: virtio: simplify the virtnet_find_vqs

2018-05-31 Thread Tonghao Zhang

Use the common free functions while return successfully.

Signed-off-by: Tonghao Zhang 
---
 drivers/net/virtio_net.c | 6 +-
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 8f08a3e..0eee6d6 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2609,12 +2609,8 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
vi->sq[i].vq = vqs[txq2vq(i)];
}
 
-   kfree(names);
-   kfree(callbacks);
-   kfree(vqs);
-   kfree(ctx);
+   /* run here: ret == 0. */
 
-   return 0;
 
 err_find:
kfree(ctx);
-- 
1.8.3.1

[PATCH] net: core: improve the tx_hash calculating

2018-05-31 Thread Tonghao Zhang

Use the % instead of while, and it may simple code and improve
the calculating. The real_num_tx_queues has been checked when
allocating and setting it.

Signed-off-by: Tonghao Zhang 
---
 net/core/dev.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 1844d9b..edc5b75 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2617,15 +2617,13 @@ void netif_device_attach(struct net_device *dev)
  */
 static u16 skb_tx_hash(const struct net_device *dev, struct sk_buff *skb)
 {
-   u32 hash;
u16 qoffset = 0;
u16 qcount = dev->real_num_tx_queues;
 
if (skb_rx_queue_recorded(skb)) {
-   hash = skb_get_rx_queue(skb);
-   while (unlikely(hash >= qcount))
-   hash -= qcount;
-   return hash;
+   /* When setting the real_num_tx_queues, we make sure
+* real_num_tx_queues != 0. */
+   return skb_get_rx_queue(skb) % qcount;
}
 
if (dev->num_tc) {
-- 
1.8.3.1

[PATCH] bonding: introduce link change helper

2018-05-16 Thread Tonghao Zhang

Introduce an new common helper to avoid redundancy.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/bonding/bond_main.c | 40 
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 718e491..3063a9c 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -2135,6 +2135,24 @@ static int bond_miimon_inspect(struct bonding *bond)
return commit;
 }
 
+static void bond_miimon_link_change(struct bonding *bond,
+   struct slave *slave,
+   char link)
+{
+   switch (BOND_MODE(bond)) {
+   case BOND_MODE_8023AD:
+   bond_3ad_handle_link_change(slave, link);
+   break;
+   case BOND_MODE_TLB:
+   case BOND_MODE_ALB:
+   bond_alb_handle_link_change(bond, slave, link);
+   break;
+   case BOND_MODE_XOR:
+   bond_update_slave_arr(bond, NULL);
+   break;
+   }
+}
+
 static void bond_miimon_commit(struct bonding *bond)
 {
struct list_head *iter;
@@ -2176,16 +2194,7 @@ static void bond_miimon_commit(struct bonding *bond)
slave->speed == SPEED_UNKNOWN ? 0 : 
slave->speed,
slave->duplex ? "full" : "half");
 
-   /* notify ad that the link status has changed */
-   if (BOND_MODE(bond) == BOND_MODE_8023AD)
-   bond_3ad_handle_link_change(slave, 
BOND_LINK_UP);
-
-   if (bond_is_lb(bond))
-   bond_alb_handle_link_change(bond, slave,
-   BOND_LINK_UP);
-
-   if (BOND_MODE(bond) == BOND_MODE_XOR)
-   bond_update_slave_arr(bond, NULL);
+   bond_miimon_link_change(bond, slave, BOND_LINK_UP);
 
if (!bond->curr_active_slave || slave == primary)
goto do_failover;
@@ -2207,16 +2216,7 @@ static void bond_miimon_commit(struct bonding *bond)
netdev_info(bond->dev, "link status definitely down for 
interface %s, disabling it\n",
slave->dev->name);
 
-   if (BOND_MODE(bond) == BOND_MODE_8023AD)
-   bond_3ad_handle_link_change(slave,
-   BOND_LINK_DOWN);
-
-   if (bond_is_lb(bond))
-   bond_alb_handle_link_change(bond, slave,
-   BOND_LINK_DOWN);
-
-   if (BOND_MODE(bond) == BOND_MODE_XOR)
-   bond_update_slave_arr(bond, NULL);
+   bond_miimon_link_change(bond, slave, BOND_LINK_DOWN);
 
if (slave == 
rcu_access_pointer(bond->curr_active_slave))
goto do_failover;
-- 
1.8.3.1

[PATCH 1/3] bonding: replace the return value type

2018-05-11 Thread Tonghao Zhang

The method ndo_start_xmit is defined as returning a
netdev_tx_t, which is a typedef for an enum type,
but the implementation in this driver returns an int.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/bonding/bond_alb.c  |  8 
 drivers/net/bonding/bond_main.c | 12 
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 1ed9529..601f678 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1316,8 +1316,8 @@ void bond_alb_deinitialize(struct bonding *bond)
rlb_deinitialize(bond);
 }
 
-static int bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
-   struct slave *tx_slave)
+static netdev_tx_t bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
+   struct slave *tx_slave)
 {
struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
struct ethhdr *eth_data = eth_hdr(skb);
@@ -1351,7 +1351,7 @@ static int bond_do_alb_xmit(struct sk_buff *skb, struct 
bonding *bond,
return NETDEV_TX_OK;
 }
 
-int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
struct bonding *bond = netdev_priv(bond_dev);
struct ethhdr *eth_data;
@@ -1389,7 +1389,7 @@ int bond_tlb_xmit(struct sk_buff *skb, struct net_device 
*bond_dev)
return bond_do_alb_xmit(skb, bond, tx_slave);
 }
 
-int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
struct bonding *bond = netdev_priv(bond_dev);
struct ethhdr *eth_data;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 718e491..01bca86 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3805,7 +3805,8 @@ static u32 bond_rr_gen_slave_id(struct bonding *bond)
return slave_id;
 }
 
-static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device 
*bond_dev)
+static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
+   struct net_device *bond_dev)
 {
struct bonding *bond = netdev_priv(bond_dev);
struct iphdr *iph = ip_hdr(skb);
@@ -3841,7 +3842,8 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, 
struct net_device *bond_dev
 /* In active-backup mode, we know that bond->curr_active_slave is always valid 
if
  * the bond has a usable interface.
  */
-static int bond_xmit_activebackup(struct sk_buff *skb, struct net_device 
*bond_dev)
+static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,
+ struct net_device *bond_dev)
 {
struct bonding *bond = netdev_priv(bond_dev);
struct slave *slave;
@@ -3979,7 +3981,8 @@ int bond_update_slave_arr(struct bonding *bond, struct 
slave *skipslave)
  * usable slave array is formed in the control path. The xmit function
  * just calculates hash and sends the packet out.
  */
-static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
+struct net_device *dev)
 {
struct bonding *bond = netdev_priv(dev);
struct slave *slave;
@@ -3999,7 +4002,8 @@ static int bond_3ad_xor_xmit(struct sk_buff *skb, struct 
net_device *dev)
 }
 
 /* in broadcast mode, we send everything to all usable interfaces. */
-static int bond_xmit_broadcast(struct sk_buff *skb, struct net_device 
*bond_dev)
+static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
+  struct net_device *bond_dev)
 {
struct bonding *bond = netdev_priv(bond_dev);
struct slave *slave = NULL;
-- 
1.8.3.1

[PATCH 3/3] net: doc: fix spelling mistake: "modrobe.d" -> "modprobe.d"

2018-05-11 Thread Tonghao Zhang

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 Documentation/networking/bonding.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/networking/bonding.txt 
b/Documentation/networking/bonding.txt
index 9ba04c0..c13214d 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -140,7 +140,7 @@ bonding module at load time, or are specified via sysfs.
 
Module options may be given as command line arguments to the
 insmod or modprobe command, but are usually specified in either the
-/etc/modrobe.d/*.conf configuration files, or in a distro-specific
+/etc/modprobe.d/*.conf configuration files, or in a distro-specific
 configuration file (some of which are detailed in the next section).
 
Details on bonding support for sysfs is provided in the
-- 
1.8.3.1

[PATCH 2/3] bonding: use the skb_get/set_queue_mapping

2018-05-11 Thread Tonghao Zhang

Use the skb_get_queue_mapping, skb_set_queue_mapping
and skb_rx_queue_recorded for skb queue_mapping in bonding
driver, but not use it directly.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/bonding/bond_main.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 01bca86..966d091 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -247,7 +247,7 @@ void bond_dev_queue_xmit(struct bonding *bond, struct 
sk_buff *skb,
 
BUILD_BUG_ON(sizeof(skb->queue_mapping) !=
 sizeof(qdisc_skb_cb(skb)->slave_dev_queue_mapping));
-   skb->queue_mapping = qdisc_skb_cb(skb)->slave_dev_queue_mapping;
+   skb_set_queue_mapping(skb, qdisc_skb_cb(skb)->slave_dev_queue_mapping);
 
if (unlikely(netpoll_tx_running(bond->dev)))
bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), 
skb);
@@ -4040,12 +4040,12 @@ static inline int bond_slave_override(struct bonding 
*bond,
struct slave *slave = NULL;
struct list_head *iter;
 
-   if (!skb->queue_mapping)
+   if (!skb_rx_queue_recorded(skb))
return 1;
 
/* Find out if any slaves have the same mapping as this skb. */
bond_for_each_slave_rcu(bond, slave, iter) {
-   if (slave->queue_id == skb->queue_mapping) {
+   if (slave->queue_id == skb_get_queue_mapping(skb)) {
if (bond_slave_is_up(slave) &&
slave->link == BOND_LINK_UP) {
bond_dev_queue_xmit(bond, skb, slave->dev);
@@ -4071,7 +4071,7 @@ static u16 bond_select_queue(struct net_device *dev, 
struct sk_buff *skb,
u16 txq = skb_rx_queue_recorded(skb) ? skb_get_rx_queue(skb) : 0;
 
/* Save the original txq to restore before passing to the driver */
-   qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb->queue_mapping;
+   qdisc_skb_cb(skb)->slave_dev_queue_mapping = skb_get_queue_mapping(skb);
 
if (unlikely(txq >= dev->real_num_tx_queues)) {
do {
-- 
1.8.3.1

[PATCH 1/3] bonding: replace the return value type

2018-05-11 Thread Tonghao Zhang

The method ndo_start_xmit is defined as returning a
netdev_tx_t, which is a typedef for an enum type,
but the implementation in this driver returns an int.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/bonding/bond_alb.c  |  8 
 drivers/net/bonding/bond_main.c | 12 
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/net/bonding/bond_alb.c b/drivers/net/bonding/bond_alb.c
index 1ed9529..601f678 100644
--- a/drivers/net/bonding/bond_alb.c
+++ b/drivers/net/bonding/bond_alb.c
@@ -1316,8 +1316,8 @@ void bond_alb_deinitialize(struct bonding *bond)
rlb_deinitialize(bond);
 }
 
-static int bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
-   struct slave *tx_slave)
+static netdev_tx_t bond_do_alb_xmit(struct sk_buff *skb, struct bonding *bond,
+   struct slave *tx_slave)
 {
struct alb_bond_info *bond_info = &(BOND_ALB_INFO(bond));
struct ethhdr *eth_data = eth_hdr(skb);
@@ -1351,7 +1351,7 @@ static int bond_do_alb_xmit(struct sk_buff *skb, struct 
bonding *bond,
return NETDEV_TX_OK;
 }
 
-int bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_tlb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
struct bonding *bond = netdev_priv(bond_dev);
struct ethhdr *eth_data;
@@ -1389,7 +1389,7 @@ int bond_tlb_xmit(struct sk_buff *skb, struct net_device 
*bond_dev)
return bond_do_alb_xmit(skb, bond, tx_slave);
 }
 
-int bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
+netdev_tx_t bond_alb_xmit(struct sk_buff *skb, struct net_device *bond_dev)
 {
struct bonding *bond = netdev_priv(bond_dev);
struct ethhdr *eth_data;
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 718e491..01bca86 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -3805,7 +3805,8 @@ static u32 bond_rr_gen_slave_id(struct bonding *bond)
return slave_id;
 }
 
-static int bond_xmit_roundrobin(struct sk_buff *skb, struct net_device 
*bond_dev)
+static netdev_tx_t bond_xmit_roundrobin(struct sk_buff *skb,
+   struct net_device *bond_dev)
 {
struct bonding *bond = netdev_priv(bond_dev);
struct iphdr *iph = ip_hdr(skb);
@@ -3841,7 +3842,8 @@ static int bond_xmit_roundrobin(struct sk_buff *skb, 
struct net_device *bond_dev
 /* In active-backup mode, we know that bond->curr_active_slave is always valid 
if
  * the bond has a usable interface.
  */
-static int bond_xmit_activebackup(struct sk_buff *skb, struct net_device 
*bond_dev)
+static netdev_tx_t bond_xmit_activebackup(struct sk_buff *skb,
+ struct net_device *bond_dev)
 {
struct bonding *bond = netdev_priv(bond_dev);
struct slave *slave;
@@ -3979,7 +3981,8 @@ int bond_update_slave_arr(struct bonding *bond, struct 
slave *skipslave)
  * usable slave array is formed in the control path. The xmit function
  * just calculates hash and sends the packet out.
  */
-static int bond_3ad_xor_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t bond_3ad_xor_xmit(struct sk_buff *skb,
+struct net_device *dev)
 {
struct bonding *bond = netdev_priv(dev);
struct slave *slave;
@@ -3999,7 +4002,8 @@ static int bond_3ad_xor_xmit(struct sk_buff *skb, struct 
net_device *dev)
 }
 
 /* in broadcast mode, we send everything to all usable interfaces. */
-static int bond_xmit_broadcast(struct sk_buff *skb, struct net_device 
*bond_dev)
+static netdev_tx_t bond_xmit_broadcast(struct sk_buff *skb,
+  struct net_device *bond_dev)
 {
struct bonding *bond = netdev_priv(bond_dev);
struct slave *slave = NULL;
-- 
1.8.3.1

[PATCH v2 2/2] doc: Change the udp/sctp rmem/wmem default value.

2018-03-13 Thread Tonghao Zhang

The SK_MEM_QUANTUM was changed from PAGE_SIZE to 4096.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 Documentation/networking/ip-sysctl.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 783675a..1d11207 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -755,13 +755,13 @@ udp_rmem_min - INTEGER
Minimal size of receive buffer used by UDP sockets in moderation.
Each UDP socket is able to use the size for receiving data, even if
total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
-   Default: 1 page
+   Default: 4K
 
 udp_wmem_min - INTEGER
Minimal size of send buffer used by UDP sockets in moderation.
Each UDP socket is able to use the size for sending data, even if
total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
-   Default: 1 page
+   Default: 4K
 
 CIPSOv4 Variables:
 
@@ -2101,7 +2101,7 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max
It is guaranteed to each SCTP socket (but not association) even
under moderate memory pressure.
 
-   Default: 1 page
+   Default: 4K
 
 sctp_wmem  - vector of 3 INTEGERs: min, default, max
Currently this tunable has no effect.
-- 
1.8.3.1

[PATCH v2 1/2] udp: Move the udp sysctl to namespace.

2018-03-13 Thread Tonghao Zhang

This patch moves the udp_rmem_min, udp_wmem_min
to namespace and init the udp_l3mdev_accept explicitly.

The udp_rmem_min/udp_wmem_min affect udp rx/tx queue,
with this patch namespaces can set them differently.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 v2: use a common helper to avoid the code duplication.
---
 include/net/netns/ipv4.h   |  3 ++
 net/ipv4/sysctl_net_ipv4.c | 32 -
 net/ipv4/udp.c | 86 +++---
 net/ipv6/udp.c | 52 ++--
 4 files changed, 96 insertions(+), 77 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 3a970e4..382bfd7 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -168,6 +168,9 @@ struct netns_ipv4 {
atomic_t tfo_active_disable_times;
unsigned long tfo_active_disable_stamp;
 
+   int sysctl_udp_wmem_min;
+   int sysctl_udp_rmem_min;
+
 #ifdef CONFIG_NET_L3_MASTER_DEV
int sysctl_udp_l3mdev_accept;
 #endif
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 011de9a..5b72d97 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -520,22 +520,6 @@ static int proc_fib_multipath_hash_policy(struct ctl_table 
*table, int write,
.mode   = 0644,
.proc_handler   = proc_doulongvec_minmax,
},
-   {
-   .procname   = "udp_rmem_min",
-   .data   = _udp_rmem_min,
-   .maxlen = sizeof(sysctl_udp_rmem_min),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec_minmax,
-   .extra1 = 
-   },
-   {
-   .procname   = "udp_wmem_min",
-   .data   = _udp_wmem_min,
-   .maxlen = sizeof(sysctl_udp_wmem_min),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec_minmax,
-   .extra1 = 
-   },
{ }
 };
 
@@ -1167,6 +1151,22 @@ static int proc_fib_multipath_hash_policy(struct 
ctl_table *table, int write,
.proc_handler   = proc_dointvec_minmax,
.extra1 = ,
},
+   {
+   .procname   = "udp_rmem_min",
+   .data   = _net.ipv4.sysctl_udp_rmem_min,
+   .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = 
+   },
+   {
+   .procname   = "udp_wmem_min",
+   .data   = _net.ipv4.sysctl_udp_wmem_min,
+   .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = 
+   },
{ }
 };
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3013404..908fc02 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -122,12 +122,6 @@
 long sysctl_udp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_mem);
 
-int sysctl_udp_rmem_min __read_mostly;
-EXPORT_SYMBOL(sysctl_udp_rmem_min);
-
-int sysctl_udp_wmem_min __read_mostly;
-EXPORT_SYMBOL(sysctl_udp_wmem_min);
-
 atomic_long_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
@@ -2533,35 +2527,35 @@ int udp_abort(struct sock *sk, int err)
 EXPORT_SYMBOL_GPL(udp_abort);
 
 struct proto udp_prot = {
-   .name  = "UDP",
-   .owner = THIS_MODULE,
-   .close = udp_lib_close,
-   .connect   = ip4_datagram_connect,
-   .disconnect= udp_disconnect,
-   .ioctl = udp_ioctl,
-   .init  = udp_init_sock,
-   .destroy   = udp_destroy_sock,
-   .setsockopt= udp_setsockopt,
-   .getsockopt= udp_getsockopt,
-   .sendmsg   = udp_sendmsg,
-   .recvmsg   = udp_recvmsg,
-   .sendpage  = udp_sendpage,
-   .release_cb= ip4_datagram_release_cb,
-   .hash  = udp_lib_hash,
-   .unhash= udp_lib_unhash,
-   .rehash= udp_v4_rehash,
-   .get_port  = udp_v4_get_port,
-   .memory_allocated  = _memory_allocated,
-   .sysctl_mem= sysctl_udp_mem,
-   .sysctl_wmem   = _udp_wmem_min,
-   .sysctl_rmem   = _udp_rmem_min,
-   .obj_size  = sizeof(struct udp_sock),
-   .h.udp_table   = _table,
+   .name   = "UDP",
+   .owner  = THIS_MODULE,
+   .close  = udp_lib_close,
+   .connect= ip4_datagram_connect,
+   .disconnect = udp_disconnect,
+   .ioctl  = udp_ioctl,

Re: [PATCH 1/2] udp: Move the udp sysctl to namespace.

2018-03-13 Thread Tonghao Zhang

On Tue, Mar 13, 2018 at 7:36 PM, Paolo Abeni <pab...@redhat.com> wrote:
> Hi,
>
> On Tue, 2018-03-13 at 02:57 -0700, Tonghao Zhang wrote:
>> This patch moves the udp_rmem_min, udp_wmem_min
>> to namespace and init the udp_l3mdev_accept explicitly.
>
> Can you please be a little more descriptive on why this is
> needed/helpful?
Thanks for your reply. In our machine, there are many dockers. The different
dockers may run the different services which require rx queue.

All the dockers in a machine share the .sysctl_mem, so we can mov the
udp_wmem_min/udp_rmem_min, which affect rx queue,  to namespace,
then docker can set it differently.

>> Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
>> ---
>>  include/net/netns/ipv4.h   |  3 ++
>>  net/ipv4/sysctl_net_ipv4.c | 32 -
>>  net/ipv4/udp.c | 86 
>> +++---
>>  net/ipv6/udp.c | 52 ++--
>>  4 files changed, 96 insertions(+), 77 deletions(-)
>>
>> diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
>> index 3a970e4..382bfd7 100644
>> --- a/include/net/netns/ipv4.h
>> +++ b/include/net/netns/ipv4.h
>> @@ -168,6 +168,9 @@ struct netns_ipv4 {
>>   atomic_t tfo_active_disable_times;
>>   unsigned long tfo_active_disable_stamp;
>>
>> + int sysctl_udp_wmem_min;
>> + int sysctl_udp_rmem_min;
>> +
>>  #ifdef CONFIG_NET_L3_MASTER_DEV
>>   int sysctl_udp_l3mdev_accept;
>>  #endif
>> diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
>> index 011de9a..5b72d97 100644
>> --- a/net/ipv4/sysctl_net_ipv4.c
>> +++ b/net/ipv4/sysctl_net_ipv4.c
>> @@ -520,22 +520,6 @@ static int proc_fib_multipath_hash_policy(struct 
>> ctl_table *table, int write,
>>   .mode   = 0644,
>>   .proc_handler   = proc_doulongvec_minmax,
>>   },
>> - {
>> - .procname   = "udp_rmem_min",
>> - .data   = _udp_rmem_min,
>> - .maxlen = sizeof(sysctl_udp_rmem_min),
>> - .mode   = 0644,
>> - .proc_handler   = proc_dointvec_minmax,
>> - .extra1 = 
>> - },
>> - {
>> - .procname   = "udp_wmem_min",
>> - .data   = _udp_wmem_min,
>> - .maxlen = sizeof(sysctl_udp_wmem_min),
>> - .mode   = 0644,
>> - .proc_handler   = proc_dointvec_minmax,
>> - .extra1 = 
>> - },
>>   { }
>>  };
>>
>> @@ -1167,6 +1151,22 @@ static int proc_fib_multipath_hash_policy(struct 
>> ctl_table *table, int write,
>>   .proc_handler   = proc_dointvec_minmax,
>>   .extra1 = ,
>>   },
>> + {
>> + .procname   = "udp_rmem_min",
>> + .data   = _net.ipv4.sysctl_udp_rmem_min,
>> + .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
>> + .mode   = 0644,
>> + .proc_handler   = proc_dointvec_minmax,
>> + .extra1 = 
>> + },
>> + {
>> + .procname   = "udp_wmem_min",
>> + .data   = _net.ipv4.sysctl_udp_wmem_min,
>> + .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min),
>> + .mode   = 0644,
>> + .proc_handler   = proc_dointvec_minmax,
>> + .extra1 = 
>> + },
>>   { }
>>  };
>>
>> diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
>> index 3013404..7ae77f2 100644
>> --- a/net/ipv4/udp.c
>> +++ b/net/ipv4/udp.c
>> @@ -122,12 +122,6 @@
>>  long sysctl_udp_mem[3] __read_mostly;
>>  EXPORT_SYMBOL(sysctl_udp_mem);
>>
>> -int sysctl_udp_rmem_min __read_mostly;
>> -EXPORT_SYMBOL(sysctl_udp_rmem_min);
>> -
>> -int sysctl_udp_wmem_min __read_mostly;
>> -EXPORT_SYMBOL(sysctl_udp_wmem_min);
>> -
>>  atomic_long_t udp_memory_allocated;
>>  EXPORT_SYMBOL(udp_memory_allocated);
>>
>> @@ -2533,35 +2527,35 @@ int udp_abort(struct sock *sk, int err)
>>  EXPORT_SYMBOL_GPL(udp_abort);
>>
>>  struct proto udp_prot = {
>> - .name  = "UDP",
>> - .owner = THIS_MODULE,
>> - .close = udp_lib_close,
>> - .connect   = ip4_datagram_connect,
>> - .

[PATCH 1/2] udp: Move the udp sysctl to namespace.

2018-03-13 Thread Tonghao Zhang

This patch moves the udp_rmem_min, udp_wmem_min
to namespace and init the udp_l3mdev_accept explicitly.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 include/net/netns/ipv4.h   |  3 ++
 net/ipv4/sysctl_net_ipv4.c | 32 -
 net/ipv4/udp.c | 86 +++---
 net/ipv6/udp.c | 52 ++--
 4 files changed, 96 insertions(+), 77 deletions(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 3a970e4..382bfd7 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -168,6 +168,9 @@ struct netns_ipv4 {
atomic_t tfo_active_disable_times;
unsigned long tfo_active_disable_stamp;
 
+   int sysctl_udp_wmem_min;
+   int sysctl_udp_rmem_min;
+
 #ifdef CONFIG_NET_L3_MASTER_DEV
int sysctl_udp_l3mdev_accept;
 #endif
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 011de9a..5b72d97 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -520,22 +520,6 @@ static int proc_fib_multipath_hash_policy(struct ctl_table 
*table, int write,
.mode   = 0644,
.proc_handler   = proc_doulongvec_minmax,
},
-   {
-   .procname   = "udp_rmem_min",
-   .data   = _udp_rmem_min,
-   .maxlen = sizeof(sysctl_udp_rmem_min),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec_minmax,
-   .extra1 = 
-   },
-   {
-   .procname   = "udp_wmem_min",
-   .data   = _udp_wmem_min,
-   .maxlen = sizeof(sysctl_udp_wmem_min),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec_minmax,
-   .extra1 = 
-   },
{ }
 };
 
@@ -1167,6 +1151,22 @@ static int proc_fib_multipath_hash_policy(struct 
ctl_table *table, int write,
.proc_handler   = proc_dointvec_minmax,
.extra1 = ,
},
+   {
+   .procname   = "udp_rmem_min",
+   .data   = _net.ipv4.sysctl_udp_rmem_min,
+   .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = 
+   },
+   {
+   .procname   = "udp_wmem_min",
+   .data   = _net.ipv4.sysctl_udp_wmem_min,
+   .maxlen = sizeof(init_net.ipv4.sysctl_udp_wmem_min),
+   .mode   = 0644,
+   .proc_handler   = proc_dointvec_minmax,
+   .extra1 = 
+   },
{ }
 };
 
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3013404..7ae77f2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -122,12 +122,6 @@
 long sysctl_udp_mem[3] __read_mostly;
 EXPORT_SYMBOL(sysctl_udp_mem);
 
-int sysctl_udp_rmem_min __read_mostly;
-EXPORT_SYMBOL(sysctl_udp_rmem_min);
-
-int sysctl_udp_wmem_min __read_mostly;
-EXPORT_SYMBOL(sysctl_udp_wmem_min);
-
 atomic_long_t udp_memory_allocated;
 EXPORT_SYMBOL(udp_memory_allocated);
 
@@ -2533,35 +2527,35 @@ int udp_abort(struct sock *sk, int err)
 EXPORT_SYMBOL_GPL(udp_abort);
 
 struct proto udp_prot = {
-   .name  = "UDP",
-   .owner = THIS_MODULE,
-   .close = udp_lib_close,
-   .connect   = ip4_datagram_connect,
-   .disconnect= udp_disconnect,
-   .ioctl = udp_ioctl,
-   .init  = udp_init_sock,
-   .destroy   = udp_destroy_sock,
-   .setsockopt= udp_setsockopt,
-   .getsockopt= udp_getsockopt,
-   .sendmsg   = udp_sendmsg,
-   .recvmsg   = udp_recvmsg,
-   .sendpage  = udp_sendpage,
-   .release_cb= ip4_datagram_release_cb,
-   .hash  = udp_lib_hash,
-   .unhash= udp_lib_unhash,
-   .rehash= udp_v4_rehash,
-   .get_port  = udp_v4_get_port,
-   .memory_allocated  = _memory_allocated,
-   .sysctl_mem= sysctl_udp_mem,
-   .sysctl_wmem   = _udp_wmem_min,
-   .sysctl_rmem   = _udp_rmem_min,
-   .obj_size  = sizeof(struct udp_sock),
-   .h.udp_table   = _table,
+   .name   = "UDP",
+   .owner  = THIS_MODULE,
+   .close  = udp_lib_close,
+   .connect= ip4_datagram_connect,
+   .disconnect = udp_disconnect,
+   .ioctl  = udp_ioctl,
+   .init   = udp_init_sock,
+   .destroy= udp_destroy_sock,
+   .setsockopt = udp_setsockopt,
+   .getsockopt

[PATCH 2/2] doc: Change the udp/sctp rmem/wmem default value.

2018-03-13 Thread Tonghao Zhang

The SK_MEM_QUANTUM was changed from PAGE_SIZE to 4096.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 Documentation/networking/ip-sysctl.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 783675a..1d11207 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -755,13 +755,13 @@ udp_rmem_min - INTEGER
Minimal size of receive buffer used by UDP sockets in moderation.
Each UDP socket is able to use the size for receiving data, even if
total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
-   Default: 1 page
+   Default: 4K
 
 udp_wmem_min - INTEGER
Minimal size of send buffer used by UDP sockets in moderation.
Each UDP socket is able to use the size for sending data, even if
total pages of UDP sockets exceed udp_mem pressure. The unit is byte.
-   Default: 1 page
+   Default: 4K
 
 CIPSOv4 Variables:
 
@@ -2101,7 +2101,7 @@ sctp_rmem - vector of 3 INTEGERs: min, default, max
It is guaranteed to each SCTP socket (but not association) even
under moderate memory pressure.
 
-   Default: 1 page
+   Default: 4K
 
 sctp_wmem  - vector of 3 INTEGERs: min, default, max
Currently this tunable has no effect.
-- 
1.8.3.1

Re: [PATCH] tcp: Support tcp socket allocated counter in namespace.

2018-02-28 Thread Tonghao Zhang

On Wed, Feb 28, 2018 at 10:41 PM, David Miller <da...@davemloft.net> wrote:
> From: Tonghao Zhang <xiangxia.m@gmail.com>
> Date: Wed, 28 Feb 2018 21:01:52 +0800
>
>>> The amount of new conditional tests in these fast paths are not
>>> justified for this new counter which is of debatable usefullness.
>> sorry, too late for reply. Did you mean this counter will affect performance 
>> ?
>> I tested the patch witch netperf.
>
> A single flow TCP session with no loss doesn't tell us much.
>
> I just know from how these things go that under load, and with
> all kinds of different flows with different characterstics,
> every conditional test in the fast paths matter.
>
> If you want to have a chance at your change getting accepted,
> running all kinds of benchmarks won't do it.
>
> Instead, please find a way to make your feature work without all of
> the newly added tests (whilst not adding another cost at the same
> time).
Thanks for your explanation.

> Thank you.

Re: [PATCH] tcp: Support tcp socket allocated counter in namespace.

2018-02-28 Thread Tonghao Zhang

On Wed, Feb 14, 2018 at 1:19 AM, David Miller <da...@davemloft.net> wrote:
> From: Tonghao Zhang <xiangxia.m@gmail.com>
> Date: Mon, 12 Feb 2018 18:44:00 -0800
>
>> Sometimes, we want to know how many tcp sockets are in use
>> different _net_ namespaces. It's a key resource metric. With
>> this patch, we can get it via /proc/net/sockstat.
>>
>> The 'alloc' show in /proc/net/sockstat is the total tcp
>> sockets in the kernel. This patch moves it to namespace,
>> via adding a counter because the previous counter is used
>> in proto(e.g tcp, udp and sctp) memory management.
>>
>> Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
>  ...
>> @@ -453,6 +453,8 @@ void tcp_init_sock(struct sock *sk)
>>   sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
>>
>>   sk_sockets_allocated_inc(sk);
>> + if (likely(sk->sk_net_refcnt))
>> + tcp_sock_allocated_add(sock_net(sk), 1);
>>  }
>>  EXPORT_SYMBOL(tcp_init_sock);
>>
>  ...
>> @@ -1928,6 +1928,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
>>   tcp_saved_syn_free(tp);
>>
>>   sk_sockets_allocated_dec(sk);
>> + if (likely(sk->sk_net_refcnt))
>> + tcp_sock_allocated_add(sock_net(sk), -1);
>>  }
>>  EXPORT_SYMBOL(tcp_v4_destroy_sock);
>>
>  ...
>> @@ -559,6 +559,9 @@ struct sock *tcp_create_openreq_child(const struct sock 
>> *sk,
>>   newtp->rack.reo_wnd_persist = 0;
>>   newtp->rack.dsack_seen = 0;
>>
>> + if (likely(newsk->sk_net_refcnt))
>> + tcp_sock_allocated_add(sock_net(newsk), 1);
>> +
>>   __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
>
> The amount of new conditional tests in these fast paths are not
> justified for this new counter which is of debatable usefullness.
sorry, too late for reply. Did you mean this counter will affect performance ?
I tested the patch witch netperf.

Before patch:
# netperf -H 192.168.3.5 -t TCP_CRR -l 60 -- -O "THROUGHPUT,
THROUGHPUT_UNITS, MIN_LATENCY, MAX_LATENCY, MEAN_LATENCY"
MIGRATED TCP Connect/Request/Response TEST from 0.0.0.0 (0.0.0.0) port
0 AF_INET to 192.168.3.5 () port 0 AF_INET
Throughput Throughput Minimum  Maximum  Mean
   Units  Latency  Latency  Latency
  Microseconds Microseconds Microseconds
7719.86Trans/s70   660  129.32


# netperf -H 192.168.3.5 -t TCP_CRR -l 60 -- -O "THROUGHPUT,
THROUGHPUT_UNITS, MIN_LATENCY, MAX_LATENCY, MEAN_LATENCY"
MIGRATED TCP Connect/Request/Response TEST from 0.0.0.0 (0.0.0.0) port
0 AF_INET to 192.168.3.5 () port 0 AF_INET
Throughput Throughput Minimum  Maximum  Mean
   Units  Latency  Latency  Latency
  Microseconds Microseconds Microseconds
7574.00Trans/s86   13016131.82


# netperf -H 192.168.3.5 -t TCP_CRR -l 60 -- -O "THROUGHPUT,
THROUGHPUT_UNITS, MIN_LATENCY, MAX_LATENCY, MEAN_LATENCY"
MIGRATED TCP Connect/Request/Response TEST from 0.0.0.0 (0.0.0.0) port
0 AF_INET to 192.168.3.5 () port 0 AF_INET
Throughput Throughput Minimum  Maximum  Mean
   Units  Latency  Latency  Latency
  Microseconds Microseconds Microseconds
7556.66Trans/s87   15152132.06


After patch:

# netperf -H 192.168.3.5 -t TCP_CRR -l 60 -- -O "THROUGHPUT,
THROUGHPUT_UNITS, MIN_LATENCY, MAX_LATENCY, MEAN_LATENCY"
MIGRATED TCP Connect/Request/Response TEST from 0.0.0.0 (0.0.0.0) port
0 AF_INET to 192.168.3.5 () port 0 AF_INET
Throughput Throughput Minimum  Maximum  Mean
   Units  Latency  Latency  Latency
  Microseconds Microseconds Microseconds
7277.86Trans/s86   14884137.19



# netperf -H 192.168.3.5 -t TCP_CRR -l 60 -- -O "THROUGHPUT,
THROUGHPUT_UNITS, MIN_LATENCY, MAX_LATENCY, MEAN_LATENCY"
MIGRATED TCP Connect/Request/Response TEST from 0.0.0.0 (0.0.0.0) port
0 AF_INET to 192.168.3.5 () port 0 AF_INET
Throughput Throughput Minimum  Maximum  Mean
   Units  Latency  Latency  Latency
  Microseconds Microseconds Microseconds
7534.73Trans/s92   7531 132.50

# netperf -H 192.168.3.5 -t TCP_CRR -l 60 -- -O "THROUGHPUT,
THROUGHPUT_UNITS, MIN_LATENCY, MAX_LATENCY, MEAN_LATENCY"
MIGRATED TCP Connect/Request/Response TEST from 0.0.0.0 (0.0.0.0) port
0 AF_INET to 192.168.3.5 () port 0 AF_INET
Throughput Throughput Minimum  Maximum  Mean
   Units  Latency  Latency  Latency
  Microseconds Microseconds Microseconds
7622.17Trans/s89   2370 130.98


> I'm not applying this, sorry.

[PATCH] tcp: Support tcp socket allocated counter in namespace.

2018-02-12 Thread Tonghao Zhang

Sometimes, we want to know how many tcp sockets are in use
different _net_ namespaces. It's a key resource metric. With
this patch, we can get it via /proc/net/sockstat.

The 'alloc' show in /proc/net/sockstat is the total tcp
sockets in the kernel. This patch moves it to namespace,
via adding a counter because the previous counter is used
in proto(e.g tcp, udp and sctp) memory management.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 include/net/netns/ipv4.h |  3 +++
 include/net/tcp.h|  2 ++
 net/ipv4/proc.c  |  2 +-
 net/ipv4/tcp.c   |  2 ++
 net/ipv4/tcp_ipv4.c  | 34 ++
 net/ipv4/tcp_minisocks.c |  3 +++
 6 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 44668c2..85e91bd 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -68,6 +68,9 @@ struct netns_ipv4 {
 
struct inet_peer_base   *peers;
struct sock  * __percpu *tcp_sk;
+#ifdef CONFIG_PROC_FS
+   int __percpu *tcp_sock_allocated;
+#endif
struct netns_frags  frags;
 #ifdef CONFIG_NETFILTER
struct xt_table *iptable_filter;
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 093e967..4b24b6e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1780,6 +1780,8 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
 int tcp_gro_complete(struct sk_buff *skb);
 
 void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr);
+void tcp_sock_allocated_add(struct net *net, int val);
+int tcp_sock_allocated_get(struct net *net);
 
 static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp)
 {
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index dc5edc8..8a147f7 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -58,7 +58,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v)
int orphans, sockets;
 
orphans = percpu_counter_sum_positive(_orphan_count);
-   sockets = proto_sockets_allocated_sum_positive(_prot);
+   sockets = tcp_sock_allocated_get(net);
 
socket_seq_show(seq);
seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n",
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 874c931..77fe4a5 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -453,6 +453,8 @@ void tcp_init_sock(struct sock *sk)
sk->sk_rcvbuf = sock_net(sk)->ipv4.sysctl_tcp_rmem[1];
 
sk_sockets_allocated_inc(sk);
+   if (likely(sk->sk_net_refcnt))
+   tcp_sock_allocated_add(sock_net(sk), 1);
 }
 EXPORT_SYMBOL(tcp_init_sock);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 95738aa..a7bd0c4 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1928,6 +1928,8 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_saved_syn_free(tp);
 
sk_sockets_allocated_dec(sk);
+   if (likely(sk->sk_net_refcnt))
+   tcp_sock_allocated_add(sock_net(sk), -1);
 }
 EXPORT_SYMBOL(tcp_v4_destroy_sock);
 
@@ -2446,6 +2448,28 @@ struct proto tcp_prot = {
 };
 EXPORT_SYMBOL(tcp_prot);
 
+void tcp_sock_allocated_add(struct net *net, int val)
+{
+#ifdef CONFIG_PROC_FS
+   this_cpu_add(*net->ipv4.tcp_sock_allocated, val);
+#endif
+}
+EXPORT_SYMBOL(tcp_sock_allocated_add);
+
+int tcp_sock_allocated_get(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+   int cpu, res = 0;
+
+   for_each_possible_cpu(cpu)
+   res += *per_cpu_ptr(net->ipv4.tcp_sock_allocated, cpu);
+   return res;
+#else
+   return 0;
+#endif
+}
+EXPORT_SYMBOL(tcp_sock_allocated_get);
+
 static void __net_exit tcp_sk_exit(struct net *net)
 {
int cpu;
@@ -2455,6 +2479,10 @@ static void __net_exit tcp_sk_exit(struct net *net)
for_each_possible_cpu(cpu)
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
free_percpu(net->ipv4.tcp_sk);
+
+#ifdef CONFIG_PROC_FS
+   free_percpu(net->ipv4.tcp_sock_allocated);
+#endif
 }
 
 static int __net_init tcp_sk_init(struct net *net)
@@ -2465,6 +2493,12 @@ static int __net_init tcp_sk_init(struct net *net)
if (!net->ipv4.tcp_sk)
return -ENOMEM;
 
+#ifdef CONFIG_PROC_FS
+   net->ipv4.tcp_sock_allocated = alloc_percpu(int);
+   if (!net->ipv4.tcp_sock_allocated)
+   goto fail;
+#endif
+
for_each_possible_cpu(cpu) {
struct sock *sk;
 
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index a8384b0c..573fe43 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -559,6 +559,9 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rack.reo_wnd_persist = 0;
newtp->rack.dsack_seen = 0;
 
+   if (likely(newsk->sk_net_refcnt))
+   tcp_sock_allocated_add(sock_net(newsk), 1);
+
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
}
return newsk;
-- 
1.8.3.1

[PATCH] doc: Change the min default value of tcp_wmem/tcp_rmem.

2018-02-04 Thread Tonghao Zhang

The SK_MEM_QUANTUM was changed from PAGE_SIZE to 4096. And the
tcp_wmem/tcp_rmem min default values are 4096.

Fixes: bd68a2a854ad ("net: set SK_MEM_QUANTUM to 4096")
Cc: Eric Dumazet <eduma...@google.com>
Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 Documentation/networking/ip-sysctl.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 3f2c40d..a553d4e 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -508,7 +508,7 @@ tcp_rmem - vector of 3 INTEGERs: min, default, max
min: Minimal size of receive buffer used by TCP sockets.
It is guaranteed to each TCP socket, even under moderate memory
pressure.
-   Default: 1 page
+   Default: 4K
 
default: initial size of receive buffer used by TCP sockets.
This value overrides net.core.rmem_default used by other protocols.
@@ -667,7 +667,7 @@ tcp_window_scaling - BOOLEAN
 tcp_wmem - vector of 3 INTEGERs: min, default, max
min: Amount of memory reserved for send buffers for TCP sockets.
Each TCP socket has rights to use it due to fact of its birth.
-   Default: 1 page
+   Default: 4K
 
default: initial size of send buffer used by TCP sockets.  This
value overrides net.core.wmem_default used by other protocols.
-- 
1.8.3.1

Re: Compare length of req sock queue with sk_max_ack_backlog

2018-02-01 Thread Tonghao Zhang

On Fri, Feb 2, 2018 at 12:19 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
> On Fri, 2018-02-02 at 09:55 +0800, Tonghao Zhang wrote:
>> On Thu, Feb 1, 2018 at 10:00 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
>> > On Thu, 2018-02-01 at 20:34 +0800, Tonghao Zhang wrote:
>> > > Hi Eric
>> > > One question for you, In the patch ef547f2ac16 ("tcp: remove
>> > > max_qlen_log"),  why we compared the length of req sock queue with
>> > > sk_max_ack_backlog. If we remove the max_qlen_log, we should check the
>> > > length of req sock queue with tcp_max_syn_backlog, right ?
>> > >
>> > > With this patch, the option "/proc/sys/net/ipv4/tcp_max_syn_backlog"
>> > > will be unsed anymore, right ?
>> >
>> > Not right.
>> >
>> > Please "git grep -n sysctl_max_syn_backlog" to convince it is still
>> > used.
>> >
>>
>> In the tcp_conn_request(), we call the  inet_csk_reqsk_queue_is_full()
>> to check the length of req sock queue. But
>> inet_csk_reqsk_queue_is_full()
>> compares it with sk_max_ack_backlog.
>>
>> inet_csk_reqsk_queue_is_full
>>  inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog;
>>
>>
>> inet_csk_reqsk_queue_len
>>  reqsk_queue_len(_csk(sk)->icsk_accept_queue);
>>
>> If we set sysctl_max_syn_backlog to 1024 via
>> /proc/sys/net/ipv4/tcp_max_syn_backlog, and
>> backlog(sk_max_ack_backlog) to 128 via listen() , tcp_conn_request
>> will drop the packets when
>> length of req sock queue > 128, but not 1024.
>>
>> tcp_conn_request
>> if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
>>  inet_csk_reqsk_queue_is_full(sk)) && !isn) {
>> want_cookie = tcp_syn_flood_action(sk, skb, 
>> rsk_ops->slab_name);
>> if (!want_cookie)
>> goto drop;  // drop the packets
>> }
>
> It seems to work as intended and documented in the changelog.
>
> A socket listen backlog is determined at the time listen() system call
> is issued, using the standard somaxconn as safe guard, for all socket
> types.
>
> We want to get rid of tcp_max_syn_backlog eventually, since TCP
> listeners can use listen(fd, 1000) these days if they want,
> it is a matter of provisioning the needed memory.

In some case, the user may use different value for tcp_max_syn_backlog
and sk_max_ack_backlog.
Then in the tcp_conn_request, we should compare the length of req sock
queue with tcp_max_syn_backlog, but not sk_max_ack_backlog.
We can update the tcp_max_syn_backlog for specific socket via ioctl,
such like listen updating the ack backlog.

inet_csk_reqsk_queue_is_full
  inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog;

to:
inet_csk_reqsk_queue_is_full
  inet_csk_reqsk_queue_len(sk) >= tcp_max_syn_backlog; // for example


The sk_acceptq_is_full is ok in the sk_acceptq_is_full().

sk_acceptq_is_full
 sk->sk_ack_backlog > sk->sk_max_ack_backlog;

> Old mechanism was not allowing a listener to reduce or increase its
> backlog dynamically (listener was using a private hash table, sized at
> the time of first listen() and not resized later)
>
> Having a global sysctl to magically change behavior on all TCP
> listeners is not very practical, unless you had dedicated hosts to deal
> with one service.
I got it.

Re: Compare length of req sock queue with sk_max_ack_backlog

2018-02-01 Thread Tonghao Zhang

On Thu, Feb 1, 2018 at 10:00 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
> On Thu, 2018-02-01 at 20:34 +0800, Tonghao Zhang wrote:
>> Hi Eric
>> One question for you, In the patch ef547f2ac16 ("tcp: remove
>> max_qlen_log"),  why we compared the length of req sock queue with
>> sk_max_ack_backlog. If we remove the max_qlen_log, we should check the
>> length of req sock queue with tcp_max_syn_backlog, right ?
>>
>> With this patch, the option "/proc/sys/net/ipv4/tcp_max_syn_backlog"
>> will be unsed anymore, right ?
>
> Not right.
>
> Please "git grep -n sysctl_max_syn_backlog" to convince it is still
> used.
>
In the tcp_conn_request(), we call the  inet_csk_reqsk_queue_is_full()
to check the length of req sock queue. But
inet_csk_reqsk_queue_is_full()
compares it with sk_max_ack_backlog.

inet_csk_reqsk_queue_is_full
 inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog;


inet_csk_reqsk_queue_len
 reqsk_queue_len(_csk(sk)->icsk_accept_queue);

If we set sysctl_max_syn_backlog to 1024 via
/proc/sys/net/ipv4/tcp_max_syn_backlog, and
backlog(sk_max_ack_backlog) to 128 via listen() , tcp_conn_request
will drop the packets when
length of req sock queue > 128, but not 1024.

tcp_conn_request
if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
 inet_csk_reqsk_queue_is_full(sk)) && !isn) {
want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
if (!want_cookie)
goto drop;  // drop the packets
}


>> I hope we should
>> 1. Add a variate in sock struct, such as sk_max_syn_backlog. Then req
>> sock queue and accept sock queue can control their max value. When we
>> create the sock, we can store the tcp_max_syn_backlog value to
>> sk_max_syn_backlog.
>>
>
> There is a single /proc/sys/net/ipv4/tcp_max_syn_backlog value shared
> by all sockets of a given network namespace.
>
> But you can have thousands of TCP listeners, each of them having a
> distinct listen() backlog
>
>> 2. Update the backlog, we can update it via ioctl.
>
> No need to add an ugly ioctl.
>
> Simply call listen() again with another backlog value.
>
>

Compare length of req sock queue with sk_max_ack_backlog

2018-02-01 Thread Tonghao Zhang

Hi Eric
One question for you, In the patch ef547f2ac16 ("tcp: remove
max_qlen_log"),  why we compared the length of req sock queue with
sk_max_ack_backlog. If we remove the max_qlen_log, we should check the
length of req sock queue with tcp_max_syn_backlog, right ?

With this patch, the option "/proc/sys/net/ipv4/tcp_max_syn_backlog"
will be unsed anymore, right ?

I hope we should
1. Add a variate in sock struct, such as sk_max_syn_backlog. Then req
sock queue and accept sock queue can control their max value. When we
create the sock, we can store the tcp_max_syn_backlog value to
sk_max_syn_backlog.

2. Update the backlog, we can update it via ioctl.

Hope for your reply.

[PATCH] ipv4: Get the address of interface correctly.

2018-01-28 Thread Tonghao Zhang

When using ioctl to get address of interface, we can't
get it anymore. For example, the command is show as below.

# ifconfig eth0

In the patch ("03aef17bb79b3"), the devinet_ioctl does not
return a suitable value, even though we can find it in
the kernel. Then fix it now.

Fixes: 03aef17bb79b3 ("devinet_ioctl(): take copyin/copyout to caller")
Cc: Al Viro <v...@zeniv.linux.org.uk>
Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/ipv4/devinet.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index e056c00..40f0017 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1048,18 +1048,22 @@ int devinet_ioctl(struct net *net, unsigned int cmd, 
struct ifreq *ifr)
 
switch (cmd) {
case SIOCGIFADDR:   /* Get interface address */
+   ret = 0;
sin->sin_addr.s_addr = ifa->ifa_local;
break;
 
case SIOCGIFBRDADDR:/* Get the broadcast address */
+   ret = 0;
sin->sin_addr.s_addr = ifa->ifa_broadcast;
break;
 
case SIOCGIFDSTADDR:/* Get the destination address */
+   ret = 0;
sin->sin_addr.s_addr = ifa->ifa_address;
break;
 
case SIOCGIFNETMASK:/* Get the netmask for the interface */
+   ret = 0;
sin->sin_addr.s_addr = ifa->ifa_mask;
break;
 
-- 
1.8.3.1

[PATCH 2/2] ixgbe: Set rss key in order.

2018-01-23 Thread Tonghao Zhang

If we use ethtool to set rss key, for example,
we except 0x6d is K[0], and 0x5a is K[1]. But
the ixgbe driver set the 0xda is K[0].

ethtool -X eth0 hkey 6d:5a:56:da:25:5b:0e:c2:41:67:25:3d
:43:a3:8f:b0:d0:ca:2b:cb:ae:7b:30:b4:77:cb:2d:a3
:80:30:f2:0c:6a:42:b7:3b:be:ac:01:fa

The key is the original Microsoft's key.

7.1.2.8.1 RSS Hash Function: (82599 datasheet)
Given an array K with k bytes, our nomenclature assumes
that the array is laid out as follows:

K[0] K[1] K[2] ... K[k-1]

K[0] is the left-most byte, and the MSB of K[0] is the
left-most bit. K[k-1] is the right-most byte, and the
LSB of K[k-1] is the right-most bit.

8.2.3.7.14 RSS Random Key Register:
The K[0] is the 7:0 bit of RSSRK (RSS Random Key Register).

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 14 +++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 95aba97..0ca2e0e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -3777,10 +3777,18 @@ u32 ixgbe_rss_indir_tbl_entries(struct ixgbe_adapter 
*adapter)
 void ixgbe_store_key(struct ixgbe_adapter *adapter)
 {
struct ixgbe_hw *hw = >hw;
-   int i;
+   u8 *rss_key = (u8 *)adapter->rss_key;
+   u32 key, i;
 
-   for (i = 0; i < 10; i++)
-   IXGBE_WRITE_REG(hw, IXGBE_RSSRK(i), adapter->rss_key[i]);
+   /* Fill out hash function seeds */
+   for (i = 0; i < 10; i++) {
+   key  = rss_key[(i * 4)];
+   key |= rss_key[(i * 4) + 1] << 8;
+   key |= rss_key[(i * 4) + 2] << 16;
+   key |= rss_key[(i * 4) + 3] << 24;
+
+   IXGBE_WRITE_REG(hw, IXGBE_RSSRK(i), key);
+   }
 }
 
 /**
-- 
1.8.3.1

[PATCH 1/2] ixgbe: Avoid to write the RETA table when unnecessary.

2018-01-23 Thread Tonghao Zhang

If indir == 0 in the ixgbe_set_rxfh(), it is unnecessary
to write the HW. Because redirection table is not changed.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
index 0aaf70b..0f1d642 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ethtool.c
@@ -3060,6 +3060,8 @@ static int ixgbe_set_rxfh(struct net_device *netdev, 
const u32 *indir,
 
for (i = 0; i < reta_entries; i++)
adapter->rss_indir_tbl[i] = indir[i];
+
+   ixgbe_store_reta(adapter);
}
 
/* Fill out the rss hash key */
@@ -3068,7 +3070,6 @@ static int ixgbe_set_rxfh(struct net_device *netdev, 
const u32 *indir,
ixgbe_store_key(adapter);
}
 
-   ixgbe_store_reta(adapter);
 
return 0;
 }
-- 
1.8.3.1

[PATCH v2] vhost: Remove the unused variable.

2018-01-09 Thread Tonghao Zhang

The patch (7235acdb1) changed the way of the work
flushing in which the queued seq, done seq, and the
flushing are not used anymore. Then remove them now.

Fixes: 7235acdb1 ("vhost: simplify work flushing")
Cc: Jason Wang <jasow...@redhat.com>
Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
Acked-by: Jason Wang <jasow...@redhat.com>
---
 drivers/vhost/vhost.c | 1 -
 drivers/vhost/vhost.h | 4 
 2 files changed, 5 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 33ac2b186b85..9b04cad91d65 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -181,7 +181,6 @@ void vhost_work_init(struct vhost_work *work, 
vhost_work_fn_t fn)
 {
clear_bit(VHOST_WORK_QUEUED, >flags);
work->fn = fn;
-   init_waitqueue_head(>done);
 }
 EXPORT_SYMBOL_GPL(vhost_work_init);
 
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 79c6e7a60a5e..749fe13e061c 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -20,10 +20,6 @@ typedef void (*vhost_work_fn_t)(struct vhost_work *work);
 struct vhost_work {
struct llist_node node;
vhost_work_fn_t   fn;
-   wait_queue_head_t done;
-   int   flushing;
-   unsigned  queue_seq;
-   unsigned  done_seq;
unsigned long flags;
 };
 
-- 
2.13.6

[PATCH] vhost: Remove the unused variable.

2018-01-07 Thread Tonghao Zhang

The patch (7235acdb1) changed the way of the work
flushing in which the queued seq, done seq, and the
flushing are not used anymore. Then remove them now.

Fixes: 7235acdb1 ("vhost: simplify work flushing")
Cc: Jason Wang <jasow...@redhat.com>
Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/vhost/vhost.c | 1 -
 drivers/vhost/vhost.h | 4 
 2 files changed, 5 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 33ac2b186b85..9b04cad91d65 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -181,7 +181,6 @@ void vhost_work_init(struct vhost_work *work, 
vhost_work_fn_t fn)
 {
clear_bit(VHOST_WORK_QUEUED, >flags);
work->fn = fn;
-   init_waitqueue_head(>done);
 }
 EXPORT_SYMBOL_GPL(vhost_work_init);
 
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 79c6e7a60a5e..749fe13e061c 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -20,10 +20,6 @@ typedef void (*vhost_work_fn_t)(struct vhost_work *work);
 struct vhost_work {
struct llist_node node;
vhost_work_fn_t   fn;
-   wait_queue_head_t done;
-   int   flushing;
-   unsigned  queue_seq;
-   unsigned  done_seq;
unsigned long flags;
 };
 
-- 
2.13.6

[PATCH v2] sctp: Replace use of sockets_allocated with specified macro.

2017-12-22 Thread Tonghao Zhang

The patch(180d8cd942ce) replaces all uses of struct sock fields'
memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem
to accessor macros. But the sockets_allocated field of sctp sock is
not replaced at all. Then replace it now for unifying the code.

Fixes: 180d8cd942ce ("foundations of per-cgroup memory pressure controlling.")
Cc: Glauber Costa <glom...@parallels.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
fix typo.
---
 net/sctp/socket.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aadcd4244d9b..a5e2150ab013 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4569,7 +4569,7 @@ static int sctp_init_sock(struct sock *sk)
SCTP_DBG_OBJCNT_INC(sock);
 
local_bh_disable();
-   percpu_counter_inc(_sockets_allocated);
+   sk_sockets_allocated_inc(sk);
sock_prot_inuse_add(net, sk->sk_prot, 1);
 
/* Nothing can fail after this block, otherwise
@@ -4613,7 +4613,7 @@ static void sctp_destroy_sock(struct sock *sk)
}
sctp_endpoint_free(sp->ep);
local_bh_disable();
-   percpu_counter_dec(_sockets_allocated);
+   sk_sockets_allocated_dec(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
local_bh_enable();
 }
-- 
2.13.6

[PATCH] stcp: Replace use of sockets_allocated with specified macro.

2017-12-22 Thread Tonghao Zhang

The patch(180d8cd942ce) replaces all uses of struct sock fields'
memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem
to accessor macros. But the sockets_allocated field of sctp sock is
not replaced at all. Then replace it now for unifying the code.

Fixes: 180d8cd942ce ("foundations of per-cgroup memory pressure controlling.")
Cc: Glauber Costa <glom...@parallels.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 net/sctp/socket.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aadcd4244d9b..a5e2150ab013 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4569,7 +4569,7 @@ static int sctp_init_sock(struct sock *sk)
SCTP_DBG_OBJCNT_INC(sock);
 
local_bh_disable();
-   percpu_counter_inc(_sockets_allocated);
+   sk_sockets_allocated_inc(sk);
sock_prot_inuse_add(net, sk->sk_prot, 1);
 
/* Nothing can fail after this block, otherwise
@@ -4613,7 +4613,7 @@ static void sctp_destroy_sock(struct sock *sk)
}
sctp_endpoint_free(sp->ep);
local_bh_disable();
-   percpu_counter_dec(_sockets_allocated);
+   sk_sockets_allocated_dec(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
local_bh_enable();
 }
-- 
2.13.6

[PATCH] stcp: Replace use of sockets_allocated with specified macro.

2017-12-22 Thread Tonghao Zhang

The patch(180d8cd942ce) replaces all uses of struct sock fields'
memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem
to accessor macros. But the sockets_allocated field of sctp sock is
not replaced at all. Then replace it now for unifying the code.

Fixes: 180d8cd942ce ("foundations of per-cgroup memory pressure controlling.")
Cc: Glauber Costa <glom...@parallels.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 net/sctp/socket.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aadcd4244d9b..a5e2150ab013 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4569,7 +4569,7 @@ static int sctp_init_sock(struct sock *sk)
SCTP_DBG_OBJCNT_INC(sock);
 
local_bh_disable();
-   percpu_counter_inc(_sockets_allocated);
+   sk_sockets_allocated_inc(sk);
sock_prot_inuse_add(net, sk->sk_prot, 1);
 
/* Nothing can fail after this block, otherwise
@@ -4613,7 +4613,7 @@ static void sctp_destroy_sock(struct sock *sk)
}
sctp_endpoint_free(sp->ep);
local_bh_disable();
-   percpu_counter_dec(_sockets_allocated);
+   sk_sockets_allocated_dec(sk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
local_bh_enable();
 }
-- 
2.13.6

[PATCH v7 2/3] sock: Move the socket inuse to namespace.

2017-12-14 Thread Tonghao Zhang

In some case, we want to know how many sockets are in use in
different _net_ namespaces. It's a key resource metric.

This patch add a member in struct netns_core. This is a counter
for socket-inuse in the _net_ namespace. The patch will add/sub
counter in the sk_alloc, sk_clone_lock and __sk_free.

This patch will not counter the socket created in kernel.
It's not very useful for userspace to know how many kernel
sockets we created.

The main reasons for doing this are that:

1. When linux calls the 'do_exit' for process to exit, the functions
'exit_task_namespaces' and 'exit_task_work' will be called sequentially.
'exit_task_namespaces' may have destroyed the _net_ namespace, but
'sock_release' called in 'exit_task_work' may use the _net_ namespace
if we counter the socket-inuse in sock_release.

2. socket and sock are in pair. More important, sock holds the _net_
namespace. We counter the socket-inuse in sock, for avoiding holding
_net_ namespace again in socket. It's a easy way to maintain the code.

Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 include/net/netns/core.h |  3 +++
 include/net/sock.h   |  1 +
 net/core/sock.c  | 47 +--
 net/socket.c | 21 ++---
 4 files changed, 51 insertions(+), 21 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 45cfb5d..a5e8a66 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,6 +11,9 @@ struct netns_core {
 
int sysctl_somaxconn;
 
+#ifdef CONFIG_PROC_FS
+   int __percpu *sock_inuse;
+#endif
struct prot_inuse __percpu *prot_inuse;
 };
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 9a90472..0a32f3c 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1262,6 +1262,7 @@ static inline void sk_sockets_allocated_inc(struct sock 
*sk)
 /* Called with local bh disabled */
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc);
 int sock_prot_inuse_get(struct net *net, struct proto *proto);
+int sock_inuse_get(struct net *net);
 #else
 static inline void sock_prot_inuse_add(struct net *net, struct proto *prot,
int inc)
diff --git a/net/core/sock.c b/net/core/sock.c
index c2dd2d3..72d14b2 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -145,6 +145,8 @@
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
+static void sock_inuse_add(struct net *net, int val);
+
 /**
  * sk_ns_capable - General socket capability test
  * @sk: Socket to use a capability on or through
@@ -1531,8 +1533,11 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t 
priority,
sk->sk_kern_sock = kern;
sock_lock_init(sk);
sk->sk_net_refcnt = kern ? 0 : 1;
-   if (likely(sk->sk_net_refcnt))
+   if (likely(sk->sk_net_refcnt)) {
get_net(net);
+   sock_inuse_add(net, 1);
+   }
+
sock_net_set(sk, net);
refcount_set(>sk_wmem_alloc, 1);
 
@@ -1595,6 +1600,9 @@ void sk_destruct(struct sock *sk)
 
 static void __sk_free(struct sock *sk)
 {
+   if (likely(sk->sk_net_refcnt))
+   sock_inuse_add(sock_net(sk), -1);
+
if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
sock_diag_broadcast_destroy(sk);
else
@@ -1716,6 +1724,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
newsk->sk_priority = 0;
newsk->sk_incoming_cpu = raw_smp_processor_id();
atomic64_set(>sk_cookie, 0);
+   if (likely(newsk->sk_net_refcnt))
+   sock_inuse_add(sock_net(newsk), 1);
 
/*
 * Before updating sk_refcnt, we must commit prior changes to 
memory
@@ -3061,15 +3071,44 @@ int sock_prot_inuse_get(struct net *net, struct proto 
*prot)
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
 
+static void sock_inuse_add(struct net *net, int val)
+{
+   this_cpu_add(*net->core.sock_inuse, val);
+}
+
+int sock_inuse_get(struct net *net)
+{
+   int cpu, res = 0;
+
+   for_each_possible_cpu(cpu)
+   res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+
+   return res;
+}
+
+EXPORT_SYMBOL_GPL(sock_inuse_get);
+
 static int __net_init sock_inuse_init_net(struct net *net)
 {
net->core.prot_inuse = alloc_percpu(struct prot_inuse);
-   return net->core.prot_inuse ? 0 : -ENOMEM;
+   if (net->core.prot_inuse == NULL)
+   return -ENOMEM;
+
+   net->core.sock_inuse = alloc_percpu(int);
+   if (net->core.sock_inuse == NULL)
+   goto out;
+
+   return 0;
+
+out:
+   free_percpu(net->core.prot_inuse);
+   ret

[PATCH v7 1/3] sock: Change the netns_core member name.

2017-12-14 Thread Tonghao Zhang

Change the member name will make the code more readable.
This patch will be used in next patch.

Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 include/net/netns/core.h |  2 +-
 net/core/sock.c  | 10 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 0ad4d0c..45cfb5d 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,7 +11,7 @@ struct netns_core {
 
int sysctl_somaxconn;
 
-   struct prot_inuse __percpu *inuse;
+   struct prot_inuse __percpu *prot_inuse;
 };
 
 #endif
diff --git a/net/core/sock.c b/net/core/sock.c
index c0b5b2f..c2dd2d3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3045,7 +3045,7 @@ struct prot_inuse {
 
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-   __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
+   __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
@@ -3055,7 +3055,7 @@ int sock_prot_inuse_get(struct net *net, struct proto 
*prot)
int res = 0;
 
for_each_possible_cpu(cpu)
-   res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+   res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
 
return res >= 0 ? res : 0;
 }
@@ -3063,13 +3063,13 @@ int sock_prot_inuse_get(struct net *net, struct proto 
*prot)
 
 static int __net_init sock_inuse_init_net(struct net *net)
 {
-   net->core.inuse = alloc_percpu(struct prot_inuse);
-   return net->core.inuse ? 0 : -ENOMEM;
+   net->core.prot_inuse = alloc_percpu(struct prot_inuse);
+   return net->core.prot_inuse ? 0 : -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
-   free_percpu(net->core.inuse);
+   free_percpu(net->core.prot_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
-- 
1.8.3.1

[PATCH v7 3/3] sock: Hide unused variable when !CONFIG_PROC_FS.

2017-12-14 Thread Tonghao Zhang

When CONFIG_PROC_FS is disabled, we will not use the prot_inuse
counter. This adds an #ifdef to hide the variable definition in
that case. This is not a bugfix. But we can save bytes when there
are many network namespace.

Cc: Pavel Emelyanov <xe...@openvz.org>
Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 include/net/netns/core.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index a5e8a66..36c2d99 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -13,8 +13,8 @@ struct netns_core {
 
 #ifdef CONFIG_PROC_FS
int __percpu *sock_inuse;
-#endif
struct prot_inuse __percpu *prot_inuse;
+#endif
 };
 
 #endif
-- 
1.8.3.1

Re: [PATCH v6 2/3] sock: Move the socket inuse to namespace.

2017-12-13 Thread Tonghao Zhang

On Wed, Dec 13, 2017 at 2:04 AM, Cong Wang <xiyou.wangc...@gmail.com> wrote:
> On Sun, Dec 10, 2017 at 7:12 AM, Tonghao Zhang <xiangxia.m@gmail.com> 
> wrote:
>> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
>> index b797832..6c191fb 100644
>> --- a/net/core/net_namespace.c
>> +++ b/net/core/net_namespace.c
>> @@ -363,6 +363,13 @@ static struct net *net_alloc(void)
>> if (!net)
>> goto out_free;
>>
>> +#ifdef CONFIG_PROC_FS
>> +   net->core.sock_inuse = alloc_percpu(int);
>> +   if (!net->core.sock_inuse) {
>> +   kmem_cache_free(net_cachep, net);
>> +   goto out_free;
>> +   }
>> +#endif
>> rcu_assign_pointer(net->gen, ng);
>>  out:
>> return net;
>> @@ -374,6 +381,9 @@ static struct net *net_alloc(void)
>>
>>  static void net_free(struct net *net)
>>  {
>> +#ifdef CONFIG_PROC_FS
>> +   free_percpu(net->core.sock_inuse);
>> +#endif
>> kfree(rcu_access_pointer(net->gen));
>> kmem_cache_free(net_cachep, net);
>>  }
>
> Putting socket code in net_namespace.c doesn't look good.
hi cong,
Thanks for your work. If we dont alloc the in the net_alloc, it's
better to counter the sock for userspace
while the sock created in kernel will be omitted.

[PATCH net-next] tcp: Remove tcp_low_latency sysctl.

2017-12-12 Thread Tonghao Zhang

The prequeue has been removed from the kernel. If we leave
the tcp_low_latency in kernel, it may confuse the users.
If users can't find the option, the doc may tell them why.
So we remove also the sysctl about tcp_low_latency.

Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 Documentation/networking/ip-sysctl.txt |  2 +-
 net/ipv4/sysctl_net_ipv4.c | 10 --
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 77f4de59dc9c..ac35f5b59739 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -356,7 +356,7 @@ tcp_l3mdev_accept - BOOLEAN
compiled with CONFIG_NET_L3_MASTER_DEV.
 
 tcp_low_latency - BOOLEAN
-   This is a legacy option, it has no effect anymore.
+   This is an obsolete option, it has been removed from kernel.
 
 tcp_max_orphans - INTEGER
Maximal number of TCP sockets not attached to any user file handle,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4602af6d5358..dfebbfcccb87 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -45,9 +45,6 @@ static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
 static int ip_ping_group_range_min[] = { 0, 0 };
 static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 
-/* obsolete */
-static int sysctl_tcp_low_latency __read_mostly;
-
 /* Update system visible IP port range */
 static void set_local_port_range(struct net *net, int range[2])
 {
@@ -438,13 +435,6 @@ static struct ctl_table ipv4_table[] = {
.extra1 = ,
},
{
-   .procname   = "tcp_low_latency",
-   .data   = _tcp_low_latency,
-   .maxlen = sizeof(int),
-   .mode   = 0644,
-   .proc_handler   = proc_dointvec
-   },
-   {
.procname   = "tcp_congestion_control",
.mode   = 0644,
.maxlen = TCP_CA_NAME_MAX,
-- 
2.13.6

Re: [PATCH v5 2/2] sock: Move the socket inuse to namespace.

2017-12-08 Thread Tonghao Zhang

On Sat, Dec 9, 2017 at 6:09 AM, Cong Wang <xiyou.wangc...@gmail.com> wrote:
> On Thu, Dec 7, 2017 at 9:28 PM, Tonghao Zhang <xiangxia.m@gmail.com> 
> wrote:
>>
>> Release the netlink sock created in kernel(not hold the _net_ namespace):
>>
>
> You can avoid counting kernel sock by testing 'kern' in sk_alloc()
> and testing 'sk->sk_net_refcnt' in __sk_free().
Hi cong, if we do it in this way, we will not counter the sock created
in kernel, right ?

Re: [PATCH v5 2/2] sock: Move the socket inuse to namespace.

2017-12-08 Thread Tonghao Zhang

On Fri, Dec 8, 2017 at 9:24 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
> On Fri, 2017-12-08 at 19:29 +0800, Tonghao Zhang wrote:
>> hi all. we can add synchronize_rcu and rcu_barrier in
>> sock_inuse_exit_net to
>> ensure there are no outstanding rcu callbacks using this network
>> namespace.
>> we will not have to test if net->core.sock_inuse is NULL or not from
>> sock_inuse_add(). :)
>>
>>  static void __net_exit sock_inuse_exit_net(struct net *net)
>>  {
>> free_percpu(net->core.prot_inuse);
>> +
>> +   synchronize_rcu();
>> +   rcu_barrier();
>> +
>> +   free_percpu(net->core.sock_inuse);
>>  }
>
>
> Oh well. Do you have any idea of the major problem this would add ?
>
> Try the following, before and after your patches :
>
> for i in `seq 1 40`
> do
>  (for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) &
> done
> wait
>
> ( Check commit 8ca712c373a462cfa1b62272870b6c2c74aa83f9 )
>
Yes, I did the test. The patches drop the performance.
Before patch:
# time ./add_del_unshare.sh
net_namespace 97125   601658 : tunables00
  0 : slabdata 25 25  0

real 8m19.665s
user 0m4.268s
sys 0m6.477s

After :
# time ./add_del_unshare.sh
net_namespace102130   601658 : tunables00
  0 : slabdata 26 26  0

real 8m52.563s
user 0m4.040s
sys 0m7.558s

>
> This is a complex problem, we wont accept patches that kill network
> namespaces dismantling performance by adding brute force
> synchronize_rcu() or rcu_barrier() calls.
>
> Why not freeing net->core.sock_inuse right before feeing net itself in
> net_free() ?
I try this way, alloc  core.sock_inuse in net_alloc(), free it in net_free ().
It does not drop performance, and we will not always to check the
core.sock_inuse
in sock_inuse_add().

After :
# time ./add_del_unshare.sh
net_namespace109135   601658 : tunables00
  0 : slabdata 27 27  0

real 8m19.265s
user 0m4.090s
sys 0m8.185s

> You do not have to hijack sock_inuse_exit_net() just because it has a
> misleading name.
>
>

Re: [PATCH v5 2/2] sock: Move the socket inuse to namespace.

2017-12-08 Thread Tonghao Zhang

hi all. we can add synchronize_rcu and rcu_barrier in sock_inuse_exit_net to
ensure there are no outstanding rcu callbacks using this network namespace.
we will not have to test if net->core.sock_inuse is NULL or not from
sock_inuse_add(). :)

 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
free_percpu(net->core.prot_inuse);
+
+   synchronize_rcu();
+   rcu_barrier();
+
+   free_percpu(net->core.sock_inuse);
 }


On Fri, Dec 8, 2017 at 5:52 PM, Tonghao Zhang <xiangxia.m@gmail.com> wrote:
> On Fri, Dec 8, 2017 at 1:40 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
>> On Fri, 2017-12-08 at 13:28 +0800, Tonghao Zhang wrote:
>>> On Fri, Dec 8, 2017 at 1:20 AM, Eric Dumazet <eric.duma...@gmail.com>
>>> wrote:
>>> > On Thu, 2017-12-07 at 08:45 -0800, Tonghao Zhang wrote:
>>> > > In some case, we want to know how many sockets are in use in
>>> > > different _net_ namespaces. It's a key resource metric.
>>> > >
>>> >
>>> > ...
>>> >
>>> > > +static void sock_inuse_add(struct net *net, int val)
>>> > > +{
>>> > > + if (net->core.prot_inuse)
>>> > > + this_cpu_add(*net->core.sock_inuse, val);
>>> > > +}
>>> >
>>> > This is very confusing.
>>> >
>>> > Why testing net->core.prot_inuse for NULL is needed at all ?
>>> >
>>> > Why not testing net->core.sock_inuse instead ?
>>> >
>>>
>>> Hi Eric and Cong, oh it's a typo. it's net->core.sock_inuse there.
>>> Why
>>> we should check the net->core.sock_inuse
>>> Now show you the code:
>>>
>>> cleanup_net will call all of the network namespace exit methods,
>>> rcu_barrier, and then remove the _net_ namespace.
>>>
>>> cleanup_net:
>>> list_for_each_entry_reverse(ops, _list, list)
>>>  ops_exit_list(ops, _exit_list);
>>>
>>> rcu_barrier(); /* for netlink sock, the ‘deferred_put_nlk_sk’
>>> will
>>> be called. But sock_inuse has been released. */
>>
>>
>> Thats would be a bug.
>>
>> Please find another way, but we want ultimately to check that before
>> net->core.sock_inuse is freed, folding the inuse count on all cpus is
>> 0, to make sure we do not have a bug somewhere.
>
> Yes, I am aware of this issue even we will destroy the network namespace.
> By the way, we can counter the socket-inuse in sock_alloc or sock_release.
> In this way, we have to hold the network namespace again(via
> get_net()) while sock
> may hold it.
>
> what do you think of this idea?
>
>> We should not have to test if net->core.sock_inuse is NULL or not from
>> sock_inuse_add(). Pointer must be there all the time.
>>
>> The freeing should only happen once we are sure sock_inuse_add() can
>> not be called anymore.
>>
>>>
>>>
>>> /* Finally it is safe to free my network namespace structure */
>>> list_for_each_entry_safe(net, tmp, _exit_list, exit_list) {}
>>>
>>>
>>>
>>> Release the netlink sock created in kernel(not hold the _net_
>>> namespace):
>>>
>>> netlink_release
>>>call_rcu(>rcu, deferred_put_nlk_sk);
>>>
>>> deferred_put_nlk_sk
>>>sk_free(sk);
>>>
>>>
>>> I may add a comment for sock_inuse_add in v6.
>>
>>

Re: [PATCH v5 2/2] sock: Move the socket inuse to namespace.

2017-12-08 Thread Tonghao Zhang

On Fri, Dec 8, 2017 at 1:40 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
> On Fri, 2017-12-08 at 13:28 +0800, Tonghao Zhang wrote:
>> On Fri, Dec 8, 2017 at 1:20 AM, Eric Dumazet <eric.duma...@gmail.com>
>> wrote:
>> > On Thu, 2017-12-07 at 08:45 -0800, Tonghao Zhang wrote:
>> > > In some case, we want to know how many sockets are in use in
>> > > different _net_ namespaces. It's a key resource metric.
>> > >
>> >
>> > ...
>> >
>> > > +static void sock_inuse_add(struct net *net, int val)
>> > > +{
>> > > + if (net->core.prot_inuse)
>> > > + this_cpu_add(*net->core.sock_inuse, val);
>> > > +}
>> >
>> > This is very confusing.
>> >
>> > Why testing net->core.prot_inuse for NULL is needed at all ?
>> >
>> > Why not testing net->core.sock_inuse instead ?
>> >
>>
>> Hi Eric and Cong, oh it's a typo. it's net->core.sock_inuse there.
>> Why
>> we should check the net->core.sock_inuse
>> Now show you the code:
>>
>> cleanup_net will call all of the network namespace exit methods,
>> rcu_barrier, and then remove the _net_ namespace.
>>
>> cleanup_net:
>> list_for_each_entry_reverse(ops, _list, list)
>>  ops_exit_list(ops, _exit_list);
>>
>> rcu_barrier(); /* for netlink sock, the ‘deferred_put_nlk_sk’
>> will
>> be called. But sock_inuse has been released. */
>
>
> Thats would be a bug.
>
> Please find another way, but we want ultimately to check that before
> net->core.sock_inuse is freed, folding the inuse count on all cpus is
> 0, to make sure we do not have a bug somewhere.

Yes, I am aware of this issue even we will destroy the network namespace.
By the way, we can counter the socket-inuse in sock_alloc or sock_release.
In this way, we have to hold the network namespace again(via
get_net()) while sock
may hold it.

what do you think of this idea?

> We should not have to test if net->core.sock_inuse is NULL or not from
> sock_inuse_add(). Pointer must be there all the time.
>
> The freeing should only happen once we are sure sock_inuse_add() can
> not be called anymore.
>
>>
>>
>> /* Finally it is safe to free my network namespace structure */
>> list_for_each_entry_safe(net, tmp, _exit_list, exit_list) {}
>>
>>
>>
>> Release the netlink sock created in kernel(not hold the _net_
>> namespace):
>>
>> netlink_release
>>call_rcu(>rcu, deferred_put_nlk_sk);
>>
>> deferred_put_nlk_sk
>>sk_free(sk);
>>
>>
>> I may add a comment for sock_inuse_add in v6.
>
>

Re: [PATCH v5 2/2] sock: Move the socket inuse to namespace.

2017-12-07 Thread Tonghao Zhang

On Fri, Dec 8, 2017 at 1:20 AM, Eric Dumazet <eric.duma...@gmail.com> wrote:
> On Thu, 2017-12-07 at 08:45 -0800, Tonghao Zhang wrote:
>> In some case, we want to know how many sockets are in use in
>> different _net_ namespaces. It's a key resource metric.
>>
>
> ...
>
>> +static void sock_inuse_add(struct net *net, int val)
>> +{
>> + if (net->core.prot_inuse)
>> + this_cpu_add(*net->core.sock_inuse, val);
>> +}
>
> This is very confusing.
>
> Why testing net->core.prot_inuse for NULL is needed at all ?
>
> Why not testing net->core.sock_inuse instead ?
>
Hi Eric and Cong, oh it's a typo. it's net->core.sock_inuse there. Why
we should check the net->core.sock_inuse
Now show you the code:

cleanup_net will call all of the network namespace exit methods,
rcu_barrier, and then remove the _net_ namespace.

cleanup_net:
list_for_each_entry_reverse(ops, _list, list)
 ops_exit_list(ops, _exit_list);

rcu_barrier(); /* for netlink sock, the ‘deferred_put_nlk_sk’ will
be called. But sock_inuse has been released. */


/* Finally it is safe to free my network namespace structure */
list_for_each_entry_safe(net, tmp, _exit_list, exit_list) {}



Release the netlink sock created in kernel(not hold the _net_ namespace):

netlink_release
   call_rcu(>rcu, deferred_put_nlk_sk);

deferred_put_nlk_sk
   sk_free(sk);


I may add a comment for sock_inuse_add in v6.

Re: [PATCH v4 2/2] sock: Move the socket inuse to namespace.

2017-12-07 Thread Tonghao Zhang

On Sat, Nov 25, 2017 at 10:53 PM, David Miller <da...@davemloft.net> wrote:
> From: Tonghao Zhang <xiangxia.m@gmail.com>
> Date: Wed, 22 Nov 2017 17:51:25 -0800
>
>> This patch add a member in struct netns_core. And this is
>> a counter for socket_inuse in the _net_ namespace. The patch
>> will add/sub counter in the sk_alloc or sk_free. Because socket and
>> sock is in pair. It's a easy way to maintain the code and help
>> developer to review. More important, it avoids holding the _net_
>> namespace again.
>>
>> Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
>> Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
>
> First, it is extremely unclear why this is better.  You do not explain
> the reason at all.
Sorry, I am so later to reply. I send  the v5 and the comment may
explain the reason.

> Second:
>
>> @@ -2646,17 +2646,8 @@ static int __init sock_init(void)
>>  #ifdef CONFIG_PROC_FS
>>  void socket_seq_show(struct seq_file *seq)
>>  {
>> - int cpu;
>> - int counter = 0;
>> -
>> - for_each_possible_cpu(cpu)
>> - counter += per_cpu(sockets_in_use, cpu);
>> -
>> - /* It can be negative, by the way. 8) */
>> - if (counter < 0)
>> - counter = 0;
>> -
>> - seq_printf(seq, "sockets: used %d\n", counter);
>
> You've deleted the only use of "sockets_in_use" but you have not
> removed that per-cpu variable and it's maintainence.
So sorry :(

> But do not even bother fixing this if you cannot explain properly
> why these changes are an improvement.  I do not understant it,
> and until I do I cannot consider these changes seriously.
Yes, I should check the patch carefully and add more detail. I may do it better.
The v5 patch may be useful. Thanks so much.

> Thank you.

[PATCH v5 2/2] sock: Move the socket inuse to namespace.

2017-12-07 Thread Tonghao Zhang

In some case, we want to know how many sockets are in use in
different _net_ namespaces. It's a key resource metric.

This patch add a member in struct netns_core. This is a counter
for socket-inuse in the _net_ namespace. The patch will add/sub
counter in the sk_alloc, sk_clone_lock and __sk_free.

The main reasons for doing this are that：

1. When linux calls the 'do_exit' for process to exit, the functions
'exit_task_namespaces' and 'exit_task_work' will be called sequentially.
'exit_task_namespaces' may have destroyed the _net_ namespace, but
'sock_release' called in 'exit_task_work' may use the _net_ namespace
if we counter the socket-inuse in sock_release.

2. socket and sock are in pair. More important, sock holds the _net_
namespace. We counter the socket-inuse in sock, for avoiding holding
_net_ namespace again in socket. It's a easy way to maintain the code.

Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 include/net/netns/core.h |  1 +
 include/net/sock.h   |  1 +
 net/core/sock.c  | 52 ++--
 net/socket.c | 21 ++-
 4 files changed, 54 insertions(+), 21 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 45cfb5d..d1b4748f 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,6 +11,7 @@ struct netns_core {
 
int sysctl_somaxconn;
 
+   int __percpu *sock_inuse;
struct prot_inuse __percpu *prot_inuse;
 };
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 79e1a2c..0809b31 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1266,6 +1266,7 @@ static inline void sk_sockets_allocated_inc(struct sock 
*sk)
 /* Called with local bh disabled */
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc);
 int sock_prot_inuse_get(struct net *net, struct proto *proto);
+int sock_inuse_get(struct net *net);
 #else
 static inline void sock_prot_inuse_add(struct net *net, struct proto *prot,
int inc)
diff --git a/net/core/sock.c b/net/core/sock.c
index c2dd2d3..a11680a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -145,6 +145,8 @@
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
+static void sock_inuse_add(struct net *net, int val);
+
 /**
  * sk_ns_capable - General socket capability test
  * @sk: Socket to use a capability on or through
@@ -1534,6 +1536,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t 
priority,
if (likely(sk->sk_net_refcnt))
get_net(net);
sock_net_set(sk, net);
+   sock_inuse_add(net, 1);
refcount_set(>sk_wmem_alloc, 1);
 
mem_cgroup_sk_alloc(sk);
@@ -1595,6 +1598,8 @@ void sk_destruct(struct sock *sk)
 
 static void __sk_free(struct sock *sk)
 {
+   sock_inuse_add(sock_net(sk), -1);
+
if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
sock_diag_broadcast_destroy(sk);
else
@@ -1716,6 +1721,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
newsk->sk_priority = 0;
newsk->sk_incoming_cpu = raw_smp_processor_id();
atomic64_set(>sk_cookie, 0);
+   sock_inuse_add(sock_net(newsk), 1);
 
/*
 * Before updating sk_refcnt, we must commit prior changes to 
memory
@@ -3061,15 +3067,53 @@ int sock_prot_inuse_get(struct net *net, struct proto 
*prot)
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
 
+static void sock_inuse_add(struct net *net, int val)
+{
+   if (net->core.prot_inuse)
+   this_cpu_add(*net->core.sock_inuse, val);
+}
+
+int sock_inuse_get(struct net *net)
+{
+   int cpu, res = 0;
+
+   if (!net->core.prot_inuse)
+   return 0;
+
+   for_each_possible_cpu(cpu)
+   res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+
+   return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_inuse_get);
+
 static int __net_init sock_inuse_init_net(struct net *net)
 {
net->core.prot_inuse = alloc_percpu(struct prot_inuse);
-   return net->core.prot_inuse ? 0 : -ENOMEM;
+   if (!net->core.prot_inuse)
+   return -ENOMEM;
+
+   net->core.sock_inuse = alloc_percpu(int);
+   if (!net->core.sock_inuse)
+   goto out;
+
+   return 0;
+out:
+   free_percpu(net->core.prot_inuse);
+   return -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
-   free_percpu(net->core.prot_inuse);
+   if (net->core.prot_inuse) {
+   free_percpu(net->core.prot_inuse);
+   net->core.prot_inuse = NULL;
+   }
+
+   if (net->core.sock_inuse) {
+   free_percpu(net->cor

[PATCH v5 1/2] sock: Change the netns_core member name.

2017-12-07 Thread Tonghao Zhang

Change the member name will make the code more readable.
This patch will be used in next patch.

Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 include/net/netns/core.h |  2 +-
 net/core/sock.c  | 10 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 0ad4d0c..45cfb5d 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,7 +11,7 @@ struct netns_core {
 
int sysctl_somaxconn;
 
-   struct prot_inuse __percpu *inuse;
+   struct prot_inuse __percpu *prot_inuse;
 };
 
 #endif
diff --git a/net/core/sock.c b/net/core/sock.c
index c0b5b2f..c2dd2d3 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3045,7 +3045,7 @@ struct prot_inuse {
 
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-   __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
+   __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
@@ -3055,7 +3055,7 @@ int sock_prot_inuse_get(struct net *net, struct proto 
*prot)
int res = 0;
 
for_each_possible_cpu(cpu)
-   res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+   res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
 
return res >= 0 ? res : 0;
 }
@@ -3063,13 +3063,13 @@ int sock_prot_inuse_get(struct net *net, struct proto 
*prot)
 
 static int __net_init sock_inuse_init_net(struct net *net)
 {
-   net->core.inuse = alloc_percpu(struct prot_inuse);
-   return net->core.inuse ? 0 : -ENOMEM;
+   net->core.prot_inuse = alloc_percpu(struct prot_inuse);
+   return net->core.prot_inuse ? 0 : -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
-   free_percpu(net->core.inuse);
+   free_percpu(net->core.prot_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
-- 
1.8.3.1

[PATCH v4 2/2] sock: Move the socket inuse to namespace.

2017-11-22 Thread Tonghao Zhang

This patch add a member in struct netns_core. And this is
a counter for socket_inuse in the _net_ namespace. The patch
will add/sub counter in the sk_alloc or sk_free. Because socket and
sock is in pair. It's a easy way to maintain the code and help
developer to review. More important, it avoids holding the _net_
namespace again.

Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
v3 --> v4:
1. add noop function for !CONF_PROC_FS case.
2. change the __this_cpu_add to this_cpu_add. This is reported by lkp.
reported at: http://patchwork.ozlabs.org/patch/837424/
---
 include/net/netns/core.h |  1 +
 include/net/sock.h   |  1 +
 net/core/sock.c  | 40 +++-
 net/socket.c | 13 ++---
 4 files changed, 43 insertions(+), 12 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 6490b79..1de41f3 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,6 +11,7 @@ struct netns_core {
int sysctl_somaxconn;
 
struct prot_inuse __percpu *prot_inuse;
+   int __percpu *sock_inuse;
 };
 
 #endif
diff --git a/include/net/sock.h b/include/net/sock.h
index 6f1be97..169a26f 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1263,6 +1263,7 @@ static inline void sk_sockets_allocated_inc(struct sock 
*sk)
 /* Called with local bh disabled */
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc);
 int sock_prot_inuse_get(struct net *net, struct proto *proto);
+int sock_inuse_get(struct net *net);
 #else
 static inline void sock_prot_inuse_add(struct net *net, struct proto *prot,
int inc)
diff --git a/net/core/sock.c b/net/core/sock.c
index b899d86..04bbab1 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -145,6 +145,8 @@
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
+static void sock_inuse_add(struct net *net, int val);
+
 /**
  * sk_ns_capable - General socket capability test
  * @sk: Socket to use a capability on or through
@@ -1536,6 +1538,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t 
priority,
if (likely(sk->sk_net_refcnt))
get_net(net);
sock_net_set(sk, net);
+   sock_inuse_add(net, 1);
refcount_set(>sk_wmem_alloc, 1);
 
mem_cgroup_sk_alloc(sk);
@@ -1597,6 +1600,8 @@ void sk_destruct(struct sock *sk)
 
 static void __sk_free(struct sock *sk)
 {
+   sock_inuse_add(sock_net(sk), -1);
+
if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
sock_diag_broadcast_destroy(sk);
else
@@ -1665,6 +1670,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
newsk->sk_backlog.len = 0;
 
+   sock_inuse_add(sock_net(newsk), 1);
atomic_set(>sk_rmem_alloc, 0);
/*
 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
@@ -3060,15 +3066,43 @@ int sock_prot_inuse_get(struct net *net, struct proto 
*prot)
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
 
+static void sock_inuse_add(struct net *net, int val)
+{
+   this_cpu_add(*net->core.sock_inuse, val);
+}
+
+int sock_inuse_get(struct net *net)
+{
+   int cpu, res = 0;
+
+   for_each_possible_cpu(cpu)
+   res += *per_cpu_ptr(net->core.sock_inuse, cpu);
+
+   return res >= 0 ? res : 0;
+}
+
+EXPORT_SYMBOL_GPL(sock_inuse_get);
+
 static int __net_init sock_inuse_init_net(struct net *net)
 {
net->core.prot_inuse = alloc_percpu(struct prot_inuse);
-   return net->core.prot_inuse ? 0 : -ENOMEM;
+   if (!net->core.prot_inuse)
+   return -ENOMEM;
+
+   net->core.sock_inuse = alloc_percpu(int);
+   if (!net->core.sock_inuse)
+   goto out;
+
+   return 0;
+out:
+   free_percpu(net->core.prot_inuse);
+   return -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
free_percpu(net->core.prot_inuse);
+   free_percpu(net->core.sock_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
@@ -3111,6 +3145,10 @@ static inline void assign_proto_idx(struct proto *prot)
 static inline void release_proto_idx(struct proto *prot)
 {
 }
+
+static void sock_inuse_add(struct net *net, int val)
+{
+}
 #endif
 
 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
diff --git a/net/socket.c b/net/socket.c
index b085f14..1bdf364 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2646,17 +2646,8 @@ static int __init sock_init(void)
 #ifdef CONFIG_PROC_FS
 void socket_seq_show(struct seq_file *seq)
 {
-   int cpu;
-   int counter = 0;
-
-   for_each_possible_cpu(cpu)
-

[PATCH v4 1/2] sock: Change the netns_core member name.

2017-11-22 Thread Tonghao Zhang

Change the member name will make the code more readable.
This patch will be used in next patch.

Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 include/net/netns/core.h |  2 +-
 net/core/sock.c  | 10 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 78eb1ff..6490b79 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -10,7 +10,7 @@ struct netns_core {
 
int sysctl_somaxconn;
 
-   struct prot_inuse __percpu *inuse;
+   struct prot_inuse __percpu *prot_inuse;
 };
 
 #endif
diff --git a/net/core/sock.c b/net/core/sock.c
index 03e1b1e..b899d86 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3044,7 +3044,7 @@ struct prot_inuse {
 
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-   __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
+   __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
@@ -3054,7 +3054,7 @@ int sock_prot_inuse_get(struct net *net, struct proto 
*prot)
int res = 0;
 
for_each_possible_cpu(cpu)
-   res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+   res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
 
return res >= 0 ? res : 0;
 }
@@ -3062,13 +3062,13 @@ int sock_prot_inuse_get(struct net *net, struct proto 
*prot)
 
 static int __net_init sock_inuse_init_net(struct net *net)
 {
-   net->core.inuse = alloc_percpu(struct prot_inuse);
-   return net->core.inuse ? 0 : -ENOMEM;
+   net->core.prot_inuse = alloc_percpu(struct prot_inuse);
+   return net->core.prot_inuse ? 0 : -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
-   free_percpu(net->core.inuse);
+   free_percpu(net->core.prot_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
-- 
1.8.3.1

Re: [PATCH v3 2/2] sock: Move the socket inuse to namespace.

2017-11-16 Thread Tonghao Zhang

On Fri, Nov 17, 2017 at 4:20 AM, Cong Wang <xiyou.wangc...@gmail.com> wrote:
> On Wed, Nov 15, 2017 at 7:36 AM, Tonghao Zhang <xiangxia.m@gmail.com> 
> wrote:
>> diff --git a/net/core/sock.c b/net/core/sock.c
>> index b899d8669388..f01ed0b41bde 100644
>> --- a/net/core/sock.c
>> +++ b/net/core/sock.c
>> @@ -145,6 +145,10 @@
>>  static DEFINE_MUTEX(proto_list_mutex);
>>  static LIST_HEAD(proto_list);
>>
>> +#ifdef CONFIG_PROC_FS
>> +static void sock_inuse_add(struct net *net, int val);
>> +#endif
>> +
>>  /**
>>   * sk_ns_capable - General socket capability test
>>   * @sk: Socket to use a capability on or through
>> @@ -1536,6 +1540,7 @@ struct sock *sk_alloc(struct net *net, int family, 
>> gfp_t priority,
>> if (likely(sk->sk_net_refcnt))
>> get_net(net);
>> sock_net_set(sk, net);
>> +   sock_inuse_add(net, 1);
>
> You don't need to define a nop for sock_inuse_add() in
> !CONFIG_PROC_FS case?
Yes, we should. But we cant config the CONFIG_PROC_FS in 'make menuconfig'
Then !CONFIG_PROC_FS is a rare event.  so I dont check it there, such
as  other counter sock_prot_inuse_add.
A patch will be sent for fixing  !CONFIG_PROC_FS. and v4 will be sent
too. Thanks a lot, cong.

[PATCH v3 1/2] sock: Change the netns_core member name.

2017-11-15 Thread Tonghao Zhang

From: Tonghao Zhang <xiangxia.m@gmail.com>

Change the member name will make the code more readable.
This patch will be used in next patch.

Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 include/net/netns/core.h |  2 +-
 net/core/sock.c  | 10 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 78eb1ff75475..6490b79881d2 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -10,7 +10,7 @@ struct netns_core {
 
int sysctl_somaxconn;
 
-   struct prot_inuse __percpu *inuse;
+   struct prot_inuse __percpu *prot_inuse;
 };
 
 #endif
diff --git a/net/core/sock.c b/net/core/sock.c
index 03e1b1ec0d5b..b899d8669388 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3044,7 +3044,7 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
 
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-   __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
+   __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
@@ -3054,7 +3054,7 @@ int sock_prot_inuse_get(struct net *net, struct proto 
*prot)
int res = 0;
 
for_each_possible_cpu(cpu)
-   res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+   res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
 
return res >= 0 ? res : 0;
 }
@@ -3062,13 +3062,13 @@ EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
 
 static int __net_init sock_inuse_init_net(struct net *net)
 {
-   net->core.inuse = alloc_percpu(struct prot_inuse);
-   return net->core.inuse ? 0 : -ENOMEM;
+   net->core.prot_inuse = alloc_percpu(struct prot_inuse);
+   return net->core.prot_inuse ? 0 : -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
-   free_percpu(net->core.inuse);
+   free_percpu(net->core.prot_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
-- 
2.13.6

[PATCH v3 2/2] sock: Move the socket inuse to namespace.

2017-11-15 Thread Tonghao Zhang

From: Tonghao Zhang <xiangxia.m@gmail.com>

This patch add a member in struct netns_core. and this is
a counter for socket_inuse in the _net_ namespace. The patch
will add/sub counter in the sk_alloc or sk_free. Because socket and
sock is in pair. It's a easy way to maintain the code. and help
developer to review. More important, it avoids holding the _net_
namespace again.

Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 include/net/netns/core.h |  1 +
 net/core/sock.c  | 26 +-
 net/socket.c | 17 +
 3 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 6490b79881d2..1de41f3a72a1 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,6 +11,7 @@ struct netns_core {
int sysctl_somaxconn;
 
struct prot_inuse __percpu *prot_inuse;
+   int __percpu *sock_inuse;
 };
 
 #endif
diff --git a/net/core/sock.c b/net/core/sock.c
index b899d8669388..f01ed0b41bde 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -145,6 +145,10 @@
 static DEFINE_MUTEX(proto_list_mutex);
 static LIST_HEAD(proto_list);
 
+#ifdef CONFIG_PROC_FS
+static void sock_inuse_add(struct net *net, int val);
+#endif
+
 /**
  * sk_ns_capable - General socket capability test
  * @sk: Socket to use a capability on or through
@@ -1536,6 +1540,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t 
priority,
if (likely(sk->sk_net_refcnt))
get_net(net);
sock_net_set(sk, net);
+   sock_inuse_add(net, 1);
refcount_set(>sk_wmem_alloc, 1);
 
mem_cgroup_sk_alloc(sk);
@@ -1597,6 +1602,8 @@ void sk_destruct(struct sock *sk)
 
 static void __sk_free(struct sock *sk)
 {
+   sock_inuse_add(sock_net(sk), -1);
+
if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
sock_diag_broadcast_destroy(sk);
else
@@ -1665,6 +1672,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
newsk->sk_backlog.len = 0;
 
+   sock_inuse_add(sock_net(newsk), 1);
atomic_set(>sk_rmem_alloc, 0);
/*
 * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
@@ -3048,6 +3056,11 @@ void sock_prot_inuse_add(struct net *net, struct proto 
*prot, int val)
 }
 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
+static void sock_inuse_add(struct net *net, int val)
+{
+   __this_cpu_add(*net->core.sock_inuse, val);
+}
+
 int sock_prot_inuse_get(struct net *net, struct proto *prot)
 {
int cpu, idx = prot->inuse_idx;
@@ -3063,12 +3076,23 @@ EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
 static int __net_init sock_inuse_init_net(struct net *net)
 {
net->core.prot_inuse = alloc_percpu(struct prot_inuse);
-   return net->core.prot_inuse ? 0 : -ENOMEM;
+   if (!net->core.prot_inuse)
+   return -ENOMEM;
+
+   net->core.sock_inuse = alloc_percpu(int);
+   if (!net->core.sock_inuse)
+   goto out;
+
+   return 0;
+out:
+   free_percpu(net->core.prot_inuse);
+   return -ENOMEM;
 }
 
 static void __net_exit sock_inuse_exit_net(struct net *net)
 {
free_percpu(net->core.prot_inuse);
+   free_percpu(net->core.sock_inuse);
 }
 
 static struct pernet_operations net_inuse_ops = {
diff --git a/net/socket.c b/net/socket.c
index c729625eb5d3..90767eb1731d 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -2643,19 +2643,20 @@ static int __init sock_init(void)
 core_initcall(sock_init);  /* early initcall */
 
 #ifdef CONFIG_PROC_FS
-void socket_seq_show(struct seq_file *seq)
+static int socket_inuse_get(struct net *net)
 {
-   int cpu;
-   int counter = 0;
+   int cpu, res = 0;
 
for_each_possible_cpu(cpu)
-   counter += per_cpu(sockets_in_use, cpu);
+   res += *per_cpu_ptr(net->core.sock_inuse, cpu);
 
-   /* It can be negative, by the way. 8) */
-   if (counter < 0)
-   counter = 0;
+   return res >= 0 ? res : 0;
+}
 
-   seq_printf(seq, "sockets: used %d\n", counter);
+void socket_seq_show(struct seq_file *seq)
+{
+   seq_printf(seq, "sockets: used %d\n",
+  socket_inuse_get(seq->private));
 }
 #endif /* CONFIG_PROC_FS */
 
-- 
2.13.6

Re: [PATCH v2 net-next] socket: Move the socket inuse to namespace.

2017-11-15 Thread Tonghao Zhang

On Wed, Nov 15, 2017 at 12:48 AM, Cong Wang <xiyou.wangc...@gmail.com> wrote:
> On Tue, Nov 14, 2017 at 3:17 AM, Tonghao Zhang <xiangxia.m@gmail.com> 
> wrote:
>> Hi cong, we increase the socket inuse in sk_allock ?
>
> How about sock_init_data()?
>
> Or at least you can replace get_net() with read_pnet()?
Hi cong, I guess it is better to count socket inuse in sk_alloc. It is
easy to maintain counter. In this way, we dont hold the refcnt again.
I send the v3. Can you review it again ? thanks.

Re: [PATCH v2 net-next] socket: Move the socket inuse to namespace.

2017-11-14 Thread Tonghao Zhang

Hi cong, we increase the socket inuse in sk_allock ?

On Tue, Nov 14, 2017 at 1:58 PM, Cong Wang <xiyou.wangc...@gmail.com> wrote:
> On Mon, Nov 13, 2017 at 7:23 PM, 张军伟(基础平台部)
> <zhangjunweimar...@didichuxing.com> wrote:
>>
>>> On 14 Nov 2017, at 10:09, Cong Wang <xiyou.wangc...@gmail.com> wrote:
>>>
>>> On Mon, Nov 13, 2017 at 4:36 AM, Tonghao Zhang <xiangxia.m@gmail.com> 
>>> wrote:
>>>> From: Tonghao Zhang <xiangxia.m@gmail.com>
>>>>
>>>> This patch add a member in struct netns_core. and this is
>>>> a counter for socket_inuse in the _net_ namespace. The patch
>>>> will add/sub counter in the sock_alloc or sock_release. In
>>>> the sock_alloc, the private data of inode saves the special
>>>> _net_. When releasing it, we can access the special _net_ and
>>>> dec the counter of socket in that namespace.
>>>>
>>>> By the way, we dont use the 'current->nsproxy->net_ns' in the
>>>> sock_release. In one case，when one task exits, the 'do_exit'
>>>> may set the current->nsproxy NULL, and then call the sock_release.
>>>> Use the private data of inode, saving few bytes.
>>>
>>> Why do you need to hold netns refcnt for socket? sock already holds
>>> it, so you can just access it by sock_net(sock->sk) ?
>> I think you suggestion is
>> replace get_net(net) ===> with sock_net(sock->sk)
>
> Not literally, but essentially yes.
>
>
>>
>> If thus, I think it could not work.
>> Because sock->sk is assigned after sock_alloc.
>> What we change is done in sock_alloc.
>
> You don't have to do it in sock_alloc().
>
>
>> By the way, sock->sk may has been released(NULL) in sock_release.
>
> My point is since sock is always paired with a sk and sk already
> holds refcnt, you don't need to hold it again, it looks unnecessary.

Re: [PATCH v2 net-next] socket: Move the socket inuse to namespace.

2017-11-13 Thread Tonghao Zhang

Yes,  thanks for your advise. I will modify, test it and then submit v3

On Tue, Nov 14, 2017 at 10:09 AM, Cong Wang <xiyou.wangc...@gmail.com> wrote:
> On Mon, Nov 13, 2017 at 4:36 AM, Tonghao Zhang <xiangxia.m@gmail.com> 
> wrote:
>> From: Tonghao Zhang <xiangxia.m@gmail.com>
>>
>> This patch add a member in struct netns_core. and this is
>> a counter for socket_inuse in the _net_ namespace. The patch
>> will add/sub counter in the sock_alloc or sock_release. In
>> the sock_alloc, the private data of inode saves the special
>> _net_. When releasing it, we can access the special _net_ and
>> dec the counter of socket in that namespace.
>>
>> By the way, we dont use the 'current->nsproxy->net_ns' in the
>> sock_release. In one case，when one task exits, the 'do_exit'
>> may set the current->nsproxy NULL, and then call the sock_release.
>> Use the private data of inode, saving few bytes.
>
> Why do you need to hold netns refcnt for socket? sock already holds
> it, so you can just access it by sock_net(sock->sk) ?

[PATCH v2 net-next] socket: Move the socket inuse to namespace.

2017-11-13 Thread Tonghao Zhang

From: Tonghao Zhang <xiangxia.m@gmail.com>

This patch add a member in struct netns_core. and this is
a counter for socket_inuse in the _net_ namespace. The patch
will add/sub counter in the sock_alloc or sock_release. In
the sock_alloc, the private data of inode saves the special
_net_. When releasing it, we can access the special _net_ and
dec the counter of socket in that namespace.

By the way, we dont use the 'current->nsproxy->net_ns' in the
sock_release. In one case，when one task exits, the 'do_exit'
may set the current->nsproxy NULL, and then call the sock_release.
Use the private data of inode, saving few bytes.

Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
---
fix typo and comment.
---
 include/net/netns/core.h |  1 +
 net/socket.c | 79 
 2 files changed, 61 insertions(+), 19 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 78eb1ff75475..bef1bc8a1721 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,6 +11,7 @@ struct netns_core {
int sysctl_somaxconn;
 
struct prot_inuse __percpu *inuse;
+   int __percpu *socket_inuse;
 };
 
 #endif
diff --git a/net/socket.c b/net/socket.c
index c729625eb5d3..8a873d3fb2ab 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -163,12 +163,6 @@ static DEFINE_SPINLOCK(net_family_lock);
 static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
 
 /*
- * Statistics counters of the socket lists
- */
-
-static DEFINE_PER_CPU(int, sockets_in_use);
-
-/*
  * Support routines.
  * Move socket addresses back and forth across the kernel/user
  * divide and look after the messy bits.
@@ -549,6 +543,50 @@ static const struct inode_operations sockfs_inode_ops = {
.setattr = sockfs_setattr,
 };
 
+#ifdef CONFIG_PROC_FS
+static void socket_inuse_add(struct net *net, int val)
+{
+   __this_cpu_add(*net->core.socket_inuse, val);
+}
+
+static int socket_inuse_get(struct net *net)
+{
+   int cpu, res = 0;
+
+   for_each_possible_cpu(cpu)
+   res += *per_cpu_ptr(net->core.socket_inuse, cpu);
+
+   return res >= 0 ? res : 0;
+}
+
+static int __net_init socket_inuse_init_net(struct net *net)
+{
+   net->core.socket_inuse = alloc_percpu(int);
+   return net->core.socket_inuse ? 0 : -ENOMEM;
+}
+
+static void __net_exit socket_inuse_exit_net(struct net *net)
+{
+   free_percpu(net->core.socket_inuse);
+}
+
+static struct pernet_operations socket_inuse_ops = {
+   .init = socket_inuse_init_net,
+   .exit = socket_inuse_exit_net,
+};
+
+static __init int socket_inuse_init(void)
+{
+   if (register_pernet_subsys(_inuse_ops))
+   panic("Cannot initialize socket inuse counters");
+
+   return 0;
+}
+
+core_initcall(socket_inuse_init);
+#endif /* CONFIG_PROC_FS */
+
+
 /**
  * sock_alloc  -   allocate a socket
  *
@@ -561,6 +599,7 @@ struct socket *sock_alloc(void)
 {
struct inode *inode;
struct socket *sock;
+   struct net *net;
 
inode = new_inode_pseudo(sock_mnt->mnt_sb);
if (!inode)
@@ -575,7 +614,15 @@ struct socket *sock_alloc(void)
inode->i_gid = current_fsgid();
inode->i_op = _inode_ops;
 
-   this_cpu_add(sockets_in_use, 1);
+   net = current->nsproxy->net_ns;
+   /*
+* Save the _net_ to private data of inode. When we destroy the
+* socket, we can use it to access the _net_ and dec socket_inuse
+* counter.
+*/
+   inode->i_private = get_net(net);
+   socket_inuse_add(net, 1);
+
return sock;
 }
 EXPORT_SYMBOL(sock_alloc);
@@ -602,7 +649,10 @@ void sock_release(struct socket *sock)
if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);
 
-   this_cpu_sub(sockets_in_use, 1);
+   /* inode->i_private saves the _net_ address. */
+   socket_inuse_add(SOCK_INODE(sock)->i_private, -1);
+   put_net(SOCK_INODE(sock)->i_private);
+
if (!sock->file) {
iput(SOCK_INODE(sock));
return;
@@ -2645,17 +2695,8 @@ core_initcall(sock_init);/* early initcall */
 #ifdef CONFIG_PROC_FS
 void socket_seq_show(struct seq_file *seq)
 {
-   int cpu;
-   int counter = 0;
-
-   for_each_possible_cpu(cpu)
-   counter += per_cpu(sockets_in_use, cpu);
-
-   /* It can be negative, by the way. 8) */
-   if (counter < 0)
-   counter = 0;
-
-   seq_printf(seq, "sockets: used %d\n", counter);
+   seq_printf(seq, "sockets: used %d\n",
+  socket_inuse_get(seq->private));
 }
 #endif /* CONFIG_PROC_FS */
 
-- 
2.13.6

[PATCH] socket: Move the socket inuse to namespace.

2017-11-13 Thread Tonghao Zhang

From: Tonghao Zhang <xiangxia.m@gmail.com>

This patch add a member in struct netns_core. and this is
a counter for socket_inuse in the _net_ namespace. The patch
will add/sub counter in the sock_alloc or sock_release. In
the sock_alloc, the private data of inode saves the special
_net_. When releasing it, we can access the special _net_ and
dec the counter of socket in that namespace.

By the way, we dont use the 'current->nsproxy->net_ns' in the
sock_release. In one case，when one task exits, the 'do_exit'
may set the current->nsproxy NULL, and then call the sock_release.
Use the private data of inode, saving few bytes.

Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
Signed-off-by: Martin Zhang <zhangjunweimar...@didichuxing.com>
Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 include/net/netns/core.h |  1 +
 net/socket.c | 80 
 2 files changed, 62 insertions(+), 19 deletions(-)

diff --git a/include/net/netns/core.h b/include/net/netns/core.h
index 78eb1ff75475..bef1bc8a1721 100644
--- a/include/net/netns/core.h
+++ b/include/net/netns/core.h
@@ -11,6 +11,7 @@ struct netns_core {
int sysctl_somaxconn;
 
struct prot_inuse __percpu *inuse;
+   int __percpu *socket_inuse;
 };
 
 #endif
diff --git a/net/socket.c b/net/socket.c
index c729625eb5d3..c54f16f06c77 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -163,12 +163,6 @@ static DEFINE_SPINLOCK(net_family_lock);
 static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;
 
 /*
- * Statistics counters of the socket lists
- */
-
-static DEFINE_PER_CPU(int, sockets_in_use);
-
-/*
  * Support routines.
  * Move socket addresses back and forth across the kernel/user
  * divide and look after the messy bits.
@@ -549,6 +543,51 @@ static const struct inode_operations sockfs_inode_ops = {
.setattr = sockfs_setattr,
 };
 
+#ifdef CONFIG_PROC_FS
+static void socket_inuse_add(struct net *net, int val)
+{
+   __this_cpu_add(*net->core.socket_inuse, val);
+}
+
+static int socket_inuse_get(struct net *net)
+{
+   int cpu, res = 0;
+
+   for_each_possible_cpu(cpu) {
+res = *per_cpu_ptr(net->core.socket_inuse, cpu);
+   }
+
+   return res >= 0 ? res : 0;
+}
+
+static int __net_init socket_inuse_init_net(struct net *net)
+{
+   net->core.socket_inuse = alloc_percpu(int);
+   return net->core.socket_inuse ? 0 : -ENOMEM;
+}
+
+static void __net_exit socket_inuse_exit_net(struct net *net)
+{
+   free_percpu(net->core.socket_inuse);
+}
+
+static struct pernet_operations socket_inuse_ops = {
+   .init = socket_inuse_init_net,
+   .exit = socket_inuse_exit_net,
+};
+
+static __init int socket_inuse_init(void)
+{
+   if (register_pernet_subsys(_inuse_ops))
+   panic("Cannot initialize socket inuse counters");
+
+   return 0;
+}
+
+core_initcall(socket_inuse_init);
+#endif /* CONFIG_PROC_FS */
+
+
 /**
  * sock_alloc  -   allocate a socket
  *
@@ -561,6 +600,7 @@ struct socket *sock_alloc(void)
 {
struct inode *inode;
struct socket *sock;
+   struct net *net;
 
inode = new_inode_pseudo(sock_mnt->mnt_sb);
if (!inode)
@@ -575,7 +615,15 @@ struct socket *sock_alloc(void)
inode->i_gid = current_fsgid();
inode->i_op = _inode_ops;
 
-   this_cpu_add(sockets_in_use, 1);
+   net = current->nsproxy->net_ns;
+   /*
+* Save the _net_ to private data of inode. When we destroy the
+* socket, we can use it to access the _net_ and dec socket_inuse
+* counter.
+*/
+   inode->i_private = get_net(net);
+   socket_inuse_add(net, 1);
+
return sock;
 }
 EXPORT_SYMBOL(sock_alloc);
@@ -602,7 +650,10 @@ void sock_release(struct socket *sock)
if (rcu_dereference_protected(sock->wq, 1)->fasync_list)
pr_err("%s: fasync list not empty!\n", __func__);
 
-   this_cpu_sub(sockets_in_use, 1);
+   /* inode->ii_private saves the _net_ address. */
+   socket_inuse_add(SOCK_INODE(sock)->i_private, -1);
+   put_net(SOCK_INODE(sock)->i_private);
+
if (!sock->file) {
iput(SOCK_INODE(sock));
return;
@@ -2645,17 +2696,8 @@ core_initcall(sock_init);/* early initcall */
 #ifdef CONFIG_PROC_FS
 void socket_seq_show(struct seq_file *seq)
 {
-   int cpu;
-   int counter = 0;
-
-   for_each_possible_cpu(cpu)
-   counter += per_cpu(sockets_in_use, cpu);
-
-   /* It can be negative, by the way. 8) */
-   if (counter < 0)
-   counter = 0;
-
-   seq_printf(seq, "sockets: used %d\n", counter);
+   seq_printf(seq, "sockets: used %d\n",
+  socket_inuse_get(seq->private));
 }
 #endif /* CONFIG_PROC_FS */
 
-- 
2.13.6

[PATCH] socket: add THIS_MODULE to sock_fs_type.

2017-11-10 Thread Tonghao Zhang

As important member of file_system_type, we set
->owner for sock_fs_type.

Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 net/socket.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/socket.c b/net/socket.c
index c729625..b085f14 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -370,6 +370,7 @@ static struct dentry *sockfs_mount(struct file_system_type 
*fs_type,
 static struct vfsmount *sock_mnt __read_mostly;
 
 static struct file_system_type sock_fs_type = {
+   .owner =THIS_MODULE,
.name = "sockfs",
.mount =sockfs_mount,
.kill_sb =  kill_anon_super,
-- 
1.8.3.1

[PATCH] sock: Remove the global prot_inuse counter.

2017-11-09 Thread Tonghao Zhang

The per-cpu counter for init_net is prepared in core_initcall.
The patch 7d720c3e ("percpu: add __percpu sparse annotations to net")
and d6d9ca0fe ("net: this_cpu_xxx conversions") optimize the
routines. Then remove the old counter.

Cc: Pavel Emelyanov <xe...@openvz.org>
Signed-off-by: Tonghao Zhang <zhangtong...@didichuxing.com>
---
 net/core/sock.c | 22 --
 1 file changed, 22 deletions(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 7594000..03e1b1e 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -3042,7 +3042,6 @@ struct prot_inuse {
 
 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
 
-#ifdef CONFIG_NET_NS
 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
__this_cpu_add(net->core.inuse->val[prot->inuse_idx], val);
@@ -3086,27 +3085,6 @@ static __init int net_inuse_init(void)
 }
 
 core_initcall(net_inuse_init);
-#else
-static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
-
-void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
-{
-   __this_cpu_add(prot_inuse.val[prot->inuse_idx], val);
-}
-EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
-
-int sock_prot_inuse_get(struct net *net, struct proto *prot)
-{
-   int cpu, idx = prot->inuse_idx;
-   int res = 0;
-
-   for_each_possible_cpu(cpu)
-   res += per_cpu(prot_inuse, cpu).val[idx];
-
-   return res >= 0 ? res : 0;
-}
-EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
-#endif
 
 static void assign_proto_idx(struct proto *prot)
 {
-- 
1.8.3.1

[PATCH] ixgbe: Remove an obsolete comment about ITR.

2017-10-30 Thread Tonghao Zhang

From: Tonghao Zhang <xiangxia.m@gmail.com>

The InterruptThrottleRate has been removed from ixgbe. Then Update
the comment.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 7f503d3..2a4ff0c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -2534,8 +2534,6 @@ enum latency_range {
  *  based on theoretical maximum wire speed and thresholds were set based
  *  on testing data as well as attempting to minimize response time
  *  while increasing bulk throughput.
- *  this functionality is controlled by the InterruptThrottleRate module
- *  parameter (see ixgbe_param.c)
  **/
 static void ixgbe_update_itr(struct ixgbe_q_vector *q_vector,
 struct ixgbe_ring_container *ring_container)
-- 
1.8.3.1

Re: [PATCH] datapath: Fix an error handling path in 'ovs_nla_init_match_and_action()'

2017-09-12 Thread Tonghao Zhang

On Tue, Sep 12, 2017 at 3:20 AM, Christophe JAILLET
 wrote:
> All other error handling paths in this function go through the 'error'
> label. This one should do the same.
>
> Fixes: 9cc9a5cb176c ("datapath: Avoid using stack larger than 1024.")
> Signed-off-by: Christophe JAILLET 
> ---
> I think that the comment above the function could be improved. It looks
> like the commit log which has introduced this function.
>
> I'm also not sure that commit 9cc9a5cb176c is of any help. It is
> supposed to remove a warning, and I guess it does. But 
> 'ovs_nla_init_match_and_action()'
> is called unconditionnaly from 'ovs_flow_cmd_set()'. So even if the stack
> used by each function is reduced, the overall stack should be the same, if
> not larger.
>
> So this commit sounds like adding a bug where the code was fine and states
> to fix an issue but, at the best, only hides it.
>
> Instead of fixing the code with the proposed patch, reverting the initial
> commit could also be considered.
> ---
>  net/openvswitch/datapath.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
> index 76cf273a56c7..c3aec6227c91 100644
> --- a/net/openvswitch/datapath.c
> +++ b/net/openvswitch/datapath.c
> @@ -1112,7 +1112,8 @@ static int ovs_nla_init_match_and_action(struct net 
> *net,
> if (!a[OVS_FLOW_ATTR_KEY]) {
> OVS_NLERR(log,
>   "Flow key attribute not present in set 
> flow.");
> -   return -EINVAL;
> +   error = -EINVAL;
> +   goto error;

Thank for your report. But I really don't understand.
In the 'ovs_nla_init_match_and_action', we only init 'match' when the
OVS_FLOW_ATTR_KEY is set.
If the 'OVS_FLOW_ATTR_ACTIONS' is set, but not 'OVS_FLOW_ATTR_KEY', we
can return directly because the match is not inited yet, and it is
unnecessary to set it's mask NULL. Then ovs_flow_cmd_set can run via
value returned.


> }
>
> *acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key,
> --
> 2.11.0
>

[PATCH net-next v3 2/2] tcp: Remove the unused parameter for tcp_try_fastopen.

2017-08-22 Thread Tonghao Zhang

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 include/net/tcp.h   | 3 +--
 net/ipv4/tcp_fastopen.c | 6 ++
 net/ipv4/tcp_input.c| 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index afdab3781425..a995004ae946 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1533,8 +1533,7 @@ int tcp_fastopen_reset_cipher(void *key, unsigned int 
len);
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
  struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst);
+ struct tcp_fastopen_cookie *foc);
 void tcp_fastopen_init_key_once(bool publish);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 struct tcp_fastopen_cookie *cookie);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index ce9c7fef200f..e3c33220c418 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -171,7 +171,6 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff 
*skb)
 
 static struct sock *tcp_fastopen_create_child(struct sock *sk,
  struct sk_buff *skb,
- struct dst_entry *dst,
  struct request_sock *req)
 {
struct tcp_sock *tp;
@@ -278,8 +277,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
  */
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
  struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst)
+ struct tcp_fastopen_cookie *foc)
 {
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
@@ -312,7 +310,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct 
sk_buff *skb,
 * data in SYN_RECV state.
 */
 fastopen:
-   child = tcp_fastopen_create_child(sk, skb, dst, req);
+   child = tcp_fastopen_create_child(sk, skb, req);
if (child) {
foc->len = -1;
NET_INC_STATS(sock_net(sk),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9397e989e1d1..6bd1f0f56e8b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6151,7 +6151,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_openreq_init_rwin(req, sk, dst);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
-   fastopen_sk = tcp_try_fastopen(sk, skb, req, , dst);
+   fastopen_sk = tcp_try_fastopen(sk, skb, req, );
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, , req,
-- 
2.13.4

[PATCH net-next v3 1/2] tcp: Get a proper dst before checking it.

2017-08-22 Thread Tonghao Zhang

tcp_peer_is_proven needs a proper route to make the
determination, but dst always is NULL. This bug may
be there at the beginning of git tree. This does not
look serious enough to deserve backports to stable
versions.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/ipv4/tcp_input.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d73903fe8c83..9397e989e1d1 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6112,6 +6112,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (tmp_opt.tstamp_ok)
tcp_rsk(req)->ts_off = af_ops->init_ts_off(net, skb);
 
+   dst = af_ops->route_req(sk, , req);
+   if (!dst)
+   goto drop_and_free;
+
if (!want_cookie && !isn) {
/* Kill the following clause, if you dislike this way. */
if (!net->ipv4.sysctl_tcp_syncookies &&
@@ -6132,11 +6136,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
isn = af_ops->init_seq(skb);
}
-   if (!dst) {
-   dst = af_ops->route_req(sk, , req);
-   if (!dst)
-   goto drop_and_free;
-   }
 
tcp_ecn_create_request(req, skb, sk, dst);
 
-- 
2.13.4

[PATCH net-next v3 0/2] Simplify the tcp_conn_request.

2017-08-22 Thread Tonghao Zhang

Just simplify the tcp_conn_request function.

Tonghao Zhang (2):
  tcp: Get a proper dst before checking it.
  tcp: Remove the unused parameter for tcp_try_fastopen.

 include/net/tcp.h   |  3 +--
 net/ipv4/tcp_fastopen.c |  6 ++
 net/ipv4/tcp_input.c| 11 +--
 3 files changed, 8 insertions(+), 12 deletions(-)

-- 
2.13.4

Re: [PATCH net-next v2 1/2] tcp: Remove unnecessary dst check in tcp_conn_request.

2017-08-21 Thread Tonghao Zhang

On Mon, Aug 21, 2017 at 10:56 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
> Please do not top post.
Got it, thanks.

> On Mon, 2017-08-21 at 21:24 +0800, Tonghao Zhang wrote:
>> Thanks,  yes  this is a bug. I found this bug exists from 3.17~ 4.13.
>> The commit is d94e0417
>>
>
> This bug was there at the beginning of git tree.
>
>
>> One question:  should I send a patch for each kernel version because
>> code conflicts ?
>>
>> a patch for v4.12
>> a patch for v4.11
>> a patch for v4.10~v4.7
>> a patch for v4.6~v3.17
>>
>> and
>> a patch for  net-next, because tcp_tw_recycle has been removed.
>>
>
> Given this bug only would matter if syncookies are disabled, I would not
> bother and only target net-next. This does not look serious enough to
> deserve backports to stable versions.
>
OK, thanks again.

>> Thanks very much.
>>
>> On Sun, Aug 20, 2017 at 12:25 PM, David Miller <da...@davemloft.net> wrote:
>> > From: Tonghao Zhang <xiangxia.m@gmail.com>
>> > Date: Wed, 16 Aug 2017 20:02:45 -0700
>> >
>> >> Because we remove the tcp_tw_recycle support in the commit
>> >> 4396e46187c ('tcp: remove tcp_tw_recycle') and also delete
>> >> the code 'af_ops->route_req' for sysctl_tw_recycle in tcp_conn_request.
>> >> Now when we call the 'af_ops->route_req', the dist always is
>> >> NULL, and we remove the unnecessay check.
>> >>
>> >> Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
>> >
>> > This is a bug actually, rather than something to paper over
>> > by removing the check.
>> >
>> > Code earlier in this function needs a proper 'dst' in order to operate
>> > properly.
>> >
>> > There is a call to tcp_peer_is_proven() which must have a proper route
>> > to make the determination yet it will always be NULL.
>> >
>> > Please investigate what the code is doing and how a test became
>> > "unnecessary" over time before blindly removing it, thank you.
>
>

Re: [PATCH net-next v2 1/2] tcp: Remove unnecessary dst check in tcp_conn_request.

2017-08-21 Thread Tonghao Zhang

Thanks,  yes  this is a bug. I found this bug exists from 3.17~ 4.13.
The commit is d94e0417

One question:  should I send a patch for each kernel version because
code conflicts ?

a patch for v4.12
a patch for v4.11
a patch for v4.10~v4.7
a patch for v4.6~v3.17

and
a patch for  net-next, because tcp_tw_recycle has been removed.

Thanks very much.

On Sun, Aug 20, 2017 at 12:25 PM, David Miller <da...@davemloft.net> wrote:
> From: Tonghao Zhang <xiangxia.m@gmail.com>
> Date: Wed, 16 Aug 2017 20:02:45 -0700
>
>> Because we remove the tcp_tw_recycle support in the commit
>> 4396e46187c ('tcp: remove tcp_tw_recycle') and also delete
>> the code 'af_ops->route_req' for sysctl_tw_recycle in tcp_conn_request.
>> Now when we call the 'af_ops->route_req', the dist always is
>> NULL, and we remove the unnecessay check.
>>
>> Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
>
> This is a bug actually, rather than something to paper over
> by removing the check.
>
> Code earlier in this function needs a proper 'dst' in order to operate
> properly.
>
> There is a call to tcp_peer_is_proven() which must have a proper route
> to make the determination yet it will always be NULL.
>
> Please investigate what the code is doing and how a test became
> "unnecessary" over time before blindly removing it, thank you.

[PATCH net-next v2 1/2] tcp: Remove unnecessary dst check in tcp_conn_request.

2017-08-16 Thread Tonghao Zhang

Because we remove the tcp_tw_recycle support in the commit
4396e46187c ('tcp: remove tcp_tw_recycle') and also delete
the code 'af_ops->route_req' for sysctl_tw_recycle in tcp_conn_request.
Now when we call the 'af_ops->route_req', the dist always is
NULL, and we remove the unnecessay check.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/ipv4/tcp_input.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d73903f..7eee2c7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6132,11 +6132,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
isn = af_ops->init_seq(skb);
}
-   if (!dst) {
-   dst = af_ops->route_req(sk, , req);
-   if (!dst)
-   goto drop_and_free;
-   }
+
+   dst = af_ops->route_req(sk, , req);
+   if (!dst)
+   goto drop_and_free;
 
tcp_ecn_create_request(req, skb, sk, dst);
 
-- 
1.8.3.1

[PATCH net-next v2 2/2] tcp: Remove the unused parameter for tcp_try_fastopen.

2017-08-16 Thread Tonghao Zhang

The parameter dst of tcp_try_fastopen and tcp_fastopen_create_child is
not used anymore. We remove it.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 include/net/tcp.h   | 3 +--
 net/ipv4/tcp_fastopen.c | 6 ++
 net/ipv4/tcp_input.c| 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index afdab37..a995004 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1533,8 +1533,7 @@ struct tcp_fastopen_request {
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
  struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst);
+ struct tcp_fastopen_cookie *foc);
 void tcp_fastopen_init_key_once(bool publish);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 struct tcp_fastopen_cookie *cookie);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index ce9c7fe..e3c3322 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -171,7 +171,6 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff 
*skb)
 
 static struct sock *tcp_fastopen_create_child(struct sock *sk,
  struct sk_buff *skb,
- struct dst_entry *dst,
  struct request_sock *req)
 {
struct tcp_sock *tp;
@@ -278,8 +277,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
  */
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
  struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst)
+ struct tcp_fastopen_cookie *foc)
 {
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
@@ -312,7 +310,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct 
sk_buff *skb,
 * data in SYN_RECV state.
 */
 fastopen:
-   child = tcp_fastopen_create_child(sk, skb, dst, req);
+   child = tcp_fastopen_create_child(sk, skb, req);
if (child) {
foc->len = -1;
NET_INC_STATS(sock_net(sk),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7eee2c7..21df086 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6151,7 +6151,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_openreq_init_rwin(req, sk, dst);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
-   fastopen_sk = tcp_try_fastopen(sk, skb, req, , dst);
+   fastopen_sk = tcp_try_fastopen(sk, skb, req, );
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, , req,
-- 
1.8.3.1

[PATCH net-next v2 0/2] simplify the tcp_conn_request.

2017-08-16 Thread Tonghao Zhang

These patches are not bugfix. But just simplify the tcp_conn_request
function.

These patches are based on Davem's net-next tree.

Tonghao Zhang (2):
  tcp: Remove unnecessary dst check in tcp_conn_request.
  tcp: Remove the unused parameter for tcp_try_fastopen.

 include/net/tcp.h   |  3 +--
 net/ipv4/tcp_fastopen.c |  6 ++
 net/ipv4/tcp_input.c| 11 +--
 3 files changed, 8 insertions(+), 12 deletions(-)

-- 
1.8.3.1

Re: [PATCH 1/2] tcp: Remove unnecessary dst check in tcp_conn_request.

2017-08-16 Thread Tonghao Zhang

On Wed, Aug 16, 2017 at 10:44 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
> On Wed, 2017-08-16 at 06:31 -0700, Tonghao Zhang wrote:
>> Because we remove the tcp_tw_recycle support in the commit
>
>
>> 4396e46187c ('tcp: remove tcp_tw_recycle') and also delete
>> the code 'af_ops->route_req' for sysctl_tw_recycle in tcp_conn_request.
>> Now when we call the 'af_ops->route_req', the dist always is
>> NULL, and we remove the unnecessay check.
>
> Thanks for these patches.
>
> You forgot :
>
> 1) a cover letter ( [PATCH next-next 0/2] tcp: 
>
> 2) clearly state which tree you are targeting
> ( read Documentation/networking/netdev-FAQ.txt )
Thanks, I do this in v2

> 3) Also, I would also have removed tcp_peer_is_proven()
> since it is also called with dst=NULL
Thanks for your work.

>
>

[PATCH net-next v2 1/2] tcp: Remove unnecessary dst check in tcp_conn_request.

2017-08-16 Thread Tonghao Zhang

Because we remove the tcp_tw_recycle support in the commit
4396e46187c ('tcp: remove tcp_tw_recycle') and also delete
the code 'af_ops->route_req' for sysctl_tw_recycle in tcp_conn_request.
Now when we call the 'af_ops->route_req', the dist always is
NULL, and we remove the unnecessay check.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/ipv4/tcp_input.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d73903f..7eee2c7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6132,11 +6132,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
isn = af_ops->init_seq(skb);
}
-   if (!dst) {
-   dst = af_ops->route_req(sk, , req);
-   if (!dst)
-   goto drop_and_free;
-   }
+
+   dst = af_ops->route_req(sk, , req);
+   if (!dst)
+   goto drop_and_free;
 
tcp_ecn_create_request(req, skb, sk, dst);
 
-- 
1.8.3.1

[PATCH net-next v2 0/2] simplify the tcp_conn_request.

2017-08-16 Thread Tonghao Zhang

These patches are not bugfix. But just simplify the tcp_conn_request
function.

These patches are based on Davem's net-next tree.

Tonghao Zhang (2):
  tcp: Remove unnecessary dst check in tcp_conn_request.
  tcp: Remove the unused parameter for tcp_try_fastopen.

 include/net/tcp.h   |  3 +--
 net/ipv4/tcp_fastopen.c |  6 ++
 net/ipv4/tcp_input.c| 11 +--
 3 files changed, 8 insertions(+), 12 deletions(-)

-- 
1.8.3.1

[PATCH 2/2] tcp: Remove the unused parameter for tcp_try_fastopen.

2017-08-16 Thread Tonghao Zhang

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 include/net/tcp.h   | 3 +--
 net/ipv4/tcp_fastopen.c | 6 ++
 net/ipv4/tcp_input.c| 2 +-
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index afdab3781425..a995004ae946 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1533,8 +1533,7 @@ int tcp_fastopen_reset_cipher(void *key, unsigned int 
len);
 void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb);
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
  struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst);
+ struct tcp_fastopen_cookie *foc);
 void tcp_fastopen_init_key_once(bool publish);
 bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
 struct tcp_fastopen_cookie *cookie);
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index ce9c7fef200f..e3c33220c418 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -171,7 +171,6 @@ void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff 
*skb)
 
 static struct sock *tcp_fastopen_create_child(struct sock *sk,
  struct sk_buff *skb,
- struct dst_entry *dst,
  struct request_sock *req)
 {
struct tcp_sock *tp;
@@ -278,8 +277,7 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
  */
 struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
  struct request_sock *req,
- struct tcp_fastopen_cookie *foc,
- struct dst_entry *dst)
+ struct tcp_fastopen_cookie *foc)
 {
struct tcp_fastopen_cookie valid_foc = { .len = -1 };
bool syn_data = TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq + 1;
@@ -312,7 +310,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct 
sk_buff *skb,
 * data in SYN_RECV state.
 */
 fastopen:
-   child = tcp_fastopen_create_child(sk, skb, dst, req);
+   child = tcp_fastopen_create_child(sk, skb, req);
if (child) {
foc->len = -1;
NET_INC_STATS(sock_net(sk),
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 7eee2c7ddb7a..21df0868f206 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6151,7 +6151,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_openreq_init_rwin(req, sk, dst);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
-   fastopen_sk = tcp_try_fastopen(sk, skb, req, , dst);
+   fastopen_sk = tcp_try_fastopen(sk, skb, req, );
}
if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, , req,
-- 
2.13.4

[PATCH 1/2] tcp: Remove unnecessary dst check in tcp_conn_request.

2017-08-16 Thread Tonghao Zhang

Because we remove the tcp_tw_recycle support in the commit
4396e46187c ('tcp: remove tcp_tw_recycle') and also delete
the code 'af_ops->route_req' for sysctl_tw_recycle in tcp_conn_request.
Now when we call the 'af_ops->route_req', the dist always is
NULL, and we remove the unnecessay check.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/ipv4/tcp_input.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d73903fe8c83..7eee2c7ddb7a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -6132,11 +6132,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
 
isn = af_ops->init_seq(skb);
}
-   if (!dst) {
-   dst = af_ops->route_req(sk, , req);
-   if (!dst)
-   goto drop_and_free;
-   }
+
+   dst = af_ops->route_req(sk, , req);
+   if (!dst)
+   goto drop_and_free;
 
tcp_ecn_create_request(req, skb, sk, dst);
 
-- 
2.13.4

[PATCH] net: Fix a typo in comment about sock flags.

2017-08-15 Thread Tonghao Zhang

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 include/linux/net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index b5c15b3..d97d80d 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -37,7 +37,7 @@
 
 /* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located
  * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected.
- * Eventually all flags will be in sk->sk_wq_flags.
+ * Eventually all flags will be in sk->sk_wq->flags.
  */
 #define SOCKWQ_ASYNC_NOSPACE   0
 #define SOCKWQ_ASYNC_WAITDATA  1
-- 
1.8.3.1

[PATCH net-next] net: skb_needs_check() removes CHECKSUM_UNNECESSARY check for tx.

2017-08-10 Thread Tonghao Zhang

Because we remove the UFO support, we will also remove the
CHECKSUM_UNNECESSARY check in skb_needs_check().

Cc: Willem de Bruijn <will...@google.com>
Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/core/dev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 3f69f6e..1024d37 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2731,8 +2731,7 @@ struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 {
if (tx_path)
-   return skb->ip_summed != CHECKSUM_PARTIAL &&
-  skb->ip_summed != CHECKSUM_UNNECESSARY;
+   return skb->ip_summed != CHECKSUM_PARTIAL;
 
return skb->ip_summed == CHECKSUM_NONE;
 }
-- 
1.8.3.1

Re: [PATCH net-next] net: skb_needs_check() removes CHECKSUM_NONE check for tx.

2017-08-10 Thread Tonghao Zhang

Hi Willem, because we change the CHECKSUM_NONE to CHECKSUM_UNNECESSARY
in skb_needs_check(), I can't revert 6e7bc478c9a0
("net: skb_needs_check() accepts CHECKSUM_NONE for tx").  I change it
directly and resubmit it.

On Fri, Aug 11, 2017 at 1:03 AM, Willem de Bruijn
<willemdebruijn.ker...@gmail.com> wrote:
> On Wed, Aug 9, 2017 at 6:02 PM, Tonghao Zhang <xiangxia.m@gmail.com> 
> wrote:
>> Thanks for your work.
>
> You, too.
>
>> On Thu, Aug 10, 2017 at 2:30 AM, Willem de Bruijn
>> <willemdebruijn.ker...@gmail.com> wrote:
>>> On Wed, Aug 9, 2017 at 5:04 AM, Tonghao Zhang <xiangxia.m@gmail.com> 
>>> wrote:
>>>> This patch reverts the commit 6e7bc478c9a0
>>>> ("net: skb_needs_check() accepts CHECKSUM_NONE for tx"),
>>>> because we removed the UFO support.
>>>>
>>>> Cc: Eric Dumazet <eduma...@google.com>
>>>> Cc: Willem de Bruijn <will...@google.com>
>>>> Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
>>>
>>>
>>> I would wait until net is merged into net-next. This will cause a conflict.
>>>
>>> Also, while logically equivalent, it is not a real revert (as in `git
>>> revert $SHA1`) of that patch.
>>>
>>> Aside from those concerns, I agree that the original patch is no
>>> longer needed now that UFO is reverted.
>
> Please do resubmit the revert patch once net has been merged into net-next.

Re: [PATCH net-next] skbuff: Add BUG_ON in skb_init.

2017-08-10 Thread Tonghao Zhang

Thanks a lot. I found it when reviewing this codes. and BUG_ON is
added  in the init_inodecache(). We may remove it or not ?

diff --git a/net/socket.c b/net/socket.c
index b332d1e..ebee3ee 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -296,7 +296,6 @@ static void init_inodecache(void)
   SLAB_RECLAIM_ACCOUNT |
   SLAB_MEM_SPREAD | SLAB_ACCOUNT),
  init_once);
-   BUG_ON(sock_inode_cachep == NULL);
 }

 static const struct super_operations sockfs_ops = {

On Thu, Aug 10, 2017 at 1:49 PM, David Miller <da...@davemloft.net> wrote:
> From: Tonghao Zhang <xiangxia.m@gmail.com>
> Date: Wed,  9 Aug 2017 05:04:38 -0700
>
>> When initializing the skbuff SLAB cache, we should make
>> sure it is successful. Adding BUG_ON to check it and
>> init_inodecache() is in the same case.
>>
>> Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
>> ---
>>  net/core/skbuff.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index 42b62c716a33..9513de519870 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -3904,6 +3904,8 @@ void __init skb_init(void)
>>   0,
>>   SLAB_HWCACHE_ALIGN|SLAB_PANIC,
>>   NULL);
>> + BUG_ON(skbuff_head_cache == NULL);
>> + BUG_ON(skbuff_fclone_cache == NULL);
>>  }
>
> I know you guys want every allocation to be explicitly checked so that
> everything is consistent for static code analysis checkers.
>
> But this is just wasted code.
>
> The first allocation will take a NULL dereference and the backtrace
> will make it completely clear which SLAB cache was NULL and couldn't
> be allocated.
>
> So there is no real value to adding these checks.
>
> So I'm not applying this, sorry.
>
> The same logic goes for your other patch of this nature.
>

Re: [PATCH net-next] net: skb_needs_check() removes CHECKSUM_NONE check for tx.

2017-08-09 Thread Tonghao Zhang

Thanks for your work.

On Thu, Aug 10, 2017 at 2:30 AM, Willem de Bruijn
<willemdebruijn.ker...@gmail.com> wrote:
> On Wed, Aug 9, 2017 at 5:04 AM, Tonghao Zhang <xiangxia.m@gmail.com> 
> wrote:
>> This patch reverts the commit 6e7bc478c9a0
>> ("net: skb_needs_check() accepts CHECKSUM_NONE for tx"),
>> because we removed the UFO support.
>>
>> Cc: Eric Dumazet <eduma...@google.com>
>> Cc: Willem de Bruijn <will...@google.com>
>> Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
>
>
> I would wait until net is merged into net-next. This will cause a conflict.
>
> Also, while logically equivalent, it is not a real revert (as in `git
> revert $SHA1`) of that patch.
>
> Aside from those concerns, I agree that the original patch is no
> longer needed now that UFO is reverted.

Re: [PATCH] net: Reduce skb_warn_bad_offload() noise.

2017-08-09 Thread Tonghao Zhang

Thanks, I send a patch, which will revert commit 6e7bc478c9a0 in net-next.

On Wed, Aug 9, 2017 at 1:46 AM, Willem de Bruijn
 wrote:
>>> @@ -2670,6 +2670,7 @@ static inline bool skb_needs_check(struct
>>> sk_buff *skb, bool tx_path)
>>>  {
>>> if (tx_path)
>>> return skb->ip_summed != CHECKSUM_PARTIAL &&
>>> +  skb->ip_summed != CHECKSUM_UNNECESSARY &&
>>>skb->ip_summed != CHECKSUM_NONE;
>>
>> Good catch. Only, the CHECKSUM_NONE case was added specifically to
>> work around this UFO issue on the tx path in commit 6e7bc478c9a0
>> ("net: skb_needs_check() accepts CHECKSUM_NONE for tx"). If we change
>> the value generated by UFO, we can remove that statement, so
>>
>> +  skb->ip_summed != CHECKSUM_UNNECESSARY;
>> -skb->ip_summed != CHECKSUM_NONE;
>>
>> Else the entire check becomes a NOOP. These are the only three valid
>> states on tx. With very few codepaths generating CHECKSUM_UNNECESSARY
>> to begin with, it arguably already is practically a NOOP. I need to
>> look more closely what the statement is intended to protect against,
>> before we relax it even further.
>
> On transmit, packets entering skb_gso_segment are expected to always
> have ip_summed CHECKSUM_PARTIAL. This check was added to track down
> unexpected exceptions in commit 67fd1a731ff1 ("net: Add debug info to
> track down GSO checksum bug").
>
> Only when called for the second time, after skb_mac_gso_segment, do we
> have to possibly handle the case where the GSO layer computes the
> checksum and changes ip_summed.
>
> Since this only goes into 4.11 to 4.13, making two separate
> skb_needs_check variants for these two call sites seems overkill. I
> will send the simple fix to convert CHECKSUM_NONE to
> CHECKSUM_UNNECESSARY.
>
> As a side effect of removing UFO in 4.14-rc1, we can also revert
> commit 6e7bc478c9a0 ("net: skb_needs_check() accepts CHECKSUM_NONE for
> tx") in net-next.

[PATCH net-next] net: skb_needs_check() removes CHECKSUM_NONE check for tx.

2017-08-09 Thread Tonghao Zhang

This patch reverts the commit 6e7bc478c9a0
("net: skb_needs_check() accepts CHECKSUM_NONE for tx"),
because we removed the UFO support.

Cc: Eric Dumazet <eduma...@google.com>
Cc: Willem de Bruijn <will...@google.com>
Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/core/dev.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 1d75499add72..1024d3741d12 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2731,8 +2731,7 @@ EXPORT_SYMBOL(skb_mac_gso_segment);
 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
 {
if (tx_path)
-   return skb->ip_summed != CHECKSUM_PARTIAL &&
-  skb->ip_summed != CHECKSUM_NONE;
+   return skb->ip_summed != CHECKSUM_PARTIAL;
 
return skb->ip_summed == CHECKSUM_NONE;
 }
-- 
2.13.4

[PATCH net-next] skbuff: Add BUG_ON in skb_init.

2017-08-09 Thread Tonghao Zhang

When initializing the skbuff SLAB cache, we should make
sure it is successful. Adding BUG_ON to check it and
init_inodecache() is in the same case.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/core/skbuff.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 42b62c716a33..9513de519870 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3904,6 +3904,8 @@ void __init skb_init(void)
0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
NULL);
+   BUG_ON(skbuff_head_cache == NULL);
+   BUG_ON(skbuff_fclone_cache == NULL);
 }
 
 static int
-- 
2.13.4

Re: [PATCH] net: Reduce skb_warn_bad_offload() noise.

2017-08-07 Thread Tonghao Zhang

269
[ 1291.596371]  do_softirq_own_stack+0x1c/0x30
[ 1291.596372]  
[ 1291.596374]  do_softirq+0x50/0x60
[ 1291.596375]  __local_bh_enable_ip+0x5a/0x70
[ 1291.596377]  ip_finish_output2+0x15e/0x390
[ 1291.596380]  ip_finish_output+0x136/0x1e0
[ 1291.596382]  ip_output+0x76/0xe0
[ 1291.596384]  ? ip_fragment.constprop.53+0x80/0x80
[ 1291.596386]  ip_local_out+0x35/0x40
[ 1291.596387]  ip_send_skb+0x19/0x40
[ 1291.596390]  udp_send_skb+0x172/0x280
[ 1291.596391]  udp_sendmsg+0x2cf/0xa60
[ 1291.596393]  ? ip_reply_glue_bits+0x50/0x50
[ 1291.596396]  ? sock_has_perm+0x75/0x90
[ 1291.596398]  inet_sendmsg+0x31/0xb0
[ 1291.596399]  sock_sendmsg+0x38/0x50
[ 1291.596401]  sock_write_iter+0x85/0xf0
[ 1291.596403]  __vfs_write+0xe3/0x160
[ 1291.596405]  vfs_write+0xb2/0x1b0
[ 1291.596407]  ? syscall_trace_enter+0x1d0/0x2b0
[ 1291.596408]  SyS_write+0x55/0xc0
[ 1291.596411]  do_syscall_64+0x67/0x150
[ 1291.596412]  entry_SYSCALL64_slow_path+0x25/0x25
[ 1291.596413] RIP: 0033:0x7f8d931cdc60
[ 1291.596414] RSP: 002b:7ffc0790e9c8 EFLAGS: 0246 ORIG_RAX:
0001
[ 1291.596415] RAX: ffda RBX: 2800 RCX: 7f8d931cdc60
[ 1291.596417] RDX: 2800 RSI: 7f8d93bc9000 RDI: 0012
[ 1291.596417] RBP: 7f8d93bc9000 R08:  R09: 003d
[ 1291.596418] R10: 0001 R11: 0246 R12: 0012
[ 1291.596419] R13: 2800 R14: 0001 R15: 7ffc0790ea40
[ 1291.596420] Code: 8d 94 24 d0 00 00 00 48 0f 44 ce 4d 85 e4 44 89
54 24 08 48 0f 44 d6 89 3c 24 48 89 c6 48 c7 c7 80 65 ac 8f 31 c0 e8
a7 bd a5 ff <0f> ff 48 83 c4 18 5b 41 5c 5d c3 55 48 89 f2 31 c0 89 fe
48 c7
[ 1291.596447] ---[ end trace b66534f41f98ceda ]---


On Tue, Aug 8, 2017 at 7:44 AM, Tonghao Zhang <xiangxia.m@gmail.com> wrote:
> That is fine to me. I have tested it. Thanks.
>
> On Mon, Aug 7, 2017 at 12:42 PM, Willem de Bruijn
> <willemdebruijn.ker...@gmail.com> wrote:
>>> The openvswitch kernel module calls the __skb_gso_segment()(and sets
>>> tx_path = false) when passing packets to userspace. The UFO will set
>>> the ip_summed to CHECKSUM_NONE. There are a lot of warn logs. The warn
>>> log is shown as below. I guess we should revert the patch.
>>
>> Indeed, the software UFO code computes the checksum and
>> sets ip_summed to CHECKSUM_NONE, as is correct on the
>> egress path.
>>
>> Commit 6e7bc478c9a0 ("net: skb_needs_check() accepts
>> CHECKSUM_NONE for tx") revised the tx_path case in
>> skb_needs_check to avoid the warning exactly for the UFO case.
>>
>> We cannot just make an exception for CHECKSUM_NONE in the
>> !tx_path case, as the entire statement then becomes false:
>>
>>return skb->ip_summed == CHECKSUM_NONE;
>>
>> Since on egress CHECKSUM_UNNECESSARY is equivalent to
>> CHECKSUM_NONE, it should be fine to update the UFO code
>> to set that, instead:
>>
>> @@ -235,7 +235,7 @@ static struct sk_buff *udp4_ufo_fragment(struct
>> sk_buff *skb,
>> if (uh->check == 0)
>> uh->check = CSUM_MANGLED_0;
>>
>> -   skb->ip_summed = CHECKSUM_NONE;
>> +   skb->ip_summed = CHECKSUM_UNNECESSARY;

Re: [PATCH] net: Reduce skb_warn_bad_offload() noise.

2017-08-07 Thread Tonghao Zhang

That is fine to me. I have tested it. Thanks.

On Mon, Aug 7, 2017 at 12:42 PM, Willem de Bruijn
 wrote:
>> The openvswitch kernel module calls the __skb_gso_segment()(and sets
>> tx_path = false) when passing packets to userspace. The UFO will set
>> the ip_summed to CHECKSUM_NONE. There are a lot of warn logs. The warn
>> log is shown as below. I guess we should revert the patch.
>
> Indeed, the software UFO code computes the checksum and
> sets ip_summed to CHECKSUM_NONE, as is correct on the
> egress path.
>
> Commit 6e7bc478c9a0 ("net: skb_needs_check() accepts
> CHECKSUM_NONE for tx") revised the tx_path case in
> skb_needs_check to avoid the warning exactly for the UFO case.
>
> We cannot just make an exception for CHECKSUM_NONE in the
> !tx_path case, as the entire statement then becomes false:
>
>return skb->ip_summed == CHECKSUM_NONE;
>
> Since on egress CHECKSUM_UNNECESSARY is equivalent to
> CHECKSUM_NONE, it should be fine to update the UFO code
> to set that, instead:
>
> @@ -235,7 +235,7 @@ static struct sk_buff *udp4_ufo_fragment(struct
> sk_buff *skb,
> if (uh->check == 0)
> uh->check = CSUM_MANGLED_0;
>
> -   skb->ip_summed = CHECKSUM_NONE;
> +   skb->ip_summed = CHECKSUM_UNNECESSARY;

Re: [PATCH] net: Reduce skb_warn_bad_offload() noise.

2017-08-06 Thread Tonghao Zhang

On Fri, Aug 4, 2017 at 9:29 PM, Eric Dumazet <eric.duma...@gmail.com> wrote:
> On Fri, 2017-08-04 at 06:11 -0700, Tonghao Zhang wrote:
>> This patch will revert the b2504a5dbe "net: reduce
>> skb_warn_bad_offload() noise". The ovs will call the
>> __skb_gso_segment() with tx false. When segmenting UDP with UFO,
>> the __skb_gso_segment raises a warning as below [1], because the
>> ip_summed is CHECKSUM_NONE. While the net-next has removed the
>> UFO support, but the 4.11 and 4.12 kernel don't address that problem.
>>
>> In the kernel, only qdisc_pkt_len_init() (__dev_queue_xmit call it.)
>> uses the SKB_GSO_DODGY to do something. Other places just set it.
>> The warn described in b2504a5dbe is shown [2]. We may know that:
>> 1. the net_device don’t have qdisc.
>> 2. the skb->ip_summed was changed to CHECKSUM_NONE. it maybe
>> changed in skb_checksum_help() when calling validate_xmit_skb().
>> or other place.
>>
>> And we should not revert the  6e7bc478c9 "net: skb_needs_check() accepts
>> CHECKSUM_NONE for tx". The check is necessary.
>
> Why is it necessary ?
I am not familiar with __skb_gso_segment() when tx_path == true. I
should say sorry to you.

>
> If you revert b2504a5dbe, then we also need to revert 6e7bc478c9,
> unless you provide hard facts.
>
The openvswitch kernel module calls the __skb_gso_segment()(and sets
tx_path = false) when passing packets to userspace. The UFO will set
the ip_summed to CHECKSUM_NONE. There are a lot of warn logs. The warn
log is shown as below. I guess we should revert the patch.


>>
>> [1]
>> [321428.168903] WARNING: CPU: 0 PID: 2279 at net/core/dev.c:2562
>> skb_warn_bad_offload+0xc4/0x110
>> [321428.168906] san0: caps=(0x04009fbb58e9, 0x) len=6769
>> data_len=6727 gso_size=1480 gso_type=2 ip_summed=0
>>
>> [321428.168955] CPU: 0 PID: 2279 Comm: ruby-mri 4.11.12-200.fc25.x86_64 #1
>> [321428.168956] Hardware name: Supermicro SYS-1028U-TNRTP+/X10DRU-i+, BIOS 
>> 1.1 07/22/2015
>> [321428.168957] Call Trace:
>> [321428.168962]  dump_stack+0x63/0x86
>> [321428.168965]  __warn+0xcb/0xf0
>> [321428.168966]  warn_slowpath_fmt+0x5a/0x80
>> [321428.168968]  skb_warn_bad_offload+0xc4/0x110
>> [321428.168970]  __skb_gso_segment+0x190/0x1a0
>> [321428.168977]  queue_gso_packets+0x62/0x160 [openvswitch]
>> [321428.168992]  ovs_dp_upcall+0x31/0x60 [openvswitch]
>> [321428.168994]  ovs_dp_process_packet+0x10d/0x130 [openvswitch]
>> [321428.168997]  ovs_vport_receive+0x76/0xd0 [openvswitch]
>> [321428.169013]  internal_dev_xmit+0x28/0x60 [openvswitch]
>> [321428.169014]  dev_hard_start_xmit+0xa3/0x1f0
>> [321428.169016]  __dev_queue_xmit+0x592/0x650
>> [321428.169026]  dev_queue_xmit+0x10/0x20
>>
>> [2]
>> WARNING: CPU: 1 PID: 6768 at net/core/dev.c:2439 
>> skb_warn_bad_offload+0x2af/0x390 net/core/dev.c:2434
>> lo: caps=(0x00a2803b7c69, 0x) len=138 data_len=0 
>> gso_size=15883 gso_type=4 ip_summed=0
>> Kernel panic - not syncing: panic_on_warn set ...
>>
>> CPU: 1 PID: 6768 Comm: syz-executor1 Not tainted 4.9.0 #5
>> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
>> Google 01/01/2011
>> 8801c063ecd8 82346bdf 0001 1100380c7d2e
>> ed00380c7d26 41b58ab3 84b37e38 823468f1
>> 84820740 84f289c0 dc00 8801c063ee20
>> Call Trace:
>
> Why are you adding this trace that was part of the b2504a5dbef3
> changelog ?
I hope we can fix the bug with other solution.

>
>> [] __dump_stack lib/dump_stack.c:15 [inline]
>> [] dump_stack+0x2ee/0x3ef lib/dump_stack.c:51
>> [] panic+0x1fb/0x412 kernel/panic.c:179
>> [] __warn+0x1c4/0x1e0 kernel/panic.c:542
>> [] warn_slowpath_fmt+0xc5/0x100 kernel/panic.c:565
>> [] skb_warn_bad_offload+0x2af/0x390 net/core/dev.c:2434
>> [] __skb_gso_segment+0x482/0x780 net/core/dev.c:2706
>> [] skb_gso_segment include/linux/netdevice.h:3985 [inline]
>> [] validate_xmit_skb+0x5c9/0xc20 net/core/dev.c:2969
>> [] __dev_queue_xmit+0xe6b/0x1e70 net/core/dev.c:3383
>> [] dev_queue_xmit+0x17/0x20 net/core/dev.c:3424
>>
>> Cc: Eric Dumazet <eduma...@google.com>
>> Cc: Willem de Bruijn <will...@google.com>
>> Cc: Pravin B Shelar <pshe...@ovn.org>
>> Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
>> ---
>
>
> IMO, this description is too confusing, I do not understand this patch.
sorry

[PATCH] net: Reduce skb_warn_bad_offload() noise.

2017-08-04 Thread Tonghao Zhang

This patch will revert the b2504a5dbe "net: reduce
skb_warn_bad_offload() noise". The ovs will call the
__skb_gso_segment() with tx false. When segmenting UDP with UFO,
the __skb_gso_segment raises a warning as below [1], because the
ip_summed is CHECKSUM_NONE. While the net-next has removed the
UFO support, but the 4.11 and 4.12 kernel don't address that problem.

In the kernel, only qdisc_pkt_len_init() (__dev_queue_xmit call it.)
uses the SKB_GSO_DODGY to do something. Other places just set it.
The warn described in b2504a5dbe is shown [2]. We may know that:
1. the net_device don’t have qdisc.
2. the skb->ip_summed was changed to CHECKSUM_NONE. it maybe
changed in skb_checksum_help() when calling validate_xmit_skb().
or other place.

And we should not revert the  6e7bc478c9 "net: skb_needs_check() accepts
CHECKSUM_NONE for tx". The check is necessary.

[1]
[321428.168903] WARNING: CPU: 0 PID: 2279 at net/core/dev.c:2562
skb_warn_bad_offload+0xc4/0x110
[321428.168906] san0: caps=(0x04009fbb58e9, 0x) len=6769
data_len=6727 gso_size=1480 gso_type=2 ip_summed=0

[321428.168955] CPU: 0 PID: 2279 Comm: ruby-mri 4.11.12-200.fc25.x86_64 #1
[321428.168956] Hardware name: Supermicro SYS-1028U-TNRTP+/X10DRU-i+, BIOS 1.1 
07/22/2015
[321428.168957] Call Trace:
[321428.168962]  dump_stack+0x63/0x86
[321428.168965]  __warn+0xcb/0xf0
[321428.168966]  warn_slowpath_fmt+0x5a/0x80
[321428.168968]  skb_warn_bad_offload+0xc4/0x110
[321428.168970]  __skb_gso_segment+0x190/0x1a0
[321428.168977]  queue_gso_packets+0x62/0x160 [openvswitch]
[321428.168992]  ovs_dp_upcall+0x31/0x60 [openvswitch]
[321428.168994]  ovs_dp_process_packet+0x10d/0x130 [openvswitch]
[321428.168997]  ovs_vport_receive+0x76/0xd0 [openvswitch]
[321428.169013]  internal_dev_xmit+0x28/0x60 [openvswitch]
[321428.169014]  dev_hard_start_xmit+0xa3/0x1f0
[321428.169016]  __dev_queue_xmit+0x592/0x650
[321428.169026]  dev_queue_xmit+0x10/0x20

[2]
WARNING: CPU: 1 PID: 6768 at net/core/dev.c:2439 
skb_warn_bad_offload+0x2af/0x390 net/core/dev.c:2434
lo: caps=(0x00a2803b7c69, 0x) len=138 data_len=0 
gso_size=15883 gso_type=4 ip_summed=0
Kernel panic - not syncing: panic_on_warn set ...

CPU: 1 PID: 6768 Comm: syz-executor1 Not tainted 4.9.0 #5
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
8801c063ecd8 82346bdf 0001 1100380c7d2e
ed00380c7d26 41b58ab3 84b37e38 823468f1
84820740 84f289c0 dc00 8801c063ee20
Call Trace:
[] __dump_stack lib/dump_stack.c:15 [inline]
[] dump_stack+0x2ee/0x3ef lib/dump_stack.c:51
[] panic+0x1fb/0x412 kernel/panic.c:179
[] __warn+0x1c4/0x1e0 kernel/panic.c:542
[] warn_slowpath_fmt+0xc5/0x100 kernel/panic.c:565
[] skb_warn_bad_offload+0x2af/0x390 net/core/dev.c:2434
[] __skb_gso_segment+0x482/0x780 net/core/dev.c:2706
[] skb_gso_segment include/linux/netdevice.h:3985 [inline]
[] validate_xmit_skb+0x5c9/0xc20 net/core/dev.c:2969
[] __dev_queue_xmit+0xe6b/0x1e70 net/core/dev.c:3383
[] dev_queue_xmit+0x17/0x20 net/core/dev.c:3424

Cc: Eric Dumazet <eduma...@google.com>
Cc: Willem de Bruijn <will...@google.com>
Cc: Pravin B Shelar <pshe...@ovn.org>
Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/core/dev.c | 11 +++
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index 1d75499..97e6989 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2753,11 +2753,11 @@ static inline bool skb_needs_check(struct sk_buff *skb, 
bool tx_path)
 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
  netdev_features_t features, bool tx_path)
 {
-   struct sk_buff *segs;
-
if (unlikely(skb_needs_check(skb, tx_path))) {
int err;
 
+   skb_warn_bad_offload(skb);
+
/* We're going to init ->check field in TCP or UDP header */
err = skb_cow_head(skb, 0);
if (err < 0)
@@ -2786,12 +2786,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
skb_reset_mac_header(skb);
skb_reset_mac_len(skb);
 
-   segs = skb_mac_gso_segment(skb, features);
-
-   if (unlikely(skb_needs_check(skb, tx_path)))
-   skb_warn_bad_offload(skb);
-
-   return segs;
+   return skb_mac_gso_segment(skb, features);
 }
 EXPORT_SYMBOL(__skb_gso_segment);
 
-- 
1.8.3.1

[PATCH] ipv4: Introduce ipip_offload_init helper function.

2017-08-02 Thread Tonghao Zhang

It's convenient to init ipip offload. We will check
the return value, and print KERN_CRIT info on failure.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/ipv4/af_inet.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 5ce44fb..c888120 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1763,6 +1763,11 @@ static int __init init_inet_pernet_ops(void)
},
 };
 
+static int __init ipip_offload_init(void)
+{
+   return inet_add_offload(_offload, IPPROTO_IPIP);
+}
+
 static int __init ipv4_offload_init(void)
 {
/*
@@ -1772,9 +1777,10 @@ static int __init ipv4_offload_init(void)
pr_crit("%s: Cannot add UDP protocol offload\n", __func__);
if (tcpv4_offload_init() < 0)
pr_crit("%s: Cannot add TCP protocol offload\n", __func__);
+   if (ipip_offload_init() < 0)
+   pr_crit("%s: Cannot add IPIP protocol offload\n", __func__);
 
dev_add_offload(_packet_offload);
-   inet_add_offload(_offload, IPPROTO_IPIP);
return 0;
 }
 
-- 
1.8.3.1

[PATCH 2/3] drivers/net: Compare pointer-typed values to NULL rather than 0.

2017-07-25 Thread Tonghao Zhang

This makes an effort to choose between !x and x == NULL.
1. drivers/net/ethernet/nxp/lpc_eth.c
2. drivers/net/ethernet/amd/declance.c

Generated by: scripts/coccinelle/null/badzero.cocci

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/ethernet/amd/declance.c | 2 +-
 drivers/net/ethernet/nxp/lpc_eth.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/amd/declance.c 
b/drivers/net/ethernet/amd/declance.c
index 82cc813..76e623b 100644
--- a/drivers/net/ethernet/amd/declance.c
+++ b/drivers/net/ethernet/amd/declance.c
@@ -606,7 +606,7 @@ static int lance_rx(struct net_device *dev)
len = (*rds_ptr(rd, mblength, lp->type) & 0xfff) - 4;
skb = netdev_alloc_skb(dev, len + 2);
 
-   if (skb == 0) {
+   if (!skb) {
dev->stats.rx_dropped++;
*rds_ptr(rd, mblength, lp->type) = 0;
*rds_ptr(rd, rmd1, lp->type) =
diff --git a/drivers/net/ethernet/nxp/lpc_eth.c 
b/drivers/net/ethernet/nxp/lpc_eth.c
index 08381ef..0d177d4 100644
--- a/drivers/net/ethernet/nxp/lpc_eth.c
+++ b/drivers/net/ethernet/nxp/lpc_eth.c
@@ -1338,7 +1338,7 @@ static int lpc_eth_drv_probe(struct platform_device *pdev)
/* Get size of DMA buffers/descriptors region */
pldat->dma_buff_size = (ENET_TX_DESC + ENET_RX_DESC) * (ENET_MAXF_SIZE +
sizeof(struct txrx_desc_t) + sizeof(struct rx_status_t));
-   pldat->dma_buff_base_v = 0;
+   pldat->dma_buff_base_v = NULL;
 
if (use_iram_for_net(>pdev->dev)) {
dma_handle = LPC32XX_IRAM_BASE;
@@ -1350,7 +1350,7 @@ static int lpc_eth_drv_probe(struct platform_device *pdev)
"IRAM not big enough for net buffers, using 
SDRAM instead.\n");
}
 
-   if (pldat->dma_buff_base_v == 0) {
+   if (pldat->dma_buff_base_v == NULL) {
ret = dma_coerce_mask_and_coherent(>dev, 
DMA_BIT_MASK(32));
if (ret)
goto err_out_free_irq;
-- 
1.8.3.1

[PATCH 1/3] drivers/net: Fix ptr_ret.cocci warnings.

2017-07-25 Thread Tonghao Zhang

we can use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR
1. drivers/net/appletalk/ipddp.c
2. drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c

Generated by: scripts/coccinelle/api/ptr_ret.cocci

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/appletalk/ipddp.c| 4 +---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c | 5 +
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c
index a306de4..9375cef 100644
--- a/drivers/net/appletalk/ipddp.c
+++ b/drivers/net/appletalk/ipddp.c
@@ -311,9 +311,7 @@ static int ipddp_ioctl(struct net_device *dev, struct ifreq 
*ifr, int cmd)
 static int __init ipddp_init_module(void)
 {
dev_ipddp = ipddp_init();
-if (IS_ERR(dev_ipddp))
-return PTR_ERR(dev_ipddp);
-   return 0;
+   return PTR_ERR_OR_ZERO(dev_ipddp);
 }
 
 static void __exit ipddp_cleanup_module(void)
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c 
b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
index 1447a83..2d3e5e2 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
@@ -78,10 +78,7 @@ int brcmf_debug_attach(struct brcmf_pub *drvr)
return -ENODEV;
 
drvr->dbgfs_dir = debugfs_create_dir(dev_name(dev), root_folder);
-   if (IS_ERR(drvr->dbgfs_dir))
-   return PTR_ERR(drvr->dbgfs_dir);
-
-   return 0;
+   return PTR_ERR_OR_ZERO(drvr->dbgfs_dir);
 }
 
 void brcmf_debug_detach(struct brcmf_pub *drvr)
-- 
1.8.3.1

[PATCH 3/3] drivers/net: Unsigned expressions cannot be lesser than zero.

2017-07-25 Thread Tonghao Zhang

Presence of comparisons 'unsigned (<|<=|>|>=) 0' often indicates
a bug, usually wrong type of variable.

When using the qe_muram_alloc to alloc memory, return an offset
into the muram area on success and (unsigned long)-ENOMEM or
-ENOSYS (if not config CONFIG_CPM and CONFIG_QUICC_ENGINE) on
failure.

Generated by: scripts/coccinelle/tests/unsigned_lesser_than_zero.cocci
Fix c19b6d24('drivers/net: support hdlc function for QE-UCC')

Cc: Zhao Qiang <qiang.z...@nxp.com>
Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/wan/fsl_ucc_hdlc.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/wan/fsl_ucc_hdlc.c b/drivers/net/wan/fsl_ucc_hdlc.c
index 33df764..155f044 100644
--- a/drivers/net/wan/fsl_ucc_hdlc.c
+++ b/drivers/net/wan/fsl_ucc_hdlc.c
@@ -192,7 +192,8 @@ static int uhdlc_init(struct ucc_hdlc_private *priv)
priv->ucc_pram_offset = qe_muram_alloc(sizeof(struct ucc_hdlc_param),
ALIGNMENT_OF_UCC_HDLC_PRAM);
 
-   if (priv->ucc_pram_offset < 0) {
+   if (priv->ucc_pram_offset == (unsigned long)-ENOMEM ||
+   priv->ucc_pram_offset == (unsigned long)-ENOSYS) {
dev_err(priv->dev, "Can not allocate MURAM for hdlc 
parameter.\n");
ret = -ENOMEM;
goto free_tx_bd;
@@ -228,14 +229,16 @@ static int uhdlc_init(struct ucc_hdlc_private *priv)
 
/* Alloc riptr, tiptr */
riptr = qe_muram_alloc(32, 32);
-   if (riptr < 0) {
+   if (riptr == (unsigned long)-ENOMEM ||
+   riptr == (unsigned long)-ENOSYS) {
dev_err(priv->dev, "Cannot allocate MURAM mem for Receive 
internal temp data pointer\n");
ret = -ENOMEM;
goto free_tx_skbuff;
}
 
tiptr = qe_muram_alloc(32, 32);
-   if (tiptr < 0) {
+   if (tiptr == (unsigned long)-ENOMEM ||
+   tiptr == (unsigned long)-ENOSYS) {
dev_err(priv->dev, "Cannot allocate MURAM mem for Transmit 
internal temp data pointer\n");
ret = -ENOMEM;
goto free_riptr;
-- 
1.8.3.1

[PATCH] drivers/net: Fix ptr_ret.cocci warnings.

2017-07-25 Thread Tonghao Zhang

we can use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR
1. drivers/net/appletalk/ipddp.c
2. drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c

Generated by: scripts/coccinelle/api/ptr_ret.cocci

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/appletalk/ipddp.c| 4 +---
 drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c | 5 +
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/net/appletalk/ipddp.c b/drivers/net/appletalk/ipddp.c
index a306de4..9375cef 100644
--- a/drivers/net/appletalk/ipddp.c
+++ b/drivers/net/appletalk/ipddp.c
@@ -311,9 +311,7 @@ static int ipddp_ioctl(struct net_device *dev, struct ifreq 
*ifr, int cmd)
 static int __init ipddp_init_module(void)
 {
dev_ipddp = ipddp_init();
-if (IS_ERR(dev_ipddp))
-return PTR_ERR(dev_ipddp);
-   return 0;
+   return PTR_ERR_OR_ZERO(dev_ipddp);
 }
 
 static void __exit ipddp_cleanup_module(void)
diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c 
b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
index 1447a83..2d3e5e2 100644
--- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
+++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/debug.c
@@ -78,10 +78,7 @@ int brcmf_debug_attach(struct brcmf_pub *drvr)
return -ENODEV;
 
drvr->dbgfs_dir = debugfs_create_dir(dev_name(dev), root_folder);
-   if (IS_ERR(drvr->dbgfs_dir))
-   return PTR_ERR(drvr->dbgfs_dir);
-
-   return 0;
+   return PTR_ERR_OR_ZERO(drvr->dbgfs_dir);
 }
 
 void brcmf_debug_detach(struct brcmf_pub *drvr)
-- 
1.8.3.1

Re: [PATCH] tun/tap: Add the missed return value check of register_netdevice_notifier

2017-07-24 Thread Tonghao Zhang

One question, this type bugfix will be applied to net-next ?
> On Jul 25, 2017, at 4:45 AM, David Miller <da...@davemloft.net> wrote:
> 
>> 
>> There is some codes of tun/tap module which did not check the return
>> value of register_netdevice_notifier. Add the check now.
>> 
>> Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
> 
> Applied.

[PATCH] tun/tap: Add the missed return value check of register_netdevice_notifier

2017-07-20 Thread Tonghao Zhang

There is some codes of tun/tap module which did not check the return
value of register_netdevice_notifier. Add the check now.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 drivers/net/tun.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 3d4c245..32ad873 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2598,8 +2598,16 @@ static int __init tun_init(void)
goto err_misc;
}
 
-   register_netdevice_notifier(_notifier_block);
+   ret = register_netdevice_notifier(_notifier_block);
+   if (ret) {
+   pr_err("Can't register netdevice notifier\n");
+   goto err_notifier;
+   }
+
return  0;
+
+err_notifier:
+   misc_deregister(_miscdev);
 err_misc:
rtnl_link_unregister(_link_ops);
 err_linkops:
-- 
1.8.3.1

[PATCH v2 1/2] openvswitch: Optimize updating for OvS flow_stats.

2017-07-18 Thread Tonghao Zhang

In the ovs_flow_stats_update(), we only use the node
var to alloc flow_stats struct. But this is not a
common case, it is unnecessary to call the numa_node_id()
everytime. This patch is not a bugfix, but there maybe
a small increase.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/openvswitch/flow.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 3f76cb7..89aeb32 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -72,7 +72,6 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
   const struct sk_buff *skb)
 {
struct flow_stats *stats;
-   int node = numa_node_id();
int cpu = smp_processor_id();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
@@ -108,7 +107,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
  __GFP_THISNODE |
  __GFP_NOWARN |
  __GFP_NOMEMALLOC,
- node);
+ numa_node_id());
if (likely(new_stats)) {
new_stats->used = jiffies;
new_stats->packet_count = 1;
-- 
1.8.3.1

[PATCH v2 2/2] openvswitch: Optimize operations for OvS flow_stats.

2017-07-18 Thread Tonghao Zhang

When calling the flow_free() to free the flow, we call many times
(cpu_possible_mask, eg. 128 as default) cpumask_next(). That will
take up our CPU usage if we call the flow_free() frequently.
When we put all packets to userspace via upcall, and OvS will send
them back via netlink to ovs_packet_cmd_execute(will call flow_free).

The test topo is shown as below. VM01 sends TCP packets to VM02,
and OvS forward packtets. When testing, we use perf to report the
system performance.

VM01 --- OvS-VM --- VM02

Without this patch, perf-top show as below: The flow_free() is
3.02% CPU usage.

4.23%  [kernel][k] _raw_spin_unlock_irqrestore
3.62%  [kernel][k] __do_softirq
3.16%  [kernel][k] __memcpy
3.02%  [kernel][k] flow_free
2.42%  libc-2.17.so[.] __memcpy_ssse3_back
2.18%  [kernel][k] copy_user_generic_unrolled
2.17%  [kernel][k] find_next_bit

When applied this patch, perf-top show as below: Not shown on
the list anymore.

4.11%  [kernel][k] _raw_spin_unlock_irqrestore
3.79%  [kernel][k] __do_softirq
3.46%  [kernel][k] __memcpy
2.73%  libc-2.17.so[.] __memcpy_ssse3_back
2.25%  [kernel][k] copy_user_generic_unrolled
1.89%  libc-2.17.so[.] _int_malloc
1.53%  ovs-vswitchd[.] xlate_actions

With this patch, the TCP throughput(we dont use Megaflow Cache
+ Microflow Cache) between VMs is 1.18Gbs/sec up to 1.30Gbs/sec
(maybe ~10% performance imporve).

This patch adds cpumask struct, the cpu_used_mask stores the cpu_id
that the flow used. And we only check the flow_stats on the cpu we
used, and it is unncessary to check all possible cpu when getting,
cleaning, and updating the flow_stats. Adding the cpu_used_mask to
sw_flow struct does’t increase the cacheline number.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
Acked-by: Pravin B Shelar <pshe...@ovn.org>
---
 net/openvswitch/flow.c   | 7 ---
 net/openvswitch/flow.h   | 2 ++
 net/openvswitch/flow_table.c | 4 +++-
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 89aeb32..cfb652a 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -72,7 +72,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
   const struct sk_buff *skb)
 {
struct flow_stats *stats;
-   int cpu = smp_processor_id();
+   unsigned int cpu = smp_processor_id();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
stats = rcu_dereference(flow->stats[cpu]);
@@ -117,6 +117,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
 
rcu_assign_pointer(flow->stats[cpu],
   new_stats);
+   cpumask_set_cpu(cpu, 
>cpu_used_mask);
goto unlock;
}
}
@@ -144,7 +145,7 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
memset(ovs_stats, 0, sizeof(*ovs_stats));
 
/* We open code this to make sure cpu 0 is always considered */
-   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
cpu_possible_mask)) {
+   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
>cpu_used_mask)) {
struct flow_stats *stats = 
rcu_dereference_ovsl(flow->stats[cpu]);
 
if (stats) {
@@ -168,7 +169,7 @@ void ovs_flow_stats_clear(struct sw_flow *flow)
int cpu;
 
/* We open code this to make sure cpu 0 is always considered */
-   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
cpu_possible_mask)) {
+   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
>cpu_used_mask)) {
struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
 
if (stats) {
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index a9bc1c8..1875bba 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -219,6 +220,7 @@ struct sw_flow {
 */
struct sw_flow_key key;
struct sw_flow_id id;
+   struct cpumask cpu_used_mask;
struct sw_flow_mask *mask;
struct sw_flow_actions __rcu *sf_acts;
struct flow_stats __rcu *stats[]; /* One for each CPU.  First one
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index ea7a807..80ea2a7 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -98,6 +98,8 @@ struct sw_flow *ovs_flow_alloc(void)
 
RCU_INIT_POINTER(flo

Re: [ovs-dev] [PATCH 2/2] openvswitch: Optimize operations for OvS flow_stats.

2017-07-17 Thread Tonghao Zhang


> On Jul 18, 2017, at 11:14 AM, Pravin Shelar <pshe...@ovn.org> wrote:
> 
>> 
>> This patch adds cpumask struct, the cpu_used_mask stores the cpu_id
>> that the flow used. And we only check the flow_stats on the cpu we
>> used, and it is unncessary to check all possible cpu when getting,
>> cleaning, and updating the flow_stats. Adding the cpu_used_mask to
>> sw_flow struct does’t increase the cacheline number.
>> 
>> Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
> 
> Patch looks good to me. I have one comment.

Thanks. I will sent v2 and add “Acked-by”.

[PATCH 2/2] openvswitch: Optimize operations for OvS flow_stats.

2017-07-17 Thread Tonghao Zhang

When calling the flow_free() to free the flow, we call many times
(cpu_possible_mask, eg. 128 as default) cpumask_next(). That will
take up our CPU usage if we call the flow_free() frequently.
When we put all packets to userspace via upcall, and OvS will send
them back via netlink to ovs_packet_cmd_execute(will call flow_free).

The test topo is shown as below. VM01 sends TCP packets to VM02,
and OvS forward packtets. When testing, we use perf to report the
system performance.

VM01 --- OvS-VM --- VM02

Without this patch, perf-top show as below: The flow_free() is
3.02% CPU usage.

4.23%  [kernel][k] _raw_spin_unlock_irqrestore
3.62%  [kernel][k] __do_softirq
3.16%  [kernel][k] __memcpy
3.02%  [kernel][k] flow_free
2.42%  libc-2.17.so[.] __memcpy_ssse3_back
2.18%  [kernel][k] copy_user_generic_unrolled
2.17%  [kernel][k] find_next_bit

When applied this patch, perf-top show as below: Not shown on
the list anymore.

4.11%  [kernel][k] _raw_spin_unlock_irqrestore
3.79%  [kernel][k] __do_softirq
3.46%  [kernel][k] __memcpy
2.73%  libc-2.17.so[.] __memcpy_ssse3_back
2.25%  [kernel][k] copy_user_generic_unrolled
1.89%  libc-2.17.so[.] _int_malloc
1.53%  ovs-vswitchd[.] xlate_actions

With this patch, the TCP throughput(we dont use Megaflow Cache
+ Microflow Cache) between VMs is 1.18Gbs/sec up to 1.30Gbs/sec
(maybe ~10% performance imporve).

This patch adds cpumask struct, the cpu_used_mask stores the cpu_id
that the flow used. And we only check the flow_stats on the cpu we
used, and it is unncessary to check all possible cpu when getting,
cleaning, and updating the flow_stats. Adding the cpu_used_mask to
sw_flow struct does’t increase the cacheline number.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/openvswitch/flow.c   | 7 ---
 net/openvswitch/flow.h   | 2 ++
 net/openvswitch/flow_table.c | 5 -
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 89aeb32..cfb652a 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -72,7 +72,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
   const struct sk_buff *skb)
 {
struct flow_stats *stats;
-   int cpu = smp_processor_id();
+   unsigned int cpu = smp_processor_id();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
stats = rcu_dereference(flow->stats[cpu]);
@@ -117,6 +117,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
 
rcu_assign_pointer(flow->stats[cpu],
   new_stats);
+   cpumask_set_cpu(cpu, 
>cpu_used_mask);
goto unlock;
}
}
@@ -144,7 +145,7 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
memset(ovs_stats, 0, sizeof(*ovs_stats));
 
/* We open code this to make sure cpu 0 is always considered */
-   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
cpu_possible_mask)) {
+   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
>cpu_used_mask)) {
struct flow_stats *stats = 
rcu_dereference_ovsl(flow->stats[cpu]);
 
if (stats) {
@@ -168,7 +169,7 @@ void ovs_flow_stats_clear(struct sw_flow *flow)
int cpu;
 
/* We open code this to make sure cpu 0 is always considered */
-   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
cpu_possible_mask)) {
+   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
>cpu_used_mask)) {
struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
 
if (stats) {
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index a9bc1c8..1875bba 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -219,6 +220,7 @@ struct sw_flow {
 */
struct sw_flow_key key;
struct sw_flow_id id;
+   struct cpumask cpu_used_mask;
struct sw_flow_mask *mask;
struct sw_flow_actions __rcu *sf_acts;
struct flow_stats __rcu *stats[]; /* One for each CPU.  First one
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index ea7a807..324a246 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -98,6 +98,9 @@ struct sw_flow *ovs_flow_alloc(void)
 
RCU_INIT_POINTER(flow->stats[0], stats);
 
+   cpumask_cle

[PATCH 1/2] openvswitch: Optimize updating for OvS flow_stats.

2017-07-17 Thread Tonghao Zhang

In the ovs_flow_stats_update(), we only use the node
var to alloc flow_stats struct. But this is not a
common case, it is unnecessary to call the numa_node_id()
everytime. This patch is not a bugfix, but there maybe
a small increase.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/openvswitch/flow.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 3f76cb7..89aeb32 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -72,7 +72,6 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
   const struct sk_buff *skb)
 {
struct flow_stats *stats;
-   int node = numa_node_id();
int cpu = smp_processor_id();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
@@ -108,7 +107,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
  __GFP_THISNODE |
  __GFP_NOWARN |
  __GFP_NOMEMALLOC,
- node);
+ numa_node_id());
if (likely(new_stats)) {
new_stats->used = jiffies;
new_stats->packet_count = 1;
-- 
1.8.3.1

[PATCH 1/2] openvswitch: Optimize updating for OvS flow_stats.

2017-07-10 Thread Tonghao Zhang

In the ovs_flow_stats_update(), we only use the node
var to alloc flow_stats struct. But this is not a
common case, it is unnecessary to call the numa_node_id()
everytime. This patch is not a bugfix, but there maybe
a small increase.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/openvswitch/flow.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 3f76cb7..89aeb32 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -72,7 +72,6 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
   const struct sk_buff *skb)
 {
struct flow_stats *stats;
-   int node = numa_node_id();
int cpu = smp_processor_id();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
@@ -108,7 +107,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
  __GFP_THISNODE |
  __GFP_NOWARN |
  __GFP_NOMEMALLOC,
- node);
+ numa_node_id());
if (likely(new_stats)) {
new_stats->used = jiffies;
new_stats->packet_count = 1;
-- 
1.8.3.1

[PATCH 2/2] openvswitch: Optimize operations for OvS flow_stats.

2017-07-10 Thread Tonghao Zhang

When calling the flow_free() to free the flow, we call many times
(cpu_possible_mask, eg. 128 as default) cpumask_next(). That will
take up our CPU usage if we call the flow_free() frequently.
When we put all packets to userspace via upcall, and OvS will send
them back via netlink to ovs_packet_cmd_execute(will call flow_free).

The test topo is shown as below. VM01 sends TCP packets to VM02,
and OvS forward packtets. When testing, we use perf to report the
system performance.

VM01 --- OvS-VM --- VM02

Without this patch, perf-top show as below: The flow_free() is
3.02% CPU usage.

4.23%  [kernel][k] _raw_spin_unlock_irqrestore
3.62%  [kernel][k] __do_softirq
3.16%  [kernel][k] __memcpy
3.02%  [kernel][k] flow_free
2.42%  libc-2.17.so[.] __memcpy_ssse3_back
2.18%  [kernel][k] copy_user_generic_unrolled
2.17%  [kernel][k] find_next_bit

When applied this patch, perf-top show as below: Not shown on
the list anymore.

4.11%  [kernel][k] _raw_spin_unlock_irqrestore
3.79%  [kernel][k] __do_softirq
3.46%  [kernel][k] __memcpy
2.73%  libc-2.17.so[.] __memcpy_ssse3_back
2.25%  [kernel][k] copy_user_generic_unrolled
1.89%  libc-2.17.so[.] _int_malloc
1.53%  ovs-vswitchd[.] xlate_actions

With this patch, the TCP throughput(we dont use Megaflow Cache
+ Microflow Cache) between VMs is 1.18Gbs/sec up to 1.30Gbs/sec
(maybe ~10% performance imporve).

This patch adds cpumask struct, the cpu_used_mask stores the cpu_id
that the flow used. And we only check the flow_stats on the cpu we
used, and it is unncessary to check all possible cpu when getting,
cleaning, and updating the flow_stats. Adding the cpu_used_mask to
sw_flow struct does’t increase the cacheline number.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/openvswitch/flow.c   | 7 ---
 net/openvswitch/flow.h   | 2 ++
 net/openvswitch/flow_table.c | 5 -
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index 89aeb32..cfb652a 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -72,7 +72,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
   const struct sk_buff *skb)
 {
struct flow_stats *stats;
-   int cpu = smp_processor_id();
+   unsigned int cpu = smp_processor_id();
int len = skb->len + (skb_vlan_tag_present(skb) ? VLAN_HLEN : 0);
 
stats = rcu_dereference(flow->stats[cpu]);
@@ -117,6 +117,7 @@ void ovs_flow_stats_update(struct sw_flow *flow, __be16 
tcp_flags,
 
rcu_assign_pointer(flow->stats[cpu],
   new_stats);
+   cpumask_set_cpu(cpu, 
>cpu_used_mask);
goto unlock;
}
}
@@ -144,7 +145,7 @@ void ovs_flow_stats_get(const struct sw_flow *flow,
memset(ovs_stats, 0, sizeof(*ovs_stats));
 
/* We open code this to make sure cpu 0 is always considered */
-   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
cpu_possible_mask)) {
+   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
>cpu_used_mask)) {
struct flow_stats *stats = 
rcu_dereference_ovsl(flow->stats[cpu]);
 
if (stats) {
@@ -168,7 +169,7 @@ void ovs_flow_stats_clear(struct sw_flow *flow)
int cpu;
 
/* We open code this to make sure cpu 0 is always considered */
-   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
cpu_possible_mask)) {
+   for (cpu = 0; cpu < nr_cpu_ids; cpu = cpumask_next(cpu, 
>cpu_used_mask)) {
struct flow_stats *stats = ovsl_dereference(flow->stats[cpu]);
 
if (stats) {
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index a9bc1c8..1875bba 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -219,6 +220,7 @@ struct sw_flow {
 */
struct sw_flow_key key;
struct sw_flow_id id;
+   struct cpumask cpu_used_mask;
struct sw_flow_mask *mask;
struct sw_flow_actions __rcu *sf_acts;
struct flow_stats __rcu *stats[]; /* One for each CPU.  First one
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index ea7a807..324a246 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -98,6 +98,9 @@ struct sw_flow *ovs_flow_alloc(void)
 
RCU_INIT_POINTER(flow->stats[0], stats);
 
+   cpumask_cle

Re: [PATCH v3] datapath: Avoid using stack larger than 1024.

2017-06-29 Thread Tonghao Zhang

Thanks for your works. I send v3 for net-next and add "Acked-by".
If it is applied, I will backport it to d...@openvswitch.org.

On Fri, Jun 30, 2017 at 1:45 AM, Pravin Shelar <pshe...@ovn.org> wrote:
> On Wed, Jun 28, 2017 at 6:38 PM, Tonghao Zhang <xiangxia.m@gmail.com> 
> wrote:
>> When compiling OvS-master on 4.4.0-81 kernel,
>> there is a warning:
>>
>> CC [M]  /root/ovs/datapath/linux/datapath.o
>> /root/ovs/datapath/linux/datapath.c: In function
>> 'ovs_flow_cmd_set':
>> /root/ovs/datapath/linux/datapath.c:1221:1: warning:
>> the frame size of 1040 bytes is larger than 1024 bytes
>> [-Wframe-larger-than=]
>>
>> This patch factors out match-init and action-copy to avoid
>> "Wframe-larger-than=1024" warning. Because mask is only
>> used to get actions, we new a function to save some
>> stack space.
>>
>> Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
>> ---
> Looks good, you have to submit patch against net-next for upstream OVS
> module first.

[PATCH v3] datapath: Avoid using stack larger than 1024.

2017-06-28 Thread Tonghao Zhang

When compiling OvS-master on 4.4.0-81 kernel,
there is a warning:

CC [M]  /root/ovs/datapath/linux/datapath.o
/root/ovs/datapath/linux/datapath.c: In function
'ovs_flow_cmd_set':
/root/ovs/datapath/linux/datapath.c:1221:1: warning:
the frame size of 1040 bytes is larger than 1024 bytes
[-Wframe-larger-than=]

This patch factors out match-init and action-copy to avoid
"Wframe-larger-than=1024" warning. Because mask is only
used to get actions, we new a function to save some
stack space.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 datapath/datapath.c | 81 ++---
 1 file changed, 58 insertions(+), 23 deletions(-)

diff --git a/datapath/datapath.c b/datapath/datapath.c
index c85029c..fb9f114 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -1100,6 +1100,58 @@ static struct sw_flow_actions *get_flow_actions(struct 
net *net,
return acts;
 }
 
+/* Factor out match-init and action-copy to avoid
+ * "Wframe-larger-than=1024" warning. Because mask is only
+ * used to get actions, we new a function to save some
+ * stack space.
+ *
+ * If there are not key and action attrs, we return 0
+ * directly. In the case, the caller will also not use the
+ * match as before. If there is action attr, we try to get
+ * actions and save them to *acts. Before returning from
+ * the function, we reset the match->mask pointer. Because
+ * we should not to return match object with dangling reference
+ * to mask.
+ * */
+static int ovs_nla_init_match_and_action(struct net *net,
+struct sw_flow_match *match,
+struct sw_flow_key *key,
+struct nlattr **a,
+struct sw_flow_actions **acts,
+bool log)
+{
+   struct sw_flow_mask mask;
+   int error = 0;
+
+   if (a[OVS_FLOW_ATTR_KEY]) {
+   ovs_match_init(match, key, true, );
+   error = ovs_nla_get_match(net, match, a[OVS_FLOW_ATTR_KEY],
+ a[OVS_FLOW_ATTR_MASK], log);
+   if (error)
+   goto error;
+   }
+
+   if (a[OVS_FLOW_ATTR_ACTIONS]) {
+   if (!a[OVS_FLOW_ATTR_KEY]) {
+   OVS_NLERR(log,
+ "Flow key attribute not present in set 
flow.");
+   return -EINVAL;
+   }
+
+   *acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key,
+, log);
+   if (IS_ERR(*acts)) {
+   error = PTR_ERR(*acts);
+   goto error;
+   }
+   }
+
+   /* On success, error is 0. */
+error:
+   match->mask = NULL;
+   return error;
+}
+
 static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 {
struct net *net = sock_net(skb->sk);
@@ -1107,7 +1159,6 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sw_flow *flow;
-   struct sw_flow_mask mask;
struct sk_buff *reply = NULL;
struct datapath *dp;
struct sw_flow_actions *old_acts = NULL, *acts = NULL;
@@ -1119,34 +1170,18 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
bool ufid_present;
 
ufid_present = ovs_nla_get_ufid(, a[OVS_FLOW_ATTR_UFID], log);
-   if (a[OVS_FLOW_ATTR_KEY]) {
-   ovs_match_init(, , true, );
-   error = ovs_nla_get_match(net, , a[OVS_FLOW_ATTR_KEY],
- a[OVS_FLOW_ATTR_MASK], log);
-   } else if (!ufid_present) {
+   if (!a[OVS_FLOW_ATTR_KEY] && !ufid_present) {
OVS_NLERR(log,
  "Flow set message rejected, Key attribute missing.");
-   error = -EINVAL;
+   return -EINVAL;
}
+
+   error = ovs_nla_init_match_and_action(net, , , a,
+ , log);
if (error)
goto error;
 
-   /* Validate actions. */
-   if (a[OVS_FLOW_ATTR_ACTIONS]) {
-   if (!a[OVS_FLOW_ATTR_KEY]) {
-   OVS_NLERR(log,
- "Flow key attribute not present in set 
flow.");
-   error = -EINVAL;
-   goto error;
-   }
-
-   acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], ,
-   , log);
-   if (IS_ERR(acts)) {
-   error = PTR_ERR(acts);
-   goto error;
-   }
-
+   if (acts) {

[PATCH v2] datapath: Avoid using stack larger than 1024.

2017-06-27 Thread Tonghao Zhang

When compiling OvS-master on 4.4.0-81 kernel,
there is a warning:

CC [M]  /root/ovs/datapath/linux/datapath.o
/root/ovs/datapath/linux/datapath.c: In function
‘ovs_flow_cmd_set’:
/root/ovs/datapath/linux/datapath.c:1221:1: warning:
the frame size of 1040 bytes is larger than 1024 bytes
[-Wframe-larger-than=]

This patch factors out match-init and action-copy to avoid
"Wframe-larger-than=1024" warning. Because mask is only
used to get actions, we new a function to save some
stack space.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 datapath/datapath.c | 73 -
 1 file changed, 50 insertions(+), 23 deletions(-)

diff --git a/datapath/datapath.c b/datapath/datapath.c
index c85029c..fdbe314 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -1100,6 +1100,50 @@ static struct sw_flow_actions *get_flow_actions(struct 
net *net,
return acts;
 }
 
+/* Factor out match-init and action-copy to avoid
+ * "Wframe-larger-than=1024" warning. Because mask is only
+ * used to get actions, we new a function to save some
+ * stack space.
+ *
+ * If there are not key and action attrs, we return 0
+ * directly. In the case, the caller will also not use the
+ * match as before. If there is action attr, we try to get
+ * actions and save them to *acts.
+ * */
+static int ovs_nla_init_match_and_action(struct net *net,
+struct sw_flow_match *match,
+struct sw_flow_key *key,
+struct nlattr **a,
+struct sw_flow_actions **acts,
+bool log)
+{
+   struct sw_flow_mask mask;
+   int error = 0;
+
+   if (a[OVS_FLOW_ATTR_KEY]) {
+   ovs_match_init(match, key, true, );
+   error = ovs_nla_get_match(net, match, a[OVS_FLOW_ATTR_KEY],
+ a[OVS_FLOW_ATTR_MASK], log);
+   if (error)
+   return error;
+   }
+
+   if (a[OVS_FLOW_ATTR_ACTIONS]) {
+   if (!a[OVS_FLOW_ATTR_KEY]) {
+   OVS_NLERR(log,
+ "Flow key attribute not present in set 
flow.");
+   return -EINVAL;
+   }
+
+   *acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], key,
+, log);
+   if (IS_ERR(*acts))
+   return PTR_ERR(*acts);
+   }
+
+   return 0;
+}
+
 static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
 {
struct net *net = sock_net(skb->sk);
@@ -1107,7 +1151,6 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sw_flow *flow;
-   struct sw_flow_mask mask;
struct sk_buff *reply = NULL;
struct datapath *dp;
struct sw_flow_actions *old_acts = NULL, *acts = NULL;
@@ -1119,34 +1162,18 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
bool ufid_present;
 
ufid_present = ovs_nla_get_ufid(, a[OVS_FLOW_ATTR_UFID], log);
-   if (a[OVS_FLOW_ATTR_KEY]) {
-   ovs_match_init(, , true, );
-   error = ovs_nla_get_match(net, , a[OVS_FLOW_ATTR_KEY],
- a[OVS_FLOW_ATTR_MASK], log);
-   } else if (!ufid_present) {
+   if (!a[OVS_FLOW_ATTR_KEY] && !ufid_present) {
OVS_NLERR(log,
  "Flow set message rejected, Key attribute missing.");
-   error = -EINVAL;
+   return -EINVAL;
}
+
+   error = ovs_nla_init_match_and_action(net, , , a,
+ , log);
if (error)
goto error;
 
-   /* Validate actions. */
-   if (a[OVS_FLOW_ATTR_ACTIONS]) {
-   if (!a[OVS_FLOW_ATTR_KEY]) {
-   OVS_NLERR(log,
- "Flow key attribute not present in set 
flow.");
-   error = -EINVAL;
-   goto error;
-   }
-
-   acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], ,
-   , log);
-   if (IS_ERR(acts)) {
-   error = PTR_ERR(acts);
-   goto error;
-   }
-
+   if (acts) {
/* Can allocate before locking if have acts. */
reply = ovs_flow_cmd_alloc_info(acts, , info, false,
ufid_flags);
-- 
1.8.3.1

[PATCH] datapath: Avoid using stack larger than 1024.

2017-06-27 Thread Tonghao Zhang

When compiling OvS-master on 4.4.0-81 kernel,
there is a warning:

CC [M]  /root/ovs/datapath/linux/datapath.o
/root/ovs/datapath/linux/datapath.c: In function
‘ovs_flow_cmd_set’:
/root/ovs/datapath/linux/datapath.c:1221:1: warning:
the frame size of 1040 bytes is larger than 1024 bytes
[-Wframe-larger-than=]

This patch use kmalloc to malloc mem for sw_flow_mask and
avoid using stack.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 net/openvswitch/datapath.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index c85029c..da8cd68 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -1107,7 +1107,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sw_flow *flow;
-   struct sw_flow_mask mask;
+   struct sw_flow_mask *mask;
struct sk_buff *reply = NULL;
struct datapath *dp;
struct sw_flow_actions *old_acts = NULL, *acts = NULL;
@@ -1120,7 +1120,11 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
 
ufid_present = ovs_nla_get_ufid(, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
-   ovs_match_init(, , true, );
+   mask = kmalloc(sizeof(struct sw_flow_mask), GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   ovs_match_init(, , true, mask);
error = ovs_nla_get_match(net, , a[OVS_FLOW_ATTR_KEY],
  a[OVS_FLOW_ATTR_MASK], log);
} else if (!ufid_present) {
@@ -1141,7 +1145,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
}
 
acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], ,
-   , log);
+   mask, log);
if (IS_ERR(acts)) {
error = PTR_ERR(acts);
goto error;
@@ -1216,6 +1220,7 @@ err_unlock_ovs:
 err_kfree_acts:
ovs_nla_free_flow_actions(acts);
 error:
+   kfree(mask);
return error;
 }
 
-- 
1.8.3.1

[PATCH] datapath: Avoid using stack larger than 1024.

2017-06-27 Thread Tonghao Zhang

When compiling OvS-master on 4.4.0-81 kernel,
there is a warning:

CC [M]  /root/ovs/datapath/linux/datapath.o
/root/ovs/datapath/linux/datapath.c: In function
‘ovs_flow_cmd_set’:
/root/ovs/datapath/linux/datapath.c:1221:1: warning:
the frame size of 1040 bytes is larger than 1024 bytes
[-Wframe-larger-than=]

This patch use kmalloc to malloc mem for sw_flow_mask and
avoid using stack.

Signed-off-by: Tonghao Zhang <xiangxia.m@gmail.com>
---
 datapath/datapath.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/datapath/datapath.c b/datapath/datapath.c
index c85029c..da8cd68 100644
--- a/datapath/datapath.c
+++ b/datapath/datapath.c
@@ -1107,7 +1107,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
struct ovs_header *ovs_header = info->userhdr;
struct sw_flow_key key;
struct sw_flow *flow;
-   struct sw_flow_mask mask;
+   struct sw_flow_mask *mask;
struct sk_buff *reply = NULL;
struct datapath *dp;
struct sw_flow_actions *old_acts = NULL, *acts = NULL;
@@ -1120,7 +1120,11 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
 
ufid_present = ovs_nla_get_ufid(, a[OVS_FLOW_ATTR_UFID], log);
if (a[OVS_FLOW_ATTR_KEY]) {
-   ovs_match_init(, , true, );
+   mask = kmalloc(sizeof(struct sw_flow_mask), GFP_KERNEL);
+   if (!mask)
+   return -ENOMEM;
+
+   ovs_match_init(, , true, mask);
error = ovs_nla_get_match(net, , a[OVS_FLOW_ATTR_KEY],
  a[OVS_FLOW_ATTR_MASK], log);
} else if (!ufid_present) {
@@ -1141,7 +1145,7 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct 
genl_info *info)
}
 
acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], ,
-   , log);
+   mask, log);
if (IS_ERR(acts)) {
error = PTR_ERR(acts);
goto error;
@@ -1216,6 +1220,7 @@ err_unlock_ovs:
 err_kfree_acts:
ovs_nla_free_flow_actions(acts);
 error:
+   kfree(mask);
return error;
 }
 
-- 
1.8.3.1

1 2 >

100 matches

Mail list logo