Re: [PATCH net-next] Per interface IPv4 stats (CONFIG_IP_IFSTATS_TABLE)

2018-04-12 Thread Julian Anastasov

Hello,

On Thu, 12 Apr 2018, Stephen Suryaputra wrote:

> Thanks for the feedbacks. Please see the detail below:
> 
> On Wed, Apr 11, 2018 at 3:37 PM, Julian Anastasov  wrote:
> [snip]
> >> - __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
> >> + __IP_INC_STATS(net, skb_dst(skb)->dev, IPSTATS_MIB_INHDRERRORS);
> >
> > May be skb->dev if we want to account it to the
> > input device.
> >
> Yes. I'm about to make change it but see the next one.
> 
> [snip]
> >> diff --git a/net/netfilter/ipvs/ip_vs_xmit.c 
> >> b/net/netfilter/ipvs/ip_vs_xmit.c
> >> index 4527921..32bd3af 100644
> >> --- a/net/netfilter/ipvs/ip_vs_xmit.c
> >> +++ b/net/netfilter/ipvs/ip_vs_xmit.c
> >> @@ -286,7 +286,7 @@ static inline bool decrement_ttl(struct netns_ipvs 
> >> *ipvs,
> >>   {
> >>   if (ip_hdr(skb)->ttl <= 1) {
> >>   /* Tell the sender its packet died... */
> >> - __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
> >> + __IP_INC_STATS(net, skb_dst(skb)->dev, 
> >> IPSTATS_MIB_INHDRERRORS);
> >
> > At this point, skb_dst(skb) can be:
> >
> > - input route at LOCAL_IN => dst->dev is "lo", skb->dev = input_device
> > - output route at LOCAL_OUT => dst->dev is output_device, skb->dev = NULL
> >
> > We should see this error on LOCAL_IN but better to be
> > safe: use 'skb->dev ? : skb_dst(skb)->dev' instead of just
> > 'skb_dst(skb)->dev'.
> >
> This follows v6 implementation in the same function:
> 
> #ifdef CONFIG_IP_VS_IPV6
> if (skb_af == AF_INET6) {
> struct dst_entry *dst = skb_dst(skb);
> 
> /* check and decrement ttl */
> if (ipv6_hdr(skb)->hop_limit <= 1) {
> /* Force OUTPUT device used as source address */

It looks like IPVS copied it from ip6_forward() but in
IPVS context it has its reason: we want ICMP to exit with
saddr=Virtual_IP. And we are at LOCAL_IN where there is no
output device like in ip6_forward(FORWARD) to use its source
address.

So, IPVS is special (both input and output path) and needs:

IPv4: skb->dev ? : skb_dst(skb)->dev
IPv6 needs fix for IPVS stats in decrement_ttl:

idev = skb->dev ? __in6_dev_get(skb->dev) : ip6_dst_idev(dst);
...
__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);

Otherwise, stats will go to "lo" if ip6_dst_idev
is used for local input route.

So, for accounting on input IPv4 path skb->dev should be
used, while for IPv6 some sites may prefer to feed icmpv6_send()
with output dst->dev as device containing the source address (skb->dev).
But this is unrelated to the stats.

> skb->dev = dst->dev;
> icmpv6_send(skb, ICMPV6_TIME_EXCEED,
> ICMPV6_EXC_HOPLIMIT, 0);
> __IP6_INC_STATS(net, ip6_dst_idev(dst),
> IPSTATS_MIB_INHDRERRORS);
> 
> return false;
> }
> 
> /* don't propagate ttl change to cloned packets */
> if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
> return false;
> 
> ipv6_hdr(skb)->hop_limit--;
> } else
> #endif
> 
> [snip]
> >
> > The patch probably has other errors, for example,
> > using rt->dst.dev (lo) when rt->dst.error != 0 in ip_error,
> > may be 'dev' should be used instead...
> 
> Same also here. Examples are ip6_forward and ip6_pkt_drop.
> 
> I think it's better be counted in the input device for them also. Thoughts?

I think so. ipv6_rcv() works with idev = __in6_dev_get(skb->dev)
but I don't know IPv6 well and whether ip6_dst_idev(skb_dst(skb))
is correct usage for input path. It should be correct for output
path, though.

Regards

--
Julian Anastasov 


Re: [PATCH net-next] Per interface IPv4 stats (CONFIG_IP_IFSTATS_TABLE)

2018-04-12 Thread Stephen Suryaputra
Thanks for the feedbacks. Please see the detail below:

On Wed, Apr 11, 2018 at 3:37 PM, Julian Anastasov  wrote:
[snip]
>> - __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
>> + __IP_INC_STATS(net, skb_dst(skb)->dev, IPSTATS_MIB_INHDRERRORS);
>
> May be skb->dev if we want to account it to the
> input device.
>
Yes. I'm about to make change it but see the next one.

[snip]
>> diff --git a/net/netfilter/ipvs/ip_vs_xmit.c 
>> b/net/netfilter/ipvs/ip_vs_xmit.c
>> index 4527921..32bd3af 100644
>> --- a/net/netfilter/ipvs/ip_vs_xmit.c
>> +++ b/net/netfilter/ipvs/ip_vs_xmit.c
>> @@ -286,7 +286,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
>>   {
>>   if (ip_hdr(skb)->ttl <= 1) {
>>   /* Tell the sender its packet died... */
>> - __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
>> + __IP_INC_STATS(net, skb_dst(skb)->dev, 
>> IPSTATS_MIB_INHDRERRORS);
>
> At this point, skb_dst(skb) can be:
>
> - input route at LOCAL_IN => dst->dev is "lo", skb->dev = input_device
> - output route at LOCAL_OUT => dst->dev is output_device, skb->dev = NULL
>
> We should see this error on LOCAL_IN but better to be
> safe: use 'skb->dev ? : skb_dst(skb)->dev' instead of just
> 'skb_dst(skb)->dev'.
>
This follows v6 implementation in the same function:

#ifdef CONFIG_IP_VS_IPV6
if (skb_af == AF_INET6) {
struct dst_entry *dst = skb_dst(skb);

/* check and decrement ttl */
if (ipv6_hdr(skb)->hop_limit <= 1) {
/* Force OUTPUT device used as source address */
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED,
ICMPV6_EXC_HOPLIMIT, 0);
__IP6_INC_STATS(net, ip6_dst_idev(dst),
IPSTATS_MIB_INHDRERRORS);

return false;
}

/* don't propagate ttl change to cloned packets */
if (!skb_make_writable(skb, sizeof(struct ipv6hdr)))
return false;

ipv6_hdr(skb)->hop_limit--;
} else
#endif

[snip]
>
> The patch probably has other errors, for example,
> using rt->dst.dev (lo) when rt->dst.error != 0 in ip_error,
> may be 'dev' should be used instead...

Same also here. Examples are ip6_forward and ip6_pkt_drop.

I think it's better be counted in the input device for them also. Thoughts?

Regards,
Stephen.


Re: [PATCH net-next] Per interface IPv4 stats (CONFIG_IP_IFSTATS_TABLE)

2018-04-12 Thread kbuild test robot
Hi Stephen,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Stephen-Suryaputra/Per-interface-IPv4-stats-CONFIG_IP_IFSTATS_TABLE/20180412-181719
reproduce:
# apt-get install sparse
make ARCH=x86_64 allmodconfig
make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

   net/ipv4/proc.c:414:28: sparse: Variable length array is used.
>> net/ipv4/proc.c:499:43: sparse: incorrect type in argument 1 (different 
>> address spaces) @@expected void [noderef] *mib @@got vvoid 
>> [noderef] *mib @@
   net/ipv4/proc.c:499:43:expected void [noderef] *mib
   net/ipv4/proc.c:499:43:got void [noderef] **pcpumib
>> net/ipv4/proc.c:532:34: sparse: cast removes address space of expression
   net/ipv4/proc.c:534:34: sparse: cast removes address space of expression

vim +499 net/ipv4/proc.c

   411  
   412  static int snmp_seq_show_tcp_udp(struct seq_file *seq, void *v)
   413  {
 > 414  unsigned long buff[TCPUDP_MIB_MAX];
   415  struct net *net = seq->private;
   416  int i;
   417  
   418  memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
   419  
   420  seq_puts(seq, "\nTcp:");
   421  for (i = 0; snmp4_tcp_list[i].name; i++)
   422  seq_printf(seq, " %s", snmp4_tcp_list[i].name);
   423  
   424  seq_puts(seq, "\nTcp:");
   425  snmp_get_cpu_field_batch(buff, snmp4_tcp_list,
   426   net->mib.tcp_statistics);
   427  for (i = 0; snmp4_tcp_list[i].name; i++) {
   428  /* MaxConn field is signed, RFC 2012 */
   429  if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN)
   430  seq_printf(seq, " %ld", buff[i]);
   431  else
   432  seq_printf(seq, " %lu", buff[i]);
   433  }
   434  
   435  memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
   436  
   437  snmp_get_cpu_field_batch(buff, snmp4_udp_list,
   438   net->mib.udp_statistics);
   439  seq_puts(seq, "\nUdp:");
   440  for (i = 0; snmp4_udp_list[i].name; i++)
   441  seq_printf(seq, " %s", snmp4_udp_list[i].name);
   442  seq_puts(seq, "\nUdp:");
   443  for (i = 0; snmp4_udp_list[i].name; i++)
   444  seq_printf(seq, " %lu", buff[i]);
   445  
   446  memset(buff, 0, TCPUDP_MIB_MAX * sizeof(unsigned long));
   447  
   448  /* the UDP and UDP-Lite MIBs are the same */
   449  seq_puts(seq, "\nUdpLite:");
   450  snmp_get_cpu_field_batch(buff, snmp4_udp_list,
   451   net->mib.udplite_statistics);
   452  for (i = 0; snmp4_udp_list[i].name; i++)
   453  seq_printf(seq, " %s", snmp4_udp_list[i].name);
   454  seq_puts(seq, "\nUdpLite:");
   455  for (i = 0; snmp4_udp_list[i].name; i++)
   456  seq_printf(seq, " %lu", buff[i]);
   457  
   458  seq_putc(seq, '\n');
   459  return 0;
   460  }
   461  
   462  static int snmp_seq_show(struct seq_file *seq, void *v)
   463  {
   464  snmp_seq_show_ipstats(seq, v);
   465  
   466  icmp_put(seq);  /* RFC 2011 compatibility */
   467  icmpmsg_put(seq);
   468  
   469  snmp_seq_show_tcp_udp(seq, v);
   470  
   471  return 0;
   472  }
   473  
   474  static int snmp_seq_open(struct inode *inode, struct file *file)
   475  {
   476  return single_open_net(inode, file, snmp_seq_show);
   477  }
   478  
   479  static const struct file_operations snmp_seq_fops = {
   480  .open= snmp_seq_open,
   481  .read= seq_read,
   482  .llseek  = seq_lseek,
   483  .release = single_release_net,
   484  };
   485  
   486  
   487  #ifdef CONFIG_IP_IFSTATS_TABLE
   488  static void snmp_seq_show_item(struct seq_file *seq, void __percpu 
**pcpumib,
   489 atomic_long_t *smib,
   490 const struct snmp_mib *itemlist,
   491 char *prefix)
   492  {
   493  char name[32];
   494  int i;
   495  unsigned long val;
   496  
   497  for (i = 0; itemlist[i].name; i++) {
   498  val = pcpumib ?
 > 499  snmp_fold_field64(pcpumib, itemlist[i].entry,
   500offsetof(struct ipstats_mib, 
syncp)) :
   501  atomic_long_read(smib + itemlist[i].entry);
   502  snprintf(name, sizeof(name), "%s%s",
   503   prefix, itemlist[i].name);
   504  seq_printf(seq, "%-32s\t%lu\n", name, val);
   505  }
   506  }
   507  
   508  static void 

Re: [PATCH net-next] Per interface IPv4 stats (CONFIG_IP_IFSTATS_TABLE)

2018-04-11 Thread Julian Anastasov

Hello,

On Tue, 10 Apr 2018, Stephen Suryaputra wrote:

> This is enhanced from the proposed patch by Igor Maravic in 2011 to
> support per interface IPv4 stats. The enhancement is mainly adding a
> kernel configuration option CONFIG_IP_IFSTATS_TABLE.
> 
> Signed-off-by: Stephen Suryaputra 
> ---

...

> diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
> index b54b948..bb9be11 100644
> --- a/net/ipv4/ip_forward.c
> +++ b/net/ipv4/ip_forward.c
> @@ -66,8 +66,8 @@ static int ip_forward_finish(struct net *net, struct sock 
> *sk, struct sk_buff *s
>  {
>   struct ip_options *opt  = &(IPCB(skb)->opt);
>  
> - __IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
> - __IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
> + __IP_INC_STATS(net, skb_dst(skb)->dev, IPSTATS_MIB_OUTFORWDATAGRAMS);
> + __IP_ADD_STATS(net, skb_dst(skb)->dev, IPSTATS_MIB_OUTOCTETS, skb->len);
>  
>   if (unlikely(opt->optlen))
>   ip_forward_options(skb);
> @@ -121,7 +121,7 @@ int ip_forward(struct sk_buff *skb)
>   IPCB(skb)->flags |= IPSKB_FORWARDED;
>   mtu = ip_dst_mtu_maybe_forward(>dst, true);
>   if (ip_exceeds_mtu(skb, mtu)) {
> - IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
> + IP_INC_STATS(net, rt->dst.dev, IPSTATS_MIB_FRAGFAILS);
>   icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
> htonl(mtu));
>   goto drop;
> @@ -158,7 +158,7 @@ int ip_forward(struct sk_buff *skb)
>  
>  too_many_hops:
>   /* Tell the sender its packet died... */
> - __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
> + __IP_INC_STATS(net, skb_dst(skb)->dev, IPSTATS_MIB_INHDRERRORS);

May be skb->dev if we want to account it to the
input device.

>   icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
>  drop:
>   kfree_skb(skb);

...

> diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
> index 4527921..32bd3af 100644
> --- a/net/netfilter/ipvs/ip_vs_xmit.c
> +++ b/net/netfilter/ipvs/ip_vs_xmit.c
> @@ -286,7 +286,7 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
>   {
>   if (ip_hdr(skb)->ttl <= 1) {
>   /* Tell the sender its packet died... */
> - __IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
> + __IP_INC_STATS(net, skb_dst(skb)->dev, 
> IPSTATS_MIB_INHDRERRORS);

At this point, skb_dst(skb) can be:

- input route at LOCAL_IN => dst->dev is "lo", skb->dev = input_device
- output route at LOCAL_OUT => dst->dev is output_device, skb->dev = NULL

We should see this error on LOCAL_IN but better to be
safe: use 'skb->dev ? : skb_dst(skb)->dev' instead of just
'skb_dst(skb)->dev'.

>   icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0);
>   return false;
>   }

The patch probably has other errors, for example,
using rt->dst.dev (lo) when rt->dst.error != 0 in ip_error,
may be 'dev' should be used instead...

Regards

--
Julian Anastasov 


Re: [PATCH net-next] Per interface IPv4 stats (CONFIG_IP_IFSTATS_TABLE)

2018-04-11 Thread Stephen Suryaputra
I read the thread and understand some of the concerns, but these are
useful for network equipments. That's why in this patch, I made it as
configuration option.
If you have feedbacks, I'll appreciate them.

I can resend when net-next is opened again.

Thanks.

On Wed, Apr 11, 2018 at 12:14 PM, Stephen Hemminger
 wrote:
> On Tue, 10 Apr 2018 22:55:35 -0400
> Stephen Suryaputra  wrote:
>
>> This is enhanced from the proposed patch by Igor Maravic in 2011 to
>> support per interface IPv4 stats. The enhancement is mainly adding a
>> kernel configuration option CONFIG_IP_IFSTATS_TABLE.
>>
>> Signed-off-by: Stephen Suryaputra 
>
>
> Net-next is closed. http://vger.kernel.org/~davem/net-next.html
>
> Also, these kind of statistics have a negative performance impact.


Re: [PATCH net-next] Per interface IPv4 stats (CONFIG_IP_IFSTATS_TABLE)

2018-04-11 Thread Stephen Hemminger
On Tue, 10 Apr 2018 22:55:35 -0400
Stephen Suryaputra  wrote:

> This is enhanced from the proposed patch by Igor Maravic in 2011 to
> support per interface IPv4 stats. The enhancement is mainly adding a
> kernel configuration option CONFIG_IP_IFSTATS_TABLE.
> 
> Signed-off-by: Stephen Suryaputra 


Net-next is closed. http://vger.kernel.org/~davem/net-next.html

Also, these kind of statistics have a negative performance impact.


[PATCH net-next] Per interface IPv4 stats (CONFIG_IP_IFSTATS_TABLE)

2018-04-11 Thread Stephen Suryaputra
This is enhanced from the proposed patch by Igor Maravic in 2011 to
support per interface IPv4 stats. The enhancement is mainly adding a
kernel configuration option CONFIG_IP_IFSTATS_TABLE.

Signed-off-by: Stephen Suryaputra 
---
 drivers/net/vrf.c   |   2 +-
 include/linux/inetdevice.h  |  22 +++
 include/net/icmp.h  |  44 --
 include/net/ip.h|  72 +--
 include/net/ipv6.h  |  62 
 include/net/netns/mib.h |   3 +
 include/net/snmp.h  |  95 ++
 net/bridge/br_netfilter_hooks.c |  10 ++--
 net/dccp/ipv4.c |   4 +-
 net/ipv4/Kconfig|   8 +++
 net/ipv4/af_inet.c  |   7 ++-
 net/ipv4/datagram.c |   2 +-
 net/ipv4/devinet.c  |  85 ++-
 net/ipv4/icmp.c |  32 +-
 net/ipv4/inet_connection_sock.c |   8 ++-
 net/ipv4/ip_forward.c   |   8 +--
 net/ipv4/ip_fragment.c  |  20 ---
 net/ipv4/ip_input.c |  29 -
 net/ipv4/ip_output.c|  40 -
 net/ipv4/ipmr.c |   6 +-
 net/ipv4/ping.c |   9 ++-
 net/ipv4/proc.c | 126 
 net/ipv4/raw.c  |   4 +-
 net/ipv4/route.c|   6 +-
 net/ipv4/tcp_ipv4.c |   4 +-
 net/ipv4/udp.c  |   4 +-
 net/l2tp/l2tp_ip.c  |   4 +-
 net/l2tp/l2tp_ip6.c |   2 +-
 net/mpls/af_mpls.c  |   2 +-
 net/netfilter/ipvs/ip_vs_xmit.c |   2 +-
 net/sctp/input.c|   2 +-
 net/sctp/output.c   |   2 +-
 32 files changed, 570 insertions(+), 156 deletions(-)

diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
index 0a2b180..509c2ca 100644
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -592,7 +592,7 @@ static int vrf_output(struct net *net, struct sock *sk, 
struct sk_buff *skb)
 {
struct net_device *dev = skb_dst(skb)->dev;
 
-   IP_UPD_PO_STATS(net, IPSTATS_MIB_OUT, skb->len);
+   IP_UPD_PO_STATS(net, dev, IPSTATS_MIB_OUT, skb->len);
 
skb->dev = dev;
skb->protocol = htons(ETH_P_IP);
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index e16fe7d..3d120cb 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -22,6 +22,15 @@ struct ipv4_devconf {
 
 #define MC_HASH_SZ_LOG 9
 
+#ifdef CONFIG_IP_IFSTATS_TABLE
+struct ipv4_devstat {
+   struct proc_dir_entry *proc_dir_entry;
+   DEFINE_SNMP_STAT(struct ipstats_mib, ip);
+   DEFINE_SNMP_STAT_ATOMIC(struct icmp_mib_device, icmpdev);
+   DEFINE_SNMP_STAT_ATOMIC(struct icmpmsg_mib_device, icmpmsgdev);
+};
+#endif
+
 struct in_device {
struct net_device   *dev;
refcount_t  refcnt;
@@ -45,6 +54,9 @@ struct in_device {
 
struct neigh_parms  *arp_parms;
struct ipv4_devconf cnf;
+#ifdef CONFIG_IP_IFSTATS_TABLE
+   struct ipv4_devstat stats;
+#endif
struct rcu_head rcu_head;
 };
 
@@ -216,6 +228,16 @@ static inline struct in_device *__in_dev_get_rcu(const 
struct net_device *dev)
return rcu_dereference(dev->ip_ptr);
 }
 
+#ifdef CONFIG_IP_IFSTATS_TABLE
+static inline struct in_device *__in_dev_get_rcu_safely(const struct 
net_device *dev)
+{
+   if (likely(dev))
+   return rcu_dereference(dev->ip_ptr);
+   else
+   return NULL;
+}
+#endif
+
 static inline struct in_device *in_dev_get(const struct net_device *dev)
 {
struct in_device *in_dev;
diff --git a/include/net/icmp.h b/include/net/icmp.h
index 3ef2743..fdfbc0f 100644
--- a/include/net/icmp.h
+++ b/include/net/icmp.h
@@ -29,10 +29,44 @@ struct icmp_err {
 };
 
 extern const struct icmp_err icmp_err_convert[];
-#define ICMP_INC_STATS(net, field) 
SNMP_INC_STATS((net)->mib.icmp_statistics, field)
-#define __ICMP_INC_STATS(net, field)   
__SNMP_INC_STATS((net)->mib.icmp_statistics, field)
-#define ICMPMSGOUT_INC_STATS(net, field)   
SNMP_INC_STATS_ATOMIC_LONG((net)->mib.icmpmsg_statistics, field+256)
-#define ICMPMSGIN_INC_STATS(net, field)
SNMP_INC_STATS_ATOMIC_LONG((net)->mib.icmpmsg_statistics, field)
+#ifdef CONFIG_IP_IFSTATS_TABLE
+#define ICMP_INC_STATS(net, dev, field)\
+   ({  \
+   rcu_read_lock();\
+   _DEVINCATOMIC(net, icmp, struct in_device,  \
+   __in_dev_get_rcu_safely(dev), field);   \
+   rcu_read_unlock();  \
+   })
+
+#define __ICMP_INC_STATS(net, dev, field)  \
+   ({  \
+   rcu_read_lock();\
+   ___DEVINCATOMIC(net, icmp, struct in_device,\
+   __in_dev_get_rcu_safely(dev), field);   \
+   rcu_read_unlock();  \
+   })
+
+#define