from:"Xin Long"

[PATCH net] selftests: set sysctl bc_forwarding properly in router_broadcast.sh

2019-06-02 Thread Xin Long

sysctl setting bc_forwarding for $rp2 is needed when ping_test_from h2,
otherwise the bc packets from $rp2 won't be forwarded. This patch is to
add this setting for $rp2.

Also, as ping_test_from does grep "$from" only, which could match some
unexpected output, some test case doesn't really work, like:

  # ping_test_from $h2 198.51.200.255 198.51.200.2
PING 198.51.200.255 from 198.51.100.2 veth3: 56(84) bytes of data.
64 bytes from 198.51.100.1: icmp_seq=1 ttl=64 time=0.336 ms

When doing grep $form (198.51.200.2), the output could still match.
So change to grep "bytes from $from" instead.

Fixes: 40f98b9af943 ("selftests: add a selftest for directed broadcast 
forwarding")
Signed-off-by: Xin Long 
---
 tools/testing/selftests/net/forwarding/router_broadcast.sh | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/forwarding/router_broadcast.sh 
b/tools/testing/selftests/net/forwarding/router_broadcast.sh
index 9a678ec..4eac0a0 100755
--- a/tools/testing/selftests/net/forwarding/router_broadcast.sh
+++ b/tools/testing/selftests/net/forwarding/router_broadcast.sh
@@ -145,16 +145,19 @@ bc_forwarding_disable()
 {
sysctl_set net.ipv4.conf.all.bc_forwarding 0
sysctl_set net.ipv4.conf.$rp1.bc_forwarding 0
+   sysctl_set net.ipv4.conf.$rp2.bc_forwarding 0
 }
 
 bc_forwarding_enable()
 {
sysctl_set net.ipv4.conf.all.bc_forwarding 1
sysctl_set net.ipv4.conf.$rp1.bc_forwarding 1
+   sysctl_set net.ipv4.conf.$rp2.bc_forwarding 1
 }
 
 bc_forwarding_restore()
 {
+   sysctl_restore net.ipv4.conf.$rp2.bc_forwarding
sysctl_restore net.ipv4.conf.$rp1.bc_forwarding
sysctl_restore net.ipv4.conf.all.bc_forwarding
 }
@@ -171,7 +174,7 @@ ping_test_from()
log_info "ping $dip, expected reply from $from"
ip vrf exec $(master_name_get $oif) \
$PING -I $oif $dip -c 10 -i 0.1 -w $PING_TIMEOUT -b 2>&1 \
-   | grep $from &> /dev/null
+   | grep "bytes from $from" > /dev/null
check_err_fail $fail $?
 }
 
-- 
2.1.0

[PATCH net] ipv4: not do cache for local delivery if bc_forwarding is enabled

2019-06-02 Thread Xin Long

With the topo:

h1 ---| rp1|
  | route  rp3 |--- h3 (192.168.200.1)
h2 ---| rp2|

If rp1 bc_forwarding is set while rp2 bc_forwarding is not, after
doing "ping 192.168.200.255" on h1, then ping 192.168.200.255 on
h2, and the packets can still be forwared.

This issue was caused by the input route cache. It should only do
the cache for either bc forwarding or local delivery. Otherwise,
local delivery can use the route cache for bc forwarding of other
interfaces.

This patch is to fix it by not doing cache for local delivery if
all.bc_forwarding is enabled.

Note that we don't fix it by checking route cache local flag after
rt_cache_valid() in "local_input:" and "ip_mkroute_input", as the
common route code shouldn't be touched for bc_forwarding.

Fixes: 5cbf777cfdf6 ("route: add support for directed broadcast forwarding")
Reported-by: Jianlin Shi 
Signed-off-by: Xin Long 
---
 net/ipv4/route.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 11ddc27..91bf75b 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1985,7 +1985,7 @@ static int ip_route_input_slow(struct sk_buff *skb, 
__be32 daddr, __be32 saddr,
u32 itag = 0;
struct rtable   *rth;
struct flowi4   fl4;
-   bool do_cache;
+   bool do_cache = true;
 
/* IP on this device is disabled. */
 
@@ -2062,6 +2062,9 @@ static int ip_route_input_slow(struct sk_buff *skb, 
__be32 daddr, __be32 saddr,
if (res->type == RTN_BROADCAST) {
if (IN_DEV_BFORWARD(in_dev))
goto make_route;
+   /* not do cache if bc_forwarding is enabled */
+   if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
+   do_cache = false;
goto brd_input;
}
 
@@ -2099,18 +2102,15 @@ out:return err;
RT_CACHE_STAT_INC(in_brd);
 
 local_input:
-   do_cache = false;
-   if (res->fi) {
-   if (!itag) {
-   struct fib_nh_common *nhc = FIB_RES_NHC(*res);
+   do_cache &= res->fi && !itag;
+   if (do_cache) {
+   struct fib_nh_common *nhc = FIB_RES_NHC(*res);
 
-   rth = rcu_dereference(nhc->nhc_rth_input);
-   if (rt_cache_valid(rth)) {
-   skb_dst_set_noref(skb, &rth->dst);
-   err = 0;
-   goto out;
-   }
-   do_cache = true;
+   rth = rcu_dereference(nhc->nhc_rth_input);
+   if (rt_cache_valid(rth)) {
+   skb_dst_set_noref(skb, &rth->dst);
+   err = 0;
+   goto out;
}
}
 
-- 
2.1.0

[PATCH net] ipv6: fix the check before getting the cookie in rt6_get_cookie

2019-06-02 Thread Xin Long

In Jianlin's testing, netperf was broken with 'Connection reset by peer',
as the cookie check failed in rt6_check() and ip6_dst_check() always
returned NULL.

It's caused by Commit 93531c674315 ("net/ipv6: separate handling of FIB
entries from dst based routes"), where the cookie can be got only when
'c1'(see below) for setting dst_cookie whereas rt6_check() is called
when !'c1' for checking dst_cookie, as we can see in ip6_dst_check().

Since in ip6_dst_check() both rt6_dst_from_check() (c1) and rt6_check()
(!c1) will check the 'from' cookie, this patch is to remove the c1 check
in rt6_get_cookie(), so that the dst_cookie can always be set properly.

c1:
  (rt->rt6i_flags & RTF_PCPU || unlikely(!list_empty(&rt->rt6i_uncached)))

Fixes: 93531c674315 ("net/ipv6: separate handling of FIB entries from dst based 
routes")
Reported-by: Jianlin Shi 
Signed-off-by: Xin Long 
---
 include/net/ip6_fib.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 525f701..d6d936c 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -263,8 +263,7 @@ static inline u32 rt6_get_cookie(const struct rt6_info *rt)
rcu_read_lock();
 
from = rcu_dereference(rt->from);
-   if (from && (rt->rt6i_flags & RTF_PCPU ||
-   unlikely(!list_empty(&rt->rt6i_uncached
+   if (from)
fib6_get_cookie_safe(from, &cookie);
 
rcu_read_unlock();
-- 
2.1.0

Re: [PATCH V2] Fix memory leak in sctp_process_init

2019-06-04 Thread Xin Long

On Tue, Jun 4, 2019 at 4:34 AM Neil Horman  wrote:
>
> syzbot found the following leak in sctp_process_init
> BUG: memory leak
> unreferenced object 0x88810ef68400 (size 1024):
>   comm "syz-executor273", pid 7046, jiffies 4294945598 (age 28.770s)
>   hex dump (first 32 bytes):
> 1d de 28 8d de 0b 1b e3 b5 c2 f9 68 fd 1a 97 25  ..(h...%
> 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
>   backtrace:
> [] kmemleak_alloc_recursive include/linux/kmemleak.h:55
> [inline]
> [] slab_post_alloc_hook mm/slab.h:439 [inline]
> [] slab_alloc mm/slab.c:3326 [inline]
> [] __do_kmalloc mm/slab.c:3658 [inline]
> [] __kmalloc_track_caller+0x15d/0x2c0 mm/slab.c:3675
> [<9e6245e6>] kmemdup+0x27/0x60 mm/util.c:119
> [] kmemdup include/linux/string.h:432 [inline]
> [] sctp_process_init+0xa7e/0xc20
> net/sctp/sm_make_chunk.c:2437
> [] sctp_cmd_process_init net/sctp/sm_sideeffect.c:682
> [inline]
> [] sctp_cmd_interpreter net/sctp/sm_sideeffect.c:1384
> [inline]
> [] sctp_side_effects net/sctp/sm_sideeffect.c:1194
> [inline]
> [] sctp_do_sm+0xbdc/0x1d60 net/sctp/sm_sideeffect.c:1165
> [<44e11f96>] sctp_assoc_bh_rcv+0x13c/0x200
> net/sctp/associola.c:1074
> [] sctp_inq_push+0x7f/0xb0 net/sctp/inqueue.c:95
> [<726aa954>] sctp_backlog_rcv+0x5e/0x2a0 net/sctp/input.c:354
> [] sk_backlog_rcv include/net/sock.h:950 [inline]
> [] __release_sock+0xab/0x110 net/core/sock.c:2418
> [] release_sock+0x37/0xd0 net/core/sock.c:2934
> [<963cc9ae>] sctp_sendmsg+0x2c0/0x990 net/sctp/socket.c:2122
> [] inet_sendmsg+0x64/0x120 net/ipv4/af_inet.c:802
> [] sock_sendmsg_nosec net/socket.c:652 [inline]
> [] sock_sendmsg+0x54/0x70 net/socket.c:671
> [<274c57ab>] ___sys_sendmsg+0x393/0x3c0 net/socket.c:2292
> [<8252aedb>] __sys_sendmsg+0x80/0xf0 net/socket.c:2330
> [] __do_sys_sendmsg net/socket.c:2339 [inline]
> [] __se_sys_sendmsg net/socket.c:2337 [inline]
> [] __x64_sys_sendmsg+0x23/0x30 net/socket.c:2337
> [] do_syscall_64+0x76/0x1a0 arch/x86/entry/common.c:3
>
> The problem was that the peer.cookie value points to an skb allocated
> area on the first pass through this function, at which point it is
> overwritten with a heap allocated value, but in certain cases, where a
> COOKIE_ECHO chunk is included in the packet, a second pass through
> sctp_process_init is made, where the cookie value is re-allocated,
> leaking the first allocation.
This's not gonna happen, as after processing INIT, the temp asoc will be
deleted on the server side. Besides, from the reproducer:

  https://syzkaller.appspot.com/x/repro.syz?x=10e32f8ca0

Packet(INIT|COOKIE_ECHO) can't be made in here.

The call trace says the leak happened when processing INIT_ACK on the
client side, as Marcelo noticed.  Then it can be triggered by:

1. sctp_sf_do_5_1C_ack() -> SCTP_CMD_PEER_INIT -> sctp_process_init():
   where it "goto clean_up" after sctp_process_param(), but in 'cleanup'
   path, it doesn't do any freeup for the memdups of sctp_process_param().
   then the client T1 retrans INIT, and later get INIT_ACK again from the
   peer, and go to sctp_process_init() to memdup().

2. sctp_sf_cookie_echoed_err() -> sctp_sf_do_5_2_6_stale():
   where the asoc state will go from COOKIE_ECHOED back to COOKIE_WAIT,
   and T1 starts to retrans INIT, and later it will get INIT_ACK again
   to sctp_process_init() and memdup().

As on either above, asoc's never been to ESTABLISHED state,
asoc->peer.cookie can be not freed, and this patch won't work.
But yes, it's nice to have this patch, just not to fix this memleak.

I tracked the code, this memleak was triggered by case 2, so I think
you also need to add something like:

@@ -881,6 +893,18 @@ static void sctp_cmd_new_state(struct sctp_cmd_seq *cmds,
asoc->rto_initial;
asoc->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] =
asoc->rto_initial;
+
+   if (asoc->peer.cookie) {
+   kfree(asoc->peer.cookie);
+   kfree(asoc->peer.peer_random);
+   kfree(asoc->peer.peer_chunks);
+   kfree(asoc->peer.peer_hmacs);
+
+   asoc->peer.cookie = NULL;
+   asoc->peer.peer_random = NULL;
+   asoc->peer.peer_random = NULL;
+   asoc->peer.peer_hmacs = NULL;
+   }
}

and the same thing may

Re: [PATCH net] ipv6: fix the check before getting the cookie in rt6_get_cookie

2019-06-06 Thread Xin Long

On Thu, Jun 6, 2019 at 11:35 PM David Ahern  wrote:
>
> On 6/2/19 5:10 AM, Xin Long wrote:
> > In Jianlin's testing, netperf was broken with 'Connection reset by peer',
> > as the cookie check failed in rt6_check() and ip6_dst_check() always
> > returned NULL.
>
> Any particular test or setup that is causing the reset? I do not see
> that problem in general.
>
by running the script below, netperf ends up with:

Connection reset by peer
netperf: remote error 104


--

#!/bin/bash

ip netns add client
ip netns add server
ip netns add route

ip link add veth0_c type veth peer name veth0_cr
ip link add veth0_s type veth peer name veth0_sr
ip link set veth0_c netns client
ip link set veth0_s netns server
ip link set veth0_cr netns route
ip link set veth0_sr netns route

ip netns exec client ip link set lo up
ip netns exec client ip link set veth0_c up

ip netns exec server ip link set lo up
ip netns exec server ip link set veth0_s up

ip netns exec route ip link set lo up
ip netns exec route ip link set veth0_cr up
ip netns exec route ip link set veth0_sr up

ip netns exec client ip addr add 2000::1/64 dev veth0_c
ip netns exec client ip addr add 10.10.0.1/24 dev veth0_c
ip netns exec route ip addr add 2000::a/64 dev veth0_cr
ip netns exec route ip addr add 10.10.0.254/24 dev veth0_cr

ip netns exec server ip addr add 2001::1/64 dev veth0_s
ip netns exec server ip addr add 10.10.1.1/24 dev veth0_s
ip netns exec route ip addr add 2001::a/64 dev veth0_sr
ip netns exec route ip addr add 10.10.1.254/24 dev veth0_sr

ip netns exec client ip route add default via 10.10.0.254
ip netns exec client ip -6 route add default via 2000::a

ip netns exec server ip route add default via 10.10.1.254
ip netns exec server ip -6 route add default via 2001::a

ip netns exec route sysctl -w net.ipv4.conf.all.forwarding=1
ip netns exec route sysctl -w net.ipv6.conf.all.forwarding=1

ip netns exec client ping 10.10.1.1 -c 2
ip netns exec client ping6 2001::1 -c 2

ip netns exec route ip link set veth0_sr mtu 1400
ip netns exec server ip link set veth0_s mtu 1400

ip netns exec server netserver -d

ip netns exec client ping6 2001::1 -c 1 -s 1400 -M do
ip netns exec client ip route get 2001::1

ip netns exec client ip link set veth0_c mtu 1300
ip netns exec client ip route get 2001::1

ip netns exec client ip link set veth0_c mtu 1600
ip netns exec client ip route get 2001::1

ip netns exec client ping6 2001::1 -c 1 -s 1400 -M do
ip netns exec client ip route get 2001::1

ip netns exec client netperf -6 -H 2001::1 -t SCTP_STREAM -l 30 -- -m 16k &
#ip netns exec client netperf -6 -H 2001::1 -t TCP_STREAM -l 30 -- -m 16k &
#ip netns exec client netperf -6 -H 2001::1 -t UDP_STREAM -l 30 -- -R 1 &
pid=$!

for i in `seq 1 5`
do
ip netns exec client ip link set veth0_c mtu 1300
sleep 1
ip netns exec client ip link set veth0_c mtu 1600
sleep 1
done

wait $pid

Re: [PATCH V2] Fix memory leak in sctp_process_init

2019-06-08 Thread Xin Long

On Fri, Jun 7, 2019 at 8:48 PM Marcelo Ricardo Leitner
 wrote:
>
> On Fri, Jun 07, 2019 at 06:56:39AM -0400, Neil Horman wrote:
> > On Thu, Jun 06, 2019 at 12:47:55PM -0300, Marcelo Ricardo Leitner wrote:
> > > On Wed, Jun 05, 2019 at 07:20:10AM -0400, Neil Horman wrote:
> > > > On Wed, Jun 05, 2019 at 04:16:24AM +0800, Xin Long wrote:
> > > > > On Tue, Jun 4, 2019 at 4:34 AM Neil Horman  
> > > > > wrote:
> > > > > >
> > > > > > syzbot found the following leak in sctp_process_init
> > > > > > BUG: memory leak
> > > > > > unreferenced object 0x88810ef68400 (size 1024):
> > > > > >   comm "syz-executor273", pid 7046, jiffies 4294945598 (age 28.770s)
> > > > > >   hex dump (first 32 bytes):
> > > > > > 1d de 28 8d de 0b 1b e3 b5 c2 f9 68 fd 1a 97 25  
> > > > > > ..(h...%
> > > > > > 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
> > > > > > 
> > > > > >   backtrace:
> > > > > > [<a02cebbd>] kmemleak_alloc_recursive 
> > > > > > include/linux/kmemleak.h:55
> > > > > > [inline]
> > > > > > [<a02cebbd>] slab_post_alloc_hook mm/slab.h:439 [inline]
> > > > > > [<a02cebbd>] slab_alloc mm/slab.c:3326 [inline]
> > > > > > [<a02cebbd>] __do_kmalloc mm/slab.c:3658 [inline]
> > > > > > [<a02cebbd>] __kmalloc_track_caller+0x15d/0x2c0 
> > > > > > mm/slab.c:3675
> > > > > > [<9e6245e6>] kmemdup+0x27/0x60 mm/util.c:119
> > > > > > [<dfdc5d2d>] kmemdup include/linux/string.h:432 [inline]
> > > > > > [<dfdc5d2d>] sctp_process_init+0xa7e/0xc20
> > > > > > net/sctp/sm_make_chunk.c:2437
> > > > > > [<b58b62f8>] sctp_cmd_process_init 
> > > > > > net/sctp/sm_sideeffect.c:682
> > > > > > [inline]
> > > > > > [<b58b62f8>] sctp_cmd_interpreter 
> > > > > > net/sctp/sm_sideeffect.c:1384
> > > > > > [inline]
> > > > > > [<b58b62f8>] sctp_side_effects 
> > > > > > net/sctp/sm_sideeffect.c:1194
> > > > > > [inline]
> > > > > > [<b58b62f8>] sctp_do_sm+0xbdc/0x1d60 
> > > > > > net/sctp/sm_sideeffect.c:1165
> > > > > > [<44e11f96>] sctp_assoc_bh_rcv+0x13c/0x200
> > > > > > net/sctp/associola.c:1074
> > > > > > [<ec43804d>] sctp_inq_push+0x7f/0xb0 
> > > > > > net/sctp/inqueue.c:95
> > > > > > [<726aa954>] sctp_backlog_rcv+0x5e/0x2a0 
> > > > > > net/sctp/input.c:354
> > > > > > [<d9e249a8>] sk_backlog_rcv include/net/sock.h:950 
> > > > > > [inline]
> > > > > > [<d9e249a8>] __release_sock+0xab/0x110 
> > > > > > net/core/sock.c:2418
> > > > > > [<acae44fa>] release_sock+0x37/0xd0 net/core/sock.c:2934
> > > > > > [<963cc9ae>] sctp_sendmsg+0x2c0/0x990 
> > > > > > net/sctp/socket.c:2122
> > > > > > [<a7fc7565>] inet_sendmsg+0x64/0x120 
> > > > > > net/ipv4/af_inet.c:802
> > > > > > [<b732cbd3>] sock_sendmsg_nosec net/socket.c:652 
> > > > > > [inline]
> > > > > > [<b732cbd3>] sock_sendmsg+0x54/0x70 net/socket.c:671
> > > > > > [<274c57ab>] ___sys_sendmsg+0x393/0x3c0 
> > > > > > net/socket.c:2292
> > > > > > [<8252aedb>] __sys_sendmsg+0x80/0xf0 net/socket.c:2330
> > > > > > [<f7bf23d1>] __do_sys_sendmsg net/socket.c:2339 [inline]
> > > > > > [<f7bf23d1>] __se_sys_sendmsg net/socket.c:2337 [inline]
> > > > > > [<f7bf23d1>] __x64_sys_sendmsg+0x23/0x30 
> > > > > > net/socket.c:2337
> > > > > > [<a8b4131f>] do_syscall_64+0x76/0x1a0 
> > > > > > arch/x86/entry/common.c:3
> > > > > >
> > > > > > The problem was

Re: [PATCH] Free cookie before we memdup a new one

2019-06-10 Thread Xin Long

On Tue, Jun 11, 2019 at 12:35 AM Neil Horman  wrote:
>
> Based on comments from Xin, even after fixes for our recent syzbot
> report of cookie memory leaks, its possible to get a resend of an INIT
> chunk which would lead to us leaking cookie memory.
>
> To ensure that we don't leak cookie memory, free any previously
> allocated cookie first.
Hi, Neil,

I think we should also do the same thing to the other 2 kmemdups in
sctp_process_param():
asoc->peer.peer_random
asoc->peer.peer_hmacs


>
> Signed-off-by: Neil Horman 
> CC: Marcelo Ricardo Leitner 
> CC: Xin Long 
> CC: "David S. Miller" 
> CC: netdev@vger.kernel.org
> ---
>  net/sctp/sm_make_chunk.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
> index f17908f5c4f3..21f7faf032e5 100644
> --- a/net/sctp/sm_make_chunk.c
> +++ b/net/sctp/sm_make_chunk.c
> @@ -2583,6 +2583,8 @@ static int sctp_process_param(struct sctp_association 
> *asoc,
> case SCTP_PARAM_STATE_COOKIE:
> asoc->peer.cookie_len =
> ntohs(param.p->length) - sizeof(struct sctp_paramhdr);
> +   if (asoc->peer.cookie)
> +   kfree(asoc->peer.cookie);
> asoc->peer.cookie = kmemdup(param.cookie->body, 
> asoc->peer.cookie_len, gfp);
> if (!asoc->peer.cookie)
> retval = 0;
> --
> 2.20.1
>

[PATCH net-next 0/4] sctp: clean up __sctp_connect function

2019-07-22 Thread Xin Long

This patchset is to factor out some common code for
sctp_sendmsg_new_asoc() and __sctp_connect() into 2
new functioins.

Xin Long (4):
  sctp: check addr_size with sa_family_t size in
__sctp_setsockopt_connectx
  sctp: clean up __sctp_connect
  sctp: factor out sctp_connect_new_asoc
  sctp: factor out sctp_connect_add_peer

 net/sctp/socket.c | 377 +-
 1 file changed, 147 insertions(+), 230 deletions(-)

-- 
2.1.0

[PATCH net-next 3/4] sctp: factor out sctp_connect_new_asoc

2019-07-22 Thread Xin Long

In this function factored out from sctp_sendmsg_new_asoc() and
__sctp_connect(), it creates the asoc and adds a peer with the
1st addr.

Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 160 ++
 1 file changed, 76 insertions(+), 84 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 49837e9..420abdb 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1044,6 +1044,73 @@ static int sctp_setsockopt_bindx(struct sock *sk,
return err;
 }
 
+static int sctp_connect_new_asoc(struct sctp_endpoint *ep,
+const union sctp_addr *daddr,
+const struct sctp_initmsg *init,
+struct sctp_transport **tp)
+{
+   struct sctp_association *asoc;
+   struct sock *sk = ep->base.sk;
+   struct net *net = sock_net(sk);
+   enum sctp_scope scope;
+   int err;
+
+   if (sctp_endpoint_is_peeled_off(ep, daddr))
+   return -EADDRNOTAVAIL;
+
+   if (!ep->base.bind_addr.port) {
+   if (sctp_autobind(sk))
+   return -EAGAIN;
+   } else {
+   if (ep->base.bind_addr.port < inet_prot_sock(net) &&
+   !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+   return -EACCES;
+   }
+
+   scope = sctp_scope(daddr);
+   asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+   if (!asoc)
+   return -ENOMEM;
+
+   err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL);
+   if (err < 0)
+   goto free;
+
+   *tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
+   if (!*tp) {
+   err = -ENOMEM;
+   goto free;
+   }
+
+   if (!init)
+   return 0;
+
+   if (init->sinit_num_ostreams) {
+   __u16 outcnt = init->sinit_num_ostreams;
+
+   asoc->c.sinit_num_ostreams = outcnt;
+   /* outcnt has been changed, need to re-init stream */
+   err = sctp_stream_init(&asoc->stream, outcnt, 0, GFP_KERNEL);
+   if (err)
+   goto free;
+   }
+
+   if (init->sinit_max_instreams)
+   asoc->c.sinit_max_instreams = init->sinit_max_instreams;
+
+   if (init->sinit_max_attempts)
+   asoc->max_init_attempts = init->sinit_max_attempts;
+
+   if (init->sinit_max_init_timeo)
+   asoc->max_init_timeo =
+   msecs_to_jiffies(init->sinit_max_init_timeo);
+
+   return 0;
+free:
+   sctp_association_free(asoc);
+   return err;
+}
+
 /* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
  *
  * Common routine for handling connect() and sctp_connectx().
@@ -1056,11 +1123,9 @@ static int __sctp_connect(struct sock *sk, struct 
sockaddr *kaddrs,
struct sctp_sock *sp = sctp_sk(sk);
struct sctp_endpoint *ep = sp->ep;
struct sctp_transport *transport;
-   struct net *net = sock_net(sk);
int addrcnt, walk_size, err;
void *addr_buf = kaddrs;
union sctp_addr *daddr;
-   enum sctp_scope scope;
struct sctp_af *af;
long timeo;
 
@@ -1082,32 +1147,10 @@ static int __sctp_connect(struct sock *sk, struct 
sockaddr *kaddrs,
return asoc->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
 : -EALREADY;
 
-   if (sctp_endpoint_is_peeled_off(ep, daddr))
-   return -EADDRNOTAVAIL;
-
-   if (!ep->base.bind_addr.port) {
-   if (sctp_autobind(sk))
-   return -EAGAIN;
-   } else {
-   if (ep->base.bind_addr.port < inet_prot_sock(net) &&
-   !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
-   return -EACCES;
-   }
-
-   scope = sctp_scope(daddr);
-   asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
-   if (!asoc)
-   return -ENOMEM;
-
-   err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL);
-   if (err < 0)
-   goto out_free;
-
-   transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
-   if (!transport) {
-   err = -ENOMEM;
-   goto out_free;
-   }
+   err = sctp_connect_new_asoc(ep, daddr, NULL, &transport);
+   if (err)
+   return err;
+   asoc = transport->asoc;
 
addr_buf += af->sockaddr_len;
walk_size = af->sockaddr_len;
@@ -1162,7 +1205,7 @@ static int __sctp_connect(struct sock *sk, struct 
sockaddr *kaddrs,
goto out_free;
}
 
-   err = sctp_primitive_ASSOCIATE(net, asoc, NULL);
+   err = sctp_primitive_ASSOCIATE(sock_net(sk), asoc, NULL);
if

[PATCH net-next 2/4] sctp: clean up __sctp_connect

2019-07-22 Thread Xin Long

__sctp_connect is doing quit similar things as sctp_sendmsg_new_asoc.
To factor out common functions, this patch is to clean up their code
to make them look more similar:

  1. create the asoc and add a peer with the 1st addr.
  2. add peers with the other addrs into this asoc one by one.

Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 211 +++---
 1 file changed, 75 insertions(+), 136 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 5f92e4a..49837e9 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1049,154 +1049,108 @@ static int sctp_setsockopt_bindx(struct sock *sk,
  * Common routine for handling connect() and sctp_connectx().
  * Connect will come in with just a single address.
  */
-static int __sctp_connect(struct sock *sk,
- struct sockaddr *kaddrs,
- int addrs_size, int flags,
- sctp_assoc_t *assoc_id)
+static int __sctp_connect(struct sock *sk, struct sockaddr *kaddrs,
+ int addrs_size, int flags, sctp_assoc_t *assoc_id)
 {
-   struct net *net = sock_net(sk);
-   struct sctp_sock *sp;
-   struct sctp_endpoint *ep;
-   struct sctp_association *asoc = NULL;
-   struct sctp_association *asoc2;
+   struct sctp_association *old, *asoc;
+   struct sctp_sock *sp = sctp_sk(sk);
+   struct sctp_endpoint *ep = sp->ep;
struct sctp_transport *transport;
-   union sctp_addr to;
+   struct net *net = sock_net(sk);
+   int addrcnt, walk_size, err;
+   void *addr_buf = kaddrs;
+   union sctp_addr *daddr;
enum sctp_scope scope;
+   struct sctp_af *af;
long timeo;
-   int err = 0;
-   int addrcnt = 0;
-   int walk_size = 0;
-   union sctp_addr *sa_addr = NULL;
-   void *addr_buf;
-   unsigned short port;
 
-   sp = sctp_sk(sk);
-   ep = sp->ep;
-
-   /* connect() cannot be done on a socket that is already in ESTABLISHED
-* state - UDP-style peeled off socket or a TCP-style socket that
-* is already connected.
-* It cannot be done even on a TCP-style listening socket.
-*/
if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) ||
-   (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
-   err = -EISCONN;
-   goto out_free;
+   (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)))
+   return -EISCONN;
+
+   daddr = addr_buf;
+   af = sctp_get_af_specific(daddr->sa.sa_family);
+   if (!af || af->sockaddr_len > addrs_size)
+   return -EINVAL;
+
+   err = sctp_verify_addr(sk, daddr, af->sockaddr_len);
+   if (err)
+   return err;
+
+   asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
+   if (asoc)
+   return asoc->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
+: -EALREADY;
+
+   if (sctp_endpoint_is_peeled_off(ep, daddr))
+   return -EADDRNOTAVAIL;
+
+   if (!ep->base.bind_addr.port) {
+   if (sctp_autobind(sk))
+   return -EAGAIN;
+   } else {
+   if (ep->base.bind_addr.port < inet_prot_sock(net) &&
+   !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+   return -EACCES;
}
 
-   /* Walk through the addrs buffer and count the number of addresses. */
-   addr_buf = kaddrs;
-   while (walk_size < addrs_size) {
-   struct sctp_af *af;
+   scope = sctp_scope(daddr);
+   asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+   if (!asoc)
+   return -ENOMEM;
 
-   if (walk_size + sizeof(sa_family_t) > addrs_size) {
-   err = -EINVAL;
-   goto out_free;
-   }
+   err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL);
+   if (err < 0)
+   goto out_free;
 
-   sa_addr = addr_buf;
-   af = sctp_get_af_specific(sa_addr->sa.sa_family);
+   transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
+   if (!transport) {
+   err = -ENOMEM;
+   goto out_free;
+   }
 
-   /* If the address family is not supported or if this address
-* causes the address buffer to overflow return EINVAL.
-*/
-   if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
-   err = -EINVAL;
+   addr_buf += af->sockaddr_len;
+   walk_size = af->sockaddr_len;
+   addrcnt = 1;
+   while (walk_size < addrs_size) {
+   err = -EINVAL;
+   if (walk_size + sizeof(sa_family_t) > addrs_size)

[PATCH net-next 1/4] sctp: check addr_size with sa_family_t size in __sctp_setsockopt_connectx

2019-07-22 Thread Xin Long

Now __sctp_connect() is called by __sctp_setsockopt_connectx() and
sctp_inet_connect(), the latter has done addr_size check with size
of sa_family_t.

In the next patch to clean up __sctp_connect(), we will remove
addr_size check with size of sa_family_t from __sctp_connect()
for the 1st address.

So before doing that, __sctp_setsockopt_connectx() should do
this check first, as sctp_inet_connect() does.

Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aa80cda..5f92e4a 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1311,7 +1311,7 @@ static int __sctp_setsockopt_connectx(struct sock *sk,
pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
 __func__, sk, addrs, addrs_size);
 
-   if (unlikely(addrs_size <= 0))
+   if (unlikely(addrs_size < sizeof(sa_family_t)))
return -EINVAL;
 
kaddrs = memdup_user(addrs, addrs_size);
-- 
2.1.0

[PATCH net-next 4/4] sctp: factor out sctp_connect_add_peer

2019-07-22 Thread Xin Long

In this function factored out from sctp_sendmsg_new_asoc() and
__sctp_connect(), it adds a peer with the other addr into the
asoc after this asoc is created with the 1st addr.

Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 76 +++
 1 file changed, 31 insertions(+), 45 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 420abdb..6584c19 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -,6 +,33 @@ static int sctp_connect_new_asoc(struct sctp_endpoint 
*ep,
return err;
 }
 
+static int sctp_connect_add_peer(struct sctp_association *asoc,
+union sctp_addr *daddr, int addr_len)
+{
+   struct sctp_endpoint *ep = asoc->ep;
+   struct sctp_association *old;
+   struct sctp_transport *t;
+   int err;
+
+   err = sctp_verify_addr(ep->base.sk, daddr, addr_len);
+   if (err)
+   return err;
+
+   old = sctp_endpoint_lookup_assoc(ep, daddr, &t);
+   if (old && old != asoc)
+   return old->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
+   : -EALREADY;
+
+   if (sctp_endpoint_is_peeled_off(ep, daddr))
+   return -EADDRNOTAVAIL;
+
+   t = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
+   if (!t)
+   return -ENOMEM;
+
+   return 0;
+}
+
 /* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
  *
  * Common routine for handling connect() and sctp_connectx().
@@ -1119,10 +1146,10 @@ static int sctp_connect_new_asoc(struct sctp_endpoint 
*ep,
 static int __sctp_connect(struct sock *sk, struct sockaddr *kaddrs,
  int addrs_size, int flags, sctp_assoc_t *assoc_id)
 {
-   struct sctp_association *old, *asoc;
struct sctp_sock *sp = sctp_sk(sk);
struct sctp_endpoint *ep = sp->ep;
struct sctp_transport *transport;
+   struct sctp_association *asoc;
int addrcnt, walk_size, err;
void *addr_buf = kaddrs;
union sctp_addr *daddr;
@@ -1168,29 +1195,10 @@ static int __sctp_connect(struct sock *sk, struct 
sockaddr *kaddrs,
if (asoc->peer.port != ntohs(daddr->v4.sin_port))
goto out_free;
 
-   err = sctp_verify_addr(sk, daddr, af->sockaddr_len);
+   err = sctp_connect_add_peer(asoc, daddr, af->sockaddr_len);
if (err)
goto out_free;
 
-   old = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
-   if (old && old != asoc) {
-   err = old->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
-  : -EALREADY;
-   goto out_free;
-   }
-
-   if (sctp_endpoint_is_peeled_off(ep, daddr)) {
-   err = -EADDRNOTAVAIL;
-   goto out_free;
-   }
-
-   transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL,
-   SCTP_UNKNOWN);
-   if (!transport) {
-   err = -ENOMEM;
-   goto out_free;
-   }
-
addr_buf  += af->sockaddr_len;
walk_size += af->sockaddr_len;
addrcnt++;
@@ -1684,8 +1692,6 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 
sflags,
 
/* sendv addr list parse */
for_each_cmsghdr(cmsg, cmsgs->addrs_msg) {
-   struct sctp_transport *transport;
-   struct sctp_association *old;
union sctp_addr _daddr;
int dlen;
 
@@ -1719,30 +1725,10 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 
sflags,
daddr->v6.sin6_port = htons(asoc->peer.port);
memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen);
}
-   err = sctp_verify_addr(sk, daddr, sizeof(*daddr));
-   if (err)
-   goto free;
-
-   old = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
-   if (old && old != asoc) {
-   if (old->state >= SCTP_STATE_ESTABLISHED)
-   err = -EISCONN;
-   else
-   err = -EALREADY;
-   goto free;
-   }
 
-   if (sctp_endpoint_is_peeled_off(ep, daddr)) {
-   err = -EADDRNOTAVAIL;
-   goto free;
-   }
-
-   transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL,
-   SCTP_UNKNOWN);
-   if (!transport) {
-   err = -ENOMEM;

Re: [PATCH net-next 1/4] sctp: check addr_size with sa_family_t size in __sctp_setsockopt_connectx

2019-07-24 Thread Xin Long

On Tue, Jul 23, 2019 at 11:25 PM Neil Horman  wrote:
>
> On Tue, Jul 23, 2019 at 01:37:57AM +0800, Xin Long wrote:
> > Now __sctp_connect() is called by __sctp_setsockopt_connectx() and
> > sctp_inet_connect(), the latter has done addr_size check with size
> > of sa_family_t.
> >
> > In the next patch to clean up __sctp_connect(), we will remove
> > addr_size check with size of sa_family_t from __sctp_connect()
> > for the 1st address.
> >
> > So before doing that, __sctp_setsockopt_connectx() should do
> > this check first, as sctp_inet_connect() does.
> >
> > Signed-off-by: Xin Long 
> > ---
> >  net/sctp/socket.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > index aa80cda..5f92e4a 100644
> > --- a/net/sctp/socket.c
> > +++ b/net/sctp/socket.c
> > @@ -1311,7 +1311,7 @@ static int __sctp_setsockopt_connectx(struct sock *sk,
> >   pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
> >__func__, sk, addrs, addrs_size);
> >
> > - if (unlikely(addrs_size <= 0))
> > + if (unlikely(addrs_size < sizeof(sa_family_t)))
> I don't think this is what you want to check for here.  sa_family_t is
> an unsigned short, and addrs_size is the number of bytes in the addrs
> array.  The addrs array should be at least the size of one struct
> sockaddr (16 bytes iirc), and, if larger, should be a multiple of
> sizeof(struct sockaddr)
sizeof(struct sockaddr) is not the right value to check either.

The proper check will be done later in __sctp_connect():

af = sctp_get_af_specific(daddr->sa.sa_family);
if (!af || af->sockaddr_len > addrs_size)
return -EINVAL;

So the check 'addrs_size < sizeof(sa_family_t)' in this patch is
just to make sure daddr->sa.sa_family is accessible. the same
check is also done in sctp_inet_connect().

>
> Neil
>
> >   return -EINVAL;
> >
> >   kaddrs = memdup_user(addrs, addrs_size);
> > --
> > 2.1.0
> >
> >

Re: [PATCH] net: sctp: fix memory leak in sctp_send_reset_streams

2019-07-24 Thread Xin Long

On Sun, Jun 2, 2019 at 9:36 PM Xin Long  wrote:
>
> On Sun, Jun 2, 2019 at 6:52 PM Neil Horman  wrote:
> >
> > On Sun, Jun 02, 2019 at 11:44:29AM +0800, Hillf Danton wrote:
> > >
> > > syzbot found the following crash on:
> > >
> > > HEAD commit:036e3431 Merge 
> > > git://git.kernel.org/pub/scm/linux/kernel/g..
> > > git tree:   upstream
> > > console output: https://syzkaller.appspot.com/x/log.txt?x=153cff12a0
> > > kernel config:  https://syzkaller.appspot.com/x/.config?x=8f0f63a62bb5b13c
> > > dashboard link: 
> > > https://syzkaller.appspot.com/bug?extid=6ad9c3bd0a218a2ab41d
> > > compiler:   gcc (GCC) 9.0.0 20181231 (experimental)
> > > syz repro:  https://syzkaller.appspot.com/x/repro.syz?x=12561c86a0
> > > C reproducer:   https://syzkaller.appspot.com/x/repro.c?x=15b76fd8a0
> > >
> > > executing program
> > > executing program
> > > executing program
> > > executing program
> > > executing program
> > > BUG: memory leak
> > > unreferenced object 0x888123894820 (size 32):
> > >   comm "syz-executor045", pid 7267, jiffies 4294943559 (age 13.660s)
> > >   hex dump (first 32 bytes):
> > > 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
> > > 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00  
> > >   backtrace:
> > > [<c7e71c69>] kmemleak_alloc_recursive
> > > include/linux/kmemleak.h:55 [inline]
> > > [<c7e71c69>] slab_post_alloc_hook mm/slab.h:439 [inline]
> > > [<c7e71c69>] slab_alloc mm/slab.c:3326 [inline]
> > > [<c7e71c69>] __do_kmalloc mm/slab.c:3658 [inline]
> > > [<c7e71c69>] __kmalloc+0x161/0x2c0 mm/slab.c:3669
> > > [<3250ed8e>] kmalloc_array include/linux/slab.h:670 [inline]
> > > [<3250ed8e>] kcalloc include/linux/slab.h:681 [inline]
> > > [<3250ed8e>] sctp_send_reset_streams+0x1ab/0x5a0 
> > > net/sctp/stream.c:302
> > > [<cd899c6e>] sctp_setsockopt_reset_streams 
> > > net/sctp/socket.c:4314 [inline]
> > > [<cd899c6e>] sctp_setsockopt net/sctp/socket.c:4765 [inline]
> > > [<cd899c6e>] sctp_setsockopt+0xc23/0x2bf0 
> > > net/sctp/socket.c:4608
> > > [<ff3a21a2>] sock_common_setsockopt+0x38/0x50 
> > > net/core/sock.c:3130
> > > [<9eb87ae7>] __sys_setsockopt+0x98/0x120 net/socket.c:2078
> > > [<e0ede6ca>] __do_sys_setsockopt net/socket.c:2089 [inline]
> > > [<e0ede6ca>] __se_sys_setsockopt net/socket.c:2086 [inline]
> > > [<e0ede6ca>] __x64_sys_setsockopt+0x26/0x30 net/socket.c:2086
> > > [<c61155f5>] do_syscall_64+0x76/0x1a0 
> > > arch/x86/entry/common.c:301
> > > [<e540958c>] entry_SYSCALL_64_after_hwframe+0x44/0xa9
> > >
> > >
> > > It was introduced in commit d570a59c5b5f ("sctp: only allow the out stream
> > > reset when the stream outq is empty"), in orde to check stream outqs 
> > > before
> > > sending SCTP_STRRESET_IN_PROGRESS back to the peer of the stream. EAGAIN 
> > > is
> > > returned, however, without the nstr_list slab released, if any outq is 
> > > found
> > > to be non empty.
> > >
> > > Freeing the slab in question before bailing out fixes it.
> > >
> > > Fixes: d570a59c5b5f ("sctp: only allow the out stream reset when the 
> > > stream outq is empty")
> > > Reported-by: syzbot 
> > > 
> > > Reported-by: Marcelo Ricardo Leitner 
> > > Tested-by: Marcelo Ricardo Leitner 
> > > Cc: Xin Long 
> > > Cc: Neil Horman 
> > > Cc: Vlad Yasevich 
> > > Cc: Eric Dumazet 
> > > Signed-off-by: Hillf Danton 
> > > ---
> > > net/sctp/stream.c | 1 +
> > > 1 file changed, 1 insertion(+)
> > >
> > > diff --git a/net/sctp/stream.c b/net/sctp/stream.c
> > > index 93ed078..d3e2f03 100644
> > > --- a/net/sctp/stream.c
> > > +++ b/net/sctp/stream.c
> > > @@ -310,6 +310,7 @@ int sctp_send_reset_streams(struct sctp_association 
> > > *asoc,
> > >
> > >   if (out && !sctp_stream_outq_is_empty(stream, str_nums, nstr_list)) 
> > > {
> > >   retval = -EAGAIN;
> > > + kfree(nstr_list);
> > >   goto out;
> > >   }
> > >
> > > --
> > >
> > >
> > Acked-by: Neil Horman 
> Reviewed-by: Xin Long 
This fix is not applied, pls resend it with:
to = network dev 
cc = da...@davemloft.net
to = linux-s...@vger.kernel.org
cc = Neil Horman 
cc = Marcelo Ricardo Leitner

Re: [PATCH net-next 1/4] sctp: check addr_size with sa_family_t size in __sctp_setsockopt_connectx

2019-07-26 Thread Xin Long

On Thu, Jul 25, 2019 at 4:44 AM Neil Horman  wrote:
>
> On Wed, Jul 24, 2019 at 04:12:43PM -0300, Marcelo Ricardo Leitner wrote:
> > On Wed, Jul 24, 2019 at 04:05:43PM -0300, Marcelo Ricardo Leitner wrote:
> > > On Wed, Jul 24, 2019 at 02:44:56PM -0400, Neil Horman wrote:
> > > > On Wed, Jul 24, 2019 at 09:49:07AM -0300, Marcelo Ricardo Leitner wrote:
> > > > > On Wed, Jul 24, 2019 at 09:36:50AM -0300, Marcelo Ricardo Leitner 
> > > > > wrote:
> > > > > > On Wed, Jul 24, 2019 at 07:22:35AM -0400, Neil Horman wrote:
> > > > > > > On Wed, Jul 24, 2019 at 03:21:12PM +0800, Xin Long wrote:
> > > > > > > > On Tue, Jul 23, 2019 at 11:25 PM Neil Horman 
> > > > > > > >  wrote:
> > > > > > > > >
> > > > > > > > > On Tue, Jul 23, 2019 at 01:37:57AM +0800, Xin Long wrote:
> > > > > > > > > > Now __sctp_connect() is called by 
> > > > > > > > > > __sctp_setsockopt_connectx() and
> > > > > > > > > > sctp_inet_connect(), the latter has done addr_size check 
> > > > > > > > > > with size
> > > > > > > > > > of sa_family_t.
> > > > > > > > > >
> > > > > > > > > > In the next patch to clean up __sctp_connect(), we will 
> > > > > > > > > > remove
> > > > > > > > > > addr_size check with size of sa_family_t from 
> > > > > > > > > > __sctp_connect()
> > > > > > > > > > for the 1st address.
> > > > > > > > > >
> > > > > > > > > > So before doing that, __sctp_setsockopt_connectx() should do
> > > > > > > > > > this check first, as sctp_inet_connect() does.
> > > > > > > > > >
> > > > > > > > > > Signed-off-by: Xin Long 
> > > > > > > > > > ---
> > > > > > > > > >  net/sctp/socket.c | 2 +-
> > > > > > > > > >  1 file changed, 1 insertion(+), 1 deletion(-)
> > > > > > > > > >
> > > > > > > > > > diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> > > > > > > > > > index aa80cda..5f92e4a 100644
> > > > > > > > > > --- a/net/sctp/socket.c
> > > > > > > > > > +++ b/net/sctp/socket.c
> > > > > > > > > > @@ -1311,7 +1311,7 @@ static int 
> > > > > > > > > > __sctp_setsockopt_connectx(struct sock *sk,
> > > > > > > > > >   pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
> > > > > > > > > >__func__, sk, addrs, addrs_size);
> > > > > > > > > >
> > > > > > > > > > - if (unlikely(addrs_size <= 0))
> > > > > > > > > > + if (unlikely(addrs_size < sizeof(sa_family_t)))
> > > > > > > > > I don't think this is what you want to check for here.  
> > > > > > > > > sa_family_t is
> > > > > > > > > an unsigned short, and addrs_size is the number of bytes in 
> > > > > > > > > the addrs
> > > > > > > > > array.  The addrs array should be at least the size of one 
> > > > > > > > > struct
> > > > > > > > > sockaddr (16 bytes iirc), and, if larger, should be a 
> > > > > > > > > multiple of
> > > > > > > > > sizeof(struct sockaddr)
> > > > > > > > sizeof(struct sockaddr) is not the right value to check either.
> > > > > > > >
> > > > > > > > The proper check will be done later in __sctp_connect():
> > > > > > > >
> > > > > > > > af = sctp_get_af_specific(daddr->sa.sa_family);
> > > > > > > > if (!af || af->sockaddr_len > addrs_size)
> > > > > > > > return -EINVAL;
> > > > > > > >
> > > > > > > > So the check 'addrs_size < sizeof(sa_family_t)' in this patch is
> > > > > > > > just to make sure daddr->sa.sa_family is accessible. the same
> > > >

[PATCHv2 net-next 1/5] sctp: only copy the available addr data in sctp_transport_init

2019-07-30 Thread Xin Long

'addr' passed to sctp_transport_init is not always a whole size
of union sctp_addr, like the path:

  sctp_sendmsg() ->
  sctp_sendmsg_new_asoc() ->
  sctp_assoc_add_peer() ->
  sctp_transport_new() -> sctp_transport_init()

In the next patches, we will also pass the address length of data
only to sctp_assoc_add_peer().

So sctp_transport_init() should copy the only available data from
addr to peer->ipaddr, instead of 'peer->ipaddr = *addr' which may
cause slab-out-of-bounds.

Signed-off-by: Xin Long 
---
 net/sctp/transport.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index e2f8e36..7235a60 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -43,8 +43,8 @@ static struct sctp_transport *sctp_transport_init(struct net 
*net,
  gfp_t gfp)
 {
/* Copy in the address.  */
-   peer->ipaddr = *addr;
peer->af_specific = sctp_get_af_specific(addr->sa.sa_family);
+   memcpy(&peer->ipaddr, addr, peer->af_specific->sockaddr_len);
memset(&peer->saddr, 0, sizeof(union sctp_addr));
 
peer->sack_generation = 0;
-- 
2.1.0

[PATCHv2 net-next 0/5] sctp: clean up __sctp_connect function

2019-07-30 Thread Xin Long

This patchset is to factor out some common code for
sctp_sendmsg_new_asoc() and __sctp_connect() into 2
new functioins.

v1->v2:
  - add the patch 1/5 to avoid a slab-out-of-bounds warning.
  - add some code comment for the check change in patch 2/5.
  - remove unused 'addrcnt' as Marcelo noticed in patch 3/5.

Xin Long (5):
  sctp: only copy the available addr data in sctp_transport_init
  sctp: check addr_size with sa_family_t size in
__sctp_setsockopt_connectx
  sctp: clean up __sctp_connect
  sctp: factor out sctp_connect_new_asoc
  sctp: factor out sctp_connect_add_peer

 net/sctp/socket.c| 376 ---
 net/sctp/transport.c |   2 +-
 2 files changed, 147 insertions(+), 231 deletions(-)

-- 
2.1.0

[PATCHv2 net-next 2/5] sctp: check addr_size with sa_family_t size in __sctp_setsockopt_connectx

2019-07-30 Thread Xin Long

Now __sctp_connect() is called by __sctp_setsockopt_connectx() and
sctp_inet_connect(), the latter has done addr_size check with size
of sa_family_t.

In the next patch to clean up __sctp_connect(), we will remove
addr_size check with size of sa_family_t from __sctp_connect()
for the 1st address.

So before doing that, __sctp_setsockopt_connectx() should do
this check first, as sctp_inet_connect() does.

Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index aa80cda..e9c5b39 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1311,7 +1311,8 @@ static int __sctp_setsockopt_connectx(struct sock *sk,
pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
 __func__, sk, addrs, addrs_size);
 
-   if (unlikely(addrs_size <= 0))
+   /* make sure the 1st addr's sa_family is accessible later */
+   if (unlikely(addrs_size < sizeof(sa_family_t)))
return -EINVAL;
 
kaddrs = memdup_user(addrs, addrs_size);
-- 
2.1.0

[PATCHv2 net-next 3/5] sctp: clean up __sctp_connect

2019-07-30 Thread Xin Long

__sctp_connect is doing quit similar things as sctp_sendmsg_new_asoc.
To factor out common functions, this patch is to clean up their code
to make them look more similar:

  1. create the asoc and add a peer with the 1st addr.
  2. add peers with the other addrs into this asoc one by one.

while at it, also remove the unused 'addrcnt'.

Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 209 +++---
 1 file changed, 73 insertions(+), 136 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index e9c5b39..b9804e5 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1049,153 +1049,105 @@ static int sctp_setsockopt_bindx(struct sock *sk,
  * Common routine for handling connect() and sctp_connectx().
  * Connect will come in with just a single address.
  */
-static int __sctp_connect(struct sock *sk,
- struct sockaddr *kaddrs,
- int addrs_size, int flags,
- sctp_assoc_t *assoc_id)
+static int __sctp_connect(struct sock *sk, struct sockaddr *kaddrs,
+ int addrs_size, int flags, sctp_assoc_t *assoc_id)
 {
-   struct net *net = sock_net(sk);
-   struct sctp_sock *sp;
-   struct sctp_endpoint *ep;
-   struct sctp_association *asoc = NULL;
-   struct sctp_association *asoc2;
+   struct sctp_association *old, *asoc;
+   struct sctp_sock *sp = sctp_sk(sk);
+   struct sctp_endpoint *ep = sp->ep;
struct sctp_transport *transport;
-   union sctp_addr to;
+   struct net *net = sock_net(sk);
+   void *addr_buf = kaddrs;
+   union sctp_addr *daddr;
enum sctp_scope scope;
+   struct sctp_af *af;
+   int walk_size, err;
long timeo;
-   int err = 0;
-   int addrcnt = 0;
-   int walk_size = 0;
-   union sctp_addr *sa_addr = NULL;
-   void *addr_buf;
-   unsigned short port;
 
-   sp = sctp_sk(sk);
-   ep = sp->ep;
-
-   /* connect() cannot be done on a socket that is already in ESTABLISHED
-* state - UDP-style peeled off socket or a TCP-style socket that
-* is already connected.
-* It cannot be done even on a TCP-style listening socket.
-*/
if (sctp_sstate(sk, ESTABLISHED) || sctp_sstate(sk, CLOSING) ||
-   (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) {
-   err = -EISCONN;
-   goto out_free;
+   (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)))
+   return -EISCONN;
+
+   daddr = addr_buf;
+   af = sctp_get_af_specific(daddr->sa.sa_family);
+   if (!af || af->sockaddr_len > addrs_size)
+   return -EINVAL;
+
+   err = sctp_verify_addr(sk, daddr, af->sockaddr_len);
+   if (err)
+   return err;
+
+   asoc = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
+   if (asoc)
+   return asoc->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
+: -EALREADY;
+
+   if (sctp_endpoint_is_peeled_off(ep, daddr))
+   return -EADDRNOTAVAIL;
+
+   if (!ep->base.bind_addr.port) {
+   if (sctp_autobind(sk))
+   return -EAGAIN;
+   } else {
+   if (ep->base.bind_addr.port < inet_prot_sock(net) &&
+   !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+   return -EACCES;
}
 
-   /* Walk through the addrs buffer and count the number of addresses. */
-   addr_buf = kaddrs;
-   while (walk_size < addrs_size) {
-   struct sctp_af *af;
+   scope = sctp_scope(daddr);
+   asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+   if (!asoc)
+   return -ENOMEM;
 
-   if (walk_size + sizeof(sa_family_t) > addrs_size) {
-   err = -EINVAL;
-   goto out_free;
-   }
+   err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL);
+   if (err < 0)
+   goto out_free;
 
-   sa_addr = addr_buf;
-   af = sctp_get_af_specific(sa_addr->sa.sa_family);
+   transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
+   if (!transport) {
+   err = -ENOMEM;
+   goto out_free;
+   }
 
-   /* If the address family is not supported or if this address
-* causes the address buffer to overflow return EINVAL.
-*/
-   if (!af || (walk_size + af->sockaddr_len) > addrs_size) {
-   err = -EINVAL;
+   addr_buf += af->sockaddr_len;
+   walk_size = af->sockaddr_len;
+   while (walk_size < addrs_size) {
+   err = -EINVAL;
+   if (walk_size + sizeof(sa_family_t) > addrs_s

[PATCHv2 net-next 5/5] sctp: factor out sctp_connect_add_peer

2019-07-30 Thread Xin Long

In this function factored out from sctp_sendmsg_new_asoc() and
__sctp_connect(), it adds a peer with the other addr into the
asoc after this asoc is created with the 1st addr.

Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 76 +++
 1 file changed, 31 insertions(+), 45 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 6f77853..2f7e88c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -,6 +,33 @@ static int sctp_connect_new_asoc(struct sctp_endpoint 
*ep,
return err;
 }
 
+static int sctp_connect_add_peer(struct sctp_association *asoc,
+union sctp_addr *daddr, int addr_len)
+{
+   struct sctp_endpoint *ep = asoc->ep;
+   struct sctp_association *old;
+   struct sctp_transport *t;
+   int err;
+
+   err = sctp_verify_addr(ep->base.sk, daddr, addr_len);
+   if (err)
+   return err;
+
+   old = sctp_endpoint_lookup_assoc(ep, daddr, &t);
+   if (old && old != asoc)
+   return old->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
+   : -EALREADY;
+
+   if (sctp_endpoint_is_peeled_off(ep, daddr))
+   return -EADDRNOTAVAIL;
+
+   t = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
+   if (!t)
+   return -ENOMEM;
+
+   return 0;
+}
+
 /* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
  *
  * Common routine for handling connect() and sctp_connectx().
@@ -1119,10 +1146,10 @@ static int sctp_connect_new_asoc(struct sctp_endpoint 
*ep,
 static int __sctp_connect(struct sock *sk, struct sockaddr *kaddrs,
  int addrs_size, int flags, sctp_assoc_t *assoc_id)
 {
-   struct sctp_association *old, *asoc;
struct sctp_sock *sp = sctp_sk(sk);
struct sctp_endpoint *ep = sp->ep;
struct sctp_transport *transport;
+   struct sctp_association *asoc;
void *addr_buf = kaddrs;
union sctp_addr *daddr;
struct sctp_af *af;
@@ -1167,29 +1194,10 @@ static int __sctp_connect(struct sock *sk, struct 
sockaddr *kaddrs,
if (asoc->peer.port != ntohs(daddr->v4.sin_port))
goto out_free;
 
-   err = sctp_verify_addr(sk, daddr, af->sockaddr_len);
+   err = sctp_connect_add_peer(asoc, daddr, af->sockaddr_len);
if (err)
goto out_free;
 
-   old = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
-   if (old && old != asoc) {
-   err = old->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
-  : -EALREADY;
-   goto out_free;
-   }
-
-   if (sctp_endpoint_is_peeled_off(ep, daddr)) {
-   err = -EADDRNOTAVAIL;
-   goto out_free;
-   }
-
-   transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL,
-   SCTP_UNKNOWN);
-   if (!transport) {
-   err = -ENOMEM;
-   goto out_free;
-   }
-
addr_buf  += af->sockaddr_len;
walk_size += af->sockaddr_len;
}
@@ -1683,8 +1691,6 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 
sflags,
 
/* sendv addr list parse */
for_each_cmsghdr(cmsg, cmsgs->addrs_msg) {
-   struct sctp_transport *transport;
-   struct sctp_association *old;
union sctp_addr _daddr;
int dlen;
 
@@ -1718,30 +1724,10 @@ static int sctp_sendmsg_new_asoc(struct sock *sk, __u16 
sflags,
daddr->v6.sin6_port = htons(asoc->peer.port);
memcpy(&daddr->v6.sin6_addr, CMSG_DATA(cmsg), dlen);
}
-   err = sctp_verify_addr(sk, daddr, sizeof(*daddr));
-   if (err)
-   goto free;
-
-   old = sctp_endpoint_lookup_assoc(ep, daddr, &transport);
-   if (old && old != asoc) {
-   if (old->state >= SCTP_STATE_ESTABLISHED)
-   err = -EISCONN;
-   else
-   err = -EALREADY;
-   goto free;
-   }
 
-   if (sctp_endpoint_is_peeled_off(ep, daddr)) {
-   err = -EADDRNOTAVAIL;
-   goto free;
-   }
-
-   transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL,
-   SCTP_UNKNOWN);
-   if (!transport) {
-   err = -ENOMEM;
+   err = sctp_connect_

[PATCHv2 net-next 4/5] sctp: factor out sctp_connect_new_asoc

2019-07-30 Thread Xin Long

In this function factored out from sctp_sendmsg_new_asoc() and
__sctp_connect(), it creates the asoc and adds a peer with the
1st addr.

Signed-off-by: Xin Long 
---
 net/sctp/socket.c | 160 ++
 1 file changed, 76 insertions(+), 84 deletions(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index b9804e5..6f77853 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -1044,6 +1044,73 @@ static int sctp_setsockopt_bindx(struct sock *sk,
return err;
 }
 
+static int sctp_connect_new_asoc(struct sctp_endpoint *ep,
+const union sctp_addr *daddr,
+const struct sctp_initmsg *init,
+struct sctp_transport **tp)
+{
+   struct sctp_association *asoc;
+   struct sock *sk = ep->base.sk;
+   struct net *net = sock_net(sk);
+   enum sctp_scope scope;
+   int err;
+
+   if (sctp_endpoint_is_peeled_off(ep, daddr))
+   return -EADDRNOTAVAIL;
+
+   if (!ep->base.bind_addr.port) {
+   if (sctp_autobind(sk))
+   return -EAGAIN;
+   } else {
+   if (ep->base.bind_addr.port < inet_prot_sock(net) &&
+   !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
+   return -EACCES;
+   }
+
+   scope = sctp_scope(daddr);
+   asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
+   if (!asoc)
+   return -ENOMEM;
+
+   err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL);
+   if (err < 0)
+   goto free;
+
+   *tp = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
+   if (!*tp) {
+   err = -ENOMEM;
+   goto free;
+   }
+
+   if (!init)
+   return 0;
+
+   if (init->sinit_num_ostreams) {
+   __u16 outcnt = init->sinit_num_ostreams;
+
+   asoc->c.sinit_num_ostreams = outcnt;
+   /* outcnt has been changed, need to re-init stream */
+   err = sctp_stream_init(&asoc->stream, outcnt, 0, GFP_KERNEL);
+   if (err)
+   goto free;
+   }
+
+   if (init->sinit_max_instreams)
+   asoc->c.sinit_max_instreams = init->sinit_max_instreams;
+
+   if (init->sinit_max_attempts)
+   asoc->max_init_attempts = init->sinit_max_attempts;
+
+   if (init->sinit_max_init_timeo)
+   asoc->max_init_timeo =
+   msecs_to_jiffies(init->sinit_max_init_timeo);
+
+   return 0;
+free:
+   sctp_association_free(asoc);
+   return err;
+}
+
 /* __sctp_connect(struct sock* sk, struct sockaddr *kaddrs, int addrs_size)
  *
  * Common routine for handling connect() and sctp_connectx().
@@ -1056,10 +1123,8 @@ static int __sctp_connect(struct sock *sk, struct 
sockaddr *kaddrs,
struct sctp_sock *sp = sctp_sk(sk);
struct sctp_endpoint *ep = sp->ep;
struct sctp_transport *transport;
-   struct net *net = sock_net(sk);
void *addr_buf = kaddrs;
union sctp_addr *daddr;
-   enum sctp_scope scope;
struct sctp_af *af;
int walk_size, err;
long timeo;
@@ -1082,32 +1147,10 @@ static int __sctp_connect(struct sock *sk, struct 
sockaddr *kaddrs,
return asoc->state >= SCTP_STATE_ESTABLISHED ? -EISCONN
 : -EALREADY;
 
-   if (sctp_endpoint_is_peeled_off(ep, daddr))
-   return -EADDRNOTAVAIL;
-
-   if (!ep->base.bind_addr.port) {
-   if (sctp_autobind(sk))
-   return -EAGAIN;
-   } else {
-   if (ep->base.bind_addr.port < inet_prot_sock(net) &&
-   !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
-   return -EACCES;
-   }
-
-   scope = sctp_scope(daddr);
-   asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL);
-   if (!asoc)
-   return -ENOMEM;
-
-   err = sctp_assoc_set_bind_addr_from_ep(asoc, scope, GFP_KERNEL);
-   if (err < 0)
-   goto out_free;
-
-   transport = sctp_assoc_add_peer(asoc, daddr, GFP_KERNEL, SCTP_UNKNOWN);
-   if (!transport) {
-   err = -ENOMEM;
-   goto out_free;
-   }
+   err = sctp_connect_new_asoc(ep, daddr, NULL, &transport);
+   if (err)
+   return err;
+   asoc = transport->asoc;
 
addr_buf += af->sockaddr_len;
walk_size = af->sockaddr_len;
@@ -1160,7 +1203,7 @@ static int __sctp_connect(struct sock *sk, struct 
sockaddr *kaddrs,
goto out_free;
}
 
-   err = sctp_primitive_ASSOCIATE(net, asoc, NULL);
+   err = sctp_primitive_ASSOCIATE(sock_net(sk), asoc, NULL);
if (err &

Re: [net 1/1] tipc: fix unitilized skb list crash

2019-07-30 Thread Xin Long

-- a/net/tipc/socket.c
> +++ b/net/tipc/socket.c
> @@ -485,9 +485,8 @@ static int tipc_sk_create(struct net *net, struct socket
> *sock,
>   tsk_set_unreturnable(tsk, true);
>   if (sock->type == SOCK_DGRAM)
>   tsk_set_unreliable(tsk, true);
> - __skb_queue_head_init(&tsk->mc_method.deferredq);
>   }
> -
> + __skb_queue_head_init(&tsk->mc_method.deferredq);
>   trace_tipc_sk_create(sk, NULL, TIPC_DUMP_NONE, " ");
>   return 0;
>  }
> --
> 2.1.4
> 
> 
Reviewed-by: Xin Long

[PATCH net] sctp: fix the transport error_count check

2019-08-12 Thread Xin Long

As the annotation says in sctp_do_8_2_transport_strike():

  "If the transport error count is greater than the pf_retrans
   threshold, and less than pathmaxrtx ..."

It should be transport->error_count checked with pathmaxrxt,
instead of asoc->pf_retrans.

Fixes: 5aa93bcf66f4 ("sctp: Implement quick failover draft from tsvwg")
Signed-off-by: Xin Long 
---
 net/sctp/sm_sideeffect.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index a554d6d..1cf5bb5 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -546,7 +546,7 @@ static void sctp_do_8_2_transport_strike(struct 
sctp_cmd_seq *commands,
 */
if (net->sctp.pf_enable &&
   (transport->state == SCTP_ACTIVE) &&
-  (asoc->pf_retrans < transport->pathmaxrxt) &&
+  (transport->error_count < transport->pathmaxrxt) &&
   (transport->error_count > asoc->pf_retrans)) {
 
sctp_assoc_control_transport(asoc, transport,
-- 
2.1.0

Re: [Patch net] cls_flower: call nla_ok() before nla_next()

2021-01-12 Thread Xin Long

On Wed, Jan 13, 2021 at 1:43 AM Cong Wang  wrote:
>
> On Tue, Jan 12, 2021 at 3:52 AM Xin Long  wrote:
> >
> > On Tue, Jan 12, 2021 at 10:56 AM Cong Wang  wrote:
> > >
> > > From: Cong Wang 
> > >
> > > fl_set_enc_opt() simply checks if there are still bytes left to parse,
> > > but this is not sufficent as syzbot seems to be able to generate
> > > malformatted netlink messages. nla_ok() is more strict so should be
> > > used to validate the next nlattr here.
> > >
> > > And nla_validate_nested_deprecated() has less strict check too, it is
> > > probably too late to switch to the strict version, but we can just
> > > call nla_ok() too after it.
> > >
> > > Reported-and-tested-by: 
> > > syzbot+2624e3778b18fc497...@syzkaller.appspotmail.com
> > > Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options")
> > > Fixes: 79b1011cb33d ("net: sched: allow flower to match erspan options")
> > > Cc: Pieter Jansen van Vuuren 
> > > Cc: Jamal Hadi Salim 
> > > Cc: Xin Long 
> > > Cc: Jiri Pirko 
> > > Signed-off-by: Cong Wang 
> > > ---
> > >  net/sched/cls_flower.c | 8 +---
> > >  1 file changed, 5 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
> > > index 1319986693fc..e265c443536e 100644
> > > --- a/net/sched/cls_flower.c
> > > +++ b/net/sched/cls_flower.c
> > > @@ -1272,6 +1272,8 @@ static int fl_set_enc_opt(struct nlattr **tb, 
> > > struct fl_flow_key *key,
> > >
> > > nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
> > > msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
> > > +   if (!nla_ok(nla_opt_msk, msk_depth))
> > > +   return -EINVAL;
> > > }
> > I think it's better to also add  NL_SET_ERR_MSG(extack, );
> > for this error return, like all the other places in this function.
>
> I think ext message is primarily for end users who usually can not
> generate malformat netlink messages.
>
> On the other hand, the nla_validate_nested_deprecated() right above
> the quoted code does not set ext message either.
nla_validate_nested_deprecated(..., extack), it's already done inside
when returns error, no?

Re: [PATCHv2 net-next] ip_gre: remove CRC flag from dev features in gre_gso_segment

2021-01-13 Thread Xin Long

On Wed, Jan 13, 2021 at 10:11 AM Alexander Duyck
 wrote:
>
> On Mon, Jan 11, 2021 at 9:14 PM Xin Long  wrote:
> >
> > On Tue, Jan 12, 2021 at 12:48 AM Alexander Duyck
> >  wrote:
> > >
> > > On Mon, Jan 11, 2021 at 5:22 AM Xin Long  wrote:
> > > >
> > > > This patch is to let it always do CRC checksum in sctp_gso_segment()
> > > > by removing CRC flag from the dev features in gre_gso_segment() for
> > > > SCTP over GRE, just as it does in Commit 527beb8ef9c0 ("udp: support
> > > > sctp over udp in skb_udp_tunnel_segment") for SCTP over UDP.
> > > >
> > > > It could set csum/csum_start in GSO CB properly in sctp_gso_segment()
> > > > after that commit, so it would do checksum with gso_make_checksum()
> > > > in gre_gso_segment(), and Commit 622e32b7d4a6 ("net: gre: recompute
> > > > gre csum for sctp over gre tunnels") can be reverted now.
> > > >
> > > > Note that the current HWs like igb NIC can only handle the SCTP CRC
> > > > when it's in the outer packet, not in the inner packet like in this
> > > > case, so here it removes CRC flag from the dev features even when
> > > > need_csum is false.
> > >
> > > So the limitation in igb is not the hardware but the driver
> > > configuration. When I had coded things up I put in a limitation on the
> > > igb_tx_csum code that it would have to validate that the protocol we
> > > are requesting an SCTP CRC offload since it is a different calculation
> > > than a 1's complement checksum. Since igb doesn't support tunnels we
> > > limited that check to the outer headers.
> > Ah.. I see, thanks.
> > >
> > > We could probably enable this for tunnels as long as the tunnel isn't
> > > requesting an outer checksum offload from the driver.
> > I think in igb_tx_csum(), by checking skb->csum_not_inet would be enough
> > to validate that is a SCTP request:
> > -   if (((first->protocol == htons(ETH_P_IP)) &&
> > -(ip_hdr(skb)->protocol == IPPROTO_SCTP)) ||
> > -   ((first->protocol == htons(ETH_P_IPV6)) &&
> > -igb_ipv6_csum_is_sctp(skb))) {
> > +   if (skb->csum_not_inet) {
> > type_tucmd = E1000_ADVTXD_TUCMD_L4T_SCTP;
> > break;
> > }
> >
>
> So if I may ask. Why go with something like csum_not_inet instead of
> specifying something like crc32_csum? I'm just wondering if there are
> any other non-1's complement checksums that we are dealing with?
I don't think there is, here is the thread of that patch:

https://lore.kernel.org/netdev/CALx6S36rem=OuN_At6qYA=se5cpuym1n2r8efoaszvo8b8t...@mail.gmail.com/

I'm writing GRE checksum, and trying to change csum_not_inet:1 to
csum_type:2, by doing the below, no bit hole occurs:

-   __u8csum_not_inet:1;
-   __u8dst_pending_confirm:1;
+   __u8csum_type:2;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8ndisc_nodetype:2;
 #endif
+   __u8dst_pending_confirm:1;

and in skb_csum_hwoffload_help():
 int skb_csum_hwoffload_help(struct sk_buff *skb,
const netdev_features_t features)
 {
-   if (unlikely(skb->csum_not_inet))
-   return !!(features & NETIF_F_SCTP_CRC) ? 0 :
-   skb_crc32c_csum_help(skb);
+   if (likely(!skb->csum_type))
+   return !!(features & NETIF_F_CSUM_MASK) ? 0 :
skb_checksum_help(skb);
+
+   if (skb->csum_type == CSUM_T_GENERIC) {
+   return !!(features & NETIF_F_HW_CSUM) ? 0 :
skb_checksum_help(skb);
+   } else if (skb->csum_type == CSUM_T_SCTP_CRC) {
+   return !!(features & NETIF_F_SCTP_CRC) ? 0 :
skb_crc32c_csum_help(skb);
+   } else {
+   pr_warn("Wrong csum type: %d\n", skb->csum_type);
+   return 1;
+   }

then the driver fix will be:
case offsetof(struct sctphdr, checksum):
/* validate that this is actually an SCTP request */
-   if (((first->protocol == htons(ETH_P_IP)) &&
-(ip_hdr(skb)->protocol == IPPROTO_SCTP)) ||
-   ((first->protocol == htons(ETH_P_IPV6)) &&
-igb_ipv6_csum_is_sctp(skb))) {
+   if (skb->csum_type == CSUM_T_SCTP_CRC) {
type_tucmd = E1000_ADVTXD_TUCMD_L4T_SCTP;
break;
}

then the gre

[PATCHv2 net-next 0/2] net: enable udp v6 sockets receiving v4 packets with UDP GRO

2021-01-14 Thread Xin Long

Currently, udp v6 socket can not process v4 packets with UDP GRO, as
udp_encap_needed_key is not increased when udp_tunnel_encap_enable()
is called for v6 socket. This patchset is to increase it and remove
the unnecessary code in bareudp.

v1->v2:
  - see Patch 1/2.

Xin Long (2):
  udp: call udp_encap_enable for v6 sockets when enabling encap
  Revert "bareudp: Fixed bareudp receive handling"

 drivers/net/bareudp.c| 6 --
 include/net/udp.h| 1 +
 include/net/udp_tunnel.h | 3 +--
 net/ipv4/udp.c   | 6 ++
 net/ipv6/udp.c   | 4 +++-
 5 files changed, 11 insertions(+), 9 deletions(-)

-- 
2.1.0

[PATCHv2 net-next 1/2] udp: call udp_encap_enable for v6 sockets when enabling encap

2021-01-14 Thread Xin Long

When enabling encap for a ipv6 socket without udp_encap_needed_key
increased, UDP GRO won't work for v4 mapped v6 address packets as
sk will be NULL in udp4_gro_receive().

This patch is to enable it by increasing udp_encap_needed_key for
v6 sockets in udp_tunnel_encap_enable(), and correspondingly
decrease udp_encap_needed_key in udpv6_destroy_sock().

v1->v2:
  - add udp_encap_disable() and export it.

Signed-off-by: Xin Long 
---
 include/net/udp.h| 1 +
 include/net/udp_tunnel.h | 3 +--
 net/ipv4/udp.c   | 6 ++
 net/ipv6/udp.c   | 4 +++-
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 877832b..1e7b6cd 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -467,6 +467,7 @@ void udp_init(void);
 
 DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
 void udp_encap_enable(void);
+void udp_encap_disable(void);
 #if IS_ENABLED(CONFIG_IPV6)
 DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
 void udpv6_encap_enable(void);
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 282d10e..afc7ce7 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -181,9 +181,8 @@ static inline void udp_tunnel_encap_enable(struct socket 
*sock)
 #if IS_ENABLED(CONFIG_IPV6)
if (sock->sk->sk_family == PF_INET6)
ipv6_stub->udpv6_encap_enable();
-   else
 #endif
-   udp_encap_enable();
+   udp_encap_enable();
 }
 
 #define UDP_TUNNEL_NIC_MAX_TABLES  4
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7103b0a..28bfe60 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -596,6 +596,12 @@ void udp_encap_enable(void)
 }
 EXPORT_SYMBOL(udp_encap_enable);
 
+void udp_encap_disable(void)
+{
+   static_branch_dec(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_disable);
+
 /* Handler for tunnels with arbitrary destination ports: no socket lookup, go
  * through error handlers in encapsulations looking for a match.
  */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b9f3dfd..d754292 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1608,8 +1608,10 @@ void udpv6_destroy_sock(struct sock *sk)
if (encap_destroy)
encap_destroy(sk);
}
-   if (up->encap_enabled)
+   if (up->encap_enabled) {
static_branch_dec(&udpv6_encap_needed_key);
+   udp_encap_disable();
+   }
}
 
inet6_destroy_sock(sk);
-- 
2.1.0

[PATCHv2 net-next 2/2] Revert "bareudp: Fixed bareudp receive handling"

2021-01-14 Thread Xin Long

As udp_encap_enable() is already called in udp_tunnel_encap_enable()
since the last patch, and we don't need it any more. So remove it by
reverting commit 81f954a44567567c7d74a97b1db78fb43afc253d.
---
 drivers/net/bareudp.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 0965d13..57dfaf4 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -240,12 +240,6 @@ static int bareudp_socket_create(struct bareudp_dev 
*bareudp, __be16 port)
tunnel_cfg.encap_destroy = NULL;
setup_udp_tunnel_sock(bareudp->net, sock, &tunnel_cfg);
 
-   /* As the setup_udp_tunnel_sock does not call udp_encap_enable if the
-* socket type is v6 an explicit call to udp_encap_enable is needed.
-*/
-   if (sock->sk->sk_family == AF_INET6)
-   udp_encap_enable();
-
rcu_assign_pointer(bareudp->sock, sock);
return 0;
 }
-- 
2.1.0

[PATCHv2 net-next 0/2] net: fix the features flag in sctp_gso_segment

2021-01-15 Thread Xin Long

Patch 1/2 is to improve the code in skb_segment(), and it is needed
by Patch 2/2.

v1->v2:
  - see Patch 1/2.

Xin Long (2):
  net: move the hsize check to the else block in skb_segment
  udp: remove CRC flag from dev features in __skb_udp_tunnel_segment

 net/core/skbuff.c  | 11 ++-
 net/ipv4/udp_offload.c |  4 ++--
 2 files changed, 8 insertions(+), 7 deletions(-)

-- 
2.1.0

[PATCHv2 net-next 2/2] udp: remove CRC flag from dev features in __skb_udp_tunnel_segment

2021-01-15 Thread Xin Long

Signed-off-by: Xin Long 
---
 net/ipv4/udp_offload.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ff39e94..1168d18 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -68,8 +68,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct 
sk_buff *skb,
  (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
 
features &= skb->dev->hw_enc_features;
-   /* CRC checksum can't be handled by HW when it's a UDP tunneling 
packet. */
-   features &= ~NETIF_F_SCTP_CRC;
+   if (need_csum)
+   features &= ~NETIF_F_SCTP_CRC;
 
/* The only checksum offload we care about from here on out is the
 * outer one so strip the existing checksum feature flags and
-- 
2.1.0

[PATCHv2 net-next 1/2] net: move the hsize check to the else block in skb_segment

2021-01-15 Thread Xin Long

After commit 89319d3801d1 ("net: Add frag_list support to skb_segment"),
it goes to process frag_list when !hsize in skb_segment(). However, when
using skb frag_list, sg normally should not be set. In this case, hsize
will be set with len right before !hsize check, then it won't go to
frag_list processing code.

So the right thing to do is move the hsize check to the else block, so
that it won't affect the !hsize check for frag_list processing.

v1->v2:
  - change to do "hsize <= 0" check instead of "!hsize", and also move
"hsize < 0" into else block, to save some cycles, as Alex suggested.

Signed-off-by: Xin Long 
---
 net/core/skbuff.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6039069..e835193 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3894,12 +3894,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
}
 
hsize = skb_headlen(head_skb) - offset;
-   if (hsize < 0)
-   hsize = 0;
-   if (hsize > len || !sg)
-   hsize = len;
 
-   if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
+   if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
(skb_headlen(list_skb) == len || sg)) {
BUG_ON(skb_headlen(list_skb) > len);
 
@@ -3942,6 +3938,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
+   if (hsize > len || !sg)
+   hsize = len;
+   else if (hsize < 0)
+   hsize = 0;
+
nskb = __alloc_skb(hsize + doffset + headroom,
   GFP_ATOMIC, 
skb_alloc_rx_flag(head_skb),
   NUMA_NO_NODE);
-- 
2.1.0

Re: [PATCHv2 net-next 0/2] net: fix the features flag in sctp_gso_segment

2021-01-15 Thread Xin Long

Please drop this patchset, the second one is incorrect.
I will post v3, thanks.

On Fri, Jan 15, 2021 at 4:21 PM Xin Long  wrote:
>
> Patch 1/2 is to improve the code in skb_segment(), and it is needed
> by Patch 2/2.
>
> v1->v2:
>   - see Patch 1/2.
>
> Xin Long (2):
>   net: move the hsize check to the else block in skb_segment
>   udp: remove CRC flag from dev features in __skb_udp_tunnel_segment
>
>  net/core/skbuff.c  | 11 ++-
>  net/ipv4/udp_offload.c |  4 ++--
>  2 files changed, 8 insertions(+), 7 deletions(-)
>
> --
> 2.1.0
>

[PATCHv3 net-next 1/2] net: move the hsize check to the else block in skb_segment

2021-01-15 Thread Xin Long

After commit 89319d3801d1 ("net: Add frag_list support to skb_segment"),
it goes to process frag_list when !hsize in skb_segment(). However, when
using skb frag_list, sg normally should not be set. In this case, hsize
will be set with len right before !hsize check, then it won't go to
frag_list processing code.

So the right thing to do is move the hsize check to the else block, so
that it won't affect the !hsize check for frag_list processing.

v1->v2:
  - change to do "hsize <= 0" check instead of "!hsize", and also move
"hsize < 0" into else block, to save some cycles, as Alex suggested.

Signed-off-by: Xin Long 
---
 net/core/skbuff.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 6039069..e835193 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3894,12 +3894,8 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
}
 
hsize = skb_headlen(head_skb) - offset;
-   if (hsize < 0)
-   hsize = 0;
-   if (hsize > len || !sg)
-   hsize = len;
 
-   if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
+   if (hsize <= 0 && i >= nfrags && skb_headlen(list_skb) &&
(skb_headlen(list_skb) == len || sg)) {
BUG_ON(skb_headlen(list_skb) > len);
 
@@ -3942,6 +3938,11 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
+   if (hsize > len || !sg)
+   hsize = len;
+   else if (hsize < 0)
+   hsize = 0;
+
nskb = __alloc_skb(hsize + doffset + headroom,
   GFP_ATOMIC, 
skb_alloc_rx_flag(head_skb),
   NUMA_NO_NODE);
-- 
2.1.0

[PATCHv3 net-next 2/2] sctp: remove the NETIF_F_SG flag before calling skb_segment

2021-01-15 Thread Xin Long

It makes more sense to clear NETIF_F_SG instead of set it when
calling skb_segment() in sctp_gso_segment(), since SCTP GSO is
using head_skb's fraglist, of which all frags are linear skbs.

This will make SCTP GSO code more understandable.

Suggested-by: Alexander Duyck 
Signed-off-by: Xin Long 
---
 net/sctp/offload.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index ce281a9..eb874e3 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -68,7 +68,7 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb,
goto out;
}
 
-   segs = skb_segment(skb, features | NETIF_F_HW_CSUM | NETIF_F_SG);
+   segs = skb_segment(skb, (features | NETIF_F_HW_CSUM) & ~NETIF_F_SG);
if (IS_ERR(segs))
goto out;
 
-- 
2.1.0

[PATCHv3 net-next 0/2] net: fix the features flag in sctp_gso_segment

2021-01-15 Thread Xin Long

Patch 1/2 is to improve the code in skb_segment(), and it is needed
by Patch 2/2.

v1->v2:
  - see Patch 1/2.
v2->v3:
  - change Patch 2/2 to the right patch.

Xin Long (2):
  net: move the hsize check to the else block in skb_segment
  sctp: remove the NETIF_F_SG flag before calling skb_segment

 net/core/skbuff.c  | 11 ++-
 net/sctp/offload.c |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

-- 
2.1.0

[PATCH net-next 1/3] vxlan: add NETIF_F_FRAGLIST flag for dev features

2021-01-15 Thread Xin Long

Some protocol HW GSO requires fraglist supported by the device, like
SCTP. Without NETIF_F_FRAGLIST set in the dev features of vxlan, it
would have to do SW GSO before the packets enter the driver, even
when the vxlan dev and lower dev (like veth) both have the feature
of NETIF_F_GSO_SCTP.

So this patch is to add it for vxlan.

Signed-off-by: Xin Long 
---
 drivers/net/vxlan.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index b936443..3929e43 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -3283,12 +3283,13 @@ static void vxlan_setup(struct net_device *dev)
SET_NETDEV_DEVTYPE(dev, &vxlan_type);
 
dev->features   |= NETIF_F_LLTX;
-   dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM;
+   dev->features   |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
dev->features   |= NETIF_F_RXCSUM;
dev->features   |= NETIF_F_GSO_SOFTWARE;
 
dev->vlan_features = dev->features;
-   dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
+   dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
+   dev->hw_features |= NETIF_F_RXCSUM;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
netif_keep_dst(dev);
dev->priv_flags |= IFF_NO_QUEUE;
-- 
2.1.0

[PATCH net-next 0/3] net: make udp tunnel devices support fraglist

2021-01-15 Thread Xin Long

Like GRE device, UDP tunnel devices should also support fraglist, so
that some protocol (like SCTP) HW GSO that requires NETIF_F_FRAGLIST
in the dev can work. Especially when the lower device support both
NETIF_F_GSO_UDP_TUNNEL and NETIF_F_GSO_SCTP.

Xin Long (3):
  vxlan: add NETIF_F_FRAGLIST flag for dev features
  geneve: add NETIF_F_FRAGLIST flag for dev features
  bareudp: add NETIF_F_FRAGLIST flag for dev features

 drivers/net/bareudp.c | 5 +++--
 drivers/net/geneve.c  | 5 +++--
 drivers/net/vxlan.c   | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

-- 
2.1.0

[PATCH net-next 2/3] geneve: add NETIF_F_FRAGLIST flag for dev features

2021-01-15 Thread Xin Long

Some protocol HW GSO requires fraglist supported by the device, like
SCTP. Without NETIF_F_FRAGLIST set in the dev features of geneve, it
would have to do SW GSO before the packets enter the driver, even
when the geneve dev and lower dev (like veth) both have the feature
of NETIF_F_GSO_SCTP.

So this patch is to add it for geneve.

Signed-off-by: Xin Long 
---
 drivers/net/geneve.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 6aa775d..4ac0373 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1197,11 +1197,12 @@ static void geneve_setup(struct net_device *dev)
SET_NETDEV_DEVTYPE(dev, &geneve_type);
 
dev->features|= NETIF_F_LLTX;
-   dev->features|= NETIF_F_SG | NETIF_F_HW_CSUM;
+   dev->features|= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
dev->features|= NETIF_F_RXCSUM;
dev->features|= NETIF_F_GSO_SOFTWARE;
 
-   dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
+   dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
+   dev->hw_features |= NETIF_F_RXCSUM;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 
/* MTU range: 68 - (something less than 65535) */
-- 
2.1.0

[PATCH net-next 3/3] bareudp: add NETIF_F_FRAGLIST flag for dev features

2021-01-15 Thread Xin Long

Like vxlan and geneve, bareudp also needs this dev feature
to support some protocol's HW GSO.

Signed-off-by: Xin Long 
---
 drivers/net/bareudp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 57dfaf4..7511bca 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -526,11 +526,12 @@ static void bareudp_setup(struct net_device *dev)
dev->netdev_ops = &bareudp_netdev_ops;
dev->needs_free_netdev = true;
SET_NETDEV_DEVTYPE(dev, &bareudp_type);
-   dev->features|= NETIF_F_SG | NETIF_F_HW_CSUM;
+   dev->features|= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
dev->features|= NETIF_F_RXCSUM;
dev->features|= NETIF_F_LLTX;
dev->features|= NETIF_F_GSO_SOFTWARE;
-   dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
+   dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST;
+   dev->hw_features |= NETIF_F_RXCSUM;
dev->hw_features |= NETIF_F_GSO_SOFTWARE;
dev->hard_header_len = 0;
dev->addr_len = 0;
-- 
2.1.0

[PATCHv3 net-next 0/2] net: enable udp v6 sockets receiving v4 packets with UDP GRO

2021-01-15 Thread Xin Long

Currently, udp v6 socket can not process v4 packets with UDP GRO, as
udp_encap_needed_key is not increased when udp_tunnel_encap_enable()
is called for v6 socket. This patchset is to increase it and remove
the unnecessary code in bareudp.

v1->v2:
  - see Patch 1/2.
v2->v3:
  - see Patch 2/2.

Xin Long (2):
  udp: call udp_encap_enable for v6 sockets when enabling encap
  Revert "bareudp: Fixed bareudp receive handling"

 drivers/net/bareudp.c| 6 --
 include/net/udp.h| 1 +
 include/net/udp_tunnel.h | 3 +--
 net/ipv4/udp.c   | 6 ++
 net/ipv6/udp.c   | 4 +++-
 5 files changed, 11 insertions(+), 9 deletions(-)

-- 
2.1.0

[PATCHv3 net-next 1/2] udp: call udp_encap_enable for v6 sockets when enabling encap

2021-01-15 Thread Xin Long

When enabling encap for a ipv6 socket without udp_encap_needed_key
increased, UDP GRO won't work for v4 mapped v6 address packets as
sk will be NULL in udp4_gro_receive().

This patch is to enable it by increasing udp_encap_needed_key for
v6 sockets in udp_tunnel_encap_enable(), and correspondingly
decrease udp_encap_needed_key in udpv6_destroy_sock().

v1->v2:
  - add udp_encap_disable() and export it.

Signed-off-by: Xin Long 
---
 include/net/udp.h| 1 +
 include/net/udp_tunnel.h | 3 +--
 net/ipv4/udp.c   | 6 ++
 net/ipv6/udp.c   | 4 +++-
 4 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/include/net/udp.h b/include/net/udp.h
index 877832b..1e7b6cd 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -467,6 +467,7 @@ void udp_init(void);
 
 DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
 void udp_encap_enable(void);
+void udp_encap_disable(void);
 #if IS_ENABLED(CONFIG_IPV6)
 DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
 void udpv6_encap_enable(void);
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 282d10e..afc7ce7 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -181,9 +181,8 @@ static inline void udp_tunnel_encap_enable(struct socket 
*sock)
 #if IS_ENABLED(CONFIG_IPV6)
if (sock->sk->sk_family == PF_INET6)
ipv6_stub->udpv6_encap_enable();
-   else
 #endif
-   udp_encap_enable();
+   udp_encap_enable();
 }
 
 #define UDP_TUNNEL_NIC_MAX_TABLES  4
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7103b0a..28bfe60 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -596,6 +596,12 @@ void udp_encap_enable(void)
 }
 EXPORT_SYMBOL(udp_encap_enable);
 
+void udp_encap_disable(void)
+{
+   static_branch_dec(&udp_encap_needed_key);
+}
+EXPORT_SYMBOL(udp_encap_disable);
+
 /* Handler for tunnels with arbitrary destination ports: no socket lookup, go
  * through error handlers in encapsulations looking for a match.
  */
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b9f3dfd..d754292 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1608,8 +1608,10 @@ void udpv6_destroy_sock(struct sock *sk)
if (encap_destroy)
encap_destroy(sk);
}
-   if (up->encap_enabled)
+   if (up->encap_enabled) {
static_branch_dec(&udpv6_encap_needed_key);
+   udp_encap_disable();
+   }
}
 
inet6_destroy_sock(sk);
-- 
2.1.0

[PATCHv3 net-next 2/2] Revert "bareudp: Fixed bareudp receive handling"

2021-01-15 Thread Xin Long

As udp_encap_enable() is already called in udp_tunnel_encap_enable()
since the last patch, and we don't need it any more. So remove it by
reverting commit 81f954a44567567c7d74a97b1db78fb43afc253d.

v1->v2:
 - no change.
v2->v3:
 - add the missing signoff.

Signed-off-by: Xin Long 
---
 drivers/net/bareudp.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 0965d13..57dfaf4 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -240,12 +240,6 @@ static int bareudp_socket_create(struct bareudp_dev 
*bareudp, __be16 port)
tunnel_cfg.encap_destroy = NULL;
setup_udp_tunnel_sock(bareudp->net, sock, &tunnel_cfg);
 
-   /* As the setup_udp_tunnel_sock does not call udp_encap_enable if the
-* socket type is v6 an explicit call to udp_encap_enable is needed.
-*/
-   if (sock->sk->sk_family == AF_INET6)
-   udp_encap_enable();
-
rcu_assign_pointer(bareudp->sock, sock);
return 0;
 }
-- 
2.1.0

[PATCHv3 net-next] ip_gre: remove CRC flag from dev features in gre_gso_segment

2021-01-15 Thread Xin Long

This patch is to let it always do CRC checksum in sctp_gso_segment()
by removing CRC flag from the dev features in gre_gso_segment() for
SCTP over GRE, just as it does in Commit 527beb8ef9c0 ("udp: support
sctp over udp in skb_udp_tunnel_segment") for SCTP over UDP.

It could set csum/csum_start in GSO CB properly in sctp_gso_segment()
after that commit, so it would do checksum with gso_make_checksum()
in gre_gso_segment(), and Commit 622e32b7d4a6 ("net: gre: recompute
gre csum for sctp over gre tunnels") can be reverted now.

Note that when need_csum is false, we can still leave CRC checksum
of SCTP to HW by not clearing this CRC flag if it's supported, as
Jakub and Alex noticed.

v1->v2:
  - improve the changelog.
  - fix "rev xmas tree" in varibles declaration.
v2->v3:
  - remove CRC flag from dev features only when need_csum is true.

Signed-off-by: Xin Long 
---
 net/ipv4/gre_offload.c | 15 ---
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index e0a2465..10bc49b 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -15,10 +15,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
   netdev_features_t features)
 {
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
-   bool need_csum, need_recompute_csum, gso_partial;
struct sk_buff *segs = ERR_PTR(-EINVAL);
u16 mac_offset = skb->mac_header;
__be16 protocol = skb->protocol;
+   bool need_csum, gso_partial;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
 
@@ -41,10 +41,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
skb->protocol = skb->inner_protocol;
 
need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
-   need_recompute_csum = skb->csum_not_inet;
skb->encap_hdr_csum = need_csum;
 
features &= skb->dev->hw_enc_features;
+   if (need_csum)
+   features &= ~NETIF_F_SCTP_CRC;
 
/* segment inner packet. */
segs = skb_mac_gso_segment(skb, features);
@@ -99,15 +100,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
}
 
*(pcsum + 1) = 0;
-   if (need_recompute_csum && !skb_is_gso(skb)) {
-   __wsum csum;
-
-   csum = skb_checksum(skb, gre_offset,
-   skb->len - gre_offset, 0);
-   *pcsum = csum_fold(csum);
-   } else {
-   *pcsum = gso_make_checksum(skb, 0);
-   }
+   *pcsum = gso_make_checksum(skb, 0);
} while ((skb = skb->next));
 out:
return segs;
-- 
2.1.0

[PATCH net-next] udp: not remove the CRC flag from dev features when need_csum is false

2021-01-15 Thread Xin Long

In __skb_udp_tunnel_segment(), when it's a SCTP over VxLAN/GENEVE
packet and need_csum is false, which means the outer udp checksum
doesn't need to be computed, csum_start and csum_offset could be
used by the inner SCTP CRC CSUM for SCTP HW CRC offload.

So this patch is to not remove the CRC flag from dev features when
need_csum is false.

Signed-off-by: Xin Long 
---
 net/ipv4/udp_offload.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index ff39e94..1168d18 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -68,8 +68,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct 
sk_buff *skb,
  (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
 
features &= skb->dev->hw_enc_features;
-   /* CRC checksum can't be handled by HW when it's a UDP tunneling 
packet. */
-   features &= ~NETIF_F_SCTP_CRC;
+   if (need_csum)
+   features &= ~NETIF_F_SCTP_CRC;
 
/* The only checksum offload we care about from here on out is the
 * outer one so strip the existing checksum feature flags and
-- 
2.1.0

[PATCH net-next 0/6] net: support SCTP CRC csum offload for tunneling packets in some drivers

2021-01-15 Thread Xin Long

This patchset introduces inline function skb_csum_is_sctp(), and uses it
to validate it's a sctp CRC csum offload packet, to make SCTP CRC csum
offload for tunneling packets supported in some HW drivers.

Xin Long (6):
  net: add inline function skb_csum_is_sctp
  net: igb: use skb_csum_is_sctp instead of protocol check
  net: igbvf: use skb_csum_is_sctp instead of protocol check
  net: igc: use skb_csum_is_sctp instead of protocol check
  net: ixgbe: use skb_csum_is_sctp instead of protocol check
  net: ixgbevf: use skb_csum_is_sctp instead of protocol check

 drivers/net/ethernet/intel/igb/igb_main.c | 14 +-
 drivers/net/ethernet/intel/igbvf/netdev.c | 14 +-
 drivers/net/ethernet/intel/igc/igc_main.c | 14 +-
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 14 +-
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 14 +-
 drivers/net/ethernet/pensando/ionic/ionic_txrx.c  |  2 +-
 include/linux/skbuff.h|  5 +
 net/core/dev.c|  2 +-
 8 files changed, 12 insertions(+), 67 deletions(-)

-- 
2.1.0

[PATCH net-next 1/6] net: add inline function skb_csum_is_sctp

2021-01-15 Thread Xin Long

This patch is to define a inline function skb_csum_is_sctp(), and
also replace all places where it checks if it's a SCTP CSUM skb.
This function would be used later in many networking drivers in
the following patches.

Suggested-by: Alexander Duyck 
Signed-off-by: Xin Long 
---
 drivers/net/ethernet/pensando/ionic/ionic_txrx.c | 2 +-
 include/linux/skbuff.h   | 5 +
 net/core/dev.c   | 2 +-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c 
b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
index ac4cd5d..162a1ff 100644
--- a/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
@@ -979,7 +979,7 @@ static int ionic_tx_calc_csum(struct ionic_queue *q, struct 
sk_buff *skb)
stats->vlan_inserted++;
}
 
-   if (skb->csum_not_inet)
+   if (skb_csum_is_sctp(skb))
stats->crc32_csum++;
else
stats->csum++;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c9568cf..46f901a 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -4621,6 +4621,11 @@ static inline void skb_reset_redirect(struct sk_buff 
*skb)
 #endif
 }
 
+static inline bool skb_csum_is_sctp(struct sk_buff *skb)
+{
+   return skb->csum_not_inet;
+}
+
 static inline void skb_set_kcov_handle(struct sk_buff *skb,
   const u64 kcov_handle)
 {
diff --git a/net/core/dev.c b/net/core/dev.c
index 0a31d4e..bbd306f 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3617,7 +3617,7 @@ static struct sk_buff *validate_xmit_vlan(struct sk_buff 
*skb,
 int skb_csum_hwoffload_help(struct sk_buff *skb,
const netdev_features_t features)
 {
-   if (unlikely(skb->csum_not_inet))
+   if (unlikely(skb_csum_is_sctp(skb)))
return !!(features & NETIF_F_SCTP_CRC) ? 0 :
skb_crc32c_csum_help(skb);
 
-- 
2.1.0

[PATCH net-next 3/6] net: igbvf: use skb_csum_is_sctp instead of protocol check

2021-01-15 Thread Xin Long

Using skb_csum_is_sctp is a easier way to validate it's a SCTP CRC
checksum offload packet, and yet it also makes igbvf support SCTP
CRC checksum offload for UDP and GRE encapped packets, just as it
does in igb driver.

Signed-off-by: Xin Long 
---
 drivers/net/ethernet/intel/igbvf/netdev.c | 14 +-
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/igbvf/netdev.c 
b/drivers/net/ethernet/intel/igbvf/netdev.c
index 30fdea2..fb3fbcb 100644
--- a/drivers/net/ethernet/intel/igbvf/netdev.c
+++ b/drivers/net/ethernet/intel/igbvf/netdev.c
@@ -2072,15 +2072,6 @@ static int igbvf_tso(struct igbvf_ring *tx_ring,
return 1;
 }
 
-static inline bool igbvf_ipv6_csum_is_sctp(struct sk_buff *skb)
-{
-   unsigned int offset = 0;
-
-   ipv6_find_hdr(skb, &offset, IPPROTO_SCTP, NULL, NULL);
-
-   return offset == skb_checksum_start_offset(skb);
-}
-
 static bool igbvf_tx_csum(struct igbvf_ring *tx_ring, struct sk_buff *skb,
  u32 tx_flags, __be16 protocol)
 {
@@ -2102,10 +2093,7 @@ static bool igbvf_tx_csum(struct igbvf_ring *tx_ring, 
struct sk_buff *skb,
break;
case offsetof(struct sctphdr, checksum):
/* validate that this is actually an SCTP request */
-   if (((protocol == htons(ETH_P_IP)) &&
-(ip_hdr(skb)->protocol == IPPROTO_SCTP)) ||
-   ((protocol == htons(ETH_P_IPV6)) &&
-igbvf_ipv6_csum_is_sctp(skb))) {
+   if (skb_csum_is_sctp(skb)) {
type_tucmd = E1000_ADVTXD_TUCMD_L4T_SCTP;
break;
}
-- 
2.1.0

[PATCH net-next 4/6] net: igc: use skb_csum_is_sctp instead of protocol check

2021-01-15 Thread Xin Long

Using skb_csum_is_sctp is a easier way to validate it's a SCTP CRC
checksum offload packet, and yet it also makes igc support SCTP
CRC checksum offload for UDP and GRE encapped packets, just as it
does in igb driver.

Signed-off-by: Xin Long 
---
 drivers/net/ethernet/intel/igc/igc_main.c | 14 +-
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/igc/igc_main.c 
b/drivers/net/ethernet/intel/igc/igc_main.c
index afd6a62..43aec42 100644
--- a/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@ -949,15 +949,6 @@ static void igc_tx_ctxtdesc(struct igc_ring *tx_ring,
}
 }
 
-static inline bool igc_ipv6_csum_is_sctp(struct sk_buff *skb)
-{
-   unsigned int offset = 0;
-
-   ipv6_find_hdr(skb, &offset, IPPROTO_SCTP, NULL, NULL);
-
-   return offset == skb_checksum_start_offset(skb);
-}
-
 static void igc_tx_csum(struct igc_ring *tx_ring, struct igc_tx_buffer *first)
 {
struct sk_buff *skb = first->skb;
@@ -980,10 +971,7 @@ static void igc_tx_csum(struct igc_ring *tx_ring, struct 
igc_tx_buffer *first)
break;
case offsetof(struct sctphdr, checksum):
/* validate that this is actually an SCTP request */
-   if ((first->protocol == htons(ETH_P_IP) &&
-(ip_hdr(skb)->protocol == IPPROTO_SCTP)) ||
-   (first->protocol == htons(ETH_P_IPV6) &&
-igc_ipv6_csum_is_sctp(skb))) {
+   if (skb_csum_is_sctp(skb)) {
type_tucmd = IGC_ADVTXD_TUCMD_L4T_SCTP;
break;
}
-- 
2.1.0

[PATCH net-next 6/6] net: ixgbevf: use skb_csum_is_sctp instead of protocol check

2021-01-15 Thread Xin Long

Using skb_csum_is_sctp is a easier way to validate it's a SCTP CRC
checksum offload packet, and yet it also makes ixgbevf support SCTP
CRC checksum offload for UDP and GRE encapped packets, just as it
does in igb driver.

Signed-off-by: Xin Long 
---
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c | 14 +-
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c 
b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 4061cd7..cd9d79f 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -3844,15 +3844,6 @@ static int ixgbevf_tso(struct ixgbevf_ring *tx_ring,
return 1;
 }
 
-static inline bool ixgbevf_ipv6_csum_is_sctp(struct sk_buff *skb)
-{
-   unsigned int offset = 0;
-
-   ipv6_find_hdr(skb, &offset, IPPROTO_SCTP, NULL, NULL);
-
-   return offset == skb_checksum_start_offset(skb);
-}
-
 static void ixgbevf_tx_csum(struct ixgbevf_ring *tx_ring,
struct ixgbevf_tx_buffer *first,
struct ixgbevf_ipsec_tx_data *itd)
@@ -3873,10 +3864,7 @@ static void ixgbevf_tx_csum(struct ixgbevf_ring *tx_ring,
break;
case offsetof(struct sctphdr, checksum):
/* validate that this is actually an SCTP request */
-   if (((first->protocol == htons(ETH_P_IP)) &&
-(ip_hdr(skb)->protocol == IPPROTO_SCTP)) ||
-   ((first->protocol == htons(ETH_P_IPV6)) &&
-ixgbevf_ipv6_csum_is_sctp(skb))) {
+   if (skb_csum_is_sctp(skb)) {
type_tucmd = IXGBE_ADVTXD_TUCMD_L4T_SCTP;
break;
}
-- 
2.1.0

[PATCH net-next 2/6] net: igb: use skb_csum_is_sctp instead of protocol check

2021-01-15 Thread Xin Long

Using skb_csum_is_sctp is a easier way to validate it's a SCTP
CRC checksum offload packet, and there is no need to parse the
packet to check its proto field, especially when it's a UDP or
GRE encapped packet.

So this patch also makes igb support SCTP CRC checksum offload
for UDP and GRE encapped packets.

Signed-off-by: Xin Long 
---
 drivers/net/ethernet/intel/igb/igb_main.c | 14 +-
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
b/drivers/net/ethernet/intel/igb/igb_main.c
index 03f78fd..8757ad0 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -5959,15 +5959,6 @@ static int igb_tso(struct igb_ring *tx_ring,
return 1;
 }
 
-static inline bool igb_ipv6_csum_is_sctp(struct sk_buff *skb)
-{
-   unsigned int offset = 0;
-
-   ipv6_find_hdr(skb, &offset, IPPROTO_SCTP, NULL, NULL);
-
-   return offset == skb_checksum_start_offset(skb);
-}
-
 static void igb_tx_csum(struct igb_ring *tx_ring, struct igb_tx_buffer *first)
 {
struct sk_buff *skb = first->skb;
@@ -5990,10 +5981,7 @@ static void igb_tx_csum(struct igb_ring *tx_ring, struct 
igb_tx_buffer *first)
break;
case offsetof(struct sctphdr, checksum):
/* validate that this is actually an SCTP request */
-   if (((first->protocol == htons(ETH_P_IP)) &&
-(ip_hdr(skb)->protocol == IPPROTO_SCTP)) ||
-   ((first->protocol == htons(ETH_P_IPV6)) &&
-igb_ipv6_csum_is_sctp(skb))) {
+   if (skb_csum_is_sctp(skb)) {
type_tucmd = E1000_ADVTXD_TUCMD_L4T_SCTP;
break;
}
-- 
2.1.0

[PATCH net-next 5/6] net: ixgbe: use skb_csum_is_sctp instead of protocol check

2021-01-15 Thread Xin Long

Using skb_csum_is_sctp is a easier way to validate it's a SCTP CRC
checksum offload packet, and yet it also makes ixgbe support SCTP
CRC checksum offload for UDP and GRE encapped packets, just as it
does in igb driver.

Signed-off-by: Xin Long 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 14 +-
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 6cbbe09..2973c54 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8040,15 +8040,6 @@ static int ixgbe_tso(struct ixgbe_ring *tx_ring,
return 1;
 }
 
-static inline bool ixgbe_ipv6_csum_is_sctp(struct sk_buff *skb)
-{
-   unsigned int offset = 0;
-
-   ipv6_find_hdr(skb, &offset, IPPROTO_SCTP, NULL, NULL);
-
-   return offset == skb_checksum_start_offset(skb);
-}
-
 static void ixgbe_tx_csum(struct ixgbe_ring *tx_ring,
  struct ixgbe_tx_buffer *first,
  struct ixgbe_ipsec_tx_data *itd)
@@ -8074,10 +8065,7 @@ static void ixgbe_tx_csum(struct ixgbe_ring *tx_ring,
break;
case offsetof(struct sctphdr, checksum):
/* validate that this is actually an SCTP request */
-   if (((first->protocol == htons(ETH_P_IP)) &&
-(ip_hdr(skb)->protocol == IPPROTO_SCTP)) ||
-   ((first->protocol == htons(ETH_P_IPV6)) &&
-ixgbe_ipv6_csum_is_sctp(skb))) {
+   if (skb_csum_is_sctp(skb)) {
type_tucmd = IXGBE_ADVTXD_TUCMD_L4T_SCTP;
break;
}
-- 
2.1.0

Re: [PATCH v2 net-next] net: fix GSO for SG-enabled devices.

2021-01-19 Thread Xin Long

On Wed, Jan 20, 2021 at 12:57 AM Paolo Abeni  wrote:
>
> The commit dbd50f238dec ("net: move the hsize check to the else
> block in skb_segment") introduced a data corruption for devices
> supporting scatter-gather.
>
> The problem boils down to signed/unsigned comparison given
> unexpected results: if signed 'hsize' is negative, it will be
> considered greater than a positive 'len', which is unsigned.
>
> This commit addresses resorting to the old checks order, so that
> 'hsize' never has a negative value when compared with 'len'.
>
> v1 -> v2:
>  - reorder hsize checks instead of explicit cast (Alex)
>
> Bisected-by: Matthieu Baerts 
> Fixes: dbd50f238dec ("net: move the hsize check to the else block in 
> skb_segment")
> Signed-off-by: Paolo Abeni 
Reviewed-by: Xin Long 
> ---
>  net/core/skbuff.c | 4 ++--
>  1 file changed, 2 insertions(+), 2 deletions(-)
>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index e835193cabcc3..cf2c4dcf42579 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -3938,10 +3938,10 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
> skb_release_head_state(nskb);
> __skb_push(nskb, doffset);
> } else {
> +   if (hsize < 0)
> +   hsize = 0;
> if (hsize > len || !sg)
> hsize = len;
> -   else if (hsize < 0)
> -   hsize = 0;
>
>     nskb = __alloc_skb(hsize + doffset + headroom,
>GFP_ATOMIC, 
> skb_alloc_rx_flag(head_skb),
> --
> 2.26.2
>
Reviewed-by: Xin Long

Re: [PATCHv3 net-next 1/2] udp: call udp_encap_enable for v6 sockets when enabling encap

2021-01-19 Thread Xin Long

On Wed, Jan 20, 2021 at 6:17 AM Alexander Duyck
 wrote:
>
> On Fri, Jan 15, 2021 at 8:34 PM Xin Long  wrote:
> >
> > When enabling encap for a ipv6 socket without udp_encap_needed_key
> > increased, UDP GRO won't work for v4 mapped v6 address packets as
> > sk will be NULL in udp4_gro_receive().
> >
> > This patch is to enable it by increasing udp_encap_needed_key for
> > v6 sockets in udp_tunnel_encap_enable(), and correspondingly
> > decrease udp_encap_needed_key in udpv6_destroy_sock().
> >
> > v1->v2:
> >   - add udp_encap_disable() and export it.
> >
> > Signed-off-by: Xin Long 
> > ---
> >  include/net/udp.h| 1 +
> >  include/net/udp_tunnel.h | 3 +--
> >  net/ipv4/udp.c   | 6 ++
> >  net/ipv6/udp.c   | 4 +++-
> >  4 files changed, 11 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/net/udp.h b/include/net/udp.h
> > index 877832b..1e7b6cd 100644
> > --- a/include/net/udp.h
> > +++ b/include/net/udp.h
> > @@ -467,6 +467,7 @@ void udp_init(void);
> >
> >  DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key);
> >  void udp_encap_enable(void);
> > +void udp_encap_disable(void);
> >  #if IS_ENABLED(CONFIG_IPV6)
> >  DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key);
> >  void udpv6_encap_enable(void);
> > diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
> > index 282d10e..afc7ce7 100644
> > --- a/include/net/udp_tunnel.h
> > +++ b/include/net/udp_tunnel.h
> > @@ -181,9 +181,8 @@ static inline void udp_tunnel_encap_enable(struct 
> > socket *sock)
> >  #if IS_ENABLED(CONFIG_IPV6)
> > if (sock->sk->sk_family == PF_INET6)
> > ipv6_stub->udpv6_encap_enable();
> > -   else
> >  #endif
> > -   udp_encap_enable();
> > +   udp_encap_enable();
> >  }
> >
> >  #define UDP_TUNNEL_NIC_MAX_TABLES  4
> > diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
> > index 7103b0a..28bfe60 100644
> > --- a/net/ipv4/udp.c
> > +++ b/net/ipv4/udp.c
> > @@ -596,6 +596,12 @@ void udp_encap_enable(void)
> >  }
> >  EXPORT_SYMBOL(udp_encap_enable);
> >
> > +void udp_encap_disable(void)
> > +{
> > +   static_branch_dec(&udp_encap_needed_key);
> > +}
> > +EXPORT_SYMBOL(udp_encap_disable);
> > +
> >  /* Handler for tunnels with arbitrary destination ports: no socket lookup, 
> > go
> >   * through error handlers in encapsulations looking for a match.
> >   */
>
> So this seems unbalanced to me. We are adding/modifying one spot where
> we are calling the enable function, but the other callers don't call
> the disable function? Specifically I am curious about how to deal with
> the rxrpc_open_socket usage.
as long as it's a UDP sock, when it's being destroyed,
udp(v6)_destroy_sock will be called, where the key(s) get decreased.

Sorry, I missed there's a call to udp_encap_enable() in rxrpc, the issue is
the similar to the one in bareudp, it should be:

-   udp_encap_enable();
 #if IS_ENABLED(CONFIG_AF_RXRPC_IPV6)
if (local->srx.transport.family == AF_INET6)
udpv6_encap_enable();
+   else
 #endif
+   udp_encap_enable();
+

I will get all these into one patch and repost.

Interesting it doesn't use udp_tunnel APIs here, I will check.

Thanks.

>
> If we don't balance out all the callers I am not sure adding the
> udp_encap_disable makes much sense.
>
> > diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
> > index b9f3dfd..d754292 100644
> > --- a/net/ipv6/udp.c
> > +++ b/net/ipv6/udp.c
> > @@ -1608,8 +1608,10 @@ void udpv6_destroy_sock(struct sock *sk)
> > if (encap_destroy)
> > encap_destroy(sk);
> > }
> > -   if (up->encap_enabled)
> > +   if (up->encap_enabled) {
> > static_branch_dec(&udpv6_encap_needed_key);
> > +   udp_encap_disable();
> > +   }
> > }
> >
> > inet6_destroy_sock(sk);
> > --
> > 2.1.0
> >

[PATCH net-next 0/3] net: add support for ip generic checksum offload for gre

2021-01-21 Thread Xin Long

NETIF_F_IP(V6)_CSUM is only for TCP and UDP checksum offload, and
NETIF_F_HW_CSUM is not only for TCP and UDP's, but also for all
other kinds of 1's complement checksums offload, like GRE's. This
patchset is to support it for GRE.

To support GRE checksum offload, this patchset is to extend the
csum_not_inet/csum_type of sk_buff in Patch 1/3, and define new
type of CSUM_T_IP_GENERIC to get ip generic checksum processed
in skb_csum_hwoffload_help() in Patch 2/3, then implement it on
TX path and GSO path in Patch 3/3.

Xin Long (3):
  net: rename csum_not_inet to csum_type
  net: add CSUM_T_IP_GENERIC csum_type
  ip_gre: add csum offload support for gre header

 include/linux/skbuff.h | 22 +-
 include/net/gre.h  | 20 
 net/core/dev.c | 19 ++-
 net/ipv4/gre_offload.c | 16 ++--
 net/sched/act_csum.c   |  2 +-
 net/sctp/offload.c |  2 +-
 net/sctp/output.c  |  3 ++-
 7 files changed, 53 insertions(+), 31 deletions(-)

-- 
2.1.0

[PATCH net-next 3/3] ip_gre: add csum offload support for gre header

2021-01-21 Thread Xin Long

This patch is to add csum offload support for gre header:

On the TX path in gre_build_header(), when CHECKSUM_PARTIAL's set
for inner proto, it will calculate the csum for outer proto, and
inner csum will be offloaded later. Otherwise, CHECKSUM_PARTIAL
and csum_start/offset will be set for outer proto, and the outer
csum will be offloaded later.

On the GSO path in gre_gso_segment(), when CHECKSUM_PARTIAL is
not set for inner proto and the hardware supports csum offload,
CHECKSUM_PARTIAL and csum_start/offset will be set for outer
proto, and outer csum will be offloaded later. Otherwise, it
will do csum for outer proto by calling gso_make_checksum().

Note that SCTP has to do the csum by itself for non GSO path in
sctp_packet_pack(), as gre_build_header() can't handle the csum
with CHECKSUM_PARTIAL set for SCTP CRC csum offload.

Signed-off-by: Xin Long 
---
 include/net/gre.h  | 20 
 net/ipv4/gre_offload.c | 16 ++--
 net/sctp/output.c  |  1 +
 3 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/include/net/gre.h b/include/net/gre.h
index b60f212..250b2fb 100644
--- a/include/net/gre.h
+++ b/include/net/gre.h
@@ -106,17 +106,6 @@ static inline __be16 gre_tnl_flags_to_gre_flags(__be16 
tflags)
return flags;
 }
 
-static inline __sum16 gre_checksum(struct sk_buff *skb)
-{
-   __wsum csum;
-
-   if (skb->ip_summed == CHECKSUM_PARTIAL)
-   csum = lco_csum(skb);
-   else
-   csum = skb_checksum(skb, 0, skb->len, 0);
-   return csum_fold(csum);
-}
-
 static inline void gre_build_header(struct sk_buff *skb, int hdr_len,
__be16 flags, __be16 proto,
__be32 key, __be32 seq)
@@ -146,7 +135,14 @@ static inline void gre_build_header(struct sk_buff *skb, 
int hdr_len,
!(skb_shinfo(skb)->gso_type &
  (SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
*ptr = 0;
-   *(__sum16 *)ptr = gre_checksum(skb);
+   if (skb->ip_summed == CHECKSUM_PARTIAL) {
+   *(__sum16 *)ptr = csum_fold(lco_csum(skb));
+   } else {
+   skb->csum_type = CSUM_T_IP_GENERIC;
+   skb->ip_summed = CHECKSUM_PARTIAL;
+   skb->csum_start = skb_transport_header(skb) - 
skb->head;
+   skb->csum_offset = sizeof(*greh);
+   }
}
}
 }
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 10bc49b..12d6996 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -15,10 +15,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
   netdev_features_t features)
 {
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+   bool need_csum, offload_csum, gso_partial, need_ipsec;
struct sk_buff *segs = ERR_PTR(-EINVAL);
u16 mac_offset = skb->mac_header;
__be16 protocol = skb->protocol;
-   bool need_csum, gso_partial;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
 
@@ -47,6 +47,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
if (need_csum)
features &= ~NETIF_F_SCTP_CRC;
 
+   need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb));
+   /* Try to offload checksum if possible */
+   offload_csum = !!(need_csum && !need_ipsec &&
+ (skb->dev->features & NETIF_F_HW_CSUM));
+
/* segment inner packet. */
segs = skb_mac_gso_segment(skb, features);
if (IS_ERR_OR_NULL(segs)) {
@@ -100,7 +105,14 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
}
 
*(pcsum + 1) = 0;
-   *pcsum = gso_make_checksum(skb, 0);
+   if (skb->encapsulation || !offload_csum) {
+   *pcsum = gso_make_checksum(skb, 0);
+   } else {
+   skb->csum_type = CSUM_T_IP_GENERIC;
+   skb->ip_summed = CHECKSUM_PARTIAL;
+   skb->csum_start = skb_transport_header(skb) - skb->head;
+   skb->csum_offset = sizeof(*greh);
+   }
} while ((skb = skb->next));
 out:
return segs;
diff --git a/net/sctp/output.c b/net/sctp/output.c
index a8cf0191..52e12df 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -515,6 +515,7 @@ static int sctp_packet_pack(struct sctp_packet *packet,
return 1;
 
if (!(tp->dst->dev->features & NETIF_F_SCTP_CRC) ||
+   tp->dst->dev->type == ARPHRD_IPGRE ||
dst_xfrm(tp->dst) || packet->ipfragok || tp->encap_port) {
struct sctphdr *sh =
(struct sctphdr *)skb_transport_header(head);
-- 
2.1.0

[PATCH net-next 1/3] net: rename csum_not_inet to csum_type

2021-01-21 Thread Xin Long

This patch is to rename csum_not_inet to csum_type, as later
more csum type would be introduced in the next patch.

Signed-off-by: Xin Long 
---
 include/linux/skbuff.h | 19 +++
 net/core/dev.c |  2 +-
 net/sched/act_csum.c   |  2 +-
 net/sctp/offload.c |  2 +-
 net/sctp/output.c  |  2 +-
 5 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 186dad2..67b0a01 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -169,7 +169,7 @@
  *   skb_csum_hwoffload_help() can be called to resolve CHECKSUM_PARTIAL based
  *   on network device checksumming capabilities: if a packet does not match
  *   them, skb_checksum_help or skb_crc32c_help (depending on the value of
- *   csum_not_inet, see item D.) is called to resolve the checksum.
+ *   csum_type, see item D.) is called to resolve the checksum.
  *
  * CHECKSUM_NONE:
  *
@@ -190,12 +190,12 @@
  *   NETIF_F_SCTP_CRC - This feature indicates that a device is capable of
  * offloading the SCTP CRC in a packet. To perform this offload the stack
  * will set csum_start and csum_offset accordingly, set ip_summed to
- * CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in
- * the skbuff that the CHECKSUM_PARTIAL refers to CRC32c.
+ * CHECKSUM_PARTIAL and set csum_type to CSUM_T_SCTP_CRC, to provide an
+ * indication in the skbuff that the CHECKSUM_PARTIAL refers to CRC32c.
  * A driver that supports both IP checksum offload and SCTP CRC32c offload
  * must verify which offload is configured for a packet by testing the
- * value of skb->csum_not_inet; skb_crc32c_csum_help is provided to resolve
- * CHECKSUM_PARTIAL on skbs where csum_not_inet is set to 1.
+ * value of skb->csum_type; skb_crc32c_csum_help is provided to resolve
+ * CHECKSUM_PARTIAL on skbs where csum_type is set to CSUM_T_SCTP_CRC.
  *
  *   NETIF_F_FCOE_CRC - This feature indicates that a device is capable of
  * offloading the FCOE CRC in a packet. To perform this offload the stack
@@ -222,6 +222,9 @@
 #define CHECKSUM_COMPLETE  2
 #define CHECKSUM_PARTIAL   3
 
+#define CSUM_T_INET0
+#define CSUM_T_SCTP_CRC1
+
 /* Maximum value in skb->csum_level */
 #define SKB_MAX_CSUM_LEVEL 3
 
@@ -677,7 +680,7 @@ typedef unsigned char *sk_buff_data_t;
  * @encapsulation: indicates the inner headers in the skbuff are valid
  * @encap_hdr_csum: software checksum is needed
  * @csum_valid: checksum is already valid
- * @csum_not_inet: use CRC32c to resolve CHECKSUM_PARTIAL
+ * @csum_type: do the checksum offload according to this type
  * @csum_complete_sw: checksum was completed by software
  * @csum_level: indicates the number of consecutive checksums found in
  * the packet minus one that have been verified as
@@ -836,7 +839,7 @@ struct sk_buff {
__u8vlan_present:1;
__u8csum_complete_sw:1;
__u8csum_level:2;
-   __u8csum_not_inet:1;
+   __u8csum_type:1;
__u8dst_pending_confirm:1;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8ndisc_nodetype:2;
@@ -4623,7 +4626,7 @@ static inline void skb_reset_redirect(struct sk_buff *skb)
 
 static inline bool skb_csum_is_sctp(struct sk_buff *skb)
 {
-   return skb->csum_not_inet;
+   return skb->csum_type == CSUM_T_SCTP_CRC;
 }
 
 static inline void skb_set_kcov_handle(struct sk_buff *skb,
diff --git a/net/core/dev.c b/net/core/dev.c
index d9ce02e..3241de2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3279,7 +3279,7 @@ int skb_crc32c_csum_help(struct sk_buff *skb)
  crc32c_csum_stub));
*(__le32 *)(skb->data + offset) = crc32c_csum;
skb->ip_summed = CHECKSUM_NONE;
-   skb->csum_not_inet = 0;
+   skb->csum_type = 0;
 out:
return ret;
 }
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index 4fa4fcb..9fabb6e 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -376,7 +376,7 @@ static int tcf_csum_sctp(struct sk_buff *skb, unsigned int 
ihl,
sctph->checksum = sctp_compute_cksum(skb,
 skb_network_offset(skb) + ihl);
skb->ip_summed = CHECKSUM_NONE;
-   skb->csum_not_inet = 0;
+   skb->csum_type = 0;
 
return 1;
 }
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index eb874e3..372f373 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -26,7 +26,7 @@
 static __le32 sctp_gso_make_checksum(struct sk_buff *skb)
 {
skb->ip_summed = CHECKSUM_NONE;
-   skb->csum_not_inet = 0;
+   skb->csum_type = 0;
/* csum and csum_start in GSO CB may be needed to do the UDP
 * checksum wh

[PATCH net-next 2/3] net: add CSUM_T_IP_GENERIC csum_type

2021-01-21 Thread Xin Long

This patch is to extend csum_type field to 2 bits, and introduce
CSUM_T_IP_GENERIC csum type, and add the support for this in
skb_csum_hwoffload_help(), just like CSUM_T_SCTP_CRC.

Note here it moves dst_pending_confirm field below ndisc_nodetype
to avoid a memory hole.

Signed-off-by: Xin Long 
---
 include/linux/skbuff.h |  5 +++--
 net/core/dev.c | 17 +
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 67b0a01..d5011fb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -224,6 +224,7 @@
 
 #define CSUM_T_INET0
 #define CSUM_T_SCTP_CRC1
+#define CSUM_T_IP_GENERIC  2
 
 /* Maximum value in skb->csum_level */
 #define SKB_MAX_CSUM_LEVEL 3
@@ -839,11 +840,11 @@ struct sk_buff {
__u8vlan_present:1;
__u8csum_complete_sw:1;
__u8csum_level:2;
-   __u8csum_type:1;
-   __u8dst_pending_confirm:1;
+   __u8csum_type:2;
 #ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8ndisc_nodetype:2;
 #endif
+   __u8dst_pending_confirm:1;
 
__u8ipvs_property:1;
__u8inner_protocol_type:1;
diff --git a/net/core/dev.c b/net/core/dev.c
index 3241de2..6d48af2 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3617,11 +3617,20 @@ static struct sk_buff *validate_xmit_vlan(struct 
sk_buff *skb,
 int skb_csum_hwoffload_help(struct sk_buff *skb,
const netdev_features_t features)
 {
-   if (unlikely(skb_csum_is_sctp(skb)))
-   return !!(features & NETIF_F_SCTP_CRC) ? 0 :
-   skb_crc32c_csum_help(skb);
+   if (likely(!skb->csum_type))
+   return !!(features & NETIF_F_CSUM_MASK) ? 0 :
+  skb_checksum_help(skb);
 
-   return !!(features & NETIF_F_CSUM_MASK) ? 0 : skb_checksum_help(skb);
+   if (skb_csum_is_sctp(skb)) {
+   return !!(features & NETIF_F_SCTP_CRC) ? 0 :
+  skb_crc32c_csum_help(skb);
+   } else if (skb->csum_type == CSUM_T_IP_GENERIC) {
+   return !!(features & NETIF_F_HW_CSUM) ? 0 :
+  skb_checksum_help(skb);
+   } else {
+   pr_warn("Wrong csum type: %d\n", skb->csum_type);
+   return 1;
+   }
 }
 EXPORT_SYMBOL(skb_csum_hwoffload_help);
 
-- 
2.1.0

Re: [PATCH net-next 1/3] net: rename csum_not_inet to csum_type

2021-01-21 Thread Xin Long

On Fri, Jan 22, 2021 at 10:50 AM Jakub Kicinski  wrote:
>
> On Thu, 21 Jan 2021 16:45:36 +0800 Xin Long wrote:
> > This patch is to rename csum_not_inet to csum_type, as later
> > more csum type would be introduced in the next patch.
> >
> > Signed-off-by: Xin Long 
>
> drivers/net/ethernet/hisilicon/hns3/hns3_enet.c:1073:11: error: ‘struct 
> sk_buff’ has no member named ‘csum_not_inet’; did you mean ‘csum_offset’?
>  1073 |  if (skb->csum_not_inet || skb_is_gso(skb) ||
>   |   ^
>   |   csum_offset
I will replace it with skb_csum_is_sctp(). Thanks.

Re: [PATCH net-next 2/3] net: add CSUM_T_IP_GENERIC csum_type

2021-01-21 Thread Xin Long

On Fri, Jan 22, 2021 at 2:13 AM Alexander Duyck
 wrote:
>
> On Thu, Jan 21, 2021 at 12:46 AM Xin Long  wrote:
> >
> > This patch is to extend csum_type field to 2 bits, and introduce
> > CSUM_T_IP_GENERIC csum type, and add the support for this in
> > skb_csum_hwoffload_help(), just like CSUM_T_SCTP_CRC.
> >
> > Note here it moves dst_pending_confirm field below ndisc_nodetype
> > to avoid a memory hole.
> >
> > Signed-off-by: Xin Long 
> > ---
> >  include/linux/skbuff.h |  5 +++--
> >  net/core/dev.c | 17 +
> >  2 files changed, 16 insertions(+), 6 deletions(-)
> >
> > diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> > index 67b0a01..d5011fb 100644
> > --- a/include/linux/skbuff.h
> > +++ b/include/linux/skbuff.h
> > @@ -224,6 +224,7 @@
> >
> >  #define CSUM_T_INET0
> >  #define CSUM_T_SCTP_CRC1
> > +#define CSUM_T_IP_GENERIC  2
> >
> >  /* Maximum value in skb->csum_level */
> >  #define SKB_MAX_CSUM_LEVEL 3
> > @@ -839,11 +840,11 @@ struct sk_buff {
> > __u8vlan_present:1;
> > __u8csum_complete_sw:1;
> > __u8csum_level:2;
> > -   __u8csum_type:1;
> > -   __u8dst_pending_confirm:1;
> > +   __u8csum_type:2;
> >  #ifdef CONFIG_IPV6_NDISC_NODETYPE
> > __u8ndisc_nodetype:2;
> >  #endif
> > +   __u8dst_pending_confirm:1;
> >
> > __u8ipvs_property:1;
> > __u8inner_protocol_type:1;
> > diff --git a/net/core/dev.c b/net/core/dev.c
> > index 3241de2..6d48af2 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -3617,11 +3617,20 @@ static struct sk_buff *validate_xmit_vlan(struct 
> > sk_buff *skb,
> >  int skb_csum_hwoffload_help(struct sk_buff *skb,
> > const netdev_features_t features)
> >  {
> > -   if (unlikely(skb_csum_is_sctp(skb)))
> > -   return !!(features & NETIF_F_SCTP_CRC) ? 0 :
> > -   skb_crc32c_csum_help(skb);
> > +   if (likely(!skb->csum_type))
> > +   return !!(features & NETIF_F_CSUM_MASK) ? 0 :
> > +  skb_checksum_help(skb);
> >
> > -   return !!(features & NETIF_F_CSUM_MASK) ? 0 : 
> > skb_checksum_help(skb);
> > +   if (skb_csum_is_sctp(skb)) {
> > +   return !!(features & NETIF_F_SCTP_CRC) ? 0 :
> > +  skb_crc32c_csum_help(skb);
> > +   } else if (skb->csum_type == CSUM_T_IP_GENERIC) {
> > +   return !!(features & NETIF_F_HW_CSUM) ? 0 :
> > +  skb_checksum_help(skb);
> > +   } else {
> > +   pr_warn("Wrong csum type: %d\n", skb->csum_type);
> > +   return 1;
> > +   }
>
> Is the only difference between CSUM_T_IP_GENERIC the fact that we
> check for NETIF_F_HW_CSUM versus using NETIF_F_CSUM_MASK? If so I
> don't think adding the new bit is adding all that much value. Instead
> you could probably just catch this in the testing logic here.
>
> You could very easily just fold CSUM_T_IP_GENERIC into CSUM_T_INET,
> and then in the checks here you split up the checks for
> NETIF_F_HW_CSUM as follows:
If so, better not to touch csum_not_inet now. I will drop the patch 1/3.

>
>  if (skb_csum_is_sctp(skb))
> return !!(features & NETIF_F_SCTP_CRC) ? 0 : skb_crc32c_csum_help(skb);
>
> if (skb->csum_type) {
> pr_warn("Wrong csum type: %d\n", skb->csum_type);
> return 1;
> }
>
> if (features & NETIF_F_HW_CSUM)
> return 0;
>
> if (features & NETIF_F_CSUM_MASK) {
> switch (skb->csum_offset) {
> case offsetof(struct tcphdr, check):
> case offsetof(struct udphdr, check):
> return 0;
> }
Question is: is it reliable to check the type by skb->csum_offset?
What if one day there's another protocol, whose the checksum field
is on the same offset, which is also using the CSUM_T_IP_GENERIC?

> }
>
> return skb_checksum_help(skb);

[PATCH net-next] net: hns3: replace skb->csum_not_inet with skb_csum_is_sctp

2021-01-22 Thread Xin Long

Commit fa8211701043 ("net: add inline function skb_csum_is_sctp")
missed replacing skb->csum_not_inet check in hns3. This patch is
to replace it with skb_csum_is_sctp().

Reported-by: Jakub Kicinski 
Signed-off-by: Xin Long 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 405e490..5120806 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1070,7 +1070,7 @@ static bool hns3_check_hw_tx_csum(struct sk_buff *skb)
 * HW checksum of the non-IP packets and GSO packets is handled at
 * different place in the following code
 */
-   if (skb->csum_not_inet || skb_is_gso(skb) ||
+   if (skb_csum_is_sctp(skb) || skb_is_gso(skb) ||
!test_bit(HNS3_NIC_STATE_HW_TX_CSUM_ENABLE, &priv->state))
return false;
 
-- 
2.1.0

[PATCH net] udp: fix the proto value passed to ip_protocol_deliver_rcu for the segments

2020-12-06 Thread Xin Long

Guillaume noticed that: for segments udp_queue_rcv_one_skb() returns the
proto, and it should pass "ret" unmodified to ip_protocol_deliver_rcu().
Otherwize, with a negtive value passed, it will underflow inet_protos.

This can be reproduced with IPIP FOU:

  # ip fou add port  ipproto 4
  # ethtool -K eth1 rx-gro-list on

Fixes: cf329aa42b66 ("udp: cope with UDP GRO packet misdirection")
Reported-by: Guillaume Nault 
Signed-off-by: Xin Long 
---
 net/ipv4/udp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 09f0a23..9eeebd4 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -2173,7 +2173,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct 
sk_buff *skb)
__skb_pull(skb, skb_transport_offset(skb));
ret = udp_queue_rcv_one_skb(sk, skb);
if (ret > 0)
-   ip_protocol_deliver_rcu(dev_net(skb->dev), skb, -ret);
+   ip_protocol_deliver_rcu(dev_net(skb->dev), skb, ret);
}
return 0;
 }
-- 
2.1.0

Re: [PATCH net-next] ip_gre: remove CRC flag from dev features in gre_gso_segment

2020-11-18 Thread Xin Long

On Thu, Nov 19, 2020 at 12:44 AM Jakub Kicinski  wrote:
>
> On Wed, 18 Nov 2020 14:14:49 +0800 Xin Long wrote:
> > On Wed, Nov 18, 2020 at 8:29 AM Jakub Kicinski  wrote:
> > > On Mon, 16 Nov 2020 17:15:47 +0800 Xin Long wrote:
> > > > This patch is to let it always do CRC checksum in sctp_gso_segment()
> > > > by removing CRC flag from the dev features in gre_gso_segment() for
> > > > SCTP over GRE, just as it does in Commit 527beb8ef9c0 ("udp: support
> > > > sctp over udp in skb_udp_tunnel_segment") for SCTP over UDP.
> > > >
> > > > It could set csum/csum_start in GSO CB properly in sctp_gso_segment()
> > > > after that commit, so it would do checksum with gso_make_checksum()
> > > > in gre_gso_segment(), and Commit 622e32b7d4a6 ("net: gre: recompute
> > > > gre csum for sctp over gre tunnels") can be reverted now.
> > > >
> > > > Signed-off-by: Xin Long 
> > >
> > > Makes sense, but does GRE tunnels don't always have a csum.
> > Do you mean the GRE csum can be offloaded? If so, it seems for GRE tunnel
> > we need the similar one as:
> >
> > commit 4bcb877d257c87298aedead1ffeaba0d5df1991d
> > Author: Tom Herbert 
> > Date:   Tue Nov 4 09:06:52 2014 -0800
> >
> > udp: Offload outer UDP tunnel csum if available
> >
> > I will confirm and implement it in another patch.
> >
> > >
> > > Is the current hardware not capable of generating CRC csums over
> > > encapsulated patches at all?
> > There is, but very rare. The thing is after doing CRC csum, the outer
> > GRE/UDP checksum will have to be recomputed, as it's NOT zero after
> > all fields for CRC scum are summed, which is different from the
> > common checksum. So if it's a GRE/UDP tunnel, the inner CRC csum
> > has to be done there even if the HW supports its offload.
>
> Ack, my point is that for UDP tunnels (at least with IPv4) the UDP
> checksum is optional (should be ignored if the header field is 0),
> and for GRE checksum is optional and it's presence is indicated by
> a bit in the header IIRC.
Yes, it's tunnel->parms.o_flags & TUNNEL_CSUM. When it's not set,
gso_type is set to SKB_GSO_GRE instead of SKB_GSO_GRE_CSUM
by gre_handle_offloads().


>
> So if the HW can compute the CRC csum based on offsets, without parsing
> the packet, it should be able to do the CRC on tunneled packets w/o
> checksum in the outer header.
Right, we can only force it to do CRC csum there when SKB_GSO_GRE_CSUM was set:

need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
skb->encap_hdr_csum = need_csum;

features &= skb->dev->hw_enc_features;
+   if (need_csum)
+   features &= ~NETIF_F_SCTP_CRC;

I will give it a try.

BTW, __skb_udp_tunnel_segment() doesn't need this, as For UDP encaped SCTP,
the UDP csum is always set.

>
> IDK how realistic this is, whether it'd work today, and whether we care
> about it...

Re: [PATCH net-next] ip_gre: remove CRC flag from dev features in gre_gso_segment

2020-11-18 Thread Xin Long

On Thu, Nov 19, 2020 at 4:35 AM Alexander Duyck
 wrote:
>
> On Mon, Nov 16, 2020 at 1:17 AM Xin Long  wrote:
> >
> > This patch is to let it always do CRC checksum in sctp_gso_segment()
> > by removing CRC flag from the dev features in gre_gso_segment() for
> > SCTP over GRE, just as it does in Commit 527beb8ef9c0 ("udp: support
> > sctp over udp in skb_udp_tunnel_segment") for SCTP over UDP.
> >
> > It could set csum/csum_start in GSO CB properly in sctp_gso_segment()
> > after that commit, so it would do checksum with gso_make_checksum()
> > in gre_gso_segment(), and Commit 622e32b7d4a6 ("net: gre: recompute
> > gre csum for sctp over gre tunnels") can be reverted now.
> >
> > Signed-off-by: Xin Long 
> > ---
> >  net/ipv4/gre_offload.c | 14 +++---
> >  1 file changed, 3 insertions(+), 11 deletions(-)
> >
> > diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
> > index e0a2465..a5935d4 100644
> > --- a/net/ipv4/gre_offload.c
> > +++ b/net/ipv4/gre_offload.c
> > @@ -15,12 +15,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff 
> > *skb,
> >netdev_features_t features)
> >  {
> > int tnl_hlen = skb_inner_mac_header(skb) - 
> > skb_transport_header(skb);
> > -   bool need_csum, need_recompute_csum, gso_partial;
> > struct sk_buff *segs = ERR_PTR(-EINVAL);
> > u16 mac_offset = skb->mac_header;
> > __be16 protocol = skb->protocol;
> > u16 mac_len = skb->mac_len;
> > int gre_offset, outer_hlen;
> > +   bool need_csum, gso_partial;
> >
> > if (!skb->encapsulation)
> > goto out;
> > @@ -41,10 +41,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff 
> > *skb,
> > skb->protocol = skb->inner_protocol;
> >
> > need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
> > -   need_recompute_csum = skb->csum_not_inet;
> > skb->encap_hdr_csum = need_csum;
> >
> > features &= skb->dev->hw_enc_features;
> > +   features &= ~NETIF_F_SCTP_CRC;
> >
> > /* segment inner packet. */
> > segs = skb_mac_gso_segment(skb, features);
>
> Why just blindly strip NETIF_F_SCTP_CRC? It seems like it would make
> more sense if there was an explanation as to why you are stripping the
> offload. I know there are many NICs that could very easily perform
> SCTP CRC offload on the inner data as long as they didn't have to
> offload the outer data. For example the Intel NICs should be able to
> do it, although when I wrote the code up enabling their offloads I
> think it is only looking at the outer headers so that might require
> updating to get it to not use the software fallback.
>
> It really seems like we should only be clearing NETIF_F_SCTP_CRC if
> need_csum is true since we must compute the CRC before we can compute
> the GRE checksum.
Right, it's also what Jakub commented, thanks.

>
> > @@ -99,15 +99,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff 
> > *skb,
> > }
> >
> > *(pcsum + 1) = 0;
> > -   if (need_recompute_csum && !skb_is_gso(skb)) {
> > -   __wsum csum;
> > -
> > -   csum = skb_checksum(skb, gre_offset,
> > -   skb->len - gre_offset, 0);
> > -   *pcsum = csum_fold(csum);
> > -   } else {
> > -   *pcsum = gso_make_checksum(skb, 0);
> > -   }
> > +   *pcsum = gso_make_checksum(skb, 0);
> > } while ((skb = skb->next));
> >  out:
> > return segs;
>
> This change doesn't make much sense to me. How are we expecting
> gso_make_checksum to be able to generate a valid checksum when we are
> dealing with a SCTP frame? From what I can tell it looks like it is
> just setting the checksum to ~0 and checksum start to the transport
> header which isn't true because SCTP is using a CRC, not a 1's
> complement checksum, or am I missing something? As such in order to
> get the gre checksum we would need to compute it over the entire
> payload data wouldn't we? Has this been tested with an actual GRE
> tunnel that had checksums enabled? If so was it verified that the GSO
> frames were actually being segmented at the NIC level and not at the
> GRE tunnel level?
Hi Alex,

I think you're looking at net.git? As on net

Re: [PATCH net-next] ip_gre: remove CRC flag from dev features in gre_gso_segment

2020-11-20 Thread Xin Long

On Fri, Nov 20, 2020 at 1:24 AM Alexander Duyck
 wrote:
>
> On Wed, Nov 18, 2020 at 9:53 PM Xin Long  wrote:
> >
> > On Thu, Nov 19, 2020 at 4:35 AM Alexander Duyck
> >  wrote:
> > >
> > > On Mon, Nov 16, 2020 at 1:17 AM Xin Long  wrote:
> > > >
> > > > This patch is to let it always do CRC checksum in sctp_gso_segment()
> > > > by removing CRC flag from the dev features in gre_gso_segment() for
> > > > SCTP over GRE, just as it does in Commit 527beb8ef9c0 ("udp: support
> > > > sctp over udp in skb_udp_tunnel_segment") for SCTP over UDP.
> > > > It could set csum/csum_start in GSO CB properly in sctp_gso_segment()
> > > > after that commit, so it would do checksum with gso_make_checksum()
> > > > in gre_gso_segment(), and Commit 622e32b7d4a6 ("net: gre: recompute
> > > > gre csum for sctp over gre tunnels") can be reverted now.
> > > >
> > > > Signed-off-by: Xin Long 
> > > > ---
> > > >  net/ipv4/gre_offload.c | 14 +++---
> > > >  1 file changed, 3 insertions(+), 11 deletions(-)
> > > >
> > > > diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
> > > > index e0a2465..a5935d4 100644
> > > > --- a/net/ipv4/gre_offload.c
> > > > +++ b/net/ipv4/gre_offload.c
> > > > @@ -15,12 +15,12 @@ static struct sk_buff *gre_gso_segment(struct 
> > > > sk_buff *skb,
> > > >netdev_features_t features)
> > > >  {
> > > > int tnl_hlen = skb_inner_mac_header(skb) - 
> > > > skb_transport_header(skb);
> > > > -   bool need_csum, need_recompute_csum, gso_partial;
> > > > struct sk_buff *segs = ERR_PTR(-EINVAL);
> > > > u16 mac_offset = skb->mac_header;
> > > > __be16 protocol = skb->protocol;
> > > > u16 mac_len = skb->mac_len;
> > > > int gre_offset, outer_hlen;
> > > > +   bool need_csum, gso_partial;
> > > >
> > > > if (!skb->encapsulation)
> > > > goto out;
> > > > @@ -41,10 +41,10 @@ static struct sk_buff *gre_gso_segment(struct 
> > > > sk_buff *skb,
> > > > skb->protocol = skb->inner_protocol;
> > > >
> > > > need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
> > > > -   need_recompute_csum = skb->csum_not_inet;
> > > > skb->encap_hdr_csum = need_csum;
> > > >
> > > > features &= skb->dev->hw_enc_features;
> > > > +   features &= ~NETIF_F_SCTP_CRC;
> > > >
> > > > /* segment inner packet. */
> > > > segs = skb_mac_gso_segment(skb, features);
> > >
> > > Why just blindly strip NETIF_F_SCTP_CRC? It seems like it would make
> > > more sense if there was an explanation as to why you are stripping the
> > > offload. I know there are many NICs that could very easily perform
> > > SCTP CRC offload on the inner data as long as they didn't have to
> > > offload the outer data. For example the Intel NICs should be able to
> > > do it, although when I wrote the code up enabling their offloads I
> > > think it is only looking at the outer headers so that might require
> > > updating to get it to not use the software fallback.
> > >
> > > It really seems like we should only be clearing NETIF_F_SCTP_CRC if
> > > need_csum is true since we must compute the CRC before we can compute
> > > the GRE checksum.
> > Right, it's also what Jakub commented, thanks.
> >
> > >
> > > > @@ -99,15 +99,7 @@ static struct sk_buff *gre_gso_segment(struct 
> > > > sk_buff *skb,
> > > > }
> > > >
> > > > *(pcsum + 1) = 0;
> > > > -   if (need_recompute_csum && !skb_is_gso(skb)) {
> > > > -   __wsum csum;
> > > > -
> > > > -   csum = skb_checksum(skb, gre_offset,
> > > > -   skb->len - gre_offset, 0);
> > > > -   *pcsum = csum_fold(csum);
> > > > -   } else {
> > > > -   *pcsum = gso_make_checksum(skb, 0);
> > > > -   }
> > > > +   *pcsum = gso_make_chec

Re: [PATCH net-next] ip_gre: remove CRC flag from dev features in gre_gso_segment

2020-11-23 Thread Xin Long

On Sat, Nov 21, 2020 at 12:10 AM Alexander Duyck
 wrote:
>
> On Fri, Nov 20, 2020 at 2:23 AM Xin Long  wrote:
> >
> > On Fri, Nov 20, 2020 at 1:24 AM Alexander Duyck
> >  wrote:
> > >
> > > On Wed, Nov 18, 2020 at 9:53 PM Xin Long  wrote:
> > > >
> > > > On Thu, Nov 19, 2020 at 4:35 AM Alexander Duyck
> > > >  wrote:
> > > > >
> > > > > On Mon, Nov 16, 2020 at 1:17 AM Xin Long  wrote:
> > > > > >
> > > > > > This patch is to let it always do CRC checksum in sctp_gso_segment()
> > > > > > by removing CRC flag from the dev features in gre_gso_segment() for
> > > > > > SCTP over GRE, just as it does in Commit 527beb8ef9c0 ("udp: support
> > > > > > sctp over udp in skb_udp_tunnel_segment") for SCTP over UDP.
> > > > > > It could set csum/csum_start in GSO CB properly in 
> > > > > > sctp_gso_segment()
> > > > > > after that commit, so it would do checksum with gso_make_checksum()
> > > > > > in gre_gso_segment(), and Commit 622e32b7d4a6 ("net: gre: recompute
> > > > > > gre csum for sctp over gre tunnels") can be reverted now.
> > > > > >
> > > > > > Signed-off-by: Xin Long 
> > > > > > ---
> > > > > >  net/ipv4/gre_offload.c | 14 +++---
> > > > > >  1 file changed, 3 insertions(+), 11 deletions(-)
> > > > > >
> > > > > > diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
> > > > > > index e0a2465..a5935d4 100644
> > > > > > --- a/net/ipv4/gre_offload.c
> > > > > > +++ b/net/ipv4/gre_offload.c
> > > > > > @@ -15,12 +15,12 @@ static struct sk_buff *gre_gso_segment(struct 
> > > > > > sk_buff *skb,
> > > > > >netdev_features_t features)
> > > > > >  {
> > > > > > int tnl_hlen = skb_inner_mac_header(skb) - 
> > > > > > skb_transport_header(skb);
> > > > > > -   bool need_csum, need_recompute_csum, gso_partial;
> > > > > > struct sk_buff *segs = ERR_PTR(-EINVAL);
> > > > > > u16 mac_offset = skb->mac_header;
> > > > > > __be16 protocol = skb->protocol;
> > > > > > u16 mac_len = skb->mac_len;
> > > > > > int gre_offset, outer_hlen;
> > > > > > +   bool need_csum, gso_partial;
> > > > > >
> > > > > > if (!skb->encapsulation)
> > > > > > goto out;
> > > > > > @@ -41,10 +41,10 @@ static struct sk_buff *gre_gso_segment(struct 
> > > > > > sk_buff *skb,
> > > > > > skb->protocol = skb->inner_protocol;
> > > > > >
> > > > > > need_csum = !!(skb_shinfo(skb)->gso_type & 
> > > > > > SKB_GSO_GRE_CSUM);
> > > > > > -   need_recompute_csum = skb->csum_not_inet;
> > > > > > skb->encap_hdr_csum = need_csum;
> > > > > >
> > > > > > features &= skb->dev->hw_enc_features;
> > > > > > +   features &= ~NETIF_F_SCTP_CRC;
> > > > > >
> > > > > > /* segment inner packet. */
> > > > > > segs = skb_mac_gso_segment(skb, features);
> > > > >
> > > > > Why just blindly strip NETIF_F_SCTP_CRC? It seems like it would make
> > > > > more sense if there was an explanation as to why you are stripping the
> > > > > offload. I know there are many NICs that could very easily perform
> > > > > SCTP CRC offload on the inner data as long as they didn't have to
> > > > > offload the outer data. For example the Intel NICs should be able to
> > > > > do it, although when I wrote the code up enabling their offloads I
> > > > > think it is only looking at the outer headers so that might require
> > > > > updating to get it to not use the software fallback.
> > > > >
> > > > > It really seems like we should only be clearing NETIF_F_SCTP_CRC if
> > > > > need_csum is true since we must compute the CRC before we can compute
> > > > > the GRE checksum.
> > > > Right, it's also what Jakub com

Re: [PATCH net-next] ip_gre: remove CRC flag from dev features in gre_gso_segment

2020-11-24 Thread Xin Long

On Tue, Nov 24, 2020 at 6:23 AM Alexander Duyck
 wrote:
>
> On Mon, Nov 23, 2020 at 1:14 AM Xin Long  wrote:
> >
> > On Sat, Nov 21, 2020 at 12:10 AM Alexander Duyck
> >  wrote:
> > >
> > > On Fri, Nov 20, 2020 at 2:23 AM Xin Long  wrote:
> > > >
> > > > On Fri, Nov 20, 2020 at 1:24 AM Alexander Duyck
> > > >  wrote:
> > > > >
> > > > > On Wed, Nov 18, 2020 at 9:53 PM Xin Long  wrote:
> > > > > >
> > > > > > On Thu, Nov 19, 2020 at 4:35 AM Alexander Duyck
> > > > > >  wrote:
> > > > > > >
> > > > > > > On Mon, Nov 16, 2020 at 1:17 AM Xin Long  
> > > > > > > wrote:
> > > > > > > >
>
> 
>
> > > > > > @@ -27,7 +27,11 @@ static __le32 sctp_gso_make_checksum(struct 
> > > > > > sk_buff *skb)
> > > > > >  {
> > > > > > skb->ip_summed = CHECKSUM_NONE;
> > > > > > skb->csum_not_inet = 0;
> > > > > > -   gso_reset_checksum(skb, ~0);
> > > > > > +   /* csum and csum_start in GSO CB may be needed to do the UDP
> > > > > > +* checksum when it's a UDP tunneling packet.
> > > > > > +*/
> > > > > > +   SKB_GSO_CB(skb)->csum = (__force __wsum)~0;
> > > > > > +   SKB_GSO_CB(skb)->csum_start = skb_headroom(skb) + skb->len;
> > > > > > return sctp_compute_cksum(skb, skb_transport_offset(skb));
> > > > > >  }
> > > > > >
> > > > > > And yes, this patch has been tested with GRE tunnel checksums 
> > > > > > enabled.
> > > > > > (... icsum ocsum).
> > > > > > And yes, it was segmented in the lower NIC level, and we can make 
> > > > > > it by:
> > > > > >
> > > > > > # ethtool -K gre1 tx-sctp-segmentation on
> > > > > > # ethtool -K veth0 tx-sctp-segmentation off
> > > > > >
> > > > > > (Note: "tx-checksum-sctp" and "gso" are on for both devices)
> > > > > >
> > > > > > Thanks.
> > > > >
> > > > > I would also turn off Tx and Rx checksum offload on your veth device
> > > > > in order to make certain you aren't falsely sending data across
> > > > > indicating that the checksums are valid when they are not. It might be
> > > > > better if you were to run this over an actual NIC as that could then
> > > > > provide independent verification as it would be a fixed checksum test.
> > > > >
> > > > > I'm still not convinced this is working correctly. Basically a crc32c
> > > > > is not the same thing as a 1's complement checksum so you should need
> > > > > to compute both if you have an SCTP packet tunneled inside a UDP or
> > > > > GRE packet with a checksum. I don't see how computing a crc32c should
> > > > > automatically give you a 1's complement checksum of ~0.
> > > >
> > > > On the tx Path [1] below, the sctp crc checksum is calculated in
> > > > sctp_gso_make_checksum() [a], where it calls *sctp_compute_cksum()*
> > > > to do that, and as for the code in it:
> > > >
> > > > SKB_GSO_CB(skb)->csum = (__force __wsum)~0;
> > > > SKB_GSO_CB(skb)->csum_start = skb_headroom(skb) + skb->len;
> > >
> > > Okay, so I think I know how this is working, but the number of things
> > > relied on is ugly. Normally assuming skb_headroom(skb) + skb->len
> > > being valid for this would be a non-starter. I was assuming you
> > > weren't doing the 1's compliment checksum because you weren't using
> > > __skb_checksum to generate it.
> > >
> > > If I am not mistaken SCTP GSO uses the GSO_BY_FRAGS and apparently
> > > none of the frags are using page fragments within the skb. Am I
> > > understanding correctly? One thing that would help to address some of
> > > my concerns would be to clear instead of set NETIF_F_SG in
> > > sctp_gso_segment since your checksum depends on linear skbs.
> > Right, no frag is using page fragments for SCTP GSO.
> > NETIF_F_SG is set here, because in skb_segment():
> >
> > if (hsize > len || !sg)
> >

[PATCH net] sctp: change to hold/put transport for proto_unreach_timer

2020-11-13 Thread Xin Long

A call trace was found in Hangbin's Codenomicon testing with debug kernel:

  [ 2615.981988] ODEBUG: free active (active state 0) object type: timer_list 
hint: sctp_generate_proto_unreach_event+0x0/0x3a0 [sctp]
  [ 2615.995050] WARNING: CPU: 17 PID: 0 at lib/debugobjects.c:328 
debug_print_object+0x199/0x2b0
  [ 2616.095934] RIP: 0010:debug_print_object+0x199/0x2b0
  [ 2616.191533] Call Trace:
  [ 2616.194265]  
  [ 2616.202068]  debug_check_no_obj_freed+0x25e/0x3f0
  [ 2616.207336]  slab_free_freelist_hook+0xeb/0x140
  [ 2616.220971]  kfree+0xd6/0x2c0
  [ 2616.224293]  rcu_do_batch+0x3bd/0xc70
  [ 2616.243096]  rcu_core+0x8b9/0xd00
  [ 2616.256065]  __do_softirq+0x23d/0xacd
  [ 2616.260166]  irq_exit+0x236/0x2a0
  [ 2616.263879]  smp_apic_timer_interrupt+0x18d/0x620
  [ 2616.269138]  apic_timer_interrupt+0xf/0x20
  [ 2616.273711]  

This is because it holds asoc when transport->proto_unreach_timer starts
and puts asoc when the timer stops, and without holding transport the
transport could be freed when the timer is still running.

So fix it by holding/putting transport instead for proto_unreach_timer
in transport, just like other timers in transport.

Fixes: 50b5d6ad6382 ("sctp: Fix a race between ICMP protocol unreachable and 
connect()")
Reported-by: Hangbin Liu 
Signed-off-by: Xin Long 
---
 net/sctp/input.c | 4 ++--
 net/sctp/sm_sideeffect.c | 2 +-
 net/sctp/transport.c | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 55d4fc6..d508f6f 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -449,7 +449,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
else {
if (!mod_timer(&t->proto_unreach_timer,
jiffies + (HZ/20)))
-   sctp_association_hold(asoc);
+   sctp_transport_hold(t);
}
} else {
struct net *net = sock_net(sk);
@@ -458,7 +458,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
 "encountered!\n", __func__);
 
if (del_timer(&t->proto_unreach_timer))
-   sctp_association_put(asoc);
+   sctp_transport_put(t);
 
sctp_do_sm(net, SCTP_EVENT_T_OTHER,
   SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 813d307..0a51150 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -419,7 +419,7 @@ void sctp_generate_proto_unreach_event(struct timer_list *t)
/* Try again later.  */
if (!mod_timer(&transport->proto_unreach_timer,
jiffies + (HZ/20)))
-   sctp_association_hold(asoc);
+   sctp_transport_hold(transport);
goto out_unlock;
}
 
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 806af58..60fcf31 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -133,7 +133,7 @@ void sctp_transport_free(struct sctp_transport *transport)
 
/* Delete the ICMP proto unreachable timer if it's active. */
if (del_timer(&transport->proto_unreach_timer))
-   sctp_association_put(transport->asoc);
+   sctp_transport_put(transport);
 
sctp_transport_put(transport);
 }
-- 
2.1.0

[PATCHv2 net] sctp: change to hold/put transport for proto_unreach_timer

2020-11-13 Thread Xin Long

A call trace was found in Hangbin's Codenomicon testing with debug kernel:

  [ 2615.981988] ODEBUG: free active (active state 0) object type: timer_list 
hint: sctp_generate_proto_unreach_event+0x0/0x3a0 [sctp]
  [ 2615.995050] WARNING: CPU: 17 PID: 0 at lib/debugobjects.c:328 
debug_print_object+0x199/0x2b0
  [ 2616.095934] RIP: 0010:debug_print_object+0x199/0x2b0
  [ 2616.191533] Call Trace:
  [ 2616.194265]  
  [ 2616.202068]  debug_check_no_obj_freed+0x25e/0x3f0
  [ 2616.207336]  slab_free_freelist_hook+0xeb/0x140
  [ 2616.220971]  kfree+0xd6/0x2c0
  [ 2616.224293]  rcu_do_batch+0x3bd/0xc70
  [ 2616.243096]  rcu_core+0x8b9/0xd00
  [ 2616.256065]  __do_softirq+0x23d/0xacd
  [ 2616.260166]  irq_exit+0x236/0x2a0
  [ 2616.263879]  smp_apic_timer_interrupt+0x18d/0x620
  [ 2616.269138]  apic_timer_interrupt+0xf/0x20
  [ 2616.273711]  

This is because it holds asoc when transport->proto_unreach_timer starts
and puts asoc when the timer stops, and without holding transport the
transport could be freed when the timer is still running.

So fix it by holding/putting transport instead for proto_unreach_timer
in transport, just like other timers in transport.

v1->v2:
  - Also use sctp_transport_put() for the "out_unlock:" path in
sctp_generate_proto_unreach_event(), as Marcelo noticed.

Fixes: 50b5d6ad6382 ("sctp: Fix a race between ICMP protocol unreachable and 
connect()")
Reported-by: Hangbin Liu 
Signed-off-by: Xin Long 
---
 net/sctp/input.c | 4 ++--
 net/sctp/sm_sideeffect.c | 4 ++--
 net/sctp/transport.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/net/sctp/input.c b/net/sctp/input.c
index 55d4fc6..d508f6f 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -449,7 +449,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
else {
if (!mod_timer(&t->proto_unreach_timer,
jiffies + (HZ/20)))
-   sctp_association_hold(asoc);
+   sctp_transport_hold(t);
}
} else {
struct net *net = sock_net(sk);
@@ -458,7 +458,7 @@ void sctp_icmp_proto_unreachable(struct sock *sk,
 "encountered!\n", __func__);
 
if (del_timer(&t->proto_unreach_timer))
-   sctp_association_put(asoc);
+   sctp_transport_put(t);
 
sctp_do_sm(net, SCTP_EVENT_T_OTHER,
   SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH),
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 813d307..0948f14 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -419,7 +419,7 @@ void sctp_generate_proto_unreach_event(struct timer_list *t)
/* Try again later.  */
if (!mod_timer(&transport->proto_unreach_timer,
jiffies + (HZ/20)))
-   sctp_association_hold(asoc);
+   sctp_transport_hold(transport);
goto out_unlock;
}
 
@@ -435,7 +435,7 @@ void sctp_generate_proto_unreach_event(struct timer_list *t)
 
 out_unlock:
bh_unlock_sock(sk);
-   sctp_association_put(asoc);
+   sctp_transport_put(transport);
 }
 
  /* Handle the timeout of the RE-CONFIG timer. */
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index 806af58..60fcf31 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -133,7 +133,7 @@ void sctp_transport_free(struct sctp_transport *transport)
 
/* Delete the ICMP proto unreachable timer if it's active. */
if (del_timer(&transport->proto_unreach_timer))
-   sctp_association_put(transport->asoc);
+   sctp_transport_put(transport);
 
sctp_transport_put(transport);
 }
-- 
2.1.0

Re: [PATCH net] sctp: change to hold/put transport for proto_unreach_timer

2020-11-13 Thread Xin Long

On Fri, Nov 13, 2020 at 8:35 PM Marcelo Ricardo Leitner
 wrote:
>
> Hi,
>
> On Fri, Nov 13, 2020 at 05:18:24PM +0800, Xin Long wrote:
> ...
> > diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
> > index 813d307..0a51150 100644
> > --- a/net/sctp/sm_sideeffect.c
> > +++ b/net/sctp/sm_sideeffect.c
> > @@ -419,7 +419,7 @@ void sctp_generate_proto_unreach_event(struct 
> > timer_list *t)
> >   /* Try again later.  */
> >   if (!mod_timer(&transport->proto_unreach_timer,
> >   jiffies + (HZ/20)))
> > - sctp_association_hold(asoc);
> > + sctp_transport_hold(transport);
> >   goto out_unlock;
> >   }
> >
>
> The chunk above covers the socket busy case, but for the normal cases
> it also needs:
>
> @@ -435,7 +435,7 @@ void sctp_generate_proto_unreach_event(struct timer_list 
> *t)
>
>  out_unlock:
> bh_unlock_sock(sk);
> -   sctp_association_put(asoc);
> +   sctp_transport_put(asoc);
>  }
yeah, right, posted v2.

Thanks.
>
>   /* Handle the timeout of the RE-CONFIG timer. */
>
> > diff --git a/net/sctp/transport.c b/net/sctp/transport.c
> > index 806af58..60fcf31 100644
> > --- a/net/sctp/transport.c
> > +++ b/net/sctp/transport.c
> > @@ -133,7 +133,7 @@ void sctp_transport_free(struct sctp_transport 
> > *transport)
> >
> >   /* Delete the ICMP proto unreachable timer if it's active. */
> >   if (del_timer(&transport->proto_unreach_timer))
> > - sctp_association_put(transport->asoc);
> > + sctp_transport_put(transport);
> >
> >   sctp_transport_put(transport);
>
> Btw, quite noticeable on the above list of timers that only this timer
> was using a reference on the asoc. Seems we're good now, then. :-)
>
correct! :D

[PATCH net-next] ip_gre: remove CRC flag from dev features in gre_gso_segment

2020-11-16 Thread Xin Long

This patch is to let it always do CRC checksum in sctp_gso_segment()
by removing CRC flag from the dev features in gre_gso_segment() for
SCTP over GRE, just as it does in Commit 527beb8ef9c0 ("udp: support
sctp over udp in skb_udp_tunnel_segment") for SCTP over UDP.

It could set csum/csum_start in GSO CB properly in sctp_gso_segment()
after that commit, so it would do checksum with gso_make_checksum()
in gre_gso_segment(), and Commit 622e32b7d4a6 ("net: gre: recompute
gre csum for sctp over gre tunnels") can be reverted now.

Signed-off-by: Xin Long 
---
 net/ipv4/gre_offload.c | 14 +++---
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index e0a2465..a5935d4 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -15,12 +15,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
   netdev_features_t features)
 {
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
-   bool need_csum, need_recompute_csum, gso_partial;
struct sk_buff *segs = ERR_PTR(-EINVAL);
u16 mac_offset = skb->mac_header;
__be16 protocol = skb->protocol;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
+   bool need_csum, gso_partial;
 
if (!skb->encapsulation)
goto out;
@@ -41,10 +41,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
skb->protocol = skb->inner_protocol;
 
need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
-   need_recompute_csum = skb->csum_not_inet;
skb->encap_hdr_csum = need_csum;
 
features &= skb->dev->hw_enc_features;
+   features &= ~NETIF_F_SCTP_CRC;
 
/* segment inner packet. */
segs = skb_mac_gso_segment(skb, features);
@@ -99,15 +99,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
}
 
*(pcsum + 1) = 0;
-   if (need_recompute_csum && !skb_is_gso(skb)) {
-   __wsum csum;
-
-   csum = skb_checksum(skb, gre_offset,
-   skb->len - gre_offset, 0);
-   *pcsum = csum_fold(csum);
-   } else {
-   *pcsum = gso_make_checksum(skb, 0);
-   }
+   *pcsum = gso_make_checksum(skb, 0);
} while ((skb = skb->next));
 out:
return segs;
-- 
2.1.0

Re: [PATCH net-next] ip_gre: remove CRC flag from dev features in gre_gso_segment

2020-11-17 Thread Xin Long

On Wed, Nov 18, 2020 at 8:29 AM Jakub Kicinski  wrote:
>
> On Mon, 16 Nov 2020 17:15:47 +0800 Xin Long wrote:
> > This patch is to let it always do CRC checksum in sctp_gso_segment()
> > by removing CRC flag from the dev features in gre_gso_segment() for
> > SCTP over GRE, just as it does in Commit 527beb8ef9c0 ("udp: support
> > sctp over udp in skb_udp_tunnel_segment") for SCTP over UDP.
> >
> > It could set csum/csum_start in GSO CB properly in sctp_gso_segment()
> > after that commit, so it would do checksum with gso_make_checksum()
> > in gre_gso_segment(), and Commit 622e32b7d4a6 ("net: gre: recompute
> > gre csum for sctp over gre tunnels") can be reverted now.
> >
> > Signed-off-by: Xin Long 
>
> Makes sense, but does GRE tunnels don't always have a csum.
Do you mean the GRE csum can be offloaded? If so, it seems for GRE tunnel
we need the similar one as:

commit 4bcb877d257c87298aedead1ffeaba0d5df1991d
Author: Tom Herbert 
Date:   Tue Nov 4 09:06:52 2014 -0800

udp: Offload outer UDP tunnel csum if available

I will confirm and implement it in another patch.

>
> Is the current hardware not capable of generating CRC csums over
> encapsulated patches at all?
There is, but very rare. The thing is after doing CRC csum, the outer
GRE/UDP checksum will have to be recomputed, as it's NOT zero after
all fields for CRC scum are summed, which is different from the
common checksum. So if it's a GRE/UDP tunnel, the inner CRC csum
has to be done there even if the HW supports its offload.

>
> I guess UDP tunnels can be configured without the csums as well
> so the situation isn't much different.
>
> > diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
> > index e0a2465..a5935d4 100644
> > --- a/net/ipv4/gre_offload.c
> > +++ b/net/ipv4/gre_offload.c
> > @@ -15,12 +15,12 @@ static struct sk_buff *gre_gso_segment(struct sk_buff 
> > *skb,
> >  netdev_features_t features)
> >  {
> >   int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
> > - bool need_csum, need_recompute_csum, gso_partial;
> >   struct sk_buff *segs = ERR_PTR(-EINVAL);
> >   u16 mac_offset = skb->mac_header;
> >   __be16 protocol = skb->protocol;
> >   u16 mac_len = skb->mac_len;
> >   int gre_offset, outer_hlen;
> > + bool need_csum, gso_partial;
>
> Nit, rev xmas tree looks broken now.
Will fix it in v2, :D

Thanks.
>
> >   if (!skb->encapsulation)
> >   goto out;
> > @@ -41,10 +41,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff 
> > *skb,
> >   skb->protocol = skb->inner_protocol;
> >
> >   need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
> > - need_recompute_csum = skb->csum_not_inet;
> >   skb->encap_hdr_csum = need_csum;
> >
> >   features &= skb->dev->hw_enc_features;
> > + features &= ~NETIF_F_SCTP_CRC;
> >
> >   /* segment inner packet. */
> >   segs = skb_mac_gso_segment(skb, features);
> > @@ -99,15 +99,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff 
> > *skb,
> >   }
> >
> >   *(pcsum + 1) = 0;
> > - if (need_recompute_csum && !skb_is_gso(skb)) {
> > - __wsum csum;
> > -
> > - csum = skb_checksum(skb, gre_offset,
> > - skb->len - gre_offset, 0);
> > - *pcsum = csum_fold(csum);
> > - } else {
> > - *pcsum = gso_make_checksum(skb, 0);
> > - }
> > + *pcsum = gso_make_checksum(skb, 0);
> >   } while ((skb = skb->next));
> >  out:
> >   return segs;
>

[PATCH net-next 0/2] net: enable udp v6 sockets receiving v4 packets with UDP GRO

2021-01-11 Thread Xin Long

Currently, udp v6 socket can not process v4 packets with UDP GRO, as
udp_encap_needed_key is not increased when udp_tunnel_encap_enable()
is called for v6 socket. This patchset is to increase it and remove
the unnecessary code in bareudp.

Xin Long (2):
  udp: call udp_encap_enable for v6 sockets when enabling encap
  Revert "bareudp: Fixed bareudp receive handling"

 drivers/net/bareudp.c| 6 --
 include/net/udp_tunnel.h | 3 +--
 net/ipv6/udp.c   | 4 +++-
 3 files changed, 4 insertions(+), 9 deletions(-)

-- 
2.1.0

[PATCH net-next 1/2] udp: call udp_encap_enable for v6 sockets when enabling encap

2021-01-11 Thread Xin Long

When enabling encap for a ipv6 socket without udp_encap_needed_key
increased, UDP GRO won't work for v4 mapped v6 address packets as
sk will be NULL in udp4_gro_receive().

This patch is to enable it by increasing udp_encap_needed_key for
v6 sockets in udp_tunnel_encap_enable(), and correspondingly
decrease udp_encap_needed_key in udpv6_destroy_sock().

Reported-by: Chen Yi 
Signed-off-by: Xin Long 
---
 include/net/udp_tunnel.h | 3 +--
 net/ipv6/udp.c   | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 282d10e..afc7ce7 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -181,9 +181,8 @@ static inline void udp_tunnel_encap_enable(struct socket 
*sock)
 #if IS_ENABLED(CONFIG_IPV6)
if (sock->sk->sk_family == PF_INET6)
ipv6_stub->udpv6_encap_enable();
-   else
 #endif
-   udp_encap_enable();
+   udp_encap_enable();
 }
 
 #define UDP_TUNNEL_NIC_MAX_TABLES  4
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index b9f3dfd..265b6a0 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1608,8 +1608,10 @@ void udpv6_destroy_sock(struct sock *sk)
if (encap_destroy)
encap_destroy(sk);
}
-   if (up->encap_enabled)
+   if (up->encap_enabled) {
static_branch_dec(&udpv6_encap_needed_key);
+   static_branch_dec(&udp_encap_needed_key);
+   }
}
 
inet6_destroy_sock(sk);
-- 
2.1.0

[PATCH net-next 2/2] Revert "bareudp: Fixed bareudp receive handling"

2021-01-11 Thread Xin Long

As udp_encap_enable() is already called in udp_tunnel_encap_enable()
since the last patch, and we don't need it any more. So remove it by
reverting commit 81f954a44567567c7d74a97b1db78fb43afc253d.
---
 drivers/net/bareudp.c | 6 --
 1 file changed, 6 deletions(-)

diff --git a/drivers/net/bareudp.c b/drivers/net/bareudp.c
index 85de5f9..aed5049 100644
--- a/drivers/net/bareudp.c
+++ b/drivers/net/bareudp.c
@@ -240,12 +240,6 @@ static int bareudp_socket_create(struct bareudp_dev 
*bareudp, __be16 port)
tunnel_cfg.encap_destroy = NULL;
setup_udp_tunnel_sock(bareudp->net, sock, &tunnel_cfg);
 
-   /* As the setup_udp_tunnel_sock does not call udp_encap_enable if the
-* socket type is v6 an explicit call to udp_encap_enable is needed.
-*/
-   if (sock->sk->sk_family == AF_INET6)
-   udp_encap_enable();
-
rcu_assign_pointer(bareudp->sock, sock);
return 0;
 }
-- 
2.1.0

[PATCH net-next 1/2] net: move the hsize check to the else block in skb_segment

2021-01-11 Thread Xin Long

After commit 89319d3801d1 ("net: Add frag_list support to skb_segment"),
it goes to process frag_list when !hsize in skb_segment(). However, when
using skb frag_list, sg normally should not be set. In this case, hsize
will be set with len right before !hsize check, then it won't go to
frag_list processing code.

So the right thing to do is move the hsize check to the else block, so
that it won't affect the !hsize check for frag_list processing.

Signed-off-by: Xin Long 
---
 net/core/skbuff.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 7626a33..ea79359 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3855,8 +3855,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
hsize = skb_headlen(head_skb) - offset;
if (hsize < 0)
hsize = 0;
-   if (hsize > len || !sg)
-   hsize = len;
 
if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
(skb_headlen(list_skb) == len || sg)) {
@@ -3901,6 +3899,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
skb_release_head_state(nskb);
__skb_push(nskb, doffset);
} else {
+   if (hsize > len || !sg)
+   hsize = len;
+
nskb = __alloc_skb(hsize + doffset + headroom,
   GFP_ATOMIC, 
skb_alloc_rx_flag(head_skb),
   NUMA_NO_NODE);
-- 
2.1.0

[PATCH net-next 0/2] net: fix the features flag in sctp_gso_segment

2021-01-11 Thread Xin Long

Patch 1/2 is to improve the code in skb_segment(), and it is needed
by Patch 2/2.

Xin Long (2):
  net: move the hsize check to the else block in skb_segment
  sctp: remove the NETIF_F_SG flag before calling skb_segment

 net/core/skbuff.c  | 5 +++--
 net/sctp/offload.c | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

-- 
2.1.0

[PATCH net-next 2/2] sctp: remove the NETIF_F_SG flag before calling skb_segment

2021-01-11 Thread Xin Long

It makes more sense to clear NETIF_F_SG instead of set it when
calling skb_segment() in sctp_gso_segment(), since SCTP GSO is
using head_skb's fraglist, of which all frags are linear skbs.

This will make SCTP GSO code more understandable.

Suggested-by: Alexander Duyck 
Signed-off-by: Xin Long 
---
 net/sctp/offload.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index ce281a9..eb874e3 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -68,7 +68,7 @@ static struct sk_buff *sctp_gso_segment(struct sk_buff *skb,
goto out;
}
 
-   segs = skb_segment(skb, features | NETIF_F_HW_CSUM | NETIF_F_SG);
+   segs = skb_segment(skb, (features | NETIF_F_HW_CSUM) & ~NETIF_F_SG);
if (IS_ERR(segs))
goto out;
 
-- 
2.1.0

[PATCHv2 net-next] ip_gre: remove CRC flag from dev features in gre_gso_segment

2021-01-11 Thread Xin Long

This patch is to let it always do CRC checksum in sctp_gso_segment()
by removing CRC flag from the dev features in gre_gso_segment() for
SCTP over GRE, just as it does in Commit 527beb8ef9c0 ("udp: support
sctp over udp in skb_udp_tunnel_segment") for SCTP over UDP.

It could set csum/csum_start in GSO CB properly in sctp_gso_segment()
after that commit, so it would do checksum with gso_make_checksum()
in gre_gso_segment(), and Commit 622e32b7d4a6 ("net: gre: recompute
gre csum for sctp over gre tunnels") can be reverted now.

Note that the current HWs like igb NIC can only handle the SCTP CRC
when it's in the outer packet, not in the inner packet like in this
case, so here it removes CRC flag from the dev features even when
need_csum is false.

v1->v2:
  - improve the changelog.
  - fix "rev xmas tree" in varibles declaration.

Signed-off-by: Xin Long 
---
 net/ipv4/gre_offload.c | 15 ---
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index e0a2465..a681306 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -15,10 +15,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
   netdev_features_t features)
 {
int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
-   bool need_csum, need_recompute_csum, gso_partial;
struct sk_buff *segs = ERR_PTR(-EINVAL);
u16 mac_offset = skb->mac_header;
__be16 protocol = skb->protocol;
+   bool need_csum, gso_partial;
u16 mac_len = skb->mac_len;
int gre_offset, outer_hlen;
 
@@ -41,10 +41,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
skb->protocol = skb->inner_protocol;
 
need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
-   need_recompute_csum = skb->csum_not_inet;
skb->encap_hdr_csum = need_csum;
 
features &= skb->dev->hw_enc_features;
+   /* CRC checksum can't be handled by HW when SCTP is the inner proto. */
+   features &= ~NETIF_F_SCTP_CRC;
 
/* segment inner packet. */
segs = skb_mac_gso_segment(skb, features);
@@ -99,15 +100,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
}
 
*(pcsum + 1) = 0;
-   if (need_recompute_csum && !skb_is_gso(skb)) {
-   __wsum csum;
-
-   csum = skb_checksum(skb, gre_offset,
-   skb->len - gre_offset, 0);
-   *pcsum = csum_fold(csum);
-   } else {
-   *pcsum = gso_make_checksum(skb, 0);
-   }
+   *pcsum = gso_make_checksum(skb, 0);
} while ((skb = skb->next));
 out:
return segs;
-- 
2.1.0

Re: [PATCHv2 net-next] ip_gre: remove CRC flag from dev features in gre_gso_segment

2021-01-11 Thread Xin Long

On Tue, Jan 12, 2021 at 12:48 AM Alexander Duyck
 wrote:
>
> On Mon, Jan 11, 2021 at 5:22 AM Xin Long  wrote:
> >
> > This patch is to let it always do CRC checksum in sctp_gso_segment()
> > by removing CRC flag from the dev features in gre_gso_segment() for
> > SCTP over GRE, just as it does in Commit 527beb8ef9c0 ("udp: support
> > sctp over udp in skb_udp_tunnel_segment") for SCTP over UDP.
> >
> > It could set csum/csum_start in GSO CB properly in sctp_gso_segment()
> > after that commit, so it would do checksum with gso_make_checksum()
> > in gre_gso_segment(), and Commit 622e32b7d4a6 ("net: gre: recompute
> > gre csum for sctp over gre tunnels") can be reverted now.
> >
> > Note that the current HWs like igb NIC can only handle the SCTP CRC
> > when it's in the outer packet, not in the inner packet like in this
> > case, so here it removes CRC flag from the dev features even when
> > need_csum is false.
>
> So the limitation in igb is not the hardware but the driver
> configuration. When I had coded things up I put in a limitation on the
> igb_tx_csum code that it would have to validate that the protocol we
> are requesting an SCTP CRC offload since it is a different calculation
> than a 1's complement checksum. Since igb doesn't support tunnels we
> limited that check to the outer headers.
Ah.. I see, thanks.
>
> We could probably enable this for tunnels as long as the tunnel isn't
> requesting an outer checksum offload from the driver.
I think in igb_tx_csum(), by checking skb->csum_not_inet would be enough
to validate that is a SCTP request:
-   if (((first->protocol == htons(ETH_P_IP)) &&
-(ip_hdr(skb)->protocol == IPPROTO_SCTP)) ||
-   ((first->protocol == htons(ETH_P_IPV6)) &&
-igb_ipv6_csum_is_sctp(skb))) {
+   if (skb->csum_not_inet) {
type_tucmd = E1000_ADVTXD_TUCMD_L4T_SCTP;
break;
}

Otherwise, we will need to parse the packet a little bit, as it does in
hns3_get_l4_protocol().

>
> > v1->v2:
> >   - improve the changelog.
> >   - fix "rev xmas tree" in varibles declaration.
> >
> > Signed-off-by: Xin Long 
> > ---
> >  net/ipv4/gre_offload.c | 15 ---
> >  1 file changed, 4 insertions(+), 11 deletions(-)
> >
> > diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
> > index e0a2465..a681306 100644
> > --- a/net/ipv4/gre_offload.c
> > +++ b/net/ipv4/gre_offload.c
> > @@ -15,10 +15,10 @@ static struct sk_buff *gre_gso_segment(struct sk_buff 
> > *skb,
> >netdev_features_t features)
> >  {
> > int tnl_hlen = skb_inner_mac_header(skb) - 
> > skb_transport_header(skb);
> > -   bool need_csum, need_recompute_csum, gso_partial;
> > struct sk_buff *segs = ERR_PTR(-EINVAL);
> > u16 mac_offset = skb->mac_header;
> > __be16 protocol = skb->protocol;
> > +   bool need_csum, gso_partial;
> > u16 mac_len = skb->mac_len;
> > int gre_offset, outer_hlen;
> >
> > @@ -41,10 +41,11 @@ static struct sk_buff *gre_gso_segment(struct sk_buff 
> > *skb,
> > skb->protocol = skb->inner_protocol;
> >
> > need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
> > -   need_recompute_csum = skb->csum_not_inet;
> > skb->encap_hdr_csum = need_csum;
> >
> > features &= skb->dev->hw_enc_features;
> > +   /* CRC checksum can't be handled by HW when SCTP is the inner 
> > proto. */
> > +   features &= ~NETIF_F_SCTP_CRC;
> >
> > /* segment inner packet. */
> > segs = skb_mac_gso_segment(skb, features);
>
> Do we have NICs that are advertising NETIF_S_SCTP_CRC as part of their
> hw_enc_features and then not supporting it? Based on your comment
Yes, igb/igbvf/igc/ixgbe/ixgbevf, they have a similar code of SCTP
proto validation.

> above it seems like you are masking this out because hardware is
> advertising features it doesn't actually support. I'm just wondering
> if that is the case or if this is something where this should be
> cleared if need_csum is set since we only support one level of
> checksum offload.
Since only these drivers only do SCTP proto validation, and "only
one level checksum offload" issue only exists when inner packet
is SCTP packet, clearing NETIF_F_SCTP_CRC should be enough.

But seems to fix the drivers will be better, as hw_enc_featu

Re: [PATCH net-next 1/2] net: move the hsize check to the else block in skb_segment

2021-01-11 Thread Xin Long

On Tue, Jan 12, 2021 at 12:26 AM Alexander Duyck
 wrote:
>
> On Mon, Jan 11, 2021 at 4:45 AM Xin Long  wrote:
> >
> > After commit 89319d3801d1 ("net: Add frag_list support to skb_segment"),
> > it goes to process frag_list when !hsize in skb_segment(). However, when
> > using skb frag_list, sg normally should not be set. In this case, hsize
> > will be set with len right before !hsize check, then it won't go to
> > frag_list processing code.
> >
> > So the right thing to do is move the hsize check to the else block, so
> > that it won't affect the !hsize check for frag_list processing.
> >
> > Signed-off-by: Xin Long 
> > ---
> >  net/core/skbuff.c | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> >
> > diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> > index 7626a33..ea79359 100644
> > --- a/net/core/skbuff.c
> > +++ b/net/core/skbuff.c
> > @@ -3855,8 +3855,6 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
> > hsize = skb_headlen(head_skb) - offset;
> > if (hsize < 0)
> > hsize = 0;
> > -   if (hsize > len || !sg)
> > -   hsize = len;
> >
> > if (!hsize && i >= nfrags && skb_headlen(list_skb) &&
> > (skb_headlen(list_skb) == len || sg)) {
>
> So looking at the function it seems like the only spot where the
> standard path actually reads the hsize value is right here, and it is
> overwritten before we exit the non-error portion of the if statement.
> I wonder if we couldn't save ourselves a few cycles and avoid an
> unnecessary assignment by replacing the "!hsize" with a check for
> "hsize <= 0" and just move the entire set of checks above down into
> the lower block.
>
> > @@ -3901,6 +3899,9 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
> > skb_release_head_state(nskb);
> > __skb_push(nskb, doffset);
> > } else {
> > +   if (hsize > len || !sg)
> > +   hsize = len;
> > +
>
> Then you could essentially just add the "if (hsize < 0)" piece here as
> an "else if" check and avoid the check if it isn't needed.
Look correct, will post v2. Thanks!

>
> > nskb = __alloc_skb(hsize + doffset + headroom,
> >GFP_ATOMIC, 
> > skb_alloc_rx_flag(head_skb),
> >NUMA_NO_NODE);
> > --
> > 2.1.0
> >

Re: [PATCH net-next 1/2] udp: call udp_encap_enable for v6 sockets when enabling encap

2021-01-11 Thread Xin Long

On Tue, Jan 12, 2021 at 2:02 AM kernel test robot  wrote:
>
> Hi Xin,
>
> Thank you for the patch! Yet something to improve:
>
> [auto build test ERROR on net-next/master]
>
> url:
> https://github.com/0day-ci/linux/commits/Xin-Long/net-enable-udp-v6-sockets-receiving-v4-packets-with-UDP-GRO/20210111-205115
> base:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
> 73b7a6047971aa6ce4a70fc4901964d14f077171
> config: m68k-defconfig (attached as .config)
> compiler: m68k-linux-gcc (GCC) 9.3.0
> reproduce (this is a W=1 build):
> wget 
> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
> ~/bin/make.cross
> chmod +x ~/bin/make.cross
> # 
> https://github.com/0day-ci/linux/commit/62229592b4c3e929eeafea82e758dacb2953fbde
> git remote add linux-review https://github.com/0day-ci/linux
> git fetch --no-tags linux-review 
> Xin-Long/net-enable-udp-v6-sockets-receiving-v4-packets-with-UDP-GRO/20210111-205115
> git checkout 62229592b4c3e929eeafea82e758dacb2953fbde
> # save the attached .config to linux build tree
> COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross 
> ARCH=m68k
>
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot 
>
> All errors (new ones prefixed by >>, old ones prefixed by <<):
>
> >> ERROR: modpost: "udp_encap_needed_key" [net/ipv6/ipv6.ko] undefined!
I will add udp_encap_disable() and export, thanks.
>
> ---
> 0-DAY CI Kernel Test Service, Intel Corporation
> https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org

Re: [Patch net] cls_flower: call nla_ok() before nla_next()

2021-01-12 Thread Xin Long

On Tue, Jan 12, 2021 at 10:56 AM Cong Wang  wrote:
>
> From: Cong Wang 
>
> fl_set_enc_opt() simply checks if there are still bytes left to parse,
> but this is not sufficent as syzbot seems to be able to generate
> malformatted netlink messages. nla_ok() is more strict so should be
> used to validate the next nlattr here.
>
> And nla_validate_nested_deprecated() has less strict check too, it is
> probably too late to switch to the strict version, but we can just
> call nla_ok() too after it.
>
> Reported-and-tested-by: syzbot+2624e3778b18fc497...@syzkaller.appspotmail.com
> Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options")
> Fixes: 79b1011cb33d ("net: sched: allow flower to match erspan options")
> Cc: Pieter Jansen van Vuuren 
> Cc: Jamal Hadi Salim 
> Cc: Xin Long 
> Cc: Jiri Pirko 
> Signed-off-by: Cong Wang 
> ---
>  net/sched/cls_flower.c | 8 +---
>  1 file changed, 5 insertions(+), 3 deletions(-)
>
> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
> index 1319986693fc..e265c443536e 100644
> --- a/net/sched/cls_flower.c
> +++ b/net/sched/cls_flower.c
> @@ -1272,6 +1272,8 @@ static int fl_set_enc_opt(struct nlattr **tb, struct 
> fl_flow_key *key,
>
> nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
> msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]);
> +   if (!nla_ok(nla_opt_msk, msk_depth))
> +   return -EINVAL;
> }
I think it's better to also add  NL_SET_ERR_MSG(extack, );
for this error return, like all the other places in this function.

>
> nla_for_each_attr(nla_opt_key, nla_enc_key,
> @@ -1308,7 +1310,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct 
> fl_flow_key *key,
> return -EINVAL;
> }
>
> -   if (msk_depth)
> +   if (nla_ok(nla_opt_msk, msk_depth))
> nla_opt_msk = nla_next(nla_opt_msk, 
> &msk_depth);
> break;
> case TCA_FLOWER_KEY_ENC_OPTS_VXLAN:
> @@ -1341,7 +1343,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct 
> fl_flow_key *key,
> return -EINVAL;
> }
>
> -   if (msk_depth)
> +   if (nla_ok(nla_opt_msk, msk_depth))
> nla_opt_msk = nla_next(nla_opt_msk, 
> &msk_depth);
> break;
> case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN:
> @@ -1374,7 +1376,7 @@ static int fl_set_enc_opt(struct nlattr **tb, struct 
> fl_flow_key *key,
> return -EINVAL;
> }
>
> -   if (msk_depth)
> +   if (nla_ok(nla_opt_msk, msk_depth))
> nla_opt_msk = nla_next(nla_opt_msk, 
> &msk_depth);
> break;
> default:
> --
> 2.25.1
>

Re: [PATCHv3 net-next 16/16] sctp: enable udp tunneling socks

2020-10-19 Thread Xin Long

On Fri, Oct 16, 2020 at 3:08 PM Xin Long  wrote:
>
> On Fri, Oct 16, 2020 at 1:42 AM Marcelo Ricardo Leitner
>  wrote:
> >
> > Actually..
> >
> > On Tue, Oct 13, 2020 at 03:27:41PM +0800, Xin Long wrote:
> > ...
> > > Also add sysctl udp_port to allow changing the listening
> > > sock's port by users.
> > ...
> > > ---
> > >  net/sctp/protocol.c |  5 +
> > >  net/sctp/sysctl.c   | 50 
> > > ++
> > >  2 files changed, 55 insertions(+)
> >
> > Xin, sorry for not noticing this earlier, but we need a documentation
> > update here for this new sysctl. This is important. Please add its
> > entry in ip-sysctl.rst.
> no problem, I will add it.
>
> >
> > >
> > > diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
> > > index be002b7..79fb4b5 100644
> > > --- a/net/sctp/protocol.c
> > > +++ b/net/sctp/protocol.c
> > > @@ -1469,6 +1469,10 @@ static int __net_init sctp_ctrlsock_init(struct 
> > > net *net)
> > >   if (status)
> > >   pr_err("Failed to initialize the SCTP control sock\n");
> > >
> > > + status = sctp_udp_sock_start(net);
> > > + if (status)
> > > + pr_err("Failed to initialize the SCTP udp tunneling 
> > > sock\n");
> >   ^^^ upper case please.
> > Nit. There are other occurrences of this.
> You mean we can remove this log, as it's been handled well in
> sctp_udp_sock_start()?
This one is actually OK :D
I've updated 'udp' with 'UDP' for all code annotations in this patchset.

will post v4.

Thanks.


>
> >
> > > +
> > >   return status;
> > ...
> > > + ret = proc_dointvec(&tbl, write, buffer, lenp, ppos);
> > > + if (write && ret == 0) {
> > > + struct sock *sk = net->sctp.ctl_sock;
> > > +
> > > + if (new_value > max || new_value < min)
> > > + return -EINVAL;
> > > +
> > > + net->sctp.udp_port = new_value;
> > > + sctp_udp_sock_stop(net);
> >
> > So, if it would be disabling the encapsulation, it shouldn't be
> > calling _start() then, right? Like
> >
> > if (new_value)
> > ret = sctp_udp_sock_start(net);
> >
> > Otherwise _start() here will call ..._bind() with port 0, which then
> > will be a random one.
> right, somehow I thought it wouldn't bind with port 0.
>
> Thanks.
> >
> > > + ret = sctp_udp_sock_start(net);
> > > + if (ret)
> > > + net->sctp.udp_port = 0;
> > > +
> > > + /* Update the value in the control socket */
> > > + lock_sock(sk);
> > > + sctp_sk(sk)->udp_port = htons(net->sctp.udp_port);
> > > + release_sock(sk);
> > > + }
> > > +
> > > + return ret;
> > > +}
> > > +
> > >  int sctp_sysctl_net_register(struct net *net)
> > >  {
> > >   struct ctl_table *table;
> > > --
> > > 2.1.0
> > >

[PATCHv4 net-next 01/16] udp: check udp sock encap_type in __udp_lib_err

2020-10-19 Thread Xin Long

There is a chance that __udp4/6_lib_lookup() returns a udp encap
sock in __udp_lib_err(), like the udp encap listening sock may
use the same port as remote encap port, in which case it should
go to __udp4/6_lib_err_encap() for more validation before
processing the icmp packet.

This patch is to check encap_type in __udp_lib_err() for the
further validation for a encap sock.

Signed-off-by: Xin Long 
---
 net/ipv4/udp.c | 2 +-
 net/ipv6/udp.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 09f0a23..ca04a8a 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -702,7 +702,7 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct 
udp_table *udptable)
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
   iph->saddr, uh->source, skb->dev->ifindex,
   inet_sdif(skb), udptable, NULL);
-   if (!sk) {
+   if (!sk || udp_sk(sk)->encap_type) {
/* No socket for error: try tunnels before discarding */
sk = ERR_PTR(-ENOENT);
if (static_branch_unlikely(&udp_encap_needed_key)) {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 29d9691..cde9b88 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -560,7 +560,7 @@ int __udp6_lib_err(struct sk_buff *skb, struct 
inet6_skb_parm *opt,
 
sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
   inet6_iif(skb), inet6_sdif(skb), udptable, NULL);
-   if (!sk) {
+   if (!sk || udp_sk(sk)->encap_type) {
/* No socket for error: try tunnels before discarding */
sk = ERR_PTR(-ENOENT);
if (static_branch_unlikely(&udpv6_encap_needed_key)) {
-- 
2.1.0

[PATCHv4 net-next 00/16] sctp: Implement RFC6951: UDP Encapsulation of SCTP

2020-10-19 Thread Xin Long

Description From the RFC:

   The Main Reasons:

   o  To allow SCTP traffic to pass through legacy NATs, which do not
  provide native SCTP support as specified in [BEHAVE] and
  [NATSUPP].

   o  To allow SCTP to be implemented on hosts that do not provide
  direct access to the IP layer.  In particular, applications can
  use their own SCTP implementation if the operating system does not
  provide one.

   Implementation Notes:

   UDP-encapsulated SCTP is normally communicated between SCTP stacks
   using the IANA-assigned UDP port number 9899 (sctp-tunneling) on both
   ends.  There are circumstances where other ports may be used on
   either end, and it might be required to use ports other than the
   registered port.

   Each SCTP stack uses a single local UDP encapsulation port number as
   the destination port for all its incoming SCTP packets, this greatly
   simplifies implementation design.

   An SCTP implementation supporting UDP encapsulation MUST maintain a
   remote UDP encapsulation port number per destination address for each
   SCTP association.  Again, because the remote stack may be using ports
   other than the well-known port, each port may be different from each
   stack.  However, because of remapping of ports by NATs, the remote
   ports associated with different remote IP addresses may not be
   identical, even if they are associated with the same stack.

   Because the well-known port might not be used, implementations need
   to allow other port numbers to be specified as a local or remote UDP
   encapsulation port number through APIs.

Patches:

   This patchset is using the udp4/6 tunnel APIs to implement the UDP
   Encapsulation of SCTP with not much change in SCTP protocol stack
   and with all current SCTP features keeped in Linux Kernel.

   1 - 4: Fix some UDP issues that may be triggered by SCTP over UDP.
   5 - 7: Process incoming UDP encapsulated packets and ICMP packets.
   8 -10: Remote encap port's update by sysctl, sockopt and packets.
   11-14: Process outgoing pakects with UDP encapsulated and its GSO.
   15-16: Add the part from draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.
  17: Enable this feature.

Tests:

  - lksctp-tools/src/func_tests with UDP Encapsulation enabled/disabled:

  Both make v4test and v6test passed.

  - sctp-tests with UDP Encapsulation enabled/disabled:

  repeatability/procdumps/sctpdiag/gsomtuchange/extoverflow/
  sctphashtable passed. Others failed as expected due to those
  "iptables -p sctp" rules.

  - netperf on lo/netns/virtio_net, with gso enabled/disabled and
with ip_checksum enabled/disabled, with UDP Encapsulation
enabled/disabled:

  No clear performance dropped.

v1->v2:
  - Fix some incorrect code in the patches 5,6,8,10,11,13,14,17, suggested
by Marcelo.
  - Append two patches 15-16 to add the Additional Considerations for UDP
Encapsulation of SCTP from draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.
v2->v3:
  - remove the cleanup code in patch 2, suggested by Willem.
  - remove the patch 3 and fix the checksum in the new patch 3 after
talking with Paolo, Marcelo and Guillaume.
  - add 'select NET_UDP_TUNNEL' in patch 4 to solve a compiling error.
  - fix __be16 type cast warning in patch 8.
  - fix the wrong endian orders when setting values in 14,16.
v3->v4:
  - add entries in ip-sysctl.rst in patch 7,16, as Marcelo Suggested.
  - not create udp socks when udp_port is set to 0 in patch 16, as
Marcelo noticed.

Xin Long (16):
  udp: check udp sock encap_type in __udp_lib_err
  udp6: move the mss check after udp gso tunnel processing
  udp: support sctp over udp in skb_udp_tunnel_segment
  sctp: create udp4 sock and add its encap_rcv
  sctp: create udp6 sock and set its encap_rcv
  sctp: add encap_err_lookup for udp encap socks
  sctp: add encap_port for netns sock asoc and transport
  sctp: add SCTP_REMOTE_UDP_ENCAPS_PORT sockopt
  sctp: allow changing transport encap_port by peer packets
  sctp: add udphdr to overhead when udp_port is set
  sctp: call sk_setup_caps in sctp_packet_transmit instead
  sctp: support for sending packet over udp4 sock
  sctp: support for sending packet over udp6 sock
  sctp: add the error cause for new encapsulation port restart
  sctp: handle the init chunk matching an existing asoc
  sctp: enable udp tunneling socks

 Documentation/networking/ip-sysctl.rst |  15 
 include/linux/sctp.h   |  20 +
 include/net/netns/sctp.h   |   8 ++
 include/net/sctp/constants.h   |   2 +
 include/net/sctp/sctp.h|   9 +-
 include/net/sctp/sm.h  |   4 +
 include/net/sctp/structs.h |  14 ++--
 include/uapi/linux/sctp.h  |   7 ++
 net/ipv4/udp.c |   2 +-
 net/ipv4/udp_offload.c |   3 +
 net/ipv6/udp.c |   2 +-
 net/ipv6/udp_offload.c

[PATCHv4 net-next 04/16] sctp: create udp4 sock and add its encap_rcv

2020-10-19 Thread Xin Long

This patch is to add the functions to create/release udp4 sock,
and set the sock's encap_rcv to process the incoming udp encap
sctp packets. In sctp_udp_rcv(), as we can see, all we need to
do is fix the transport header for sctp_rcv(), then it would
implement the part of rfc6951#section-5.4:

  "When an encapsulated packet is received, the UDP header is removed.
   Then, the generic lookup is performed, as done by an SCTP stack
   whenever a packet is received, to find the association for the
   received SCTP packet"

Note that these functions will be called in the last patch of
this patchset when enabling this feature.

v1->v2:
  - Add pr_err() when fails to create udp v4 sock.
v2->v3:
  - Add 'select NET_UDP_TUNNEL' in sctp Kconfig.

Signed-off-by: Xin Long 
---
 include/net/netns/sctp.h |  5 +
 include/net/sctp/constants.h |  2 ++
 include/net/sctp/sctp.h  |  2 ++
 net/sctp/Kconfig |  1 +
 net/sctp/protocol.c  | 43 +++
 5 files changed, 53 insertions(+)

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index d8d02e4..8cc9aff 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -22,6 +22,11 @@ struct netns_sctp {
 */
struct sock *ctl_sock;
 
+   /* UDP tunneling listening sock. */
+   struct sock *udp4_sock;
+   /* UDP tunneling listening port. */
+   int udp_port;
+
/* This is the global local address list.
 * We actively maintain this complete list of addresses on
 * the system by catching address add/delete events.
diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 122d9e2..14a0d22 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -286,6 +286,8 @@ enum { SCTP_MAX_GABS = 16 };
 * functions simpler to write.
 */
 
+#define SCTP_DEFAULT_UDP_PORT 9899 /* default UDP tunneling port */
+
 /* These are the values for pf exposure, UNUSED is to keep compatible with old
  * applications by default.
  */
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 4fc747b..bfd87a0 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -84,6 +84,8 @@ int sctp_copy_local_addr_list(struct net *net, struct 
sctp_bind_addr *addr,
 struct sctp_pf *sctp_get_pf_specific(sa_family_t family);
 int sctp_register_pf(struct sctp_pf *, sa_family_t);
 void sctp_addr_wq_mgmt(struct net *, struct sctp_sockaddr_entry *, int);
+int sctp_udp_sock_start(struct net *net);
+void sctp_udp_sock_stop(struct net *net);
 
 /*
  * sctp/socket.c
diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig
index 39d7fa9..5da599f 100644
--- a/net/sctp/Kconfig
+++ b/net/sctp/Kconfig
@@ -11,6 +11,7 @@ menuconfig IP_SCTP
select CRYPTO_HMAC
select CRYPTO_SHA1
select LIBCRC32C
+   select NET_UDP_TUNNEL
help
  Stream Control Transmission Protocol
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 2583323..6fb03fc 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -44,6 +44,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024)
 
@@ -840,6 +841,45 @@ static int sctp_ctl_sock_init(struct net *net)
return 0;
 }
 
+static int sctp_udp_rcv(struct sock *sk, struct sk_buff *skb)
+{
+   skb_set_transport_header(skb, sizeof(struct udphdr));
+   sctp_rcv(skb);
+   return 0;
+}
+
+int sctp_udp_sock_start(struct net *net)
+{
+   struct udp_tunnel_sock_cfg tuncfg = {NULL};
+   struct udp_port_cfg udp_conf = {0};
+   struct socket *sock;
+   int err;
+
+   udp_conf.family = AF_INET;
+   udp_conf.local_ip.s_addr = htonl(INADDR_ANY);
+   udp_conf.local_udp_port = htons(net->sctp.udp_port);
+   err = udp_sock_create(net, &udp_conf, &sock);
+   if (err) {
+   pr_err("Failed to create the SCTP UDP tunneling v4 sock\n");
+   return err;
+   }
+
+   tuncfg.encap_type = 1;
+   tuncfg.encap_rcv = sctp_udp_rcv;
+   setup_udp_tunnel_sock(net, sock, &tuncfg);
+   net->sctp.udp4_sock = sock->sk;
+
+   return 0;
+}
+
+void sctp_udp_sock_stop(struct net *net)
+{
+   if (net->sctp.udp4_sock) {
+   udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket);
+   net->sctp.udp4_sock = NULL;
+   }
+}
+
 /* Register address family specific functions. */
 int sctp_register_af(struct sctp_af *af)
 {
@@ -1271,6 +1311,9 @@ static int __net_init sctp_defaults_init(struct net *net)
/* Enable ECN by default. */
net->sctp.ecn_enable = 1;
 
+   /* Set UDP tunneling listening port to default value */
+   net->sctp.udp_port = SCTP_DEFAULT_UDP_PORT;
+
/* Set SCOPE policy to enabled */
net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE;
 
-- 
2.1.0

[PATCHv4 net-next 07/16] sctp: add encap_port for netns sock asoc and transport

2020-10-19 Thread Xin Long

encap_port is added as per netns/sock/assoc/transport, and the
latter one's encap_port inherits the former one's by default.
The transport's encap_port value would mostly decide if one
packet should go out with udp encapsulated or not.

This patch also allows users to set netns' encap_port by sysctl.

v1->v2:
  - Change to define encap_port as __be16 for sctp_sock, asoc and
transport.
v2->v3:
  - No change.
v3->v4:
  - Add 'encap_port' entry in ip-sysctl.rst.

Signed-off-by: Xin Long 
---
 Documentation/networking/ip-sysctl.rst |  9 +
 include/net/netns/sctp.h   |  2 ++
 include/net/sctp/structs.h |  6 ++
 net/sctp/associola.c   |  4 
 net/sctp/protocol.c|  3 +++
 net/sctp/socket.c  |  1 +
 net/sctp/sysctl.c  | 10 ++
 7 files changed, 35 insertions(+)

diff --git a/Documentation/networking/ip-sysctl.rst 
b/Documentation/networking/ip-sysctl.rst
index 837d51f..3909305 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2640,6 +2640,15 @@ addr_scope_policy - INTEGER
 
Default: 1
 
+encap_port - INTEGER
+   The default remote UDP encapsalution port.
+   When UDP tunneling is enabled, this global value is used to set
+   the dest port for the udp header of outgoing packets by default.
+   Users can also change the value for each sock/asoc/transport by
+   using setsockopt.
+
+   Default: 0
+
 
 ``/proc/sys/net/core/*``
 
diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 247b401..a0f315e 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -27,6 +27,8 @@ struct netns_sctp {
struct sock *udp6_sock;
/* UDP tunneling listening port. */
int udp_port;
+   /* UDP tunneling remote encap port. */
+   int encap_port;
 
/* This is the global local address list.
 * We actively maintain this complete list of addresses on
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 0bdff38..aa98e7e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -178,6 +178,8 @@ struct sctp_sock {
 */
__u32 hbinterval;
 
+   __be16 encap_port;
+
/* This is the max_retrans value for new associations. */
__u16 pathmaxrxt;
 
@@ -877,6 +879,8 @@ struct sctp_transport {
 */
unsigned long last_time_ecne_reduced;
 
+   __be16 encap_port;
+
/* This is the max_retrans value for the transport and will
 * be initialized from the assocs value.  This can be changed
 * using the SCTP_SET_PEER_ADDR_PARAMS socket option.
@@ -1790,6 +1794,8 @@ struct sctp_association {
 */
unsigned long hbinterval;
 
+   __be16 encap_port;
+
/* This is the max_retrans value for new transports in the
 * association.
 */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index fdb69d4..336df4b 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -99,6 +99,8 @@ static struct sctp_association *sctp_association_init(
 */
asoc->hbinterval = msecs_to_jiffies(sp->hbinterval);
 
+   asoc->encap_port = sp->encap_port;
+
/* Initialize path max retrans value. */
asoc->pathmaxrxt = sp->pathmaxrxt;
 
@@ -624,6 +626,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct 
sctp_association *asoc,
 */
peer->hbinterval = asoc->hbinterval;
 
+   peer->encap_port = asoc->encap_port;
+
/* Set the path max_retrans.  */
peer->pathmaxrxt = asoc->pathmaxrxt;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index aa8e5b2..7c729d7 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1359,6 +1359,9 @@ static int __net_init sctp_defaults_init(struct net *net)
/* Set UDP tunneling listening port to default value */
net->sctp.udp_port = SCTP_DEFAULT_UDP_PORT;
 
+   /* Set remote encap port to 0 by default */
+   net->sctp.encap_port = 0;
+
/* Set SCOPE policy to enabled */
net->sctp.scope_policy = SCTP_SCOPE_POLICY_ENABLE;
 
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 53d0a41..09b94cd 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4876,6 +4876,7 @@ static int sctp_init_sock(struct sock *sk)
 * be modified via SCTP_PEER_ADDR_PARAMS
 */
sp->hbinterval  = net->sctp.hb_interval;
+   sp->encap_port  = htons(net->sctp.encap_port);
sp->pathmaxrxt  = net->sctp.max_retrans_path;
sp->pf_retrans  = net->sctp.pf_retrans;
sp->ps_retrans  = net->sctp.ps_retrans;
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index c16c809..ecc1b5e 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -36,6 +36,7 @

[PATCHv4 net-next 06/16] sctp: add encap_err_lookup for udp encap socks

2020-10-19 Thread Xin Long

As it says in rfc6951#section-5.5:

  "When receiving ICMP or ICMPv6 response packets, there might not be
   enough bytes in the payload to identify the SCTP association that the
   SCTP packet triggering the ICMP or ICMPv6 packet belongs to.  If a
   received ICMP or ICMPv6 packet cannot be related to a specific SCTP
   association or the verification tag cannot be verified, it MUST be
   discarded silently.  In particular, this means that the SCTP stack
   MUST NOT rely on receiving ICMP or ICMPv6 messages.  Implementation
   constraints could prevent processing received ICMP or ICMPv6
   messages."

ICMP or ICMPv6 packets need to be handled, and this is implemented by
udp encap sock .encap_err_lookup function.

The .encap_err_lookup function is called in __udp(6)_lib_err_encap()
to confirm this path does need to be updated. For sctp, what we can
do here is check if the corresponding asoc and transport exist.

Note that icmp packet process for sctp over udp is done by udp sock
.encap_err_lookup(), and it means for now we can't do as much as
sctp_v4/6_err() does. Also we can't do the two mappings mentioned
in rfc6951#section-5.5.

Signed-off-by: Xin Long 
---
 net/sctp/protocol.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index e1501e7..aa8e5b2 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -848,6 +848,23 @@ static int sctp_udp_rcv(struct sock *sk, struct sk_buff 
*skb)
return 0;
 }
 
+static int sctp_udp_err_lookup(struct sock *sk, struct sk_buff *skb)
+{
+   struct sctp_association *asoc;
+   struct sctp_transport *t;
+   int family;
+
+   skb->transport_header += sizeof(struct udphdr);
+   family = (ip_hdr(skb)->version == 4) ? AF_INET : AF_INET6;
+   sk = sctp_err_lookup(dev_net(skb->dev), family, skb, sctp_hdr(skb),
+&asoc, &t);
+   if (!sk)
+   return -ENOENT;
+
+   sctp_err_finish(sk, t);
+   return 0;
+}
+
 int sctp_udp_sock_start(struct net *net)
 {
struct udp_tunnel_sock_cfg tuncfg = {NULL};
@@ -866,6 +883,7 @@ int sctp_udp_sock_start(struct net *net)
 
tuncfg.encap_type = 1;
tuncfg.encap_rcv = sctp_udp_rcv;
+   tuncfg.encap_err_lookup = sctp_udp_err_lookup;
setup_udp_tunnel_sock(net, sock, &tuncfg);
net->sctp.udp4_sock = sock->sk;
 
@@ -887,6 +905,7 @@ int sctp_udp_sock_start(struct net *net)
 
tuncfg.encap_type = 1;
tuncfg.encap_rcv = sctp_udp_rcv;
+   tuncfg.encap_err_lookup = sctp_udp_err_lookup;
setup_udp_tunnel_sock(net, sock, &tuncfg);
net->sctp.udp6_sock = sock->sk;
 #endif
-- 
2.1.0

[PATCHv4 net-next 10/16] sctp: add udphdr to overhead when udp_port is set

2020-10-19 Thread Xin Long

sctp_mtu_payload() is for calculating the frag size before making
chunks from a msg. So we should only add udphdr size to overhead
when udp socks are listening, as only then sctp can handle the
incoming sctp over udp packets and outgoing sctp over udp packets
will be possible.

Note that we can't do this according to transport->encap_port, as
different transports may be set to different values, while the
chunks were made before choosing the transport, we could not be
able to meet all rfc6951#section-5.6 recommends.

v1->v2:
  - Add udp_port for sctp_sock to avoid a potential race issue, it
will be used in xmit path in the next patch.

Signed-off-by: Xin Long 
---
 include/net/sctp/sctp.h| 7 +--
 include/net/sctp/structs.h | 1 +
 net/sctp/socket.c  | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index bfd87a0..86f74f2 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -578,10 +578,13 @@ static inline __u32 sctp_mtu_payload(const struct 
sctp_sock *sp,
 {
__u32 overhead = sizeof(struct sctphdr) + extra;
 
-   if (sp)
+   if (sp) {
overhead += sp->pf->af->net_header_len;
-   else
+   if (sp->udp_port)
+   overhead += sizeof(struct udphdr);
+   } else {
overhead += sizeof(struct ipv6hdr);
+   }
 
if (WARN_ON_ONCE(mtu && mtu <= overhead))
mtu = overhead;
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 81464ae..80f7149 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -178,6 +178,7 @@ struct sctp_sock {
 */
__u32 hbinterval;
 
+   __be16 udp_port;
__be16 encap_port;
 
/* This is the max_retrans value for new associations. */
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 2a9ee9b..a710917 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4928,6 +4928,7 @@ static int sctp_init_sock(struct sock *sk)
 * be modified via SCTP_PEER_ADDR_PARAMS
 */
sp->hbinterval  = net->sctp.hb_interval;
+   sp->udp_port= htons(net->sctp.udp_port);
sp->encap_port  = htons(net->sctp.encap_port);
sp->pathmaxrxt  = net->sctp.max_retrans_path;
sp->pf_retrans  = net->sctp.pf_retrans;
-- 
2.1.0

[PATCHv4 net-next 05/16] sctp: create udp6 sock and set its encap_rcv

2020-10-19 Thread Xin Long

This patch is to add the udp6 sock part in sctp_udp_sock_start/stop().
udp_conf.use_udp6_rx_checksums is set to true, as:

   "The SCTP checksum MUST be computed for IPv4 and IPv6, and the UDP
checksum SHOULD be computed for IPv4 and IPv6"

says in rfc6951#section-5.3.

v1->v2:
  - Add pr_err() when fails to create udp v6 sock.
  - Add #if IS_ENABLED(CONFIG_IPV6) not to create v6 sock when ipv6 is
disabled.

Signed-off-by: Xin Long 
---
 include/net/netns/sctp.h |  1 +
 net/sctp/protocol.c  | 26 ++
 2 files changed, 27 insertions(+)

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index 8cc9aff..247b401 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -24,6 +24,7 @@ struct netns_sctp {
 
/* UDP tunneling listening sock. */
struct sock *udp4_sock;
+   struct sock *udp6_sock;
/* UDP tunneling listening port. */
int udp_port;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 6fb03fc..e1501e7 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -869,6 +869,28 @@ int sctp_udp_sock_start(struct net *net)
setup_udp_tunnel_sock(net, sock, &tuncfg);
net->sctp.udp4_sock = sock->sk;
 
+#if IS_ENABLED(CONFIG_IPV6)
+   memset(&udp_conf, 0, sizeof(udp_conf));
+
+   udp_conf.family = AF_INET6;
+   udp_conf.local_ip6 = in6addr_any;
+   udp_conf.local_udp_port = htons(net->sctp.udp_port);
+   udp_conf.use_udp6_rx_checksums = true;
+   udp_conf.ipv6_v6only = true;
+   err = udp_sock_create(net, &udp_conf, &sock);
+   if (err) {
+   pr_err("Failed to create the SCTP UDP tunneling v6 sock\n");
+   udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket);
+   net->sctp.udp4_sock = NULL;
+   return err;
+   }
+
+   tuncfg.encap_type = 1;
+   tuncfg.encap_rcv = sctp_udp_rcv;
+   setup_udp_tunnel_sock(net, sock, &tuncfg);
+   net->sctp.udp6_sock = sock->sk;
+#endif
+
return 0;
 }
 
@@ -878,6 +900,10 @@ void sctp_udp_sock_stop(struct net *net)
udp_tunnel_sock_release(net->sctp.udp4_sock->sk_socket);
net->sctp.udp4_sock = NULL;
}
+   if (net->sctp.udp6_sock) {
+   udp_tunnel_sock_release(net->sctp.udp6_sock->sk_socket);
+   net->sctp.udp6_sock = NULL;
+   }
 }
 
 /* Register address family specific functions. */
-- 
2.1.0

[PATCHv4 net-next 02/16] udp6: move the mss check after udp gso tunnel processing

2020-10-19 Thread Xin Long

For some protocol's gso, like SCTP, it's using GSO_BY_FRAGS for
gso_size. When using UDP to encapsulate its packet, it will
return error in udp6_ufo_fragment() as skb->len < gso_size,
and it will never go to the gso tunnel processing.

So we should move this check after udp gso tunnel processing,
the same as udp4_ufo_fragment() does.

v1->v2:
  - no change.
v2->v3:
  - not do any cleanup.

Signed-off-by: Xin Long 
Acked-by: Willem de Bruijn 
---
 net/ipv6/udp_offload.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 584157a..aa602af 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -28,10 +28,6 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
int tnl_hlen;
int err;
 
-   mss = skb_shinfo(skb)->gso_size;
-   if (unlikely(skb->len <= mss))
-   goto out;
-
if (skb->encapsulation && skb_shinfo(skb)->gso_type &
(SKB_GSO_UDP_TUNNEL|SKB_GSO_UDP_TUNNEL_CSUM))
segs = skb_udp_tunnel_segment(skb, features, true);
@@ -48,6 +44,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4)
return __udp_gso_segment(skb, features);
 
+   mss = skb_shinfo(skb)->gso_size;
+   if (unlikely(skb->len <= mss))
+   goto out;
+
/* Do software UFO. Complete and fill in the UDP checksum as HW 
cannot
 * do checksum of UDP packets sent as multiple IP fragments.
 */
-- 
2.1.0

[PATCHv4 net-next 03/16] udp: support sctp over udp in skb_udp_tunnel_segment

2020-10-19 Thread Xin Long

For the gso of sctp over udp packets, sctp_gso_segment() will be called in
skb_udp_tunnel_segment(), we need to set transport_header to sctp header.

As all the current HWs can't handle both crc checksum and udp checksum at
the same time, the crc checksum has to be done in sctp_gso_segment() by
removing the NETIF_F_SCTP_CRC flag from the features.

Meanwhile, if the HW can't do udp checksum, csum and csum_start has to be
set correctly, and udp checksum will be done in __skb_udp_tunnel_segment()
by calling gso_make_checksum().

Thanks to Paolo, Marcelo and Guillaume for helping with this one.

v1->v2:
  - no change.
v2->v3:
  - remove the he NETIF_F_SCTP_CRC flag from the features.
  - set csum and csum_start in sctp_gso_make_checksum().

Signed-off-by: Xin Long 
---
 net/ipv4/udp_offload.c | 3 +++
 net/sctp/offload.c | 6 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index e67a66f..b8b1fde 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -49,6 +49,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct 
sk_buff *skb,
__skb_pull(skb, tnl_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb_inner_network_offset(skb));
+   skb_set_transport_header(skb, skb_inner_transport_offset(skb));
skb->mac_len = skb_inner_network_offset(skb);
skb->protocol = new_protocol;
 
@@ -67,6 +68,8 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct 
sk_buff *skb,
  (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM;
 
features &= skb->dev->hw_enc_features;
+   /* CRC checksum can't be handled by HW when it's a UDP tunneling 
packet. */
+   features &= ~NETIF_F_SCTP_CRC;
 
/* The only checksum offload we care about from here on out is the
 * outer one so strip the existing checksum feature flags and
diff --git a/net/sctp/offload.c b/net/sctp/offload.c
index 74847d6..ce281a9 100644
--- a/net/sctp/offload.c
+++ b/net/sctp/offload.c
@@ -27,7 +27,11 @@ static __le32 sctp_gso_make_checksum(struct sk_buff *skb)
 {
skb->ip_summed = CHECKSUM_NONE;
skb->csum_not_inet = 0;
-   gso_reset_checksum(skb, ~0);
+   /* csum and csum_start in GSO CB may be needed to do the UDP
+* checksum when it's a UDP tunneling packet.
+*/
+   SKB_GSO_CB(skb)->csum = (__force __wsum)~0;
+   SKB_GSO_CB(skb)->csum_start = skb_headroom(skb) + skb->len;
return sctp_compute_cksum(skb, skb_transport_offset(skb));
 }
 
-- 
2.1.0

[PATCHv4 net-next 08/16] sctp: add SCTP_REMOTE_UDP_ENCAPS_PORT sockopt

2020-10-19 Thread Xin Long

This patch is to implement:

  rfc6951#section-6.1: Get or Set the Remote UDP Encapsulation Port Number

with the param of the struct:

  struct sctp_udpencaps {
sctp_assoc_t sue_assoc_id;
struct sockaddr_storage sue_address;
uint16_t sue_port;
  };

the encap_port of sock, assoc or transport can be changed by users,
which also means it allows the different transports of the same asoc
to have different encap_port value.

v1->v2:
  - no change.
v2->v3:
  - fix the endian warning when setting values between encap_port and
sue_port.

Signed-off-by: Xin Long 
---
 include/uapi/linux/sctp.h |   7 +++
 net/sctp/socket.c | 114 ++
 2 files changed, 121 insertions(+)

diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 28ad40d..cb78e7a 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -140,6 +140,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_ECN_SUPPORTED 130
 #define SCTP_EXPOSE_POTENTIALLY_FAILED_STATE   131
 #define SCTP_EXPOSE_PF_STATE   SCTP_EXPOSE_POTENTIALLY_FAILED_STATE
+#define SCTP_REMOTE_UDP_ENCAPS_PORT132
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE  0x
@@ -1197,6 +1198,12 @@ struct sctp_event {
uint8_t se_on;
 };
 
+struct sctp_udpencaps {
+   sctp_assoc_t sue_assoc_id;
+   struct sockaddr_storage sue_address;
+   uint16_t sue_port;
+};
+
 /* SCTP Stream schedulers */
 enum sctp_sched_type {
SCTP_SS_FCFS,
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 09b94cd..2a9ee9b 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -4417,6 +4417,55 @@ static int sctp_setsockopt_pf_expose(struct sock *sk,
return retval;
 }
 
+static int sctp_setsockopt_encap_port(struct sock *sk,
+ struct sctp_udpencaps *encap,
+ unsigned int optlen)
+{
+   struct sctp_association *asoc;
+   struct sctp_transport *t;
+   __be16 encap_port;
+
+   if (optlen != sizeof(*encap))
+   return -EINVAL;
+
+   /* If an address other than INADDR_ANY is specified, and
+* no transport is found, then the request is invalid.
+*/
+   encap_port = (__force __be16)encap->sue_port;
+   if (!sctp_is_any(sk, (union sctp_addr *)&encap->sue_address)) {
+   t = sctp_addr_id2transport(sk, &encap->sue_address,
+  encap->sue_assoc_id);
+   if (!t)
+   return -EINVAL;
+
+   t->encap_port = encap_port;
+   return 0;
+   }
+
+   /* Get association, if assoc_id != SCTP_FUTURE_ASSOC and the
+* socket is a one to many style socket, and an association
+* was not found, then the id was invalid.
+*/
+   asoc = sctp_id2assoc(sk, encap->sue_assoc_id);
+   if (!asoc && encap->sue_assoc_id != SCTP_FUTURE_ASSOC &&
+   sctp_style(sk, UDP))
+   return -EINVAL;
+
+   /* If changes are for association, also apply encap_port to
+* each transport.
+*/
+   if (asoc) {
+   list_for_each_entry(t, &asoc->peer.transport_addr_list,
+   transports)
+   t->encap_port = encap_port;
+
+   return 0;
+   }
+
+   sctp_sk(sk)->encap_port = encap_port;
+   return 0;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -4636,6 +4685,9 @@ static int sctp_setsockopt(struct sock *sk, int level, 
int optname,
case SCTP_EXPOSE_POTENTIALLY_FAILED_STATE:
retval = sctp_setsockopt_pf_expose(sk, kopt, optlen);
break;
+   case SCTP_REMOTE_UDP_ENCAPS_PORT:
+   retval = sctp_setsockopt_encap_port(sk, kopt, optlen);
+   break;
default:
retval = -ENOPROTOOPT;
break;
@@ -7791,6 +7843,65 @@ static int sctp_getsockopt_pf_expose(struct sock *sk, 
int len,
return retval;
 }
 
+static int sctp_getsockopt_encap_port(struct sock *sk, int len,
+ char __user *optval, int __user *optlen)
+{
+   struct sctp_association *asoc;
+   struct sctp_udpencaps encap;
+   struct sctp_transport *t;
+   __be16 encap_port;
+
+   if (len < sizeof(encap))
+   return -EINVAL;
+
+   len = sizeof(encap);
+   if (copy_from_user(&encap, optval, len))
+   return -EFAULT;
+
+   /* If an address other than INADDR_ANY is specified, and
+* no transport is found, then the request is invalid.
+*/
+   if (!sctp_is_any(sk, (union sctp_addr *)&encap.sue_address)) {
+   t = sctp_addr_id2transport(sk, &encap.sue_address,
+  encap.sue_assoc

[PATCHv4 net-next 09/16] sctp: allow changing transport encap_port by peer packets

2020-10-19 Thread Xin Long

As rfc6951#section-5.4 says:

  "After finding the SCTP association (which
   includes checking the verification tag), the UDP source port MUST be
   stored as the encapsulation port for the destination address the SCTP
   packet is received from (see Section 5.1).

   When a non-encapsulated SCTP packet is received by the SCTP stack,
   the encapsulation of outgoing packets belonging to the same
   association and the corresponding destination address MUST be
   disabled."

transport encap_port should be updated by a validated incoming packet's
udp src port.

We save the udp src port in sctp_input_cb->encap_port, and then update
the transport in two places:

  1. right after vtag is verified, which is required by RFC, and this
 allows the existent transports to be updated by the chunks that
 can only be processed on an asoc.

  2. right before processing the 'init' where the transports are added,
 and this allows building a sctp over udp connection by client with
 the server not knowing the remote encap port.

  3. when processing ootb_pkt and creating the temporary transport for
 the reply pkt.

Note that sctp_input_cb->header is removed, as it's not used any more
in sctp.

v1->v2:
  - Change encap_port as __be16 for sctp_input_cb.

Signed-off-by: Xin Long 
---
 include/net/sctp/sm.h  |  1 +
 include/net/sctp/structs.h |  7 +--
 net/sctp/ipv6.c|  1 +
 net/sctp/protocol.c| 11 ++-
 net/sctp/sm_make_chunk.c   |  1 +
 net/sctp/sm_statefuns.c|  2 ++
 6 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 5c491a3..a499341 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -380,6 +380,7 @@ sctp_vtag_verify(const struct sctp_chunk *chunk,
 if (ntohl(chunk->sctp_hdr->vtag) == asoc->c.my_vtag)
 return 1;
 
+   chunk->transport->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port;
return 0;
 }
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index aa98e7e..81464ae 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1120,14 +1120,9 @@ static inline void sctp_outq_cork(struct sctp_outq *q)
  * sctp_input_cb is currently used on rx and sock rx queue
  */
 struct sctp_input_cb {
-   union {
-   struct inet_skb_parmh4;
-#if IS_ENABLED(CONFIG_IPV6)
-   struct inet6_skb_parm   h6;
-#endif
-   } header;
struct sctp_chunk *chunk;
struct sctp_af *af;
+   __be16 encap_port;
 };
 #define SCTP_INPUT_CB(__skb)   ((struct sctp_input_cb *)&((__skb)->cb[0]))
 
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 8a58f42..a064bf2 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -1053,6 +1053,7 @@ static struct inet_protosw sctpv6_stream_protosw = {
 
 static int sctp6_rcv(struct sk_buff *skb)
 {
+   memset(skb->cb, 0, sizeof(skb->cb));
return sctp_rcv(skb) ? -1 : 0;
 }
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 7c729d7..36f471d 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -843,6 +843,9 @@ static int sctp_ctl_sock_init(struct net *net)
 
 static int sctp_udp_rcv(struct sock *sk, struct sk_buff *skb)
 {
+   memset(skb->cb, 0, sizeof(skb->cb));
+   SCTP_INPUT_CB(skb)->encap_port = udp_hdr(skb)->source;
+
skb_set_transport_header(skb, sizeof(struct udphdr));
sctp_rcv(skb);
return 0;
@@ -1139,9 +1142,15 @@ static struct inet_protosw sctp_stream_protosw = {
.flags  = SCTP_PROTOSW_FLAG
 };
 
+static int sctp4_rcv(struct sk_buff *skb)
+{
+   memset(skb->cb, 0, sizeof(skb->cb));
+   return sctp_rcv(skb);
+}
+
 /* Register with IP layer.  */
 static const struct net_protocol sctp_protocol = {
-   .handler = sctp_rcv,
+   .handler = sctp4_rcv,
.err_handler = sctp_v4_err,
.no_policy   = 1,
.netns_ok= 1,
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 9a56ae2..21d0ff1 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -2321,6 +2321,7 @@ int sctp_process_init(struct sctp_association *asoc, 
struct sctp_chunk *chunk,
 * added as the primary transport.  The source address seems to
 * be a better choice than any of the embedded addresses.
 */
+   asoc->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_port;
if (!sctp_assoc_add_peer(asoc, peer_addr, gfp, SCTP_ACTIVE))
goto nomem;
 
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index c669f8b..8edab15 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -6268,6 +6268,8 @@ static struct sctp_packet *sctp_ootb_pkt_new(
if (!transport)
goto nomem;
 
+   transport->encap_port = SCTP_INPUT_CB(chunk->skb)->encap_por

[PATCHv4 net-next 12/16] sctp: support for sending packet over udp4 sock

2020-10-19 Thread Xin Long

This patch does what the rfc6951#section-5.3 says for ipv4:

  "Within the UDP header, the source port MUST be the local UDP
   encapsulation port number of the SCTP stack, and the destination port
   MUST be the remote UDP encapsulation port number maintained for the
   association and the destination address to which the packet is sent
   (see Section 5.1).

   Because the SCTP packet is the UDP payload, the length of the UDP
   packet MUST be the length of the SCTP packet plus the size of the UDP
   header.

   The SCTP checksum MUST be computed for IPv4 and IPv6, and the UDP
   checksum SHOULD be computed for IPv4 and IPv6."

Some places need to be adjusted in sctp_packet_transmit():

  1. For non-gso packets, when transport's encap_port is set, sctp
 checksum has to be done in sctp_packet_pack(), as the outer
 udp will use ip_summed = CHECKSUM_PARTIAL to do the offload
 setting for checksum.

  2. Delay calling dst_clone() and skb_dst_set() for non-udp packets
 until sctp_v4_xmit(), as for udp packets, skb_dst_set() is not
 needed before calling udp_tunnel_xmit_skb().

then in sctp_v4_xmit():

  1. Go to udp_tunnel_xmit_skb() only when transport->encap_port and
 net->sctp.udp_port both are set, as these are one for dst port
 and another for src port.

  2. For gso packet, SKB_GSO_UDP_TUNNEL_CSUM is set for gso_type, and
 with this udp checksum can be done in __skb_udp_tunnel_segment()
 for each segments after the sctp gso.

  3. inner_mac_header and inner_transport_header are set, as these
 will be needed in __skb_udp_tunnel_segment() to find the right
 headers.

  4. df and ttl are calculated, as these are the required params by
 udp_tunnel_xmit_skb().

  5. nocheck param has to be false, as "the UDP checksum SHOULD be
 computed for IPv4 and IPv6", says in rfc6951#section-5.3.

v1->v2:
  - Use sp->udp_port instead in sctp_v4_xmit(), which is more safe.

Signed-off-by: Xin Long 
---
 net/sctp/output.c   |  9 +++--
 net/sctp/protocol.c | 41 ++---
 2 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/net/sctp/output.c b/net/sctp/output.c
index fb16500..6614c9f 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -514,8 +514,8 @@ static int sctp_packet_pack(struct sctp_packet *packet,
if (sctp_checksum_disable)
return 1;
 
-   if (!(skb_dst(head)->dev->features & NETIF_F_SCTP_CRC) ||
-   dst_xfrm(skb_dst(head)) || packet->ipfragok) {
+   if (!(tp->dst->dev->features & NETIF_F_SCTP_CRC) ||
+   dst_xfrm(tp->dst) || packet->ipfragok || tp->encap_port) {
struct sctphdr *sh =
(struct sctphdr *)skb_transport_header(head);
 
@@ -542,7 +542,6 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t 
gfp)
struct sctp_association *asoc = tp->asoc;
struct sctp_chunk *chunk, *tmp;
int pkt_count, gso = 0;
-   struct dst_entry *dst;
struct sk_buff *head;
struct sctphdr *sh;
struct sock *sk;
@@ -579,13 +578,11 @@ int sctp_packet_transmit(struct sctp_packet *packet, 
gfp_t gfp)
sh->checksum = 0;
 
/* drop packet if no dst */
-   dst = dst_clone(tp->dst);
-   if (!dst) {
+   if (!tp->dst) {
IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
kfree_skb(head);
goto out;
}
-   skb_dst_set(head, dst);
 
rcu_read_lock();
if (__sk_dst_get(sk) != tp->dst) {
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 36f471d..c8de327 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1059,25 +1059,44 @@ static int sctp_inet_supported_addrs(const struct 
sctp_sock *opt,
 }
 
 /* Wrapper routine that calls the ip transmit routine. */
-static inline int sctp_v4_xmit(struct sk_buff *skb,
-  struct sctp_transport *transport)
+static inline int sctp_v4_xmit(struct sk_buff *skb, struct sctp_transport *t)
 {
-   struct inet_sock *inet = inet_sk(skb->sk);
+   struct dst_entry *dst = dst_clone(t->dst);
+   struct flowi4 *fl4 = &t->fl.u.ip4;
+   struct sock *sk = skb->sk;
+   struct inet_sock *inet = inet_sk(sk);
__u8 dscp = inet->tos;
+   __be16 df = 0;
 
pr_debug("%s: skb:%p, len:%d, src:%pI4, dst:%pI4\n", __func__, skb,
-skb->len, &transport->fl.u.ip4.saddr,
-&transport->fl.u.ip4.daddr);
+skb->len, &fl4->saddr, &fl4->daddr);
+
+   if (t->dscp & SCTP_DSCP_SET_MASK)
+   dscp = t->dscp & SCTP_DSCP_VAL_MASK;
 
-   if (transport->dscp & SCTP_DSCP_SET_MASK)
-   dscp = transport->dscp & SCTP_DSCP_VAL_MASK;
+   inet->pmtudisc = t->param_flags &

[PATCHv4 net-next 13/16] sctp: support for sending packet over udp6 sock

2020-10-19 Thread Xin Long

This one basically does the similar things in sctp_v6_xmit as does for
udp4 sock in the last patch, just note that:

  1. label needs to be calculated, as it's the param of
 udp_tunnel6_xmit_skb().

  2. The 'nocheck' param of udp_tunnel6_xmit_skb() is false, as
 required by RFC.

v1->v2:
  - Use sp->udp_port instead in sctp_v6_xmit(), which is more safe.

Signed-off-by: Xin Long 
---
 net/sctp/ipv6.c | 43 ---
 1 file changed, 32 insertions(+), 11 deletions(-)

diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index a064bf2..814754d 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -55,6 +55,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 
@@ -191,33 +192,53 @@ static int sctp_v6_err(struct sk_buff *skb, struct 
inet6_skb_parm *opt,
return ret;
 }
 
-static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
+static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *t)
 {
+   struct dst_entry *dst = dst_clone(t->dst);
+   struct flowi6 *fl6 = &t->fl.u.ip6;
struct sock *sk = skb->sk;
struct ipv6_pinfo *np = inet6_sk(sk);
-   struct flowi6 *fl6 = &transport->fl.u.ip6;
__u8 tclass = np->tclass;
-   int res;
+   __be32 label;
 
pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
 skb->len, &fl6->saddr, &fl6->daddr);
 
-   if (transport->dscp & SCTP_DSCP_SET_MASK)
-   tclass = transport->dscp & SCTP_DSCP_VAL_MASK;
+   if (t->dscp & SCTP_DSCP_SET_MASK)
+   tclass = t->dscp & SCTP_DSCP_VAL_MASK;
 
if (INET_ECN_is_capable(tclass))
IP6_ECN_flow_xmit(sk, fl6->flowlabel);
 
-   if (!(transport->param_flags & SPP_PMTUD_ENABLE))
+   if (!(t->param_flags & SPP_PMTUD_ENABLE))
skb->ignore_df = 1;
 
SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
 
-   rcu_read_lock();
-   res = ip6_xmit(sk, skb, fl6, sk->sk_mark, rcu_dereference(np->opt),
-  tclass, sk->sk_priority);
-   rcu_read_unlock();
-   return res;
+   if (!t->encap_port || !sctp_sk(sk)->udp_port) {
+   int res;
+
+   skb_dst_set(skb, dst);
+   rcu_read_lock();
+   res = ip6_xmit(sk, skb, fl6, sk->sk_mark,
+  rcu_dereference(np->opt),
+  tclass, sk->sk_priority);
+   rcu_read_unlock();
+   return res;
+   }
+
+   if (skb_is_gso(skb))
+   skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL_CSUM;
+
+   skb->encapsulation = 1;
+   skb_reset_inner_mac_header(skb);
+   skb_reset_inner_transport_header(skb);
+   skb_set_inner_ipproto(skb, IPPROTO_SCTP);
+   label = ip6_make_flowlabel(sock_net(sk), skb, fl6->flowlabel, true, 
fl6);
+
+   return udp_tunnel6_xmit_skb(dst, sk, skb, NULL, &fl6->saddr,
+   &fl6->daddr, tclass, ip6_dst_hoplimit(dst),
+   label, sctp_sk(sk)->udp_port, 
t->encap_port, false);
 }
 
 /* Returns the dst cache entry for the given source and destination ip
-- 
2.1.0

[PATCHv4 net-next 14/16] sctp: add the error cause for new encapsulation port restart

2020-10-19 Thread Xin Long

This patch is to add the function to make the abort chunk with
the error cause for new encapsulation port restart, defined
on Section 4.4 in draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.

v1->v2:
  - no change.
v2->v3:
  - no need to call htons() when setting nep.cur_port/new_port.

Signed-off-by: Xin Long 
---
 include/linux/sctp.h | 20 
 include/net/sctp/sm.h|  3 +++
 net/sctp/sm_make_chunk.c | 20 
 3 files changed, 43 insertions(+)

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index 7673123..bb19265 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -482,11 +482,13 @@ enum sctp_error {
 *  11  Restart of an association with new addresses
 *  12  User Initiated Abort
 *  13  Protocol Violation
+*  14  Restart of an Association with New Encapsulation Port
 */
 
SCTP_ERROR_RESTART = cpu_to_be16(0x0b),
SCTP_ERROR_USER_ABORT  = cpu_to_be16(0x0c),
SCTP_ERROR_PROTO_VIOLATION = cpu_to_be16(0x0d),
+   SCTP_ERROR_NEW_ENCAP_PORT  = cpu_to_be16(0x0e),
 
/* ADDIP Section 3.3  New Error Causes
 *
@@ -793,4 +795,22 @@ enum {
SCTP_FLOWLABEL_VAL_MASK = 0xf
 };
 
+/* UDP Encapsulation
+ * draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.html#section-4-4
+ *
+ *   The error cause indicating an "Restart of an Association with
+ *   New Encapsulation Port"
+ *
+ * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |Cause Code = 14|   Cause Length = 8|
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |   Current Encapsulation Port  | New Encapsulation Port|
+ * +---+---+
+ */
+struct sctp_new_encap_port_hdr {
+   __be16 cur_port;
+   __be16 new_port;
+};
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index a499341..fd223c9 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -221,6 +221,9 @@ struct sctp_chunk *sctp_make_violation_paramlen(
 struct sctp_chunk *sctp_make_violation_max_retrans(
const struct sctp_association *asoc,
const struct sctp_chunk *chunk);
+struct sctp_chunk *sctp_make_new_encap_port(
+   const struct sctp_association *asoc,
+   const struct sctp_chunk *chunk);
 struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
   const struct sctp_transport *transport);
 struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 21d0ff1..f77484d 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1142,6 +1142,26 @@ struct sctp_chunk *sctp_make_violation_max_retrans(
return retval;
 }
 
+struct sctp_chunk *sctp_make_new_encap_port(const struct sctp_association 
*asoc,
+   const struct sctp_chunk *chunk)
+{
+   struct sctp_new_encap_port_hdr nep;
+   struct sctp_chunk *retval;
+
+   retval = sctp_make_abort(asoc, chunk,
+sizeof(struct sctp_errhdr) + sizeof(nep));
+   if (!retval)
+   goto nodata;
+
+   sctp_init_cause(retval, SCTP_ERROR_NEW_ENCAP_PORT, sizeof(nep));
+   nep.cur_port = SCTP_INPUT_CB(chunk->skb)->encap_port;
+   nep.new_port = chunk->transport->encap_port;
+   sctp_addto_chunk(retval, sizeof(nep), &nep);
+
+nodata:
+   return retval;
+}
+
 /* Make a HEARTBEAT chunk.  */
 struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
   const struct sctp_transport *transport)
-- 
2.1.0

[PATCHv4 net-next 11/16] sctp: call sk_setup_caps in sctp_packet_transmit instead

2020-10-19 Thread Xin Long

sk_setup_caps() was originally called in Commit 90017accff61 ("sctp:
Add GSO support"), as:

  "We have to refresh this in case we are xmiting to more than one
   transport at a time"

This actually happens in the loop of sctp_outq_flush_transports(),
and it shouldn't be tied to gso, so move it out of gso part and
before sctp_packet_pack().

Signed-off-by: Xin Long 
---
 net/sctp/output.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/net/sctp/output.c b/net/sctp/output.c
index 1441eaf..fb16500 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -508,12 +508,6 @@ static int sctp_packet_pack(struct sctp_packet *packet,
sizeof(struct inet6_skb_parm)));
skb_shinfo(head)->gso_segs = pkt_count;
skb_shinfo(head)->gso_size = GSO_BY_FRAGS;
-   rcu_read_lock();
-   if (skb_dst(head) != tp->dst) {
-   dst_hold(tp->dst);
-   sk_setup_caps(sk, tp->dst);
-   }
-   rcu_read_unlock();
goto chksum;
}
 
@@ -593,6 +587,13 @@ int sctp_packet_transmit(struct sctp_packet *packet, gfp_t 
gfp)
}
skb_dst_set(head, dst);
 
+   rcu_read_lock();
+   if (__sk_dst_get(sk) != tp->dst) {
+   dst_hold(tp->dst);
+   sk_setup_caps(sk, tp->dst);
+   }
+   rcu_read_unlock();
+
/* pack up chunks */
pkt_count = sctp_packet_pack(packet, head, gso, gfp);
if (!pkt_count) {
-- 
2.1.0

[PATCHv4 net-next 15/16] sctp: handle the init chunk matching an existing asoc

2020-10-19 Thread Xin Long

This is from Section 4 of draft-tuexen-tsvwg-sctp-udp-encaps-cons-03,
and it requires responding with an abort chunk with an error cause
when the udp source port of the received init chunk doesn't match the
encap port of the transport.

Signed-off-by: Xin Long 
---
 net/sctp/sm_statefuns.c | 50 +
 1 file changed, 50 insertions(+)

diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 8edab15..af2b704 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -87,6 +87,13 @@ static enum sctp_disposition sctp_sf_tabort_8_4_8(
const union sctp_subtype type,
void *arg,
struct sctp_cmd_seq *commands);
+static enum sctp_disposition sctp_sf_new_encap_port(
+   struct net *net,
+   const struct sctp_endpoint *ep,
+   const struct sctp_association *asoc,
+   const union sctp_subtype type,
+   void *arg,
+   struct sctp_cmd_seq *commands);
 static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk);
 
 static enum sctp_disposition sctp_stop_t1_and_abort(
@@ -1493,6 +1500,10 @@ static enum sctp_disposition sctp_sf_do_unexpected_init(
if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_init_chunk)))
return sctp_sf_violation_chunklen(net, ep, asoc, type, arg,
  commands);
+
+   if (SCTP_INPUT_CB(chunk->skb)->encap_port != 
chunk->transport->encap_port)
+   return sctp_sf_new_encap_port(net, ep, asoc, type, arg, 
commands);
+
/* Grab the INIT header.  */
chunk->subh.init_hdr = (struct sctp_inithdr *)chunk->skb->data;
 
@@ -3392,6 +3403,45 @@ static enum sctp_disposition sctp_sf_tabort_8_4_8(
 
sctp_packet_append_chunk(packet, abort);
 
+   sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(packet));
+
+   SCTP_INC_STATS(net, SCTP_MIB_OUTCTRLCHUNKS);
+
+   sctp_sf_pdiscard(net, ep, asoc, type, arg, commands);
+   return SCTP_DISPOSITION_CONSUME;
+}
+
+/* Handling of SCTP Packets Containing an INIT Chunk Matching an
+ * Existing Associations when the UDP encap port is incorrect.
+ *
+ * From Section 4 at draft-tuexen-tsvwg-sctp-udp-encaps-cons-03.
+ */
+static enum sctp_disposition sctp_sf_new_encap_port(
+   struct net *net,
+   const struct sctp_endpoint *ep,
+   const struct sctp_association *asoc,
+   const union sctp_subtype type,
+   void *arg,
+   struct sctp_cmd_seq *commands)
+{
+   struct sctp_packet *packet = NULL;
+   struct sctp_chunk *chunk = arg;
+   struct sctp_chunk *abort;
+
+   packet = sctp_ootb_pkt_new(net, asoc, chunk);
+   if (!packet)
+   return SCTP_DISPOSITION_NOMEM;
+
+   abort = sctp_make_new_encap_port(asoc, chunk);
+   if (!abort) {
+   sctp_ootb_pkt_free(packet);
+   return SCTP_DISPOSITION_NOMEM;
+   }
+
+   abort->skb->sk = ep->base.sk;
+
+   sctp_packet_append_chunk(packet, abort);
+
sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT,
SCTP_PACKET(packet));
 
-- 
2.1.0

[PATCHv4 net-next 16/16] sctp: enable udp tunneling socks

2020-10-19 Thread Xin Long

This patch is to enable udp tunneling socks by calling
sctp_udp_sock_start() in sctp_ctrlsock_init(), and
sctp_udp_sock_stop() in sctp_ctrlsock_exit().

Also add sysctl udp_port to allow changing the listening
sock's port by users.

Wit this patch, the whole sctp over udp feature can be
enabled and used.

v1->v2:
  - Also update ctl_sock udp_port in proc_sctp_do_udp_port()
where netns udp_port gets changed.
v2->v3:
  - Call htons() when setting sk udp_port from netns udp_port.
v3->v4:
  - Not call sctp_udp_sock_start() when new_value is 0.
  - Add udp_port entry in ip-sysctl.rst.

Signed-off-by: Xin Long 
---
 Documentation/networking/ip-sysctl.rst |  6 
 net/sctp/protocol.c|  5 
 net/sctp/sysctl.c  | 52 ++
 3 files changed, 63 insertions(+)

diff --git a/Documentation/networking/ip-sysctl.rst 
b/Documentation/networking/ip-sysctl.rst
index 3909305..9effc45 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -2640,6 +2640,12 @@ addr_scope_policy - INTEGER
 
Default: 1
 
+udp_port - INTEGER
+   The listening port for the local UDP tunneling sock.
+   UDP encapsulation will be disabled when it's set to 0.
+
+   Default: 9899
+
 encap_port - INTEGER
The default remote UDP encapsalution port.
When UDP tunneling is enabled, this global value is used to set
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index c8de327..894ab12 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1469,6 +1469,10 @@ static int __net_init sctp_ctrlsock_init(struct net *net)
if (status)
pr_err("Failed to initialize the SCTP control sock\n");
 
+   status = sctp_udp_sock_start(net);
+   if (status)
+   pr_err("Failed to initialize the SCTP UDP tunneling sock\n");
+
return status;
 }
 
@@ -1476,6 +1480,7 @@ static void __net_exit sctp_ctrlsock_exit(struct net *net)
 {
/* Free the control endpoint.  */
inet_ctl_sock_destroy(net->sctp.ctl_sock);
+   sctp_udp_sock_stop(net);
 }
 
 static struct pernet_operations sctp_ctrlsock_ops = {
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index ecc1b5e..e92df77 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -49,6 +49,8 @@ static int proc_sctp_do_rto_min(struct ctl_table *ctl, int 
write,
void *buffer, size_t *lenp, loff_t *ppos);
 static int proc_sctp_do_rto_max(struct ctl_table *ctl, int write, void *buffer,
size_t *lenp, loff_t *ppos);
+static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, void 
*buffer,
+size_t *lenp, loff_t *ppos);
 static int proc_sctp_do_alpha_beta(struct ctl_table *ctl, int write,
   void *buffer, size_t *lenp, loff_t *ppos);
 static int proc_sctp_do_auth(struct ctl_table *ctl, int write,
@@ -292,6 +294,15 @@ static struct ctl_table sctp_net_table[] = {
.proc_handler   = proc_dointvec,
},
{
+   .procname   = "udp_port",
+   .data   = &init_net.sctp.udp_port,
+   .maxlen = sizeof(int),
+   .mode   = 0644,
+   .proc_handler   = proc_sctp_do_udp_port,
+   .extra1 = SYSCTL_ZERO,
+   .extra2 = &udp_port_max,
+   },
+   {
.procname   = "encap_port",
.data   = &init_net.sctp.encap_port,
.maxlen = sizeof(int),
@@ -487,6 +498,47 @@ static int proc_sctp_do_auth(struct ctl_table *ctl, int 
write,
return ret;
 }
 
+static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write,
+void *buffer, size_t *lenp, loff_t *ppos)
+{
+   struct net *net = current->nsproxy->net_ns;
+   unsigned int min = *(unsigned int *)ctl->extra1;
+   unsigned int max = *(unsigned int *)ctl->extra2;
+   struct ctl_table tbl;
+   int ret, new_value;
+
+   memset(&tbl, 0, sizeof(struct ctl_table));
+   tbl.maxlen = sizeof(unsigned int);
+
+   if (write)
+   tbl.data = &new_value;
+   else
+   tbl.data = &net->sctp.udp_port;
+
+   ret = proc_dointvec(&tbl, write, buffer, lenp, ppos);
+   if (write && ret == 0) {
+   struct sock *sk = net->sctp.ctl_sock;
+
+   if (new_value > max || new_value < min)
+   return -EINVAL;
+
+   net->sctp.udp_port = new_value;
+   sctp_udp_sock_stop(net);
+   if (new_value) {
+   ret = sctp_udp_sock_start(net);
+   if (ret)
+   net->sctp.udp_port = 0;
+

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 1692 matches

Mail list logo