Fix nf_flow_ip6_tunnel_proto() to use pskb_may_pull() instead of
skb_header_pointer() to ensure the outer IPv6 header is in the skb
headroom, which is required for subsequent packet processing. Move
ctx->offset update inside the IPPROTO_IPV6 conditional block since it
should only be adjusted when an IP6IP6 tunnel is actually detected.
Simplify the rx path by removing ipv6_skip_exthdr() and checking
ip6h->nexthdr directly, as the flowtable fast path only handles simple
IP6IP6 encapsulation without extension headers.
Drop the tunnel encapsulation limit destination option support from the
tx path to match, since the rx path no longer handles extension headers.
Remove the encap_limit parameter from nf_flow_offload_ipv6_forward(),
nf_flow_tunnel_ip6ip6_push() and nf_flow_tunnel_v6_push(), along with
the ipv6_tel_txoption struct and related headroom/MTU adjustments.

Fixes: d98103575dcdd ("netfilter: flowtable: Add IP6IP6 rx sw acceleration")
Signed-off-by: Lorenzo Bianconi <[email protected]>
---
Changes in v2:
- Drop tunnel encapsulation limit destination option support.
- Do not allow IPv6 extension headers in nf_flow_ip6_tunnel_proto().
- Link to v1: 
https://lore.kernel.org/r/20260608-b4-nf_flow_ip6_tunnel_proto-update-v1-1-782c7052c...@kernel.org
---
 net/ipv6/ip6_tunnel.c                              |  7 ++
 net/netfilter/nf_flow_table_ip.c                   | 80 +++++-----------------
 .../selftests/net/netfilter/nft_flowtable.sh       |  8 +--
 3 files changed, 30 insertions(+), 65 deletions(-)

diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 9d1037ac082f..bf1e77f95f18 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -1850,6 +1850,13 @@ static int ip6_tnl_fill_forward_path(struct 
net_device_path_ctx *ctx,
        struct dst_entry *dst;
        int err;
 
+       if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) {
+               /* encaplimit option is currently not supported is
+                * sw-acceleration path.
+                */
+               return -EOPNOTSUPP;
+       }
+
        dst = ip6_route_output(dev_net(ctx->dev), NULL, &fl6);
        if (!dst->error) {
                path->type = DEV_PATH_TUN;
diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
index 9c05a50d6013..e7a3fb2b2d94 100644
--- a/net/netfilter/nf_flow_table_ip.c
+++ b/net/netfilter/nf_flow_table_ip.c
@@ -347,29 +347,23 @@ static bool nf_flow_ip6_tunnel_proto(struct 
nf_flowtable_ctx *ctx,
                                     struct sk_buff *skb)
 {
 #if IS_ENABLED(CONFIG_IPV6)
-       struct ipv6hdr *ip6h, _ip6h;
-       __be16 frag_off;
-       u8 nexthdr;
-       int hdrlen;
+       struct ipv6hdr *ip6h;
 
-       ip6h = skb_header_pointer(skb, ctx->offset, sizeof(*ip6h), &_ip6h);
-       if (!ip6h)
+       if (!pskb_may_pull(skb, sizeof(*ip6h) + ctx->offset))
                return false;
 
+       ip6h = (struct ipv6hdr *)(skb_network_header(skb) + ctx->offset);
        if (ip6h->hop_limit <= 1)
                return false;
 
-       nexthdr = ip6h->nexthdr;
-       hdrlen = ipv6_skip_exthdr(skb, sizeof(*ip6h) + ctx->offset, &nexthdr,
-                                 &frag_off);
-       if (hdrlen < 0)
+       if (ipv6_ext_hdr(ip6h->nexthdr))
                return false;
 
-       if (nexthdr == IPPROTO_IPV6) {
-               ctx->tun.hdr_size = hdrlen;
-               ctx->tun.proto = IPPROTO_IPV6;
+       if (ip6h->nexthdr == IPPROTO_IPV6) {
+               ctx->tun.proto = ip6h->nexthdr;
+               ctx->tun.hdr_size = sizeof(*ip6h);
+               ctx->offset += ctx->tun.hdr_size;
        }
-       ctx->offset += ctx->tun.hdr_size;
 
        return true;
 #else
@@ -648,25 +642,19 @@ static int nf_flow_tunnel_v4_push(struct net *net, struct 
sk_buff *skb,
        return 0;
 }
 
-struct ipv6_tel_txoption {
-       struct ipv6_txoptions ops;
-       __u8 dst_opt[8];
-};
-
 static int nf_flow_tunnel_ip6ip6_push(struct net *net, struct sk_buff *skb,
                                      struct flow_offload_tuple *tuple,
-                                     struct in6_addr **ip6_daddr,
-                                     int encap_limit)
+                                     struct in6_addr **ip6_daddr)
 {
        struct ipv6hdr *ip6h = (struct ipv6hdr *)skb_network_header(skb);
-       u8 hop_limit = ip6h->hop_limit, proto = IPPROTO_IPV6;
        struct rtable *rt = dst_rtable(tuple->dst_cache);
        __u8 dsfield = ipv6_get_dsfield(ip6h);
        struct flowi6 fl6 = {
                .daddr = tuple->tun.src_v6,
                .saddr = tuple->tun.dst_v6,
-               .flowi6_proto = proto,
+               .flowi6_proto = IPPROTO_IPV6,
        };
+       u8 hop_limit = ip6h->hop_limit;
        int err, mtu;
        u32 headroom;
 
@@ -674,41 +662,18 @@ static int nf_flow_tunnel_ip6ip6_push(struct net *net, 
struct sk_buff *skb,
        if (err)
                return err;
 
-       skb_set_inner_ipproto(skb, proto);
+       skb_set_inner_ipproto(skb, IPPROTO_IPV6);
        headroom = sizeof(*ip6h) + LL_RESERVED_SPACE(rt->dst.dev) +
                   rt->dst.header_len;
-       if (encap_limit)
-               headroom += 8;
        err = skb_cow_head(skb, headroom);
        if (err)
                return err;
 
        skb_scrub_packet(skb, true);
        mtu = dst_mtu(&rt->dst) - sizeof(*ip6h);
-       if (encap_limit)
-               mtu -= 8;
        mtu = max(mtu, IPV6_MIN_MTU);
        skb_dst_update_pmtu_no_confirm(skb, mtu);
 
-       if (encap_limit > 0) {
-               struct ipv6_tel_txoption opt = {
-                       .dst_opt[2] = IPV6_TLV_TNL_ENCAP_LIMIT,
-                       .dst_opt[3] = 1,
-                       .dst_opt[4] = encap_limit,
-                       .dst_opt[5] = IPV6_TLV_PADN,
-                       .dst_opt[6] = 1,
-               };
-               struct ipv6_opt_hdr *hopt;
-
-               opt.ops.dst1opt = (struct ipv6_opt_hdr *)opt.dst_opt;
-               opt.ops.opt_nflen = 8;
-
-               hopt = skb_push(skb, ipv6_optlen(opt.ops.dst1opt));
-               memcpy(hopt, opt.ops.dst1opt, ipv6_optlen(opt.ops.dst1opt));
-               hopt->nexthdr = IPPROTO_IPV6;
-               proto = NEXTHDR_DEST;
-       }
-
        skb_push(skb, sizeof(*ip6h));
        skb_reset_network_header(skb);
 
@@ -716,7 +681,7 @@ static int nf_flow_tunnel_ip6ip6_push(struct net *net, 
struct sk_buff *skb,
        ip6_flow_hdr(ip6h, dsfield,
                     ip6_make_flowlabel(net, skb, fl6.flowlabel, true, &fl6));
        ip6h->hop_limit = hop_limit;
-       ip6h->nexthdr = proto;
+       ip6h->nexthdr = IPPROTO_IPV6;
        ip6h->daddr = tuple->tun.src_v6;
        ip6h->saddr = tuple->tun.dst_v6;
        ipv6_hdr(skb)->payload_len = htons(skb->len - sizeof(*ip6h));
@@ -729,12 +694,10 @@ static int nf_flow_tunnel_ip6ip6_push(struct net *net, 
struct sk_buff *skb,
 
 static int nf_flow_tunnel_v6_push(struct net *net, struct sk_buff *skb,
                                  struct flow_offload_tuple *tuple,
-                                 struct in6_addr **ip6_daddr,
-                                 int encap_limit)
+                                 struct in6_addr **ip6_daddr)
 {
        if (tuple->tun_num)
-               return nf_flow_tunnel_ip6ip6_push(net, skb, tuple, ip6_daddr,
-                                                 encap_limit);
+               return nf_flow_tunnel_ip6ip6_push(net, skb, tuple, ip6_daddr);
 
        return 0;
 }
@@ -1089,7 +1052,7 @@ static int nf_flow_tuple_ipv6(struct nf_flowtable_ctx 
*ctx, struct sk_buff *skb,
 static int nf_flow_offload_ipv6_forward(struct nf_flowtable_ctx *ctx,
                                        struct nf_flowtable *flow_table,
                                        struct flow_offload_tuple_rhash 
*tuplehash,
-                                       struct sk_buff *skb, int encap_limit)
+                                       struct sk_buff *skb)
 {
        enum flow_offload_tuple_dir dir;
        struct flow_offload *flow;
@@ -1100,11 +1063,8 @@ static int nf_flow_offload_ipv6_forward(struct 
nf_flowtable_ctx *ctx,
        flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
 
        mtu = flow->tuplehash[dir].tuple.mtu + ctx->offset;
-       if (flow->tuplehash[!dir].tuple.tun_num) {
+       if (flow->tuplehash[!dir].tuple.tun_num)
                mtu -= sizeof(*ip6h);
-               if (encap_limit > 0)
-                       mtu -= 8; /* encap limit option */
-       }
 
        if (unlikely(nf_flow_exceeds_mtu(skb, mtu)))
                return 0;
@@ -1158,7 +1118,6 @@ unsigned int
 nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
                          const struct nf_hook_state *state)
 {
-       int encap_limit = IPV6_DEFAULT_TNL_ENCAP_LIMIT;
        struct flow_offload_tuple_rhash *tuplehash;
        struct nf_flowtable *flow_table = priv;
        struct flow_offload_tuple *other_tuple;
@@ -1177,8 +1136,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
        if (tuplehash == NULL)
                return NF_ACCEPT;
 
-       ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb,
-                                          encap_limit);
+       ret = nf_flow_offload_ipv6_forward(&ctx, flow_table, tuplehash, skb);
        if (ret < 0)
                return NF_DROP;
        else if (ret == 0)
@@ -1198,7 +1156,7 @@ nf_flow_offload_ipv6_hook(void *priv, struct sk_buff *skb,
        ip6_daddr = &other_tuple->src_v6;
 
        if (nf_flow_tunnel_v6_push(state->net, skb, other_tuple,
-                                  &ip6_daddr, encap_limit) < 0)
+                                  &ip6_daddr) < 0)
                return NF_DROP;
 
        switch (tuplehash->tuple.xmit_type) {
diff --git a/tools/testing/selftests/net/netfilter/nft_flowtable.sh 
b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
index 7a34ef468975..08ad07500e8a 100755
--- a/tools/testing/selftests/net/netfilter/nft_flowtable.sh
+++ b/tools/testing/selftests/net/netfilter/nft_flowtable.sh
@@ -592,7 +592,7 @@ ip -net "$nsr1" link set tun0 up
 ip -net "$nsr1" addr add 192.168.100.1/24 dev tun0
 ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
 
-ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote fee1:2::2
+ip -net "$nsr1" link add name tun6 type ip6tnl local fee1:2::1 remote 
fee1:2::2 encaplimit none
 ip -net "$nsr1" link set tun6 up
 ip -net "$nsr1" addr add fee1:3::1/64 dev tun6 nodad
 
@@ -601,7 +601,7 @@ ip -net "$nsr2" link set tun0 up
 ip -net "$nsr2" addr add 192.168.100.2/24 dev tun0
 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0.forwarding=1 > /dev/null
 
-ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote 
fee1:2::1 || ret=1
+ip -net "$nsr2" link add name tun6 type ip6tnl local fee1:2::2 remote 
fee1:2::1 encaplimit none || ret=1
 ip -net "$nsr2" link set tun6 up
 ip -net "$nsr2" addr add fee1:3::2/64 dev tun6 nodad
 
@@ -651,7 +651,7 @@ ip -net "$nsr1" route change default via 192.168.200.2
 ip netns exec "$nsr1" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null
 ip netns exec "$nsr1" nft -a insert rule inet filter forward 'meta oif tun0.10 
accept'
 
-ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote 
fee1:4::2
+ip -net "$nsr1" link add name tun6.10 type ip6tnl local fee1:4::1 remote 
fee1:4::2 encaplimit none
 ip -net "$nsr1" link set tun6.10 up
 ip -net "$nsr1" addr add fee1:5::1/64 dev tun6.10 nodad
 ip -6 -net "$nsr1" route delete default
@@ -670,7 +670,7 @@ ip -net "$nsr2" addr add 192.168.200.2/24 dev tun0.10
 ip -net "$nsr2" route change default via 192.168.200.1
 ip netns exec "$nsr2" sysctl net.ipv4.conf.tun0/10.forwarding=1 > /dev/null
 
-ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote 
fee1:4::1 || ret=1
+ip -net "$nsr2" link add name tun6.10 type ip6tnl local fee1:4::2 remote 
fee1:4::1 encaplimit none || ret=1
 ip -net "$nsr2" link set tun6.10 up
 ip -net "$nsr2" addr add fee1:5::2/64 dev tun6.10 nodad
 ip -6 -net "$nsr2" route delete default

---
base-commit: 1fad1796b9411217fa77b6a497ed76b999205487
change-id: 20260608-b4-nf_flow_ip6_tunnel_proto-update-8b64903825b4

Best regards,
-- 
Lorenzo Bianconi <[email protected]>


Reply via email to