On 2016/11/26 4:40, Julian Anastasov wrote:
> 
>       Hello,
> 
> On Fri, 25 Nov 2016, Hannes Frederic Sowa wrote:
> 
>> On 25.11.2016 09:18, Julian Anastasov wrote:
>>>
>>>     Another option would be to add similar bit to
>>> struct sock (sk_dst_pending_confirm), may be ip_finish_output2
>>> can propagate it to dst_neigh_output via new arg and then to
>>> reset it.
>>
>> I don't understand how this can help? Maybe I understood it wrong?
> 
>       The idea is, the indication from highest level (TCP) to
> be propagated to lowest level (neigh).
> 
>>> Or to change n->confirmed via some new inline sock
>>> function instead of changing dst_neigh_output. At this place 
>>> skb->sk is optional, may be we need (skb->sk && dst ==
>>> skb->sk->sk_dst_cache) check. Also, when sk_dst_cache changes
>>> we should clear this flag.
>>
>> In "(skb->sk && dst == skb->sk->sk_dst_cache)" where does dst come from?
> 
>       This is in case we do not want to propagate indication
> from TCP to tunnels, see below...
> 
>> I don't see a possibility besides using mac_header or inner_mac_header
>> to look up the incoming MAC address and confirm that one in the neighbor
>> cache so far (we could try to optimize this case for rt_gateway though).
> 
>       My idea is as follows:
> 
> - TCP instead of calling dst_confirm should call new func
> dst_confirm_sk(sk) where sk->sk_dst_pending_confirm is set,
> not dst->pending_confirm.
> 
> - ip_finish_output2: use skb->sk (TCP) to check for
> sk_dst_pending_confirm and update n->confirmed in some
> new inline func
> 
>       Correct me if I'm wrong but here is how I see the
> situation:
> 
> - TCP caches output dst in socket, for example, dst1,
> sets it as skb->dst
> 
> - if xfrm tunnels are used then dst1 can point to some tunnel,
> i.e. it is not in all cases the same skb->dst, as seen by 
> ip_finish_output2
> 
> - netfilter can DNAT and assign different skb->dst
> 
> - as result, before now, dst1->pending_confirm is set but
> it is not used properly because ip_finish_output2 uses
> the final skb->dst which can be different or with
> rt_gateway = 0
> 
> - considering that tunnels do not use dst_confirm,
> n->confirmed is not changed every time. There are some
> interesting cases:
> 
> 1. both dst1 and the final skb->dst point to same
> dst with rt_gateway = 0: n->confirmed is updated but
> as wee see it can be for wrong neigh.
> 
> 2. dst1 is different from skb->dst, so n->confirmed is
> not changed. This can happen on DNAT or when using
> tunnel to secure gateway.
> 
> - in ip_finish_output2 we have skb->sk (original TCP sk) and
> sk arg (equal to skb->sk or second option is sock of some UDP
> tunnel, etc). The idea is to use skb->sk->sk_dst_pending_confirm,
> i.e. highest level sk because the sk arg, if different, does not
> have such flag set (tunnels do not call dst_confirm).
> 
> - ip_finish_output2 should call new func dst_neigh_confirm_sk
> (which has nothing related to dst, hm... better name is needed):
> 
>       if (!IS_ERR(neigh)) {
> -             int res = dst_neigh_output(dst, neigh, skb);
> +             int res;
> +
> +             dst_neigh_confirm_sk(skb->sk, neigh);
> +             res = dst_neigh_output(dst, neigh, skb);
> 
>       which should do something like this:
> 
>       if (sk && sk->sk_dst_pending_confirm) {
>               unsigned long now = jiffies;
> 
>               sk->sk_dst_pending_confirm = 0;
>               /* avoid dirtying neighbour */
>               if (n->confirmed != now)
>                       n->confirmed = now;
>       }
> 
>       Additional dst == skb->sk->sk_dst_cache check will
> not propagate the indication on DNAT/tunnel. For me,
> it is better without such check.
> 
>       So, the idea is to move TCP and other similar
> users to the new dst_confirm_sk() method. If other
> dst_confirm() users are left, they should be checked
> if dsts with rt_gateway = 0 can be wrongly used.
> 
> Regards
> 
> --
> Julian Anastasov <j...@ssi.bg>
> 
> .
> 

Sorry for so late.

Based on your ideas, I make a patch and test it for a while.

It works for me.

>From dc033fe4bac8931590e15774a298f04ccea8ed4c Mon Sep 17 00:00:00 2001
From: YueHabing <yuehaib...@huawei.com>
Date: Wed, 14 Dec 2016 02:43:51 -0500
Subject: [PATCH] arp: do neigh confirm based on sk arg

Signed-off-by: YueHabing <yuehaib...@huawei.com>
---
 include/net/sock.h     | 18 ++++++++++++++++++
 net/ipv4/ip_output.c   |  5 ++++-
 net/ipv4/ping.c        |  2 +-
 net/ipv4/raw.c         |  2 +-
 net/ipv4/tcp_input.c   |  8 ++------
 net/ipv4/tcp_metrics.c |  5 +++--
 net/ipv4/udp.c         |  2 +-
 net/ipv6/raw.c         |  2 +-
 net/ipv6/route.c       |  6 +++---
 net/ipv6/udp.c         |  2 +-
 10 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index 92b2697..ece8dfa 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -434,6 +434,7 @@ struct sock {
        struct sk_buff          *sk_send_head;
        __s32                   sk_peek_off;
        int                     sk_write_pending;
+       unsigned short          sk_dst_pending_confirm;
 #ifdef CONFIG_SECURITY
        void                    *sk_security;
 #endif
@@ -2263,6 +2264,23 @@ static inline void sk_state_store(struct sock *sk, int 
newstate)
        smp_store_release(&sk->sk_state, newstate);
 }

+static inline void dst_confirm_sk(struct sock *sk)
+{
+        sk->sk_dst_pending_confirm = 1;
+}
+
+static inline void dst_neigh_confirm_sk(struct sock *sk, struct neighbour *n)
+{
+        if (sk && sk->sk_dst_pending_confirm) {
+                unsigned long now = jiffies;
+
+                sk->sk_dst_pending_confirm = 0;
+                /* avoid dirtying neighbour */
+                if (n->confirmed != now)
+                        n->confirmed = now;
+        }
+}
+
 void sock_enable_timestamp(struct sock *sk, int flag);
 int sock_get_timestamp(struct sock *, struct timeval __user *);
 int sock_get_timestampns(struct sock *, struct timespec __user *);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 877bdb0..7c9b640 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -221,7 +221,10 @@ static int ip_finish_output2(struct net *net, struct sock 
*sk, struct sk_buff *s
        if (unlikely(!neigh))
                neigh = __neigh_create(&arp_tbl, &nexthop, dev, false);
        if (!IS_ERR(neigh)) {
-               int res = dst_neigh_output(dst, neigh, skb);
+               int res;
+
+               dst_neigh_confirm_sk(skb->sk, neigh);
+               res = dst_neigh_output(dst, neigh, skb);

                rcu_read_unlock_bh();
                return res;
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 96b8e2b..bcff1c6 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -847,7 +847,7 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t len)
        return err;

 do_confirm:
-       dst_confirm(&rt->dst);
+       dst_confirm_sk(sk);
        if (!(msg->msg_flags & MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ecbe5a7..e01424d 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -664,7 +664,7 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t len)
        return len;

 do_confirm:
-       dst_confirm(&rt->dst);
+       dst_confirm_sk(sk);
        if (!(msg->msg_flags & MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c71d49c..92b2060 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -3702,9 +3702,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff 
*skb, int flag)
                tcp_process_tlp_ack(sk, ack, flag);

        if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
-               struct dst_entry *dst = __sk_dst_get(sk);
-               if (dst)
-                       dst_confirm(dst);
+               dst_confirm_sk(sk);
        }

        if (icsk->icsk_pending == ICSK_TIME_RETRANS)
@@ -6049,9 +6047,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff 
*skb)
                tcp_set_state(sk, TCP_FIN_WAIT2);
                sk->sk_shutdown |= SEND_SHUTDOWN;

-               dst = __sk_dst_get(sk);
-               if (dst)
-                       dst_confirm(dst);
+               dst_confirm_sk(sk);

                if (!sock_flag(sk, SOCK_DEAD)) {
                        /* Wake up lingering close() */
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index bf1f3b2..375ff08 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -379,7 +379,8 @@ void tcp_update_metrics(struct sock *sk)
                return;

        if (dst->flags & DST_HOST)
-               dst_confirm(dst);
+               dst_confirm_sk(sk);
+               

        rcu_read_lock();
        if (icsk->icsk_backoff || !tp->srtt_us) {
@@ -496,7 +497,7 @@ void tcp_init_metrics(struct sock *sk)
        if (!dst)
                goto reset;

-       dst_confirm(dst);
+       dst_confirm_sk(sk);

        rcu_read_lock();
        tm = tcp_get_metrics(sk, dst, true);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 5bab6c3..f7852d3 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1111,7 +1111,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t len)
        return err;

 do_confirm:
-       dst_confirm(&rt->dst);
+       dst_confirm_sk(sk);
        if (!(msg->msg_flags&MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 054a1d8..bbb09a4 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -927,7 +927,7 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr 
*msg, size_t len)
        txopt_put(opt_to_free);
        return err < 0 ? err : len;
 do_confirm:
-       dst_confirm(dst);
+       dst_confirm_sk(sk);
        if (!(msg->msg_flags & MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 1b57e11..8df4671 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1356,7 +1356,7 @@ static bool rt6_cache_allowed_for_pmtu(const struct 
rt6_info *rt)
                (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
 }

-static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
+static void __ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
                                 const struct ipv6hdr *iph, u32 mtu)
 {
        struct rt6_info *rt6 = (struct rt6_info *)dst;
@@ -1367,7 +1367,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, 
const struct sock *sk,
        if (dst_metric_locked(dst, RTAX_MTU))
                return;

-       dst_confirm(dst);
+       dst_confirm_sk(sk);
        mtu = max_t(u32, mtu, IPV6_MIN_MTU);
        if (mtu >= dst_mtu(dst))
                return;
@@ -2248,7 +2248,7 @@ static void rt6_do_redirect(struct dst_entry *dst, struct 
sock *sk, struct sk_bu
         * Look, redirects are sent only in response to data packets,
         * so that this nexthop apparently is reachable. --ANK
         */
-       dst_confirm(&rt->dst);
+       dst_confirm_sk(sk);

        neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
        if (!neigh)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index e4a8000..6057101 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -1309,7 +1309,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t len)
        return err;

 do_confirm:
-       dst_confirm(dst);
+       dst_confirm_sk(sk);
        if (!(msg->msg_flags&MSG_PROBE) || len)
                goto back_from_confirm;
        err = 0;
-- 
1.8.3.1

Reply via email to