It is possible that two concurrent packets originating from the same
socket of a connection-less protocol (e.g. UDP) can end up having
different IP_CT_DIR_REPLY tuples which results in one of the packets
being dropped.

To illustrate this, consider the following simplified scenario:

1. No DNAT/SNAT/MASQUEARADE rules are installed, but the nf_nat module
   is loaded.
2. Packet A and B are sent at the same time from two different threads
   via the same UDP socket which hasn't been used before (=no CT has
   been created before). Both packets have the same IP_CT_DIR_ORIGINAL
   tuple.
3. CT of A has been created and confirmed, afterwards get_unique_tuple
   is called for B. Because IP_CT_DIR_REPLY tuple (the inverse of
   the IP_CT_DIR_ORIGINAL tuple) is already taken by the A's confirmed
   CT (nf_nat_used_tuple finds it), get_unique_tuple calls UDP's
   unique_tuple which returns a different IP_CT_DIR_REPLY tuple (usually
   with src port = 1024)
4. B's CT cannot get confirmed in __nf_conntrack_confirm due to
   the found IP_CT_DIR_ORIGINAL tuple of A and the different
   IP_CT_DIR_REPLY tuples, thus the packet B gets dropped.

This patch modifies get_unique_tuple in a way that the function might
return the already used by a confirmed CT reply tuple if a L4 protocol
allows the clash resolution and IP_CT_DIR_ORIGINAL tuples are equal.

Signed-off-by: Martynas Pumputis <[email protected]>

---

I've tested the patch with https://github.com/brb/conntrack-race in three
different scenarios (no NAT, SNAT, DNAT) by checking "insert_failed" and
"drop" counters, PCAP traces and dynamic debug logs.
---
 include/net/netfilter/nf_conntrack.h     |  5 ++--
 include/net/netfilter/nf_nat.h           |  3 ++-
 net/ipv4/netfilter/nf_nat_proto_gre.c    |  2 +-
 net/ipv4/netfilter/nf_nat_proto_icmp.c   |  2 +-
 net/ipv6/netfilter/nf_nat_proto_icmpv6.c |  2 +-
 net/netfilter/nf_conntrack_core.c        | 12 ++++++---
 net/netfilter/nf_nat_core.c              | 34 +++++++++++++++++++-----
 net/netfilter/nf_nat_proto_common.c      |  2 +-
 8 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h 
b/include/net/netfilter/nf_conntrack.h
index 062dc19b5840..498d5d8159f5 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -135,8 +135,9 @@ void nf_conntrack_alter_reply(struct nf_conn *ct,
 
 /* Is this tuple taken? (ignoring any belonging to the given
    conntrack). */
-int nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
-                            const struct nf_conn *ignored_conntrack);
+int nf_conntrack_reply_tuple_taken(const struct nf_conntrack_tuple *tuple,
+                            const struct nf_conn *ignored_conntrack,
+                            bool ignore_same_orig);
 
 #define NFCT_INFOMASK  7UL
 #define NFCT_PTRMASK   ~(NFCT_INFOMASK)
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index a17eb2f8d40e..fee9737a65a7 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -49,7 +49,8 @@ struct nf_conn_nat *nf_ct_nat_ext_add(struct nf_conn *ct);
 
 /* Is this tuple already taken? (not by us)*/
 int nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
-                     const struct nf_conn *ignored_conntrack);
+                     const struct nf_conn *ignored_conntrack,
+                     bool ignore_same_orig);
 
 static inline struct nf_conn_nat *nfct_nat(const struct nf_conn *ct)
 {
diff --git a/net/ipv4/netfilter/nf_nat_proto_gre.c 
b/net/ipv4/netfilter/nf_nat_proto_gre.c
index 00fda6331ce5..c3083b68d3c2 100644
--- a/net/ipv4/netfilter/nf_nat_proto_gre.c
+++ b/net/ipv4/netfilter/nf_nat_proto_gre.c
@@ -72,7 +72,7 @@ gre_unique_tuple(const struct nf_nat_l3proto *l3proto,
 
        for (i = 0; ; ++key) {
                *keyptr = htons(min + key % range_size);
-               if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+               if (++i == range_size || !nf_nat_used_tuple(tuple, ct, false))
                        return;
        }
 
diff --git a/net/ipv4/netfilter/nf_nat_proto_icmp.c 
b/net/ipv4/netfilter/nf_nat_proto_icmp.c
index 6d7cf1d79baf..589e9a9b5509 100644
--- a/net/ipv4/netfilter/nf_nat_proto_icmp.c
+++ b/net/ipv4/netfilter/nf_nat_proto_icmp.c
@@ -47,7 +47,7 @@ icmp_unique_tuple(const struct nf_nat_l3proto *l3proto,
        for (i = 0; ; ++id) {
                tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
                                             (id % range_size));
-               if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+               if (++i == range_size || !nf_nat_used_tuple(tuple, ct, false))
                        return;
        }
        return;
diff --git a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c 
b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
index d9bf42ba44fa..cf47f5f549ee 100644
--- a/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
+++ b/net/ipv6/netfilter/nf_nat_proto_icmpv6.c
@@ -49,7 +49,7 @@ icmpv6_unique_tuple(const struct nf_nat_l3proto *l3proto,
        for (i = 0; ; ++id) {
                tuple->src.u.icmp.id = htons(ntohs(range->min_proto.icmp.id) +
                                             (id % range_size));
-               if (++i == range_size || !nf_nat_used_tuple(tuple, ct))
+               if (++i == range_size || !nf_nat_used_tuple(tuple, ct, false))
                        return;
        }
 }
diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index cc5ef8c9d0da..43a53eaff82c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -848,8 +848,9 @@ EXPORT_SYMBOL_GPL(__nf_conntrack_confirm);
 /* Returns true if a connection correspondings to the tuple (required
    for NAT). */
 int
-nf_conntrack_tuple_taken(const struct nf_conntrack_tuple *tuple,
-                        const struct nf_conn *ignored_conntrack)
+nf_conntrack_reply_tuple_taken(const struct nf_conntrack_tuple *tuple,
+                              const struct nf_conn *ignored_conntrack,
+                              bool ignore_same_orig)
 {
        struct net *net = nf_ct_net(ignored_conntrack);
        const struct nf_conntrack_zone *zone;
@@ -878,6 +879,11 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple 
*tuple,
                }
 
                if (nf_ct_key_equal(h, tuple, zone, net)) {
+                       if (ignore_same_orig &&
+                           
nf_ct_tuple_equal(&ignored_conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
+                                             
&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple)) {
+                               continue;
+                       }
                        NF_CT_STAT_INC_ATOMIC(net, found);
                        rcu_read_unlock();
                        return 1;
@@ -893,7 +899,7 @@ nf_conntrack_tuple_taken(const struct nf_conntrack_tuple 
*tuple,
 
        return 0;
 }
-EXPORT_SYMBOL_GPL(nf_conntrack_tuple_taken);
+EXPORT_SYMBOL_GPL(nf_conntrack_reply_tuple_taken);
 
 #define NF_CT_EVICTION_RANGE   8
 
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 46f9df99d276..12fb39d953e0 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -154,7 +154,8 @@ hash_by_src(const struct net *n, const struct 
nf_conntrack_tuple *tuple)
 /* Is this tuple already taken? (not by us) */
 int
 nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
-                 const struct nf_conn *ignored_conntrack)
+                 const struct nf_conn *ignored_conntrack,
+                 bool ignore_same_orig)
 {
        /* Conntrack tracking doesn't keep track of outgoing tuples; only
         * incoming ones.  NAT means they don't have a fixed mapping,
@@ -165,7 +166,15 @@ nf_nat_used_tuple(const struct nf_conntrack_tuple *tuple,
        struct nf_conntrack_tuple reply;
 
        nf_ct_invert_tuplepr(&reply, tuple);
-       return nf_conntrack_tuple_taken(&reply, ignored_conntrack);
+       /* If ignore_same_orig is enabled, the following function will ignore
+        * any matching CT with the same IP_CT_DIR_ORIGINAL tuple.
+        *
+        * Used when calling the function for a CT of a connection-less protocol
+        * such as UDP to ignore a clashing CT which originated from the same
+        * socket.
+        */
+       return nf_conntrack_reply_tuple_taken(&reply, ignored_conntrack,
+                                             ignore_same_orig);
 }
 EXPORT_SYMBOL(nf_nat_used_tuple);
 
@@ -323,7 +332,9 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
        const struct nf_conntrack_zone *zone;
        const struct nf_nat_l3proto *l3proto;
        const struct nf_nat_l4proto *l4proto;
+       const struct nf_conntrack_l4proto *ct_l4proto;
        struct net *net = nf_ct_net(ct);
+       bool ignore_same_orig = false;
 
        zone = nf_ct_zone(ct);
 
@@ -331,6 +342,16 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
        l3proto = __nf_nat_l3proto_find(orig_tuple->src.l3num);
        l4proto = __nf_nat_l4proto_find(orig_tuple->src.l3num,
                                        orig_tuple->dst.protonum);
+       ct_l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
+
+       /* If the protocol allows the clash resolution, then when searching
+        * for clashing CTs ignore the ones with the same IP_CT_DIR_ORIGINAL
+        * tuple as they originate from the same socket. This prevents from
+        * generating different reply tuples for two racing packets from
+        * the same connection-less (e.g. UDP) socket which results in dropping
+        * one of the packets in __nf_conntrack_confirm.
+        */
+       ignore_same_orig = ct_l4proto->allow_clash;
 
        /* 1) If this srcip/proto/src-proto-part is currently mapped,
         * and that same mapping gives a unique tuple within the given
@@ -344,14 +365,15 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
            !(range->flags & NF_NAT_RANGE_PROTO_RANDOM_ALL)) {
                /* try the original tuple first */
                if (in_range(l3proto, l4proto, orig_tuple, range)) {
-                       if (!nf_nat_used_tuple(orig_tuple, ct)) {
+                       if (!nf_nat_used_tuple(orig_tuple, ct,
+                                              ignore_same_orig)) {
                                *tuple = *orig_tuple;
                                goto out;
                        }
                } else if (find_appropriate_src(net, zone, l3proto, l4proto,
                                                orig_tuple, tuple, range)) {
                        pr_debug("get_unique_tuple: Found current src map\n");
-                       if (!nf_nat_used_tuple(tuple, ct))
+                       if (!nf_nat_used_tuple(tuple, ct, ignore_same_orig))
                                goto out;
                }
        }
@@ -372,9 +394,9 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple,
                                  &range->min_proto,
                                  &range->max_proto) &&
                            (range->min_proto.all == range->max_proto.all ||
-                            !nf_nat_used_tuple(tuple, ct)))
+                            !nf_nat_used_tuple(tuple, ct, ignore_same_orig)))
                                goto out;
-               } else if (!nf_nat_used_tuple(tuple, ct)) {
+               } else if (!nf_nat_used_tuple(tuple, ct, ignore_same_orig)) {
                        goto out;
                }
        }
diff --git a/net/netfilter/nf_nat_proto_common.c 
b/net/netfilter/nf_nat_proto_common.c
index 5d849d835561..851517cdfbd7 100644
--- a/net/netfilter/nf_nat_proto_common.c
+++ b/net/netfilter/nf_nat_proto_common.c
@@ -91,7 +91,7 @@ void nf_nat_l4proto_unique_tuple(const struct nf_nat_l3proto 
*l3proto,
 
        for (i = 0; ; ++off) {
                *portptr = htons(min + off % range_size);
-               if (++i != range_size && nf_nat_used_tuple(tuple, ct))
+               if (++i != range_size && nf_nat_used_tuple(tuple, ct, false))
                        continue;
                if (!(range->flags & (NF_NAT_RANGE_PROTO_RANDOM_ALL|
                                        NF_NAT_RANGE_PROTO_OFFSET)))
-- 
2.18.0

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to