The branch main has been updated by thj:

URL: 
https://cgit.FreeBSD.org/src/commit/?id=390dc369efaaeca2802baf168ddbd7a40e3afcc8

commit 390dc369efaaeca2802baf168ddbd7a40e3afcc8
Author:     Tom Jones <[email protected]>
AuthorDate: 2024-09-06 11:59:09 +0000
Commit:     Tom Jones <[email protected]>
CommitDate: 2024-09-06 12:48:04 +0000

    pf: Add support for endpoint independent NAT bindings for UDP
    
    With Endpoint Independent NAT bindings for UDP flows from a NATed source
    address are always mapped to the same ip:port pair on the NAT router.
    This allows a client to connect to multiple external servers while
    appearing as the same host and enables NAT traversal without requiring
    the client to use a middlebox traversal protocol such as STUN or TURN.
    
    Introduce the 'endpoint-independent' option to NAT rules to allow
    configuration of endpoint independent without effecting existing
    deployments.
    
    This change satisfies REQ 1 and 3 of RFC 4787 also known as 'full cone'
    NAT.
    
    Using Endpoint Independent NAT changes NAT exhaustion behaviour it does
    not introduce any additional security considerations compared to other
    forms of NAT.
    
    PR:             219803
    Co-authored-by: Damjan Jovanovic <[email protected]>
    Co-authored-by: Naman Sood <[email protected]>
    Reviewed-by:    kp
    Sponsored-by:   Tailscale
    Sponsored-by:   The FreeBSD Foundation
    Differential Revision: https://reviews.freebsd.org/D11137
---
 sbin/pfctl/parse.y               |  12 ++-
 sbin/pfctl/pfctl_parser.c        |   2 +
 sbin/pfctl/tests/files/pf1021.in |   1 +
 sbin/pfctl/tests/files/pf1021.ok |   1 +
 share/man/man4/pf.4              |   6 +-
 share/man/man5/pf.conf.5         |  12 ++-
 sys/net/pfvar.h                  |  49 +++++++++-
 sys/netpfil/pf/pf.c              | 195 ++++++++++++++++++++++++++++++++++++++-
 sys/netpfil/pf/pf.h              |   1 +
 sys/netpfil/pf/pf_lb.c           | 104 +++++++++++++++++----
 tests/sys/netpfil/pf/nat.sh      | 134 +++++++++++++++++++++++++++
 11 files changed, 489 insertions(+), 28 deletions(-)

diff --git a/sbin/pfctl/parse.y b/sbin/pfctl/parse.y
index 724ffefcd7d9..f54f24a14a7c 100644
--- a/sbin/pfctl/parse.y
+++ b/sbin/pfctl/parse.y
@@ -326,6 +326,7 @@ static struct pool_opts {
        int                      marker;
 #define POM_TYPE               0x01
 #define POM_STICKYADDRESS      0x02
+#define POM_ENDPI              0x04
        u_int8_t                 opts;
        int                      type;
        int                      staticport;
@@ -512,7 +513,7 @@ int parseport(char *, struct range *r, int);
 %token UPPERLIMIT QUEUE PRIORITY QLIMIT HOGS BUCKETS RTABLE TARGET INTERVAL
 %token DNPIPE DNQUEUE RIDENTIFIER
 %token LOAD RULESET_OPTIMIZATION PRIO
-%token STICKYADDRESS MAXSRCSTATES MAXSRCNODES SOURCETRACK GLOBAL RULE
+%token STICKYADDRESS ENDPI MAXSRCSTATES MAXSRCNODES SOURCETRACK GLOBAL RULE
 %token MAXSRCCONN MAXSRCCONNRATE OVERLOAD FLUSH SLOPPY PFLOW
 %token TAGGED TAG IFBOUND FLOATING STATEPOLICY STATEDEFAULTS ROUTE SETTOS
 %token DIVERTTO DIVERTREPLY BRIDGE_TO
@@ -4593,6 +4594,14 @@ pool_opt : BITMASK       {
                        pool_opts.marker |= POM_STICKYADDRESS;
                        pool_opts.opts |= PF_POOL_STICKYADDR;
                }
+               | ENDPI {
+                       if (pool_opts.marker & POM_ENDPI) {
+                               yyerror("endpoint-independent cannot be 
redefined");
+                               YYERROR;
+                       }
+                       pool_opts.marker |= POM_ENDPI;
+                       pool_opts.opts |= PF_POOL_ENDPI;
+               }
                | MAPEPORTSET number '/' number '/' number {
                        if (pool_opts.mape.offset) {
                                yyerror("map-e-portset cannot be redefined");
@@ -6299,6 +6308,7 @@ lookup(char *s)
                { "dnqueue",            DNQUEUE},
                { "drop",               DROP},
                { "dup-to",             DUPTO},
+               { "endpoint-independent", ENDPI},
                { "ether",              ETHER},
                { "fail-policy",        FAILPOLICY},
                { "fairq",              FAIRQ},
diff --git a/sbin/pfctl/pfctl_parser.c b/sbin/pfctl/pfctl_parser.c
index e71b7b160495..a9416534626b 100644
--- a/sbin/pfctl/pfctl_parser.c
+++ b/sbin/pfctl/pfctl_parser.c
@@ -488,6 +488,8 @@ print_pool(struct pfctl_pool *pool, u_int16_t p1, u_int16_t 
p2,
        }
        if (pool->opts & PF_POOL_STICKYADDR)
                printf(" sticky-address");
+       if (pool->opts & PF_POOL_ENDPI)
+               printf(" endpoint-independent");
        if (id == PF_NAT && p1 == 0 && p2 == 0)
                printf(" static-port");
        if (pool->mape.offset > 0)
diff --git a/sbin/pfctl/tests/files/pf1021.in b/sbin/pfctl/tests/files/pf1021.in
new file mode 100644
index 000000000000..841b024157c6
--- /dev/null
+++ b/sbin/pfctl/tests/files/pf1021.in
@@ -0,0 +1 @@
+nat on vtnet1 inet from ! (vtnet1) to any -> (vtnet1) endpoint-independent
diff --git a/sbin/pfctl/tests/files/pf1021.ok b/sbin/pfctl/tests/files/pf1021.ok
new file mode 100644
index 000000000000..3b5b84e2e11b
--- /dev/null
+++ b/sbin/pfctl/tests/files/pf1021.ok
@@ -0,0 +1 @@
+nat on vtnet1 inet from ! (vtnet1) to any -> (vtnet1) round-robin 
endpoint-independent
diff --git a/share/man/man4/pf.4 b/share/man/man4/pf.4
index 9bfc75cb490d..3855d07faead 100644
--- a/share/man/man4/pf.4
+++ b/share/man/man4/pf.4
@@ -26,7 +26,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.Dd September 2, 2024
+.Dd September 6, 2024
 .Dt PF 4
 .Os
 .Sh NAME
@@ -89,6 +89,10 @@ Should be power of 2.
 Default value is 32768.
 .It Va net.pf.rule_tag_hashsize
 Size of the hash table that stores tags.
+.It Va net.pf.udpendpoint_hashsize
+Size of hash table that store UDP endpoint mappings.
+Should be power of 2.
+Default value is 32768.
 .It Va net.pf.default_to_drop
 This value overrides
 .Cd "options PF_DEFAULT_TO_DROP"
diff --git a/share/man/man5/pf.conf.5 b/share/man/man5/pf.conf.5
index f04b0799741e..5aa936d509ed 100644
--- a/share/man/man5/pf.conf.5
+++ b/share/man/man5/pf.conf.5
@@ -27,7 +27,7 @@
 .\" ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 .\" POSSIBILITY OF SUCH DAMAGE.
 .\"
-.Dd June 24, 2024
+.Dd September 4, 2024
 .Dt PF.CONF 5
 .Os
 .Sh NAME
@@ -2278,6 +2278,16 @@ from modifying the source port on TCP and UDP packets.
 With
 .Ar nat
 rules, the
+.It Ar endpoint-independent
+With
+.Ar nat
+rules, the
+.Ar endpoint-independent
+option caues
+.Xr pf 4
+to always map connections from a UDP source address and port to the same
+NAT address and port.
+This feature implements "full-cone" NAT behavior.
 .Ar map-e-portset
 option enables the source port translation of MAP-E (RFC 7597) Customer Edge.
 In order to make the host act as a MAP-E Customer Edge, setting up a tunneling
diff --git a/sys/net/pfvar.h b/sys/net/pfvar.h
index 7b3c1c49696a..c123da37d2cb 100644
--- a/sys/net/pfvar.h
+++ b/sys/net/pfvar.h
@@ -940,6 +940,29 @@ struct pf_state_peer {
        u_int8_t        pad[1];
 };
 
+/* Keep synced with struct pf_udp_endpoint. */
+struct pf_udp_endpoint_cmp {
+       struct pf_addr  addr;
+       uint16_t        port;
+       sa_family_t     af;
+       uint8_t         pad[1];
+};
+
+struct pf_udp_endpoint {
+       struct pf_addr  addr;
+       uint16_t        port;
+       sa_family_t     af;
+       uint8_t         pad[1];
+
+       struct pf_udp_mapping *mapping;
+       LIST_ENTRY(pf_udp_endpoint) entry;
+};
+
+struct pf_udp_mapping {
+       struct pf_udp_endpoint endpoints[2];
+       u_int refs;
+};
+
 /* Keep synced with struct pf_state_key. */
 struct pf_state_key_cmp {
        struct pf_addr   addr[2];
@@ -1069,6 +1092,7 @@ struct pf_kstate {
        union pf_krule_ptr       nat_rule;
        struct pf_addr           rt_addr;
        struct pf_state_key     *key[2];        /* addresses stack and wire  */
+       struct pf_udp_mapping   *udp_mapping;
        struct pfi_kkif         *kif;
        struct pfi_kkif         *orig_kif;      /* The real kif, even if we're 
a floating state (i.e. if == V_pfi_all). */
        struct pfi_kkif         *rt_kif;
@@ -2124,17 +2148,28 @@ struct pf_idhash {
        struct mtx                      lock;
 };
 
+struct pf_udpendpointhash {
+       LIST_HEAD(, pf_udp_endpoint)    endpoints;
+       /* refcont is synchronized on the source endpoint's row lock */
+       struct mtx                      lock;
+};
+
 extern u_long          pf_ioctl_maxcount;
 VNET_DECLARE(u_long, pf_hashmask);
 #define V_pf_hashmask  VNET(pf_hashmask)
 VNET_DECLARE(u_long, pf_srchashmask);
 #define V_pf_srchashmask       VNET(pf_srchashmask)
+VNET_DECLARE(u_long, pf_udpendpointhashmask);
+#define V_pf_udpendpointhashmask       VNET(pf_udpendpointhashmask)
 #define        PF_HASHSIZ      (131072)
 #define        PF_SRCHASHSIZ   (PF_HASHSIZ/4)
+#define        PF_UDPENDHASHSIZ        (PF_HASHSIZ/4)
 VNET_DECLARE(struct pf_keyhash *, pf_keyhash);
 VNET_DECLARE(struct pf_idhash *, pf_idhash);
+VNET_DECLARE(struct pf_udpendpointhash *, pf_udpendpointhash);
 #define V_pf_keyhash   VNET(pf_keyhash)
 #define        V_pf_idhash     VNET(pf_idhash)
+#define        V_pf_udpendpointhash    VNET(pf_udpendpointhash)
 VNET_DECLARE(struct pf_srchash *, pf_srchash);
 #define        V_pf_srchash    VNET(pf_srchash)
 
@@ -2209,6 +2244,8 @@ VNET_DECLARE(uma_zone_t,   pf_state_z);
 #define        V_pf_state_z             VNET(pf_state_z)
 VNET_DECLARE(uma_zone_t,        pf_state_key_z);
 #define        V_pf_state_key_z         VNET(pf_state_key_z)
+VNET_DECLARE(uma_zone_t,        pf_udp_mapping_z);
+#define        V_pf_udp_mapping_z       VNET(pf_udp_mapping_z)
 VNET_DECLARE(uma_zone_t,        pf_state_scrub_z);
 #define        V_pf_state_scrub_z       VNET(pf_state_scrub_z)
 
@@ -2281,6 +2318,15 @@ extern struct pf_kstate          *pf_find_state_all(
 extern bool                    pf_find_state_all_exists(
                                    const struct pf_state_key_cmp *,
                                    u_int);
+extern struct pf_udp_mapping   *pf_udp_mapping_find(struct pf_udp_endpoint_cmp
+                                   *endpoint);
+extern struct pf_udp_mapping   *pf_udp_mapping_create(sa_family_t af,
+                                   struct pf_addr *src_addr, uint16_t src_port,
+                                   struct pf_addr *nat_addr, uint16_t 
nat_port);
+extern int                      pf_udp_mapping_insert(struct pf_udp_mapping
+                                   *mapping);
+extern void                     pf_udp_mapping_release(struct pf_udp_mapping
+                                   *mapping);
 extern struct pf_ksrc_node     *pf_find_src_node(struct pf_addr *,
                                    struct pf_krule *, sa_family_t,
                                    struct pf_srchash **, bool);
@@ -2574,7 +2620,8 @@ u_short                    pf_get_translation(struct 
pf_pdesc *, struct mbuf *,
                            struct pf_state_key **, struct pf_state_key **,
                            struct pf_addr *, struct pf_addr *,
                            uint16_t, uint16_t, struct pf_kanchor_stackframe *,
-                           struct pf_krule **);
+                           struct pf_krule **,
+                           struct pf_udp_mapping **udp_mapping);
 
 struct pf_state_key    *pf_state_key_setup(struct pf_pdesc *, struct mbuf *, 
int,
                            struct pf_addr *, struct pf_addr *, u_int16_t, 
u_int16_t);
diff --git a/sys/netpfil/pf/pf.c b/sys/netpfil/pf/pf.c
index f7fe75184efd..70220dda935e 100644
--- a/sys/netpfil/pf/pf.c
+++ b/sys/netpfil/pf/pf.c
@@ -283,6 +283,7 @@ VNET_DEFINE_STATIC(uma_zone_t,      pf_sources_z);
 uma_zone_t             pf_mtag_z;
 VNET_DEFINE(uma_zone_t,         pf_state_z);
 VNET_DEFINE(uma_zone_t,         pf_state_key_z);
+VNET_DEFINE(uma_zone_t,         pf_udp_mapping_z);
 
 VNET_DEFINE(struct unrhdr64, pf_stateid);
 
@@ -330,7 +331,7 @@ static int           pf_create_state(struct pf_krule *, 
struct pf_krule *,
                            struct pf_state_key *, struct mbuf *, int,
                            u_int16_t, u_int16_t, int *, struct pfi_kkif *,
                            struct pf_kstate **, int, u_int16_t, u_int16_t,
-                           int, struct pf_krule_slist *);
+                           int, struct pf_krule_slist *, struct pf_udp_mapping 
*);
 static int              pf_state_key_addr_setup(struct pf_pdesc *, struct mbuf 
*,
                            int, struct pf_state_key_cmp *, int, struct pf_addr 
*,
                            int, struct pf_addr *, int);
@@ -493,22 +494,29 @@ MALLOC_DEFINE(M_PF_RULE_ITEM, "pf_krule_item", "pf(4) 
rule items");
 VNET_DEFINE(struct pf_keyhash *, pf_keyhash);
 VNET_DEFINE(struct pf_idhash *, pf_idhash);
 VNET_DEFINE(struct pf_srchash *, pf_srchash);
+VNET_DEFINE(struct pf_udpendpointhash *, pf_udpendpointhash);
+VNET_DEFINE(struct pf_udpendpointmapping *, pf_udpendpointmapping);
 
 SYSCTL_NODE(_net, OID_AUTO, pf, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
     "pf(4)");
 
 VNET_DEFINE(u_long, pf_hashmask);
 VNET_DEFINE(u_long, pf_srchashmask);
+VNET_DEFINE(u_long, pf_udpendpointhashmask);
 VNET_DEFINE_STATIC(u_long, pf_hashsize);
 #define V_pf_hashsize  VNET(pf_hashsize)
 VNET_DEFINE_STATIC(u_long, pf_srchashsize);
 #define V_pf_srchashsize       VNET(pf_srchashsize)
+VNET_DEFINE_STATIC(u_long, pf_udpendpointhashsize);
+#define V_pf_udpendpointhashsize       VNET(pf_udpendpointhashsize)
 u_long pf_ioctl_maxcount = 65535;
 
 SYSCTL_ULONG(_net_pf, OID_AUTO, states_hashsize, CTLFLAG_VNET | CTLFLAG_RDTUN,
     &VNET_NAME(pf_hashsize), 0, "Size of pf(4) states hashtable");
 SYSCTL_ULONG(_net_pf, OID_AUTO, source_nodes_hashsize, CTLFLAG_VNET | 
CTLFLAG_RDTUN,
     &VNET_NAME(pf_srchashsize), 0, "Size of pf(4) source nodes hashtable");
+SYSCTL_ULONG(_net_pf, OID_AUTO, udpendpoint_hashsize, CTLFLAG_VNET | 
CTLFLAG_RDTUN,
+    &VNET_NAME(pf_udpendpointhashsize), 0, "Size of pf(4) endpoint hashtable");
 SYSCTL_ULONG(_net_pf, OID_AUTO, request_maxcount, CTLFLAG_RWTUN,
     &pf_ioctl_maxcount, 0, "Maximum number of tables, addresses, ... in a 
single ioctl() call");
 
@@ -699,6 +707,17 @@ pf_hashsrc(struct pf_addr *addr, sa_family_t af)
        return (h & V_pf_srchashmask);
 }
 
+static inline uint32_t
+pf_hashudpendpoint(struct pf_udp_endpoint *endpoint)
+{
+       uint32_t h;
+
+       h = murmur3_32_hash32((uint32_t *)endpoint,
+           sizeof(struct pf_udp_endpoint_cmp)/sizeof(uint32_t),
+           V_pf_hashseed);
+       return (h & V_pf_udpendpointhashmask);
+}
+
 #ifdef ALTQ
 static int
 pf_state_hash(struct pf_kstate *s)
@@ -1086,12 +1105,15 @@ pf_initialize(void)
        struct pf_keyhash       *kh;
        struct pf_idhash        *ih;
        struct pf_srchash       *sh;
+       struct pf_udpendpointhash       *uh;
        u_int i;
 
        if (V_pf_hashsize == 0 || !powerof2(V_pf_hashsize))
                V_pf_hashsize = PF_HASHSIZ;
        if (V_pf_srchashsize == 0 || !powerof2(V_pf_srchashsize))
                V_pf_srchashsize = PF_SRCHASHSIZ;
+       if (V_pf_udpendpointhashsize == 0 || 
!powerof2(V_pf_udpendpointhashsize))
+               V_pf_udpendpointhashsize = PF_UDPENDHASHSIZ;
 
        V_pf_hashseed = arc4random();
 
@@ -1154,6 +1176,30 @@ pf_initialize(void)
        for (i = 0, sh = V_pf_srchash; i <= V_pf_srchashmask; i++, sh++)
                mtx_init(&sh->lock, "pf_srchash", NULL, MTX_DEF);
 
+
+       /* UDP endpoint mappings. */
+       V_pf_udp_mapping_z = uma_zcreate("pf UDP mappings",
+           sizeof(struct pf_udp_mapping), NULL, NULL, NULL, NULL,
+           UMA_ALIGN_PTR, 0);
+       V_pf_udpendpointhash = mallocarray(V_pf_udpendpointhashsize,
+           sizeof(struct pf_udpendpointhash), M_PFHASH, M_NOWAIT | M_ZERO);
+       if (V_pf_udpendpointhash == NULL) {
+               printf("pf: Unable to allocate memory for "
+                   "udpendpoint_hashsize %lu.\n", V_pf_udpendpointhashsize);
+
+               V_pf_udpendpointhashsize = PF_UDPENDHASHSIZ;
+               V_pf_udpendpointhash = mallocarray(V_pf_udpendpointhashsize,
+                   sizeof(struct pf_udpendpointhash), M_PFHASH, M_WAITOK | 
M_ZERO);
+       }
+
+       V_pf_udpendpointhashmask = V_pf_udpendpointhashsize - 1;
+       for (i = 0, uh = V_pf_udpendpointhash;
+           i <= V_pf_udpendpointhashmask;
+           i++, uh++) {
+               mtx_init(&uh->lock, "pf_udpendpointhash", NULL,
+                   MTX_DEF | MTX_DUPOK);
+       }
+
        /* ALTQ */
        TAILQ_INIT(&V_pf_altqs[0]);
        TAILQ_INIT(&V_pf_altqs[1]);
@@ -1187,10 +1233,12 @@ pf_cleanup(void)
        struct pf_keyhash       *kh;
        struct pf_idhash        *ih;
        struct pf_srchash       *sh;
+       struct pf_udpendpointhash       *uh;
        struct pf_send_entry    *pfse, *next;
        u_int i;
 
-       for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash; i <= V_pf_hashmask;
+       for (i = 0, kh = V_pf_keyhash, ih = V_pf_idhash;
+           i <= V_pf_hashmask;
            i++, kh++, ih++) {
                KASSERT(LIST_EMPTY(&kh->keys), ("%s: key hash not empty",
                    __func__));
@@ -1209,6 +1257,15 @@ pf_cleanup(void)
        }
        free(V_pf_srchash, M_PFHASH);
 
+       for (i = 0, uh = V_pf_udpendpointhash;
+           i <= V_pf_udpendpointhashmask;
+           i++, uh++) {
+               KASSERT(LIST_EMPTY(&uh->endpoints),
+                   ("%s: udp endpoint hash not empty", __func__));
+               mtx_destroy(&uh->lock);
+       }
+       free(V_pf_udpendpointhash, M_PFHASH);
+
        STAILQ_FOREACH_SAFE(pfse, &V_pf_sendqueue, pfse_next, next) {
                m_freem(pfse->pfse_m);
                free(pfse, M_PFTEMP);
@@ -1218,6 +1275,7 @@ pf_cleanup(void)
        uma_zdestroy(V_pf_sources_z);
        uma_zdestroy(V_pf_state_z);
        uma_zdestroy(V_pf_state_key_z);
+       uma_zdestroy(V_pf_udp_mapping_z);
 }
 
 static int
@@ -1807,6 +1865,123 @@ pf_find_state_all_exists(const struct pf_state_key_cmp 
*key, u_int dir)
        return (false);
 }
 
+struct pf_udp_mapping *
+pf_udp_mapping_create(sa_family_t af, struct pf_addr *src_addr, uint16_t 
src_port,
+    struct pf_addr *nat_addr, uint16_t nat_port)
+{
+       struct pf_udp_mapping *mapping;
+
+       mapping = uma_zalloc(V_pf_udp_mapping_z, M_NOWAIT | M_ZERO);
+       if (mapping == NULL)
+               return (NULL);
+       PF_ACPY(&mapping->endpoints[0].addr, src_addr, af);
+       mapping->endpoints[0].port = src_port;
+       mapping->endpoints[0].af = af;
+       mapping->endpoints[0].mapping = mapping;
+       PF_ACPY(&mapping->endpoints[1].addr, nat_addr, af);
+       mapping->endpoints[1].port = nat_port;
+       mapping->endpoints[1].af = af;
+       mapping->endpoints[1].mapping = mapping;
+       refcount_init(&mapping->refs, 1);
+       return (mapping);
+}
+
+int
+pf_udp_mapping_insert(struct pf_udp_mapping *mapping)
+{
+       struct pf_udpendpointhash *h0, *h1;
+       struct pf_udp_endpoint *endpoint;
+       int ret = EEXIST;
+
+       h0 = &V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[0])];
+       h1 = &V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[1])];
+       if (h0 == h1) {
+               PF_HASHROW_LOCK(h0);
+       } else if (h0 < h1) {
+               PF_HASHROW_LOCK(h0);
+               PF_HASHROW_LOCK(h1);
+       } else {
+               PF_HASHROW_LOCK(h1);
+               PF_HASHROW_LOCK(h0);
+       }
+
+       LIST_FOREACH(endpoint, &h0->endpoints, entry) {
+               if (bcmp(endpoint, &mapping->endpoints[0],
+                   sizeof(struct pf_udp_endpoint_cmp)) == 0)
+                       break;
+       }
+       if (endpoint != NULL)
+               goto cleanup;
+       LIST_FOREACH(endpoint, &h1->endpoints, entry) {
+               if (bcmp(endpoint, &mapping->endpoints[1],
+                   sizeof(struct pf_udp_endpoint_cmp)) == 0)
+                       break;
+       }
+       if (endpoint != NULL)
+               goto cleanup;
+       LIST_INSERT_HEAD(&h0->endpoints, &mapping->endpoints[0], entry);
+       LIST_INSERT_HEAD(&h1->endpoints, &mapping->endpoints[1], entry);
+       ret = 0;
+
+cleanup:
+       if (h0 != h1) {
+               PF_HASHROW_UNLOCK(h0);
+               PF_HASHROW_UNLOCK(h1);
+       } else {
+               PF_HASHROW_UNLOCK(h0);
+       }
+       return (ret);
+}
+
+void
+pf_udp_mapping_release(struct pf_udp_mapping *mapping)
+{
+       /* refcount is synchronized on the source endpoint's row lock */
+       struct pf_udpendpointhash *h0, *h1;
+
+       if (mapping == NULL)
+               return;
+
+       h0 = &V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[0])];
+       PF_HASHROW_LOCK(h0);
+       if (refcount_release(&mapping->refs)) {
+               LIST_REMOVE(&mapping->endpoints[0], entry);
+               PF_HASHROW_UNLOCK(h0);
+               h1 = 
&V_pf_udpendpointhash[pf_hashudpendpoint(&mapping->endpoints[1])];
+               PF_HASHROW_LOCK(h1);
+               LIST_REMOVE(&mapping->endpoints[1], entry);
+               PF_HASHROW_UNLOCK(h1);
+
+               uma_zfree(V_pf_udp_mapping_z, mapping);
+       } else {
+                       PF_HASHROW_UNLOCK(h0);
+       }
+}
+
+
+struct pf_udp_mapping *
+pf_udp_mapping_find(struct pf_udp_endpoint_cmp *key)
+{
+       struct pf_udpendpointhash *uh;
+       struct pf_udp_endpoint *endpoint;
+
+       uh = &V_pf_udpendpointhash[pf_hashudpendpoint((struct 
pf_udp_endpoint*)key)];
+
+       PF_HASHROW_LOCK(uh);
+       LIST_FOREACH(endpoint, &uh->endpoints, entry) {
+               if (bcmp(endpoint, key, sizeof(struct pf_udp_endpoint_cmp)) == 
0 &&
+                       bcmp(endpoint, &endpoint->mapping->endpoints[0],
+                           sizeof(struct pf_udp_endpoint_cmp)) == 0)
+                       break;
+       }
+       if (endpoint == NULL) {
+               PF_HASHROW_UNLOCK(uh);
+               return (NULL);
+       }
+       refcount_acquire(&endpoint->mapping->refs);
+       PF_HASHROW_UNLOCK(uh);
+       return (endpoint->mapping);
+}
 /* END state table stuff */
 
 static void
@@ -2423,6 +2598,9 @@ pf_unlink_state(struct pf_kstate *s)
        PF_HASHROW_UNLOCK(ih);
 
        pf_detach_state(s);
+
+       pf_udp_mapping_release(s->udp_mapping);
+
        /* pf_state_insert() initialises refs to 2 */
        return (pf_release_staten(s, 2));
 }
@@ -4686,6 +4864,7 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, 
struct pfi_kkif *kif,
        u_int16_t                bproto_sum = 0, bip_sum = 0;
        u_int8_t                 icmptype = 0, icmpcode = 0;
        struct pf_kanchor_stackframe    anchor_stack[PF_ANCHOR_STACKSIZE];
+       struct pf_udp_mapping   *udp_mapping = NULL;
 
        PF_RULES_RASSERT();
 
@@ -4760,7 +4939,7 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, 
struct pfi_kkif *kif,
 
        /* check packet for BINAT/NAT/RDR */
        transerror = pf_get_translation(pd, m, off, kif, &nsn, &sk,
-           &nk, saddr, daddr, sport, dport, anchor_stack, &nr);
+           &nk, saddr, daddr, sport, dport, anchor_stack, &nr, &udp_mapping);
        switch (transerror) {
        default:
                /* A translation error occurred. */
@@ -5058,8 +5237,9 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, 
struct pfi_kkif *kif,
                int action;
                action = pf_create_state(r, nr, a, pd, nsn, nk, sk, m, off,
                    sport, dport, &rewrite, kif, sm, tag, bproto_sum, bip_sum,
-                   hdrlen, &match_rules);
+                   hdrlen, &match_rules, udp_mapping);
                if (action != PF_PASS) {
+                       pf_udp_mapping_release(udp_mapping);
                        if (action == PF_DROP &&
                            (r->rule_flag & PFRULE_RETURN))
                                pf_return(r, nr, pd, sk, off, m, th, kif,
@@ -5075,6 +5255,7 @@ pf_test_rule(struct pf_krule **rm, struct pf_kstate **sm, 
struct pfi_kkif *kif,
 
                uma_zfree(V_pf_state_key_z, sk);
                uma_zfree(V_pf_state_key_z, nk);
+               pf_udp_mapping_release(udp_mapping);
        }
 
        /* copy back packet headers if we performed NAT operations */
@@ -5102,6 +5283,8 @@ cleanup:
 
        uma_zfree(V_pf_state_key_z, sk);
        uma_zfree(V_pf_state_key_z, nk);
+       pf_udp_mapping_release(udp_mapping);
+
        return (PF_DROP);
 }
 
@@ -5111,7 +5294,7 @@ pf_create_state(struct pf_krule *r, struct pf_krule *nr, 
struct pf_krule *a,
     struct pf_state_key *sk, struct mbuf *m, int off, u_int16_t sport,
     u_int16_t dport, int *rewrite, struct pfi_kkif *kif, struct pf_kstate **sm,
     int tag, u_int16_t bproto_sum, u_int16_t bip_sum, int hdrlen,
-    struct pf_krule_slist *match_rules)
+    struct pf_krule_slist *match_rules, struct pf_udp_mapping *udp_mapping)
 {
        struct pf_kstate        *s = NULL;
        struct pf_ksrc_node     *sn = NULL;
@@ -5328,6 +5511,8 @@ pf_create_state(struct pf_krule *r, struct pf_krule *nr, 
struct pf_krule *a,
                return (PF_SYNPROXY_DROP);
        }
 
+       s->udp_mapping = udp_mapping;
+
        return (PF_PASS);
 
 csfailed:
diff --git a/sys/netpfil/pf/pf.h b/sys/netpfil/pf/pf.h
index d5ab4f03a96d..6370164cb291 100644
--- a/sys/netpfil/pf/pf.h
+++ b/sys/netpfil/pf/pf.h
@@ -129,6 +129,7 @@ enum        { PF_ADDR_ADDRMASK, PF_ADDR_NOROUTE, 
PF_ADDR_DYNIFTL,
          PF_ADDR_RANGE };
 #define PF_POOL_TYPEMASK       0x0f
 #define PF_POOL_STICKYADDR     0x20
+#define PF_POOL_ENDPI          0x40
 #define        PF_WSCALE_FLAG          0x80
 #define        PF_WSCALE_MASK          0x0f
 
diff --git a/sys/netpfil/pf/pf_lb.c b/sys/netpfil/pf/pf_lb.c
index 6b0b95e9ce01..cdd68aaf5dab 100644
--- a/sys/netpfil/pf/pf_lb.c
+++ b/sys/netpfil/pf/pf_lb.c
@@ -62,7 +62,8 @@ static struct pf_krule        *pf_match_translation(struct 
pf_pdesc *, struct mbuf *,
                            uint16_t, int, struct pf_kanchor_stackframe *);
 static int pf_get_sport(sa_family_t, uint8_t, struct pf_krule *,
     struct pf_addr *, uint16_t, struct pf_addr *, uint16_t, struct pf_addr *,
-    uint16_t *, uint16_t, uint16_t, struct pf_ksrc_node **);
+    uint16_t *, uint16_t, uint16_t, struct pf_ksrc_node **,
+    struct pf_udp_mapping **);
 
 #define mix(a,b,c) \
        do {                                    \
@@ -216,14 +217,47 @@ static int
 pf_get_sport(sa_family_t af, u_int8_t proto, struct pf_krule *r,
     struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr,
     uint16_t dport, struct pf_addr *naddr, uint16_t *nport, uint16_t low,
-    uint16_t high, struct pf_ksrc_node **sn)
+    uint16_t high, struct pf_ksrc_node **sn,
+    struct pf_udp_mapping **udp_mapping)
 {
        struct pf_state_key_cmp key;
        struct pf_addr          init_addr;
+       struct pf_srchash       *sh = NULL;
 
        bzero(&init_addr, sizeof(init_addr));
+
+       MPASS(*udp_mapping == NULL);
+
+       /*
+        * If we are UDP and have an existing mapping we can get source port
+        * from the mapping. In this case we have to look up the src_node as
+        * pf_map_addr would.
+        */
+       if (proto == IPPROTO_UDP && (r->rpool.opts & PF_POOL_ENDPI)) {
+               struct pf_udp_endpoint_cmp udp_source;
+
+               bzero(&udp_source, sizeof(udp_source));
+               udp_source.af = af;
+               PF_ACPY(&udp_source.addr, saddr, af);
+               udp_source.port = sport;
+               *udp_mapping = pf_udp_mapping_find(&udp_source);
+               if (*udp_mapping) {
+                       PF_ACPY(naddr, &(*udp_mapping)->endpoints[1].addr, af);
+                       *nport = (*udp_mapping)->endpoints[1].port;
+                       /* Try to find a src_node as per pf_map_addr(). */
+                       if (*sn == NULL && r->rpool.opts & PF_POOL_STICKYADDR &&
+                           (r->rpool.opts & PF_POOL_TYPEMASK) != PF_POOL_NONE)
+                               *sn = pf_find_src_node(saddr, r, af, &sh, 0);
+                       return (0);
+               } else {
+                       *udp_mapping = pf_udp_mapping_create(af, saddr, sport, 
&init_addr, 0);
+                       if (*udp_mapping == NULL)
+                               return (1);
+               }
+       }
+
        if (pf_map_addr(af, r, saddr, naddr, NULL, &init_addr, sn))
-               return (1);
+               goto failed;
 
        if (proto == IPPROTO_ICMP) {
                if (*nport == htons(ICMP_ECHO)) {
@@ -250,6 +284,8 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct 
pf_krule *r,
 
        do {
                PF_ACPY(&key.addr[1], naddr, key.af);
+               if (*udp_mapping)
+                       PF_ACPY(&(*udp_mapping)->endpoints[1].addr, naddr, af);
 
                /*
                 * port search; start random, step;
@@ -277,8 +313,16 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct 
pf_krule *r,
                } else if (low == high) {
                        key.port[1] = htons(low);
                        if (!pf_find_state_all_exists(&key, PF_IN)) {
-                               *nport = htons(low);
-                               return (0);
+                               if (*udp_mapping != NULL) {
+                                       (*udp_mapping)->endpoints[1].port = 
htons(low);
+                                       if (pf_udp_mapping_insert(*udp_mapping) 
== 0) {
+                                               *nport = htons(low);
+                                               return (0);
+                                       }
+                               } else {
+                                       *nport = htons(low);
+                                       return (0);
+                               }
                        }
                } else {
                        uint32_t tmp;
@@ -293,18 +337,35 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct 
pf_krule *r,
                        cut = arc4random() % (1 + high - low) + low;
                        /* low <= cut <= high */
                        for (tmp = cut; tmp <= high && tmp <= 0xffff; ++tmp) {
-                               key.port[1] = htons(tmp);
-                               if (!pf_find_state_all_exists(&key, PF_IN)) {
-                                       *nport = htons(tmp);
-                                       return (0);
+                               if (*udp_mapping != NULL) {
+                                       (*udp_mapping)->endpoints[1].port = 
htons(tmp);
+                                       if (pf_udp_mapping_insert(*udp_mapping) 
== 0) {
+                                               *nport = htons(tmp);
+                                               return (0);
+                                       }
+                               } else {
+                                       key.port[1] = htons(tmp);
+                                       if (!pf_find_state_all_exists(&key, 
PF_IN)) {
+                                               *nport = htons(tmp);
+                                               return (0);
+                                       }
                                }
                        }
                        tmp = cut;
                        for (tmp -= 1; tmp >= low && tmp <= 0xffff; --tmp) {
-                               key.port[1] = htons(tmp);
-                               if (!pf_find_state_all_exists(&key, PF_IN)) {
-                                       *nport = htons(tmp);
-                                       return (0);
+                               if (proto == IPPROTO_UDP &&
+                                   (r->rpool.opts & PF_POOL_ENDPI)) {
+                                       (*udp_mapping)->endpoints[1].port = 
htons(tmp);
+                                       if (pf_udp_mapping_insert(*udp_mapping) 
== 0) {
+                                               *nport = htons(tmp);
+                                               return (0);
+                                       }
+                               } else {
+                                       key.port[1] = htons(tmp);
+                                       if (!pf_find_state_all_exists(&key, 
PF_IN)) {
+                                               *nport = htons(tmp);
+                                               return (0);
+                                       }
                                }
                        }
                }
@@ -326,6 +387,10 @@ pf_get_sport(sa_family_t af, u_int8_t proto, struct 
pf_krule *r,
                        return (1);
                }
        } while (! PF_AEQ(&init_addr, naddr, af) );
+
+failed:
+       uma_zfree(V_pf_udp_mapping_z, *udp_mapping);
+       *udp_mapping = NULL;
        return (1);                                     /* none available */
 }
 
@@ -333,7 +398,7 @@ static int
 pf_get_mape_sport(sa_family_t af, u_int8_t proto, struct pf_krule *r,
     struct pf_addr *saddr, uint16_t sport, struct pf_addr *daddr,
     uint16_t dport, struct pf_addr *naddr, uint16_t *nport,
-    struct pf_ksrc_node **sn)
+    struct pf_ksrc_node **sn, struct pf_udp_mapping **udp_mapping)
 {
        uint16_t psmask, low, highmask;
        uint16_t i, ahigh, cut;
@@ -353,13 +418,13 @@ pf_get_mape_sport(sa_family_t af, u_int8_t proto, struct 
pf_krule *r,
        for (i = cut; i <= ahigh; i++) {
                low = (i << ashift) | psmask;
                if (!pf_get_sport(af, proto, r, saddr, sport, daddr, dport,
-                   naddr, nport, low, low | highmask, sn))
+                   naddr, nport, low, low | highmask, sn, udp_mapping))
                        return (0);
        }
        for (i = cut - 1; i > 0; i--) {
                low = (i << ashift) | psmask;
                if (!pf_get_sport(af, proto, r, saddr, sport, daddr, dport,
-                   naddr, nport, low, low | highmask, sn))
+                   naddr, nport, low, low | highmask, sn, udp_mapping))
                        return (0);
        }
        return (1);
@@ -597,7 +662,8 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int 
off,
     struct pf_state_key **skp, struct pf_state_key **nkp,
     struct pf_addr *saddr, struct pf_addr *daddr,
     uint16_t sport, uint16_t dport, struct pf_kanchor_stackframe *anchor_stack,
-    struct pf_krule **rp)
+    struct pf_krule **rp,
+    struct pf_udp_mapping **udp_mapping)
 {
        struct pf_krule *r = NULL;
        struct pf_addr  *naddr;
@@ -661,7 +727,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int 
off,
                }
                if (r->rpool.mape.offset > 0) {
                        if (pf_get_mape_sport(pd->af, pd->proto, r, saddr,
-                           sport, daddr, dport, naddr, nportp, sn)) {
+                           sport, daddr, dport, naddr, nportp, sn, 
udp_mapping)) {
                                DPFPRINTF(PF_DEBUG_MISC,
                                    ("pf: MAP-E port allocation (%u/%u/%u)"
                                    " failed\n",
@@ -672,7 +738,7 @@ pf_get_translation(struct pf_pdesc *pd, struct mbuf *m, int 
off,
                                goto notrans;
                        }
                } else if (pf_get_sport(pd->af, pd->proto, r, saddr, sport,
-                   daddr, dport, naddr, nportp, low, high, sn)) {
+                   daddr, dport, naddr, nportp, low, high, sn, udp_mapping)) {
                        DPFPRINTF(PF_DEBUG_MISC,
                            ("pf: NAT proxy port allocation (%u-%u) failed\n",
                            r->rpool.proxy_port[0], r->rpool.proxy_port[1]));
diff --git a/tests/sys/netpfil/pf/nat.sh b/tests/sys/netpfil/pf/nat.sh
index 513abfa5e040..aaa49805c772 100644
--- a/tests/sys/netpfil/pf/nat.sh
+++ b/tests/sys/netpfil/pf/nat.sh
@@ -112,6 +112,139 @@ nested_anchor_body()
 
 }
 
+atf_test_case "endpoint_independent" "cleanup"
+endpoint_independent_head()
+{
+       atf_set descr 'Test that a client behind NAT gets the same external 
IP:port for different servers'
+       atf_set require.user root
+}
+
+endpoint_independent_body()
+{
+       pft_init
+       filter="udp and dst port 1234"  # only capture udp pings
+
+       epair_client=$(vnet_mkepair)
+       epair_nat=$(vnet_mkepair)
+       epair_server1=$(vnet_mkepair)
+       epair_server2=$(vnet_mkepair)
+       bridge=$(vnet_mkbridge)
+
+       vnet_mkjail nat ${epair_client}b ${epair_nat}a
+       vnet_mkjail client ${epair_client}a
+       vnet_mkjail server1 ${epair_server1}a
+       vnet_mkjail server2 ${epair_server2}a
+
+       ifconfig ${epair_server1}b up
+       ifconfig ${epair_server2}b up
+       ifconfig ${epair_nat}b up
+       ifconfig ${bridge} \
+               addm ${epair_server1}b \
+               addm ${epair_server2}b \
+               addm ${epair_nat}b \
+               up
+
+       jexec nat ifconfig ${epair_client}b 192.0.2.1/24 up
+       jexec nat ifconfig ${epair_nat}a 198.51.100.42/24 up
+       jexec nat sysctl net.inet.ip.forwarding=1
+
+       jexec client ifconfig ${epair_client}a 192.0.2.2/24 up
+       jexec client route add default 192.0.2.1
+
+       jexec server1 ifconfig ${epair_server1}a 198.51.100.32/24 up
+       jexec server2 ifconfig ${epair_server2}a 198.51.100.22/24 up
+
+       # Enable pf!
+       jexec nat pfctl -e
+
+       # validate non-endpoint independent nat rule behaviour
+       pft_set_rules nat \
+               "nat on ${epair_nat}a inet from ! (${epair_nat}a) to any -> 
(${epair_nat}a)"
+
+       jexec server1 tcpdump -i ${epair_server1}a -w ${PWD}/server1.pcap \
+               --immediate-mode $filter &
+       server1tcppid="$!"
+       jexec server2 tcpdump -i ${epair_server2}a -w ${PWD}/server2.pcap \
+               --immediate-mode $filter &
+       server2tcppid="$!"
+
+       # send out multiple packets
+       for i in $(seq 1 10); do
+               echo "ping" | jexec client nc -u 198.51.100.32 1234 -p 4242 -w 0
+               echo "ping" | jexec client nc -u 198.51.100.22 1234 -p 4242 -w 0
+       done
+
+       kill $server1tcppid
+       kill $server2tcppid
+
+       tuple_server1=$(tcpdump -r ${PWD}/server1.pcap | awk '{addr=$3} END 
{print addr}')
+       tuple_server2=$(tcpdump -r ${PWD}/server2.pcap | awk '{addr=$3} END 
{print addr}')
+
+       if [ -z $tuple_server1 ]
+       then
+               atf_fail "server1 did not receive connection from client 
(default)"
+       fi
+
+       if [ -z $tuple_server2 ]
+       then
+               atf_fail "server2 did not receive connection from client 
(default)"
+       fi
+
+       if [ "$tuple_server1" = "$tuple_server2" ]
+       then
+               echo "server1 tcpdump: $tuple_server1"
+               echo "server2 tcpdump: $tuple_server2"
+               atf_fail "Received same IP:port on server1 and server2 
(default)"
+       fi
+
+       # validate endpoint independent nat rule behaviour
+       pft_set_rules nat \
+               "nat on ${epair_nat}a inet from ! (${epair_nat}a) to any -> 
(${epair_nat}a) endpoint-independent"
+
+       jexec server1 tcpdump -i ${epair_server1}a -w ${PWD}/server1.pcap \
+               --immediate-mode $filter &
+       server1tcppid="$!"
+       jexec server2 tcpdump -i ${epair_server2}a -w ${PWD}/server2.pcap \
+               --immediate-mode $filter &
+       server2tcppid="$!"
+
+       # send out multiple packets,  sometimes one fails to go through
+       for i in $(seq 1 10); do
+               echo "ping" | jexec client nc -u 198.51.100.32 1234 -p 4242 -w 0
+               echo "ping" | jexec client nc -u 198.51.100.22 1234 -p 4242 -w 0
+       done
+
+       kill $server1tcppid
+       kill $server2tcppid
+
+       tuple_server1=$(tcpdump -r ${PWD}/server1.pcap | awk '{addr=$3} END 
{print addr}')
+       tuple_server2=$(tcpdump -r ${PWD}/server2.pcap | awk '{addr=$3} END 
{print addr}')
+
+       if [ -z $tuple_server1 ]
+       then
+               atf_fail "server1 did not receive connection from client 
(endpoint-independent)"
+       fi
+
+       if [ -z $tuple_server2 ]
+       then
+               atf_fail "server2 did not receive connection from client 
(endpoint-independent)"
+       fi
+
+       if [ ! "$tuple_server1" = "$tuple_server2" ]
+       then
+               echo "server1 tcpdump: $tuple_server1"
+               echo "server2 tcpdump: $tuple_server2"
+               atf_fail "Received different IP:port on server1 than server2 
(endpoint-independent)"
+       fi
+}
+
+endpoint_independent_cleanup()
+{
+       pft_cleanup
+       rm -f server1.out
+       rm -f server2.out
+}
+
 nested_anchor_cleanup()
*** 8 LINES SKIPPED ***

Reply via email to