Hello,
patch which fixes the problem is below. The issue has been sort of introduced
by my earlier commit [1] to fix another problem reported by giovanni@.
it has actually turned out my earlier change was not complete and actually
made a problem reported worse.
the original problem reported by giovanni@ shows symptoms as follows:
the nat rule:
pass out from 192.168.1.0/24 to any nat-to { 49.0.0.1 } round-robin
translates matching packets to:
0.0.1.243, 0.0.1.244, ...
which is completely wrong. My commit [2] fixes that so we get expected
result: all matching packets get translated to 49.0.0.1
unfortunately earlier change [2] makes things worse for rule reported
by Hrvoje:
pass out from 192.168.1.0/24 to any nat-to { 49/27 } round-robin
to understand how things end up such badly we have to take a look
at how 'nat-to' argument is handled in pf_lb.c. There are three possible
ways how nat-to argument is interpreted:
a) radix table
if 'nat-to' refers to interface: 'nat-to em0'
b) dynamic table
if 'nat-to' refers to interface like that: 'nat-to (em0)'
c) single IP address
if 'nat-to' is written as follows: 'nat-to { 1.2.3.4 }'
d) table of IP addresses:
'nat-to {1.2.3.4, 10.20.30.40}'
e) single IP address which represents network:
'nat-to {49/27}'
all forms above are implemented as 'struct pf_pool' in pf(4).
variants 'nat-to {1.2.3.4}' and 'nat-to 1.2.3.4' are equal the
resulting pf_pool instance is same. Variants a, b, d are
not interesting for us, because bug occurs in variants c and e.
to keep c and e working those three actors:
pf_pool.addr.v.a.addr
pf_pool.addr.v.a.mask
pf_pool.counter
must play well when 'round-robin' option is specified. here
the pf_pool.counter remembers the last address 'retrieved' from pool
so pf(4) can select the new address for next packet.
So in case 49/27 we are supposed to be selecting addresses:
49.0.0.1, 49.0.0.2, ..., 49.0.0.30, 49.0.0.1
we need to make sure selection mechanism skips network
address (49.0.0.0) and network broadcast address (49.0.0.31).
With my earlier commit [2] in tree this fails to happen.
As soon as we load 'nat-to {49/27}' rule to kernel the
pf_pool.counter value is 0 this triggers the condition
I've introduced [2]:
488 case PF_POOL_ROUNDROBIN:
...
501 } else if (PF_AZERO(&rpool->counter, af)) {
502 /*
503 * fall back to POOL_NONE if there are no addresses
in
504 * pool
505 */
506 pf_addrcpy(naddr, raddr, af);
507 break;
fallback action translates packet to 49.0.0.0 without updating
pf_pool.counter. The code above needs more attention:
we need to fallback if and only if pool comes with
single /32 (/128) IP address.
I still don't know why things go that wild on NET_LOCK
when pf(4) starts to translate packets to 49.0.0.0.
I have not investigated that far yet.
the glitch for {49/27} random is kind of similar. The current
code reads as follows:
408 case PF_POOL_RANDOM:
409 if (rpool->addr.type == PF_ADDR_TABLE ||
410 rpool->addr.type == PF_ADDR_DYNIFTL) {
...
427 pf_addrcpy(naddr, &rpool->counter, af);
428 } else if (init_addr != NULL && PF_AZERO(init_addr, af)) {
429 switch (af) {
430 case AF_INET:
431 rpool->counter.addr32[0] = arc4random();
432 break;
433 #ifdef INET6
434 case AF_INET6:
...
454 pf_poolmask(naddr, raddr, rmask, &rpool->counter,
af);
455 pf_addrcpy(init_addr, naddr, af);
at line 431 we use arc4random() to generate 32bit integer. at line 454 we
combine the random number with network mask (/27 in our case).
Tests were conveyed on UDP streams where 60k/sec unique UDP packets were
sent. On Hrvoje's network there were 370232 udp packets with unique source
address sent in 5secs. 11610 of them ended up to be translated to 49.0.0.0
~30% of packets we saw are translated to invalid address. Why?
because at line 431 we select random number from entire integer range,
however we must do random select on significantly smaller range:
<1, 30> in case of /27 prefix.
I've described above. It's still kind of mystery why ?net_lock? becomes
contended when we attempt to transmit 49.0.0.0
diff below addresses all issues I've described above.
OK to commit?
thanks and
regards
sashan
[1] https://marc.info/?l=openbsd-cvs&m=164500117319660&w=2
[2] https://marc.info/?l=openbsd-bugs&m=164458056429812&w=2
--------8<---------------8<---------------8<------------------8<--------
diff --git a/sys/net/pf_lb.c b/sys/net/pf_lb.c
index d106073d372..5aa7f252fa3 100644
--- a/sys/net/pf_lb.c
+++ b/sys/net/pf_lb.c
@@ -344,6 +344,44 @@ pf_map_addr_sticky(sa_family_t af, struct pf_rule *r,
struct pf_addr *saddr,
return (0);
}
+uint32_t
+pf_rand_addr(uint32_t mask)
+{
+ uint32_t addr;
+
+ if (mask == 0)
+ return (arc4random());
+
+ mask = ~ntohl(mask);
+ do {
+ addr = arc4random_uniform(mask + 1);
+ } while ((addr == 0) || (addr == mask));
+
+ return (htonl(addr));
+}
+
+int
+pf_is_broadcast_addr(struct pf_addr *a, struct pf_addr *m, sa_family_t af)
+{
+ int rv;
+
+ switch (af) {
+ case AF_INET:
+ rv = (a->addr32[0] == ~m->addr32[0]);
+ break;
+ case AF_INET6:
+ rv = ((a->addr32[0] == ~m->addr32[0]) &&
+ (a->addr32[1] == ~m->addr32[1]) &&
+ (a->addr32[2] == ~m->addr32[2]) &&
+ (a->addr32[3] == ~m->addr32[3]));
+ break;
+ default:
+ unhandled_af(af);
+ }
+
+ return (rv);
+}
+
int
pf_map_addr(sa_family_t af, struct pf_rule *r, struct pf_addr *saddr,
struct pf_addr *naddr, struct pf_addr *init_addr, struct pf_src_node **sns,
@@ -428,24 +466,29 @@ pf_map_addr(sa_family_t af, struct pf_rule *r, struct
pf_addr *saddr,
} else if (init_addr != NULL && PF_AZERO(init_addr, af)) {
switch (af) {
case AF_INET:
- rpool->counter.addr32[0] = arc4random();
+ rpool->counter.addr32[0] = pf_rand_addr(
+ rmask->addr32[0]);
break;
#ifdef INET6
case AF_INET6:
if (rmask->addr32[3] != 0xffffffff)
- rpool->counter.addr32[3] = arc4random();
+ rpool->counter.addr32[3] = pf_rand_addr(
+ rmask->addr32[3]);
else
break;
if (rmask->addr32[2] != 0xffffffff)
- rpool->counter.addr32[2] = arc4random();
+ rpool->counter.addr32[2] = pf_rand_addr(
+ rmask->addr32[2]);
else
break;
if (rmask->addr32[1] != 0xffffffff)
- rpool->counter.addr32[1] = arc4random();
+ rpool->counter.addr32[1] = pf_rand_addr(
+ rmask->addr32[1]);
else
break;
if (rmask->addr32[0] != 0xffffffff)
- rpool->counter.addr32[0] = arc4random();
+ rpool->counter.addr32[0] = pf_rand_addr(
+ rmask->addr32[0]);
break;
#endif /* INET6 */
default:
@@ -500,11 +543,16 @@ pf_map_addr(sa_family_t af, struct pf_rule *r, struct
pf_addr *saddr,
}
} else if (PF_AZERO(&rpool->counter, af)) {
/*
- * fall back to POOL_NONE if there are no addresses in
- * pool
+ * fall back to POOL_NONE if there is a single host
+ * address in pool.
*/
- pf_addrcpy(naddr, raddr, af);
- break;
+ if ((af == AF_INET &&
+ rmask->addr32[0] == INADDR_BROADCAST) ||
+ (af == AF_INET6 &&
+ IN6_ARE_ADDR_EQUAL(&rmask->v6, &in6mask128))) {
+ pf_addrcpy(naddr, raddr, af);
+ break;
+ }
} else if (pf_match_addr(0, raddr, rmask, &rpool->counter, af))
return (1);
@@ -532,10 +580,15 @@ pf_map_addr(sa_family_t af, struct pf_rule *r, struct
pf_addr *saddr,
weight = rpool->weight;
}
- pf_addrcpy(naddr, &rpool->counter, af);
- if (init_addr != NULL && PF_AZERO(init_addr, af))
- pf_addrcpy(init_addr, naddr, af);
pf_addr_inc(&rpool->counter, af);
+ if (init_addr != NULL && PF_AZERO(init_addr, af))
+ pf_addrcpy(init_addr, &rpool->counter, af);
+
+ /* skip network broadcast addresses */
+ if (pf_is_broadcast_addr(&rpool->counter, rmask, af))
+ pf_addr_inc(&rpool->counter, af);
+
+ pf_poolmask(naddr, raddr, rmask, &rpool->counter, af);
break;
case PF_POOL_LEASTSTATES:
/* retrieve an address first */