Hi, when bgpd changes a lot of routes (and runs with fib-updates yes), the routing socket of ospfd (and ospf6d, ripd, eigrpd) will get a lot of routing messages. When the kernel cannot write all the messages intothe recipients route socket, it will set the desync flag, and the daemon has to read the full routing table and reevaluate the routes. That comes with a cost.
claudio and i are investigating varios aproaches to minimize this. One way is to increase the buffer size on the route socket. That alone does not help, the receiver simply does not get a chance to read the data fast enough. So here i tryto give it a chance: I add the socket option SO_RDAEMON. A receiver sets it on a route socket. When the sockets reaches 1/4 of the current maximum size,the kernel will yield() at the end of route_input(). With this i can shrink the buffer size to 1/2 of the current setting (8 times what it was until yesterday) and not get a DESYNC on coupling a full table from bgpd. To get this working i had to disable the sb_mbmax calculation in sbreserve() because that triggers a lot earlier. There is another mail from claudio asking on advise how to change sbreserve(). So right now this is a proof of concept. Opinions? diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c index 2f7ecfd2371..e3d24843a1c 100644 --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -1600,6 +1600,7 @@ sosetopt(struct socket *so, int level, int optname, struct mbuf *m) case SO_OOBINLINE: case SO_TIMESTAMP: case SO_ZEROIZE: + case SO_RDAEMON: if (m == NULL || m->m_len < sizeof (int)) return (EINVAL); if (*mtod(m, int *)) @@ -1770,6 +1771,7 @@ sogetopt(struct socket *so, int level, int optname, struct mbuf *m) case SO_OOBINLINE: case SO_TIMESTAMP: case SO_ZEROIZE: + case SO_RDAEMON: *mtod(m, int *) = so->so_options & optname; break; diff --git sys/kern/uipc_socket2.c sys/kern/uipc_socket2.c index a7c6ba1f576..e60fc8d8256 100644 --- sys/kern/uipc_socket2.c +++ sys/kern/uipc_socket2.c @@ -464,8 +464,11 @@ sbreserve(struct socket *so, struct sockbuf *sb, u_long cc) if (cc == 0 || cc > sb_max) return (1); sb->sb_hiwat = cc; +#if 0 sb->sb_mbmax = max(3 * MAXMCLBYTES, min(cc * 2, sb_max + (sb_max / MCLBYTES) * MSIZE)); +#endif + sb->sb_mbmax = ULONG_MAX/4; if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; return (0); diff --git sys/net/rtsock.c sys/net/rtsock.c index 6a516bf00cd..dc918e1aa93 100644 --- sys/net/rtsock.c +++ sys/net/rtsock.c @@ -388,6 +388,10 @@ route_input(struct mbuf *m0, struct socket *so, sa_family_t sa_family) int sockets = 0; struct socket *last = NULL; struct sockaddr *sosrc, *sodst; + int yield_cnt = 0; +#if 0 + static int yield_logged = 0; +#endif KERNEL_ASSERT_LOCKED(); @@ -457,7 +461,21 @@ route_input(struct mbuf *m0, struct socket *so, sa_family_t sa_family) if (last) { struct mbuf *n; + if ((last->so_options & SO_RDAEMON) && + (last->so_rcv.sb_cc > (sb_max/4))) + yield_cnt++; if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { +#if 0 + if (sbspace(last, &last->so_rcv) < (2*MSIZE)) { + printf("sbspace 1 hiwat %lu cc " + "%lu mbmax %lu mbcnt %lu f %hd\n", + last->so_rcv.sb_hiwat, + last->so_rcv.sb_cc, + last->so_rcv.sb_mbmax, + last->so_rcv.sb_mbcnt, + last->so_options); + } +#endif if (sbspace(last, &last->so_rcv) < (2*MSIZE) || sbappendaddr(last, &last->so_rcv, sosrc, n, (struct mbuf *)NULL) == 0) { @@ -479,6 +497,20 @@ route_input(struct mbuf *m0, struct socket *so, sa_family_t sa_family) last = rp->rcb_socket; } if (last) { + if ((last->so_options & SO_RDAEMON) && + (last->so_rcv.sb_cc > (sb_max/4))) + yield_cnt++; +#if 0 + if (sbspace(last, &last->so_rcv) < (2*MSIZE)) { + printf("sbspace 2 hiwat %lu cc " + "%lu mbmax %lu mbcnt %lu f %hd\n", + last->so_rcv.sb_hiwat, + last->so_rcv.sb_cc, + last->so_rcv.sb_mbmax, + last->so_rcv.sb_mbcnt, + last->so_options); + } +#endif if (sbspace(last, &last->so_rcv) < (2 * MSIZE) || sbappendaddr(last, &last->so_rcv, sosrc, m, (struct mbuf *)NULL) == 0) { @@ -493,6 +525,24 @@ route_input(struct mbuf *m0, struct socket *so, sa_family_t sa_family) } } else m_freem(m); + + if (yield_cnt && so != NULL) { +#if 0 + if ((yield_logged % 2000) == 0) { + printf("%s: yield hiwat %lu cc " + "%lu mbmax %lu mbcnt %lu f %hd\n", + __func__, + last->so_rcv.sb_hiwat, + last->so_rcv.sb_cc, + last->so_rcv.sb_mbmax, + last->so_rcv.sb_mbcnt, + last->so_options); + } + yield_logged++; +#endif + NET_ASSERT_UNLOCKED(); + yield(); + } } struct rt_msghdr * diff --git sys/sys/socket.h sys/sys/socket.h index a1ff94653f8..fc78c14296c 100644 --- sys/sys/socket.h +++ sys/sys/socket.h @@ -97,6 +97,7 @@ typedef __sa_family_t sa_family_t; /* sockaddr address family type */ #define SO_TIMESTAMP 0x0800 /* timestamp received dgram traffic */ #define SO_BINDANY 0x1000 /* allow bind to any address */ #define SO_ZEROIZE 0x2000 /* zero out all mbufs sent over socket */ +#define SO_RDAEMON 0x4000 /* optimize bulk routing msg */ /* * Additional options, not kept in so_options. diff --git usr.sbin/ospfd/kroute.c usr.sbin/ospfd/kroute.c index 17febefbdcb..caa1c9fe591 100644 --- usr.sbin/ospfd/kroute.c +++ usr.sbin/ospfd/kroute.c @@ -129,7 +129,7 @@ kif_init(void) int kr_init(int fs, u_int rdomain) { - int opt = 0, rcvbuf, default_rcvbuf; + int opt = 0, rcvbuf, default_rcvbuf, rdaemon_flag = 1; socklen_t optlen; kr_state.fib_sync = fs; @@ -159,6 +159,11 @@ kr_init(int fs, u_int rdomain) rcvbuf /= 2) ; /* nothing */ + if (setsockopt(kr_state.fd, SOL_SOCKET, SO_RDAEMON, + &rdaemon_flag, sizeof(rdaemon_flag)) == -1) + log_info("%s: setsockopt SO_RDAEMON failed", + __func__); + kr_state.pid = getpid(); kr_state.rtseq = 1; diff --git usr.sbin/ospfd/ospfd.h usr.sbin/ospfd/ospfd.h index af082b1079c..f9f52cf3ea4 100644 --- usr.sbin/ospfd/ospfd.h +++ usr.sbin/ospfd/ospfd.h @@ -46,7 +46,7 @@ #define READ_BUF_SIZE 65535 #define PKG_DEF_SIZE 512 /* compromise */ #define RT_BUF_SIZE 16384 -#define MAX_RTSOCK_BUF (2 * 1024 * 1024) +#define MAX_RTSOCK_BUF (1 * 1024 * 1024) #define OSPFD_FLAG_NO_FIB_UPDATE 0x0001 #define OSPFD_FLAG_STUB_ROUTER 0x0002