Hi,

when bgpd changes a lot of routes (and runs with fib-updates yes), the
routing socket of ospfd (and ospf6d, ripd, eigrpd) will get a lot of routing
messages. When the kernel cannot write all the messages intothe recipients
route socket, it will set the desync flag, and the daemon has to read the
full routing table and reevaluate the routes. That comes with a cost.

claudio and i are investigating varios aproaches to minimize this.

One way is to increase the buffer size on the route socket. That alone does
not help, the receiver simply does not get a chance to read the data fast
enough. So here i tryto give it a chance:

I add the socket option SO_RDAEMON. A receiver sets it on a route socket.
When the sockets reaches 1/4 of the current maximum size,the kernel will
yield() at the end of route_input().

With this i can shrink the buffer size to 1/2 of the current setting (8
times what it was until yesterday) and not get a DESYNC on coupling a full
table from bgpd.

To get this working i had to disable the sb_mbmax calculation in sbreserve()
because that triggers a lot earlier. There is another mail from claudio
asking on advise how to change sbreserve().

So right now this is a proof of concept.
Opinions?

diff --git sys/kern/uipc_socket.c sys/kern/uipc_socket.c
index 2f7ecfd2371..e3d24843a1c 100644
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -1600,6 +1600,7 @@ sosetopt(struct socket *so, int level, int optname, 
struct mbuf *m)
                case SO_OOBINLINE:
                case SO_TIMESTAMP:
                case SO_ZEROIZE:
+               case SO_RDAEMON:
                        if (m == NULL || m->m_len < sizeof (int))
                                return (EINVAL);
                        if (*mtod(m, int *))
@@ -1770,6 +1771,7 @@ sogetopt(struct socket *so, int level, int optname, 
struct mbuf *m)
                case SO_OOBINLINE:
                case SO_TIMESTAMP:
                case SO_ZEROIZE:
+               case SO_RDAEMON:
                        *mtod(m, int *) = so->so_options & optname;
                        break;
 
diff --git sys/kern/uipc_socket2.c sys/kern/uipc_socket2.c
index a7c6ba1f576..e60fc8d8256 100644
--- sys/kern/uipc_socket2.c
+++ sys/kern/uipc_socket2.c
@@ -464,8 +464,11 @@ sbreserve(struct socket *so, struct sockbuf *sb, u_long cc)
        if (cc == 0 || cc > sb_max)
                return (1);
        sb->sb_hiwat = cc;
+#if 0
        sb->sb_mbmax = max(3 * MAXMCLBYTES,
            min(cc * 2, sb_max + (sb_max / MCLBYTES) * MSIZE));
+#endif
+       sb->sb_mbmax = ULONG_MAX/4;
        if (sb->sb_lowat > sb->sb_hiwat)
                sb->sb_lowat = sb->sb_hiwat;
        return (0);
diff --git sys/net/rtsock.c sys/net/rtsock.c
index 6a516bf00cd..dc918e1aa93 100644
--- sys/net/rtsock.c
+++ sys/net/rtsock.c
@@ -388,6 +388,10 @@ route_input(struct mbuf *m0, struct socket *so, 
sa_family_t sa_family)
        int sockets = 0;
        struct socket *last = NULL;
        struct sockaddr *sosrc, *sodst;
+       int yield_cnt = 0;
+#if 0
+       static int yield_logged = 0;
+#endif
 
        KERNEL_ASSERT_LOCKED();
 
@@ -457,7 +461,21 @@ route_input(struct mbuf *m0, struct socket *so, 
sa_family_t sa_family)
 
                if (last) {
                        struct mbuf *n;
+                       if ((last->so_options & SO_RDAEMON) &&
+                           (last->so_rcv.sb_cc > (sb_max/4)))
+                               yield_cnt++;
                        if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) {
+#if 0
+                               if (sbspace(last, &last->so_rcv) < (2*MSIZE)) {
+                                       printf("sbspace 1 hiwat %lu cc "
+                                           "%lu mbmax %lu mbcnt %lu f %hd\n",
+                                           last->so_rcv.sb_hiwat,
+                                           last->so_rcv.sb_cc,
+                                           last->so_rcv.sb_mbmax,
+                                           last->so_rcv.sb_mbcnt,
+                                           last->so_options);
+                               }
+#endif
                                if (sbspace(last, &last->so_rcv) < (2*MSIZE) ||
                                    sbappendaddr(last, &last->so_rcv, sosrc,
                                    n, (struct mbuf *)NULL) == 0) {
@@ -479,6 +497,20 @@ route_input(struct mbuf *m0, struct socket *so, 
sa_family_t sa_family)
                last = rp->rcb_socket;
        }
        if (last) {
+               if ((last->so_options & SO_RDAEMON) &&
+                   (last->so_rcv.sb_cc > (sb_max/4)))
+                       yield_cnt++;
+#if 0
+               if (sbspace(last, &last->so_rcv) < (2*MSIZE)) {
+                       printf("sbspace 2 hiwat %lu cc "
+                           "%lu mbmax %lu mbcnt %lu f %hd\n",
+                           last->so_rcv.sb_hiwat,
+                           last->so_rcv.sb_cc,
+                           last->so_rcv.sb_mbmax,
+                           last->so_rcv.sb_mbcnt,
+                           last->so_options);
+               }
+#endif
                if (sbspace(last, &last->so_rcv) < (2 * MSIZE) ||
                    sbappendaddr(last, &last->so_rcv, sosrc,
                    m, (struct mbuf *)NULL) == 0) {
@@ -493,6 +525,24 @@ route_input(struct mbuf *m0, struct socket *so, 
sa_family_t sa_family)
                }
        } else
                m_freem(m);
+
+       if (yield_cnt && so != NULL) {
+#if 0
+               if ((yield_logged % 2000) == 0) {
+                       printf("%s: yield hiwat %lu cc "
+                           "%lu mbmax %lu mbcnt %lu f %hd\n",
+                           __func__,
+                           last->so_rcv.sb_hiwat,
+                           last->so_rcv.sb_cc,
+                           last->so_rcv.sb_mbmax,
+                           last->so_rcv.sb_mbcnt,
+                           last->so_options);
+               }
+               yield_logged++;
+#endif
+               NET_ASSERT_UNLOCKED();
+               yield();
+       }
 }
 
 struct rt_msghdr *
diff --git sys/sys/socket.h sys/sys/socket.h
index a1ff94653f8..fc78c14296c 100644
--- sys/sys/socket.h
+++ sys/sys/socket.h
@@ -97,6 +97,7 @@ typedef       __sa_family_t   sa_family_t;    /* sockaddr 
address family type */
 #define SO_TIMESTAMP   0x0800          /* timestamp received dgram traffic */
 #define SO_BINDANY     0x1000          /* allow bind to any address */
 #define SO_ZEROIZE     0x2000          /* zero out all mbufs sent over socket 
*/
+#define        SO_RDAEMON      0x4000          /* optimize bulk routing msg */
 
 /*
  * Additional options, not kept in so_options.
diff --git usr.sbin/ospfd/kroute.c usr.sbin/ospfd/kroute.c
index 17febefbdcb..caa1c9fe591 100644
--- usr.sbin/ospfd/kroute.c
+++ usr.sbin/ospfd/kroute.c
@@ -129,7 +129,7 @@ kif_init(void)
 int
 kr_init(int fs, u_int rdomain)
 {
-       int             opt = 0, rcvbuf, default_rcvbuf;
+       int             opt = 0, rcvbuf, default_rcvbuf, rdaemon_flag = 1;
        socklen_t       optlen;
 
        kr_state.fib_sync = fs;
@@ -159,6 +159,11 @@ kr_init(int fs, u_int rdomain)
                    rcvbuf /= 2)
                        ;       /* nothing */
 
+       if (setsockopt(kr_state.fd, SOL_SOCKET, SO_RDAEMON,
+           &rdaemon_flag, sizeof(rdaemon_flag)) == -1)
+               log_info("%s: setsockopt SO_RDAEMON failed",
+                   __func__);
+
        kr_state.pid = getpid();
        kr_state.rtseq = 1;
 
diff --git usr.sbin/ospfd/ospfd.h usr.sbin/ospfd/ospfd.h
index af082b1079c..f9f52cf3ea4 100644
--- usr.sbin/ospfd/ospfd.h
+++ usr.sbin/ospfd/ospfd.h
@@ -46,7 +46,7 @@
 #define        READ_BUF_SIZE           65535
 #define        PKG_DEF_SIZE            512     /* compromise */
 #define        RT_BUF_SIZE             16384
-#define        MAX_RTSOCK_BUF          (2 * 1024 * 1024)
+#define        MAX_RTSOCK_BUF          (1 * 1024 * 1024)
 
 #define        OSPFD_FLAG_NO_FIB_UPDATE        0x0001
 #define        OSPFD_FLAG_STUB_ROUTER          0x0002

Reply via email to