I never got any negative test results back about this change. I assume
that either nobody is interested or that it just works. Since this is
getting in my way of working on bgpd I would like to commit this. So
unless people scream in the next few days it will go in and I will handle
the fallout when it happens.

-- 
:wq Claudio


----- Forwarded message from Claudio Jeker <clau...@openbsd.org> -----

Date: Mon, 9 Jul 2012 15:39:17 +0200
From: Claudio Jeker <clau...@openbsd.org>
To: tech@openbsd.org
Subject: graceful restart diff
Mail-Followup-To: tech@openbsd.org

OK, this is a fixed version of the graceful restart diff I have brewing
for a long long long time. It seems to behave in my bgpd test network but
I would like some wider testing (esp. against routers that do GR).

This implements only the "Restarting Client" bits of the RFC -- in other
words bgpd will keep the FIB when the client restarts but it will not do GR
when restarting itself. The capability is still off by default (you need
"announce restart yes" to enable it) and the best is to test first without
GR and if there are no regressions restest with GR enabled.

Make sure to run this on current -current since this needs some of the
recent bgpd commits.
-- 
:wq Claudio

Index: bgpctl/bgpctl.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpctl/bgpctl.c,v
retrieving revision 1.164
diff -u -p -r1.164 bgpctl.c
--- bgpctl/bgpctl.c     27 May 2012 18:53:50 -0000      1.164
+++ bgpctl/bgpctl.c     8 Jul 2012 10:48:01 -0000
@@ -52,6 +52,7 @@ int            show_summary_terse_msg(struct imsg
 int             show_neighbor_terse(struct imsg *);
 int             show_neighbor_msg(struct imsg *, enum neighbor_views);
 void            print_neighbor_capa_mp(struct peer *);
+void            print_neighbor_capa_restart(struct peer *);
 void            print_neighbor_msgstats(struct peer *);
 void            print_timer(const char *, time_t);
 static char    *fmt_timeframe(time_t t);
@@ -168,7 +169,7 @@ main(int argc, char *argv[])
        case NONE:
        case IRRFILTER:
                usage();
-               /* not reached */
+               /* NOTREACHED */
        case SHOW:
        case SHOW_SUMMARY:
                imsg_compose(ibuf, IMSG_CTL_SHOW_NEIGHBOR, 0, 0, -1, NULL, 0);
@@ -648,7 +649,7 @@ show_neighbor_msg(struct imsg *imsg, enu
                        if (p->capa.peer.mp[i])
                                hascapamp = 1;
                if (hascapamp || p->capa.peer.refresh ||
-                   p->capa.peer.restart || p->capa.peer.as4byte) {
+                   p->capa.peer.grestart.restart || p->capa.peer.as4byte) {
                        printf("  Neighbor capabilities:\n");
                        if (hascapamp) {
                                printf("    Multiprotocol extensions: ");
@@ -657,8 +658,11 @@ show_neighbor_msg(struct imsg *imsg, enu
                        }
                        if (p->capa.peer.refresh)
                                printf("    Route Refresh\n");
-                       if (p->capa.peer.restart)
-                               printf("    Graceful Restart\n");
+                       if (p->capa.peer.grestart.restart) {
+                               printf("    Graceful Restart");
+                               print_neighbor_capa_restart(p);
+                               printf("\n");
+                       }
                        if (p->capa.peer.as4byte)
                                printf("    4-byte AS numbers\n");
                }
@@ -726,6 +730,28 @@ print_neighbor_capa_mp(struct peer *p)
 }
 
 void
+print_neighbor_capa_restart(struct peer *p)
+{
+       int             comma;
+       u_int8_t        i;
+
+       if (p->capa.peer.grestart.timeout)
+               printf(": Timeout: %d, ", p->capa.peer.grestart.timeout);
+       for (i = 0, comma = 0; i < AID_MAX; i++)
+               if (p->capa.peer.grestart.flags[i] & CAPA_GR_PRESENT) {
+                       if (!comma &&
+                           p->capa.peer.grestart.flags[i] & CAPA_GR_RESTART)
+                               printf("restarted, ");
+                       if (comma)
+                               printf(", ");
+                       printf("%s", aid2str(i));
+                       if (p->capa.peer.grestart.flags[i] & CAPA_GR_FORWARD)
+                               printf(" (preserved)");
+                       comma = 1;
+               }
+}
+
+void
 print_neighbor_msgstats(struct peer *p)
 {
        printf("  Message statistics:\n");
@@ -753,6 +779,8 @@ print_neighbor_msgstats(struct peer *p)
            p->stats.prefix_sent_update, p->stats.prefix_rcvd_update);
        printf("  %-15s %10llu %10llu\n", "Withdraws",
            p->stats.prefix_sent_withdraw, p->stats.prefix_rcvd_withdraw);
+       printf("  %-15s %10llu %10llu\n", "End-of-Rib",
+           p->stats.prefix_sent_eor, p->stats.prefix_rcvd_eor);
 }
 
 void
@@ -1107,8 +1135,8 @@ show_interface_msg(struct imsg *imsg)
 void
 show_rib_summary_head(void)
 {
-       printf(
-           "flags: * = Valid, > = Selected, I = via IBGP, A = Announced\n");
+       printf("flags: * = Valid, > = Selected, I = via IBGP, A = Announced, "
+           "S = Stale\n");
        printf("origin: i = IGP, e = EGP, ? = Incomplete\n\n");
        printf("%-5s %-20s %-15s  %5s %5s %s\n", "flags", "destination",
            "gateway", "lpref", "med", "aspath origin");
@@ -1152,6 +1180,8 @@ print_flags(u_int8_t flags, int sum)
                        *p++ = 'A';
                if (flags & F_PREF_INTERNAL)
                        *p++ = 'I';
+               if (flags & F_PREF_STALE)
+                       *p++ = 'S';
                if (flags & F_PREF_ELIGIBLE)
                        *p++ = '*';
                if (flags & F_PREF_ACTIVE)
@@ -1163,6 +1193,8 @@ print_flags(u_int8_t flags, int sum)
                        printf("internal");
                else
                        printf("external");
+               if (flags & F_PREF_STALE)
+                       printf(", stale");
                if (flags & F_PREF_ELIGIBLE)
                        printf(", valid");
                if (flags & F_PREF_ACTIVE)
Index: bgpd/bgpd.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/bgpd.h,v
retrieving revision 1.270
diff -u -p -r1.270 bgpd.h
--- bgpd/bgpd.h 27 May 2012 18:52:07 -0000      1.270
+++ bgpd/bgpd.h 8 Jul 2012 10:42:33 -0000
@@ -132,6 +132,7 @@ extern const struct aid aid_vals[];
 #define        AID_INET6       2
 #define        AID_VPN_IPv4    3
 #define        AID_MAX         4
+#define        AID_MIN         1       /* skip AID_UNSPEC since that is a 
dummy */
 
 #define AID_VALS       {                                       \
        /* afi, af, safii, name */                              \
@@ -253,11 +254,24 @@ struct peer_auth {
 };
 
 struct capabilities {
-       int8_t  mp[AID_MAX];    /* multiprotocol extensions, RFC 4760 */
-       int8_t  refresh;        /* route refresh, RFC 2918 */
-       int8_t  restart;        /* graceful restart, RFC 4724 */
-       int8_t  as4byte;        /* draft-ietf-idr-as4bytes-13 */
-};
+       struct {
+               int16_t timeout;        /* graceful restart timeout */
+               int8_t  flags[AID_MAX]; /* graceful restart per AID flags */
+               int8_t  restart;        /* graceful restart, RFC 4724 */
+       }       grestart;
+       int8_t  mp[AID_MAX];            /* multiprotocol extensions, RFC 4760 */
+       int8_t  refresh;                /* route refresh, RFC 2918 */
+       int8_t  as4byte;                /* 4-byte ASnum, RFC 4893 */
+};
+
+#define        CAPA_GR_PRESENT         0x01
+#define        CAPA_GR_RESTART         0x02
+#define        CAPA_GR_FORWARD         0x04
+#define        CAPA_GR_RESTARTING      0x08
+
+#define        CAPA_GR_TIMEMASK        0x0fff
+#define        CAPA_GR_R_FLAG          0x8000
+#define        CAPA_GR_F_FLAG          0x80
 
 struct peer_config {
        struct bgpd_addr         remote_addr;
@@ -373,6 +387,9 @@ enum imsg_type {
        IMSG_SESSION_ADD,
        IMSG_SESSION_UP,
        IMSG_SESSION_DOWN,
+       IMSG_SESSION_STALE,
+       IMSG_SESSION_FLUSH,
+       IMSG_SESSION_RESTARTED,
        IMSG_MRT_OPEN,
        IMSG_MRT_REOPEN,
        IMSG_MRT_CLOSE,
@@ -550,6 +567,7 @@ struct ctl_neighbor {
 #define        F_PREF_ACTIVE   0x02
 #define        F_PREF_INTERNAL 0x04
 #define        F_PREF_ANNOUNCE 0x08
+#define        F_PREF_STALE    0x10
 
 struct ctl_show_rib {
        struct bgpd_addr        true_nexthop;
Index: bgpd/parse.y
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/parse.y,v
retrieving revision 1.261
diff -u -p -r1.261 parse.y
--- bgpd/parse.y        12 Apr 2012 17:31:05 -0000      1.261
+++ bgpd/parse.y        2 Jun 2012 10:09:09 -0000
@@ -1038,7 +1038,7 @@ peeropts  : REMOTEAS as4number    {
                        curpeer->conf.capabilities.refresh = $3;
                }
                | ANNOUNCE RESTART yesno {
-                       curpeer->conf.capabilities.restart = $3;
+                       curpeer->conf.capabilities.grestart.restart = $3;
                }
                | ANNOUNCE AS4BYTE yesno {
                        curpeer->conf.capabilities.as4byte = $3;
@@ -3019,7 +3019,7 @@ alloc_peer(void)
        for (i = 0; i < AID_MAX; i++)
                p->conf.capabilities.mp[i] = -1;
        p->conf.capabilities.refresh = 1;
-       p->conf.capabilities.restart = 0;
+       p->conf.capabilities.grestart.restart = 0;
        p->conf.capabilities.as4byte = 1;
        p->conf.local_as = conf->as;
        p->conf.local_short_as = conf->short_as;
Index: bgpd/printconf.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/printconf.c,v
retrieving revision 1.86
diff -u -p -r1.86 printconf.c
--- bgpd/printconf.c    17 Sep 2011 16:29:44 -0000      1.86
+++ bgpd/printconf.c    2 Jun 2012 10:09:09 -0000
@@ -377,7 +377,7 @@ print_peer(struct peer_config *p, struct
                printf("%s\tannounce capabilities no\n", c);
        if (p->capabilities.refresh == 0)
                printf("%s\tannounce refresh no\n", c);
-       if (p->capabilities.restart == 1)
+       if (p->capabilities.grestart.restart == 1)
                printf("%s\tannounce restart yes\n", c);
        if (p->capabilities.as4byte == 0)
                printf("%s\tannounce as4byte no\n", c);
Index: bgpd/rde.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v
retrieving revision 1.316
diff -u -p -r1.316 rde.c
--- bgpd/rde.c  27 May 2012 18:52:07 -0000      1.316
+++ bgpd/rde.c  9 Jul 2012 12:44:26 -0000
@@ -101,6 +101,9 @@ struct rde_peer     *peer_add(u_int32_t, str
 struct rde_peer        *peer_get(u_int32_t);
 void            peer_up(u_int32_t, struct session_up *);
 void            peer_down(u_int32_t);
+void            peer_flush(struct rde_peer *, u_int8_t);
+void            peer_stale(u_int32_t, u_int8_t);
+void            peer_recv_eor(struct rde_peer *, u_int8_t);
 void            peer_dump(u_int32_t, u_int8_t);
 void            peer_send_eor(struct rde_peer *, u_int8_t);
 
@@ -407,6 +410,47 @@ rde_dispatch_imsg_session(struct imsgbuf
                case IMSG_SESSION_DOWN:
                        peer_down(imsg.hdr.peerid);
                        break;
+               case IMSG_SESSION_STALE:
+                       if (imsg.hdr.len - IMSG_HEADER_SIZE != sizeof(aid)) {
+                               log_warnx("rde_dispatch: wrong imsg len");
+                               break;
+                       }
+                       memcpy(&aid, imsg.data, sizeof(aid));
+                       if (aid >= AID_MAX)
+                               fatalx("IMSG_SESSION_STALE: bad AID");
+                       peer_stale(imsg.hdr.peerid, aid);
+                       break;
+               case IMSG_SESSION_FLUSH:
+                       if (imsg.hdr.len - IMSG_HEADER_SIZE != sizeof(aid)) {
+                               log_warnx("rde_dispatch: wrong imsg len");
+                               break;
+                       }
+                       memcpy(&aid, imsg.data, sizeof(aid));
+                       if (aid >= AID_MAX)
+                               fatalx("IMSG_SESSION_FLUSH: bad AID");
+                       if ((peer = peer_get(imsg.hdr.peerid)) == NULL) {
+                               log_warnx("rde_dispatch: unknown peer id %d",
+                                   imsg.hdr.peerid);
+                               break;
+                       }
+                       peer_flush(peer, aid);
+                       break;
+               case IMSG_SESSION_RESTARTED:
+                       if (imsg.hdr.len - IMSG_HEADER_SIZE != sizeof(aid)) {
+                               log_warnx("rde_dispatch: wrong imsg len");
+                               break;
+                       }
+                       memcpy(&aid, imsg.data, sizeof(aid));
+                       if (aid >= AID_MAX)
+                               fatalx("IMSG_SESSION_RESTARTED: bad AID");
+                       if ((peer = peer_get(imsg.hdr.peerid)) == NULL) {
+                               log_warnx("rde_dispatch: unknown peer id %d",
+                                   imsg.hdr.peerid);
+                               break;
+                       }
+                       if (peer->staletime[aid])
+                               peer_flush(peer, aid);
+                       break;
                case IMSG_REFRESH:
                        if (imsg.hdr.len - IMSG_HEADER_SIZE != sizeof(aid)) {
                                log_warnx("rde_dispatch: wrong imsg len");
@@ -559,10 +603,14 @@ badnet:
                                    peer->prefix_rcvd_update;
                                p.stats.prefix_rcvd_withdraw =
                                    peer->prefix_rcvd_withdraw;
+                               p.stats.prefix_rcvd_eor =
+                                   peer->prefix_rcvd_eor;
                                p.stats.prefix_sent_update =
                                    peer->prefix_sent_update;
                                p.stats.prefix_sent_withdraw =
                                    peer->prefix_sent_withdraw;
+                               p.stats.prefix_sent_eor =
+                                   peer->prefix_sent_eor;
                        }
                        imsg_compose(ibuf_se_ctl, IMSG_CTL_SHOW_NEIGHBOR, 0,
                            imsg.hdr.pid, -1, &p, sizeof(struct peer));
@@ -1020,6 +1068,10 @@ rde_update_dispatch(struct imsg *imsg)
                            ERR_UPD_ATTRLIST, NULL, 0);
                        return (-1);
                }
+               if (withdrawn_len == 0) {
+                       /* EoR marker */
+                       peer_recv_eor(peer, AID_INET);
+               }
                return (0);
        }
 
@@ -1050,6 +1102,11 @@ rde_update_dispatch(struct imsg *imsg)
                        goto done;
                }
 
+               if ((asp->flags & ~F_ATTR_MP_UNREACH) == 0 && mplen == 0) {
+                       /* EoR marker */
+                       peer_recv_eor(peer, aid);
+               }
+
                switch (aid) {
                case AID_INET6:
                        while (mplen > 0) {
@@ -2165,6 +2222,7 @@ rde_dump_rib_as(struct prefix *p, struct
        struct ibuf             *wbuf;
        struct attr             *a;
        void                    *bp;
+       time_t                   staletime;;
        u_int8_t                 l;
 
        bzero(&rib, sizeof(rib));
@@ -2201,6 +2259,9 @@ rde_dump_rib_as(struct prefix *p, struct
                rib.flags |= F_PREF_ELIGIBLE;
        if (asp->flags & F_ATTR_LOOP)
                rib.flags &= ~F_PREF_ELIGIBLE;
+       staletime = asp->peer->staletime[p->prefix->aid];
+       if (staletime && p->lastchange <= staletime)
+               rib.flags |= F_PREF_STALE;
        rib.aspath_len = aspath_length(asp->aspath);
 
        if ((wbuf = imsg_create(ibuf_se_ctl, IMSG_CTL_SHOW_RIB, 0, pid,
@@ -3163,7 +3224,8 @@ peer_up(u_int32_t id, struct session_up 
                return;
        }
 
-       if (peer->state != PEER_DOWN && peer->state != PEER_NONE)
+       if (peer->state != PEER_DOWN && peer->state != PEER_NONE &&
+           peer->state != PEER_UP)
                fatalx("peer_up: bad state");
        peer->remote_bgpid = ntohl(sup->remote_bgpid);
        peer->short_as = sup->short_as;
@@ -3220,6 +3282,50 @@ peer_down(u_int32_t id)
        free(peer);
 }
 
+/*
+ * Flush all routes older then staletime. If staletime is 0 all routes will
+ * be flushed.
+ */
+void
+peer_flush(struct rde_peer *peer, u_int8_t aid)
+{
+       struct rde_aspath       *asp, *nasp;
+
+       /* walk through per peer RIB list and remove all stale prefixes. */
+       for (asp = LIST_FIRST(&peer->path_h); asp != NULL; asp = nasp) {
+               nasp = LIST_NEXT(asp, peer_l);
+               path_remove_stale(asp, aid);
+       }
+
+       /* Deletions are performed in path_remove() */
+       rde_send_pftable_commit();
+
+       /* flushed no need to keep staletime */
+       peer->staletime[aid] = 0;
+}
+
+void
+peer_stale(u_int32_t id, u_int8_t aid)
+{
+       struct rde_peer         *peer;
+       time_t                   now;
+
+       peer = peer_get(id);
+       if (peer == NULL) {
+               log_warnx("peer_stale: unknown peer id %d", id);
+               return;
+       }
+
+       if (peer->staletime[aid])
+               peer_flush(peer, aid);
+       peer->staletime[aid] = now = time(NULL);
+
+       /* make sure new prefixes start on a higher timestamp */
+       do {
+               sleep(1);
+       } while (now >= time(NULL));
+}
+
 void
 peer_dump(u_int32_t id, u_int8_t aid)
 {
@@ -3235,16 +3341,29 @@ peer_dump(u_int32_t id, u_int8_t aid)
                up_generate_default(rules_l, peer, aid);
        else
                rib_dump(&ribs[peer->ribid], rde_up_dump_upcall, peer, aid);
-       if (peer->capa.restart)
+       if (peer->capa.grestart.restart)
                up_generate_marker(peer, aid);
 }
 
 /* End-of-RIB marker, RFC 4724 */
 void
+peer_recv_eor(struct rde_peer *peer, u_int8_t aid)
+{
+       peer->prefix_rcvd_eor++;
+
+       /* First notify SE to remove possible race with the timeout. */
+       if (imsg_compose(ibuf_se, IMSG_SESSION_RESTARTED, peer->conf.id,
+           0, -1, &aid, sizeof(aid)) == -1)
+               fatal("imsg_compose error");
+}
+
+void
 peer_send_eor(struct rde_peer *peer, u_int8_t aid)
 {
        u_int16_t       afi;
        u_int8_t        safi;
+
+       peer->prefix_sent_eor++;
 
        if (aid == AID_INET) {
                u_char null[4];
Index: bgpd/rde.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v
retrieving revision 1.142
diff -u -p -r1.142 rde.h
--- bgpd/rde.h  21 Sep 2011 08:59:01 -0000      1.142
+++ bgpd/rde.h  8 Jul 2012 08:49:07 -0000
@@ -59,10 +59,13 @@ struct rde_peer {
        struct uplist_attr               updates[AID_MAX];
        struct uplist_prefix             withdraws[AID_MAX];
        struct capabilities              capa;
+       time_t                           staletime[AID_MAX];
        u_int64_t                        prefix_rcvd_update;
        u_int64_t                        prefix_rcvd_withdraw;
+       u_int64_t                        prefix_rcvd_eor;
        u_int64_t                        prefix_sent_update;
        u_int64_t                        prefix_sent_withdraw;
+       u_int64_t                        prefix_sent_eor;
        u_int32_t                        prefix_cnt; /* # of prefixes */
        u_int32_t                        remote_bgpid; /* host byte order! */
        u_int32_t                        up_pcnt;
@@ -427,6 +430,7 @@ int          path_update(struct rib *, struct r
 int             path_compare(struct rde_aspath *, struct rde_aspath *);
 struct rde_aspath *path_lookup(struct rde_aspath *, struct rde_peer *);
 void            path_remove(struct rde_aspath *);
+void            path_remove_stale(struct rde_aspath *, u_int8_t);
 void            path_destroy(struct rde_aspath *);
 int             path_empty(struct rde_aspath *);
 struct rde_aspath *path_copy(struct rde_aspath *);
Index: bgpd/rde_rib.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v
retrieving revision 1.133
diff -u -p -r1.133 rde_rib.c
--- bgpd/rde_rib.c      1 Jul 2012 11:55:13 -0000       1.133
+++ bgpd/rde_rib.c      7 Jul 2012 12:48:40 -0000
@@ -505,6 +505,36 @@ path_remove(struct rde_aspath *asp)
        }
 }
 
+/* remove all stale routes or if staletime is 0 remove all routes for
+   a specified AID. */
+void
+path_remove_stale(struct rde_aspath *asp, u_int8_t aid)
+{
+       struct prefix   *p, *np;
+       time_t           staletime;
+
+       staletime = asp->peer->staletime[aid];
+       for (p = LIST_FIRST(&asp->prefix_h); p != NULL; p = np) {
+               np = LIST_NEXT(p, path_l);
+               if (p->prefix->aid != aid)
+                       continue;
+
+               if (staletime && p->lastchange > staletime)
+                       continue;
+
+               if (asp->pftableid) {
+                       struct bgpd_addr addr;
+
+                       pt_getaddr(p->prefix, &addr);
+                       /* Commit is done in peer_flush() */
+                       rde_send_pftable(p->aspath->pftableid, &addr,
+                           p->prefix->prefixlen, 1);
+               }
+               prefix_destroy(p);
+       }
+}
+
+
 /* this function is only called by prefix_remove and path_remove */
 void
 path_destroy(struct rde_aspath *asp)
Index: bgpd/session.c
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/session.c,v
retrieving revision 1.322
diff -u -p -r1.322 session.c
--- bgpd/session.c      9 Jul 2012 11:11:07 -0000       1.322
+++ bgpd/session.c      9 Jul 2012 13:14:41 -0000
@@ -69,6 +69,7 @@ void  session_tcp_established(struct peer
 void   session_capa_ann_none(struct peer *);
 int    session_capa_add(struct ibuf *, u_int8_t, u_int8_t);
 int    session_capa_add_mp(struct ibuf *, u_int8_t);
+int    session_capa_add_gr(struct peer *, struct ibuf *, u_int8_t);
 struct bgp_msg *session_newmsg(enum msg_type, u_int16_t);
 int    session_sendmsg(struct bgp_msg *, struct peer *);
 void   session_open(struct peer *);
@@ -77,6 +78,9 @@ void  session_update(u_int32_t, void *, s
 void   session_notification(struct peer *, u_int8_t, u_int8_t, void *,
            ssize_t);
 void   session_rrefresh(struct peer *, u_int8_t);
+int    session_graceful_restart(struct peer *);
+int    session_graceful_is_restarting(struct peer *);
+int    session_graceful_stop(struct peer *);
 int    session_dispatch_msg(struct pollfd *, struct peer *);
 int    session_process_msg(struct peer *);
 int    parse_header(struct peer *, u_char *, u_int16_t *, u_int8_t *);
@@ -437,6 +441,10 @@ session_main(int pipe_m2s[2], int pipe_s
                                            p->state == STATE_ESTABLISHED)
                                                session_demote(p, -1);
                                        break;
+                               case Timer_RestartTimeout:
+                                       timer_stop(p, Timer_RestartTimeout);
+                                       session_graceful_stop(p);
+                                       break;
                                default:
                                        fatalx("King Bula lost in time");
                                }
@@ -941,14 +949,24 @@ change_state(struct peer *peer, enum ses
                free(peer->rbuf);
                peer->rbuf = NULL;
                bzero(&peer->capa.peer, sizeof(peer->capa.peer));
-               if (peer->state == STATE_ESTABLISHED)
-                       session_down(peer);
+
                if (event != EVNT_STOP) {
                        timer_set(peer, Timer_IdleHold, peer->IdleHoldTime);
                        if (event != EVNT_NONE &&
                            peer->IdleHoldTime < MAX_IDLE_HOLD/2)
                                peer->IdleHoldTime *= 2;
                }
+               if (peer->state == STATE_ESTABLISHED) {
+                       if (peer->capa.neg.grestart.restart == 2 &&
+                           (event == EVNT_CON_CLOSED ||
+                           event == EVNT_CON_FATAL)) {
+                               /* don't punish graceful restart */
+                               timer_set(peer, Timer_IdleHold, 0);
+                               peer->IdleHoldTime /= 2;
+                               session_graceful_restart(peer);
+                       } else
+                               session_down(peer);
+               }
                if (peer->state == STATE_NONE ||
                    peer->state == STATE_ESTABLISHED) {
                        /* initialize capability negotiation structures */
@@ -959,6 +977,20 @@ change_state(struct peer *peer, enum ses
                }
                break;
        case STATE_CONNECT:
+               if (peer->state == STATE_ESTABLISHED &&
+                   peer->capa.neg.grestart.restart == 2) {
+                       /* do the graceful restart dance */
+                       session_graceful_restart(peer);
+                       peer->holdtime = INTERVAL_HOLD_INITIAL;
+                       timer_stop(peer, Timer_ConnectRetry);
+                       timer_stop(peer, Timer_Keepalive);
+                       timer_stop(peer, Timer_Hold);
+                       timer_stop(peer, Timer_IdleHold);
+                       timer_stop(peer, Timer_IdleHoldReset);
+                       session_close_connection(peer);
+                       msgbuf_clear(&peer->wbuf);
+                       bzero(&peer->capa.peer, sizeof(peer->capa.peer));
+               }
                break;
        case STATE_ACTIVE:
                break;
@@ -1032,6 +1064,7 @@ session_accept(int listenfd)
                        }
                }
 
+open:
                if (p->conf.auth.method != AUTH_NONE && sysdep.no_pfkey) {
                        log_peer_warnx(&p->conf,
                            "ipsec or md5sig configured but not available");
@@ -1064,6 +1097,13 @@ session_accept(int listenfd)
                }
                session_socket_blockmode(connfd, BM_NONBLOCK);
                bgp_fsm(p, EVNT_CON_OPEN);
+               return;
+       } else if (p != NULL && p->state == STATE_ESTABLISHED &&
+           p->capa.neg.grestart.restart == 2) {
+               /* first do the graceful restart dance */
+               change_state(p, STATE_CONNECT, EVNT_CON_CLOSED);
+               /* then do part of the open dance */
+               goto open;
        } else {
                log_conn_attempt(p, (struct sockaddr *)&cliaddr);
                close(connfd);
@@ -1288,6 +1328,30 @@ session_capa_add_mp(struct ibuf *buf, u_
        return (errs);
 }
 
+int
+session_capa_add_gr(struct peer *p, struct ibuf *b, u_int8_t aid)
+{
+       u_int           errs = 0;
+       u_int16_t       afi;
+       u_int8_t        flags, safi;
+
+       if (aid2afi(aid, &afi, &safi)) {
+               log_warn("session_capa_add_gr: bad AID");
+               return (1);
+       }
+       if (p->capa.neg.grestart.flags[aid] & CAPA_GR_RESTARTING)
+               flags = CAPA_GR_F_FLAG;
+       else
+               flags = 0;
+
+       afi = htons(afi);
+       errs += ibuf_add(b, &afi, sizeof(afi));
+       errs += ibuf_add(b, &safi, sizeof(safi));
+       errs += ibuf_add(b, &flags, sizeof(flags));
+
+       return (errs);
+}
+
 struct bgp_msg *
 session_newmsg(enum msg_type msgtype, u_int16_t len)
 {
@@ -1348,6 +1412,7 @@ session_open(struct peer *p)
        u_int16_t                len;
        u_int8_t                 i, op_type, optparamlen = 0;
        int                      errs = 0;
+       int                      mpcapa = 0;
 
 
        if ((opb = ibuf_dynamic(0, UCHAR_MAX - sizeof(op_type) -
@@ -1361,20 +1426,51 @@ session_open(struct peer *p)
                if (p->capa.ann.mp[i]) {        /* 4 bytes data */
                        errs += session_capa_add(opb, CAPA_MP, 4);
                        errs += session_capa_add_mp(opb, i);
+                       mpcapa++;
                }
 
        /* route refresh, RFC 2918 */
        if (p->capa.ann.refresh)        /* no data */
                errs += session_capa_add(opb, CAPA_REFRESH, 0);
 
-       /* End-of-RIB marker, RFC 4724 */
-       if (p->capa.ann.restart) {      /* 2 bytes data */
-               u_char          c[2];
-
-               c[0] = 0x80; /* we're always restarting */
-               c[1] = 0;
-               errs += session_capa_add(opb, CAPA_RESTART, 2);
-               errs += ibuf_add(opb, &c, 2);
+       /* graceful restart and End-of-RIB marker, RFC 4724 */
+       if (p->capa.ann.grestart.restart) {
+               int             rst = 0;
+               u_int16_t       hdr;
+               u_int8_t        grlen;
+
+               if (mpcapa) {
+                       grlen = 2 + 4 * mpcapa;
+                       for (i = 0; i < AID_MAX; i++) {
+                               if (p->capa.neg.grestart.flags[i] &
+                                   CAPA_GR_RESTARTING)
+                                       rst++;
+                       }
+               } else {        /* AID_INET */
+                       grlen = 2 + 4;
+                       if (p->capa.neg.grestart.flags[AID_INET] &
+                           CAPA_GR_RESTARTING)
+                               rst++;
+               }
+
+               hdr = conf->holdtime;           /* default timeout */
+               /* if client does graceful restart don't set R flag */
+               if (!rst)
+                       hdr |= CAPA_GR_R_FLAG;
+               hdr = htons(hdr);
+
+               errs += session_capa_add(opb, CAPA_RESTART, grlen);
+               errs += ibuf_add(opb, &hdr, sizeof(hdr));
+
+               if (mpcapa) {
+                       for (i = 0; i < AID_MAX; i++) {
+                               if (p->capa.ann.mp[i]) {
+                                       errs += session_capa_add_gr(p, opb, i);
+                               }
+                       }
+               } else {        /* AID_INET */
+                       errs += session_capa_add_gr(p, opb, AID_INET);
+               }
        }
 
        /* 4-bytes AS numbers, draft-ietf-idr-as4bytes-13 */
@@ -1581,6 +1677,69 @@ session_rrefresh(struct peer *p, u_int8_
 }
 
 int
+session_graceful_restart(struct peer *p)
+{
+       u_int8_t        i;
+
+       timer_set(p, Timer_RestartTimeout, p->capa.neg.grestart.timeout);
+
+       for (i = 0; i < AID_MAX; i++) {
+               if (p->capa.neg.grestart.flags[i] & CAPA_GR_PRESENT) {
+                       if (imsg_compose(ibuf_rde, IMSG_SESSION_STALE,
+                           p->conf.id, 0, -1, &i, sizeof(i)) == -1)
+                               return (-1);
+                       log_peer_warnx(&p->conf,
+                           "graceful restart of %s, keeping routes",
+                           aid2str(i));
+                       p->capa.neg.grestart.flags[i] |= CAPA_GR_RESTARTING;
+               } else if (p->capa.neg.mp[i]) {
+                       if (imsg_compose(ibuf_rde, IMSG_SESSION_FLUSH,
+                           p->conf.id, 0, -1, &i, sizeof(i)) == -1)
+                               return (-1);
+                       log_peer_warnx(&p->conf,
+                           "graceful restart of %s, flushing routes",
+                           aid2str(i));
+               }
+       }
+       return (0);
+}
+
+int
+session_graceful_is_restarting(struct peer *p)
+{
+       u_int8_t        i;
+
+       for (i = 0; i < AID_MAX; i++)
+               if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING)
+                       return (1);
+       return (0);
+}
+
+int
+session_graceful_stop(struct peer *p)
+{
+       u_int8_t        i;
+
+       for (i = 0; i < AID_MAX; i++) {
+               /*
+                * Only flush if the peer is restarting and the peer indicated
+                * it hold the forwarding state. In all other cases the
+                * session was already flushed when the session came up.
+                */
+               if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING &&
+                   p->capa.neg.grestart.flags[i] & CAPA_GR_FORWARD) {
+                       log_peer_warnx(&p->conf, "graceful restart of %s, "
+                           "time-out, flushing", aid2str(i));
+                       if (imsg_compose(ibuf_rde, IMSG_SESSION_FLUSH,
+                           p->conf.id, 0, -1, &i, sizeof(i)) == -1)
+                               return (-1);
+               }
+               p->capa.neg.grestart.flags[i] &= ~CAPA_GR_RESTARTING;
+       }
+       return (0);
+}
+
+int
 session_dispatch_msg(struct pollfd *pfd, struct peer *p)
 {
        ssize_t         n;
@@ -2154,7 +2313,7 @@ parse_notification(struct peer *peer)
                                    "disabling route refresh capability");
                                break;
                        case CAPA_RESTART:
-                               peer->capa.ann.restart = 0;
+                               peer->capa.ann.grestart.restart = 0;
                                log_peer_warnx(&peer->conf,
                                    "disabling restart capability");
                                break;
@@ -2192,10 +2351,13 @@ parse_capabilities(struct peer *peer, u_
        u_int32_t        remote_as;
        u_int16_t        len;
        u_int16_t        afi;
+       u_int16_t        gr_header;
        u_int8_t         safi;
        u_int8_t         aid;
+       u_int8_t         gr_flags;
        u_int8_t         capa_code;
        u_int8_t         capa_len;
+       u_int8_t         i;
 
        len = dlen;
        while (len > 0) {
@@ -2247,8 +2409,50 @@ parse_capabilities(struct peer *peer, u_
                        peer->capa.peer.refresh = 1;
                        break;
                case CAPA_RESTART:
-                       peer->capa.peer.restart = 1;
-                       /* we don't care about the further restart capas yet */
+                       if (capa_len == 2) {
+                               /* peer only supports EoR marker */
+                               peer->capa.peer.grestart.restart = 1;
+                               peer->capa.peer.grestart.timeout = 0;
+                               break;
+                       } else if (capa_len % 4 != 2) {
+                               log_peer_warnx(&peer->conf,
+                                   "parse_capabilities: "
+                                   "expect len 2 + x*4, len is %u", capa_len);
+                               return (-1);
+                       }
+
+                       memcpy(&gr_header, capa_val, sizeof(gr_header));
+                       gr_header = ntohs(gr_header);
+                       peer->capa.peer.grestart.timeout =
+                           gr_header & CAPA_GR_TIMEMASK;
+                       if (peer->capa.peer.grestart.timeout == 0) {
+                               log_peer_warnx(&peer->conf,
+                                   "graceful restart timeout is zero");
+                               return (-1);
+                       }
+
+                       for (i = 2; i <= capa_len - 4; i += 4) {
+                               memcpy(&afi, capa_val + i, sizeof(afi));
+                               afi = ntohs(afi);
+                               memcpy(&safi, capa_val + i + 2, sizeof(safi));
+                               if (afi2aid(afi, safi, &aid) == -1) {
+                                       log_peer_warnx(&peer->conf,
+                                           "parse_capabilities: restart: AFI "
+                                           "%u, safi %u unknown", afi, safi);
+                                       return (-1);
+                               }
+                               memcpy(&gr_flags, capa_val + i + 3,
+                                   sizeof(gr_flags));
+                               peer->capa.peer.grestart.flags[aid] |=
+                                   CAPA_GR_PRESENT;
+                               if (gr_flags & CAPA_GR_F_FLAG)
+                                       peer->capa.peer.grestart.flags[aid] |=
+                                           CAPA_GR_FORWARD;
+                               if (gr_header & CAPA_GR_R_FLAG)
+                                       peer->capa.peer.grestart.flags[aid] |=
+                                           CAPA_GR_RESTART;
+                               peer->capa.peer.grestart.restart = 2;
+                       }
                        break;
                case CAPA_AS4BYTE:
                        if (capa_len != 4) {
@@ -2291,11 +2495,40 @@ capa_neg_calc(struct peer *p)
                } else
                        p->capa.neg.mp[i] = 0;
        }
-       /* if no MP capability present for default IPv4 unicast mode */
+       /* if no MP capability present default to IPv4 unicast mode */
        if (!hasmp)
                p->capa.neg.mp[AID_INET] = 1;
 
-       p->capa.neg.restart = p->capa.peer.restart;
+       /*
+        * graceful restart: only the peer capabilities are of interest here.
+        * It is necessary to compare the new values with the previous ones
+        * and act acordingly. AFI/SAFI that are not part in the MP capability
+        * are treated as not being present.
+        */
+
+       for (i = 0; i < AID_MAX; i++) {
+               /* disable GR if the AFI/SAFI is not present */
+               if (p->capa.peer.grestart.flags[i] & CAPA_GR_PRESENT &&
+                   p->capa.neg.mp[i] == 0)
+                       p->capa.peer.grestart.flags[i] = 0;     /* disable */
+               /* look at current GR state and decide what to do */
+               if (p->capa.neg.grestart.flags[i] & CAPA_GR_RESTARTING) {
+                       if (!(p->capa.peer.grestart.flags[i] &
+                           CAPA_GR_FORWARD)) {
+                               if (imsg_compose(ibuf_rde, IMSG_SESSION_FLUSH,
+                                   p->conf.id, 0, -1, &i, sizeof(i)) == -1)
+                                       return (-1);
+                               log_peer_warnx(&p->conf, "graceful restart of "
+                                   "%s, not restarted, flushing", aid2str(i));
+                       }
+                       p->capa.neg.grestart.flags[i] =
+                           p->capa.peer.grestart.flags[i] | CAPA_GR_RESTARTING;
+               } else
+                       p->capa.neg.grestart.flags[i] =
+                           p->capa.peer.grestart.flags[i];
+       }
+       p->capa.neg.grestart.timeout = p->capa.peer.grestart.timeout;
+       p->capa.neg.grestart.restart = p->capa.peer.grestart.restart;
 
        return (0);
 }
@@ -2313,7 +2546,7 @@ session_dispatch_imsg(struct imsgbuf *ib
        u_char                  *data;
        enum reconf_action       reconf;
        int                      n, depend_ok, restricted;
-       u_int8_t                 errcode, subcode;
+       u_int8_t                 aid, errcode, subcode;
 
        if ((n = imsg_read(ibuf)) == -1)
                fatal("session_dispatch_imsg: imsg_read error");
@@ -2624,6 +2857,40 @@ session_dispatch_imsg(struct imsgbuf *ib
                                break;
                        }
                        break;
+               case IMSG_SESSION_RESTARTED:
+                       if (idx != PFD_PIPE_ROUTE)
+                               fatalx("update request not from RDE");
+                       if (imsg.hdr.len < IMSG_HEADER_SIZE + sizeof(aid)) {
+                               log_warnx("RDE sent invalid restart msg");
+                               break;
+                       }
+                       if ((p = getpeerbyid(imsg.hdr.peerid)) == NULL) {
+                               log_warnx("no such peer: id=%u",
+                                   imsg.hdr.peerid);
+                               break;
+                       }
+                       memcpy(&aid, imsg.data, sizeof(aid));
+                       if (aid >= AID_MAX)
+                               fatalx("IMSG_SESSION_RESTARTED: bad AID");
+                       if (p->capa.neg.grestart.flags[aid] &
+                           CAPA_GR_RESTARTING &&
+                           p->capa.neg.grestart.flags[aid] &
+                           CAPA_GR_FORWARD) {
+                               log_peer_warnx(&p->conf,
+                                   "graceful restart of %s finished",
+                                   aid2str(aid));
+                               p->capa.neg.grestart.flags[aid] &=
+                                   ~CAPA_GR_RESTARTING;
+                               timer_stop(p, Timer_RestartTimeout);
+
+                               /* signal back to RDE to cleanup stale routes */
+                               if (imsg_compose(ibuf_rde,
+                                   IMSG_SESSION_RESTARTED, imsg.hdr.peerid, 0,
+                                   -1, &aid, sizeof(aid)) == -1)
+                                       fatal("imsg_compose: "
+                                           "IMSG_SESSION_RESTARTED");
+                       }
+                       break;
                default:
                        break;
                }
@@ -2814,9 +3081,10 @@ session_up(struct peer *p)
 {
        struct session_up        sup;
 
-       if (imsg_compose(ibuf_rde, IMSG_SESSION_ADD, p->conf.id, 0, -1,
-           &p->conf, sizeof(p->conf)) == -1)
-               fatalx("imsg_compose error");
+       if (!session_graceful_is_restarting(p))
+               if (imsg_compose(ibuf_rde, IMSG_SESSION_ADD, p->conf.id, 0, -1,
+                   &p->conf, sizeof(p->conf)) == -1)
+                       fatalx("imsg_compose error");
 
        sa2addr((struct sockaddr *)&p->sa_local, &sup.local_addr);
        sa2addr((struct sockaddr *)&p->sa_remote, &sup.remote_addr);
Index: bgpd/session.h
===================================================================
RCS file: /cvs/src/usr.sbin/bgpd/session.h,v
retrieving revision 1.113
diff -u -p -r1.113 session.h
--- bgpd/session.h      12 Apr 2012 17:26:09 -0000      1.113
+++ bgpd/session.h      8 Jul 2012 08:52:04 -0000
@@ -162,8 +162,10 @@ struct peer_stats {
        u_int64_t                msg_sent_rrefresh;
        u_int64_t                prefix_rcvd_update;
        u_int64_t                prefix_rcvd_withdraw;
+       u_int64_t                prefix_rcvd_eor;
        u_int64_t                prefix_sent_update;
        u_int64_t                prefix_sent_withdraw;
+       u_int64_t                prefix_sent_eor;
        time_t                   last_updown;
        time_t                   last_read;
        u_int32_t                prefix_cnt;
@@ -179,6 +181,7 @@ enum Timer {
        Timer_IdleHold,
        Timer_IdleHoldReset,
        Timer_CarpUndemote,
+       Timer_RestartTimeout,
        Timer_Max
 };
 

----- End forwarded message -----

Reply via email to