From: Pradosh Mohapatra <[email protected]>
ISSUE
BGP starts the routeadv timer (peer->t_routeadv) to expire in 1 sec
when a peer is established. From then on, the timer expires
periodically based on the configured MRAI value (default: 30sec for
EBGP, 5sec for IBGP). At the expiry, the write thread is triggered
that takes the routes from peer's sync FIFO (adj-rib-out) and sends
UPDATEs. This has a few drawbacks:
(1) Delay in new route announcement: Even when the last UPDATE message
was sent a while back, the next route change will necessarily have
to wait for routeadv expiry
(2) CPU usage: The timer is always armed. If the operator chooses to
configure a lower value of MRAI (zero second is a preferred choice
in many deployments) for better convergence, it leads to high CPU
usage for BGP process, even at the times of no network churn.
PATCH
Make the route advertisement event-driven - When routes are added to
peer's sync FIFO, check if the routeadv timer needs to be adjusted (or
started). Conversely, do not arm the routeadv timer unconditionally.
The patch also addresses route announcements during read-only mode
(update-delay). During read-only mode operation, the routeadv timer
is not started. When BGP comes out of read-only mode and all the
routes are processed, the timer is started for all peers with zero
expiry, so that the UPDATEs can be sent all at once. This leads to
(near-)optimal UPDATE packing.
Finally, the patch makes the "max # packets to write to peer socket at
a time" configurable. Currently it is hard-coded to 10. The command is
at the top router-bgp mode and is called "write-quanta <number>". It
is a useful convergence parameter to tweak.
Signed-off-by: Pradosh Mohapatra <[email protected]>
Reviewed-by: Daniel Walton <[email protected]>
Signed-off-by: Donald Sharp <[email protected]>
---
bgpd/bgp_advertise.c | 3 ++
bgpd/bgp_fsm.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++-----
bgpd/bgp_fsm.h | 18 +++++++
bgpd/bgp_packet.c | 13 +++--
bgpd/bgp_route.c | 64 ++++++++++++++++++++---
bgpd/bgp_route.h | 6 +++
bgpd/bgp_vty.c | 63 +++++++++++++++++++++--
bgpd/bgp_vty.h | 1 +
bgpd/bgpd.c | 5 ++
bgpd/bgpd.h | 3 ++
10 files changed, 292 insertions(+), 26 deletions(-)
diff --git a/bgpd/bgp_advertise.c b/bgpd/bgp_advertise.c
index bbdba22..be1b1f3 100644
--- a/bgpd/bgp_advertise.c
+++ b/bgpd/bgp_advertise.c
@@ -267,6 +267,9 @@ bgp_adj_out_set (struct bgp_node *rn, struct peer *peer,
struct prefix *p,
/* Add new advertisement to advertisement attribute list. */
bgp_advertise_add (adv->baa, adv);
+ if (FIFO_EMPTY(&peer->sync[afi][safi]->update))
+ bgp_adjust_routeadv(peer);
+
BGP_ADV_FIFO_ADD (&peer->sync[afi][safi]->update, &adv->fifo);
}
diff --git a/bgpd/bgp_fsm.c b/bgpd/bgp_fsm.c
index f1781e6..54afdcc 100644
--- a/bgpd/bgp_fsm.c
+++ b/bgpd/bgp_fsm.c
@@ -288,6 +288,22 @@ bgp_keepalive_timer (struct thread *thread)
}
static int
+bgp_routeq_empty (struct peer *peer)
+{
+ afi_t afi;
+ safi_t safi;
+
+ for (afi = AFI_IP; afi < AFI_MAX; afi++)
+ for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
+ {
+ if (!FIFO_EMPTY(&peer->sync[afi][safi]->withdraw) ||
+ !FIFO_EMPTY(&peer->sync[afi][safi]->update))
+ return 0;
+ }
+ return 1;
+}
+
+static int
bgp_routeadv_timer (struct thread *thread)
{
struct peer *peer;
@@ -304,8 +320,14 @@ bgp_routeadv_timer (struct thread *thread)
BGP_WRITE_ON (peer->t_write, bgp_write, peer->fd);
- BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer,
- peer->v_routeadv);
+ /*
+ * If there is no UPDATE to send, don't start the timer. We will start
+ * it when the queues go non-empty.
+ */
+ if (bgp_routeq_empty(peer))
+ return 0;
+
+ BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer, peer->v_routeadv);
return 0;
}
@@ -441,6 +463,13 @@ bgp_update_delay_end (struct bgp *bgp)
quagga_timestamp(3, bgp->update_delay_end_time,
sizeof(bgp->update_delay_end_time));
+ /*
+ * Add an end-of-initial-update marker to the main process queues so that
+ * the route advertisement timer for the peers can be started.
+ */
+ bgp_add_eoiu_mark(bgp, BGP_TABLE_MAIN);
+ bgp_add_eoiu_mark(bgp, BGP_TABLE_RSCLIENT);
+
/* Route announcements were postponed for all the peers during read-only
mode,
send those now. */
for (ALL_LIST_ELEMENTS (bgp->peer, node, nnode, peer))
@@ -452,6 +481,92 @@ bgp_update_delay_end (struct bgp *bgp)
work_queue_unplug(bm->process_rsclient_queue);
}
+/**
+ * see bgp_fsm.h
+ */
+void
+bgp_start_routeadv (struct bgp *bgp)
+{
+ struct listnode *node, *nnode;
+ struct peer *peer;
+
+ for (ALL_LIST_ELEMENTS (bgp->peer, node, nnode, peer))
+ {
+ if (peer->status != Established)
+ continue;
+ BGP_TIMER_OFF(peer->t_routeadv);
+ BGP_TIMER_ON(peer->t_routeadv, bgp_routeadv_timer, 0);
+ }
+}
+
+/**
+ * see bgp_fsm.h
+ */
+void
+bgp_adjust_routeadv (struct peer *peer)
+{
+ time_t nowtime = bgp_clock();
+ double diff;
+ unsigned long remain;
+
+ /*
+ * CASE I:
+ * If the last update was written more than MRAI back, expire the timer
+ * instantly so that we can send the update out sooner.
+ *
+ * <------- MRAI --------->
+ * |-----------------|-----------------------|
+ * <------------- m ------------>
+ * ^ ^ ^
+ * | | |
+ * | | current time
+ * | timer start
+ * last write
+ *
+ * m > MRAI
+ */
+ diff = difftime(nowtime, peer->last_write);
+ if (diff > (double) peer->v_routeadv)
+ {
+ BGP_TIMER_OFF(peer->t_routeadv);
+ BGP_TIMER_ON(peer->t_routeadv, bgp_routeadv_timer, 0);
+ if (BGP_DEBUG (update, UPDATE_OUT))
+ zlog (peer->log, LOG_DEBUG, "%s: MRAI timer to expire instantly\n",
+ peer->host);
+ return;
+ }
+
+ /*
+ * CASE II:
+ * - Find when to expire the MRAI timer.
+ * If MRAI timer is not active, assume we can start it now.
+ *
+ * <------- MRAI --------->
+ * |------------|-----------------------|
+ * <-------- m ----------><----- r ----->
+ * ^ ^ ^
+ * | | |
+ * | | current time
+ * | timer start
+ * last write
+ *
+ * (MRAI - m) < r
+ */
+ if (peer->t_routeadv)
+ remain = thread_timer_remain_second(peer->t_routeadv);
+ else
+ remain = peer->v_routeadv;
+ diff = peer->v_routeadv - diff;
+ if (diff <= (double) remain)
+ {
+ BGP_TIMER_OFF(peer->t_routeadv);
+ BGP_TIMER_ON(peer->t_routeadv, bgp_routeadv_timer, diff);
+ if (BGP_DEBUG (update, UPDATE_OUT))
+ zlog (peer->log, LOG_DEBUG, "%s: MRAI timer to expire in %f secs\n",
+ peer->host, diff);
+ }
+}
+
/* The update delay timer expiry callback. */
static int
bgp_update_delay_timer (struct thread *thread)
@@ -928,16 +1043,12 @@ bgp_fsm_open (struct peer *peer)
static int
bgp_fsm_keepalive_expire (struct peer *peer)
{
- afi_t afi;
- safi_t safi;
-
- for (afi = AFI_IP; afi < AFI_MAX; afi++)
- for (safi = SAFI_UNICAST; safi < SAFI_MAX; safi++)
- {
- if (!FIFO_EMPTY(&peer->sync[afi][safi]->withdraw) ||
- !FIFO_EMPTY(&peer->sync[afi][safi]->update))
- return 0;
- }
+ /*
+ * If there are UPDATE messages to send, no need to send keepalive. The
+ * peer will note our progress through the UPDATEs.
+ */
+ if (!bgp_routeq_empty(peer))
+ return 0;
bgp_keepalive_send (peer);
return 0;
@@ -1070,7 +1181,12 @@ bgp_establish (struct peer *peer)
bgp_announce_route_all (peer);
- BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer, 1);
+ /* Start the route advertisement timer to send updates to the peer - if BGP
+ * is not in read-only mode. If it is, the timer will be started at the end
+ * of read-only mode.
+ */
+ if (!bgp_update_delay_active(peer->bgp))
+ BGP_TIMER_ON (peer->t_routeadv, bgp_routeadv_timer, 0);
return 0;
}
diff --git a/bgpd/bgp_fsm.h b/bgpd/bgp_fsm.h
index 2144fd9..bc04c72 100644
--- a/bgpd/bgp_fsm.h
+++ b/bgpd/bgp_fsm.h
@@ -71,6 +71,8 @@ Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA
thread_cancel_event (bm->master, (P)); \
} while (0)
+#define BGP_MSEC_JITTER 10
+
/* Prototypes. */
extern int bgp_event (struct thread *);
extern int bgp_stop (struct peer *peer);
@@ -79,4 +81,20 @@ extern void bgp_fsm_change_status (struct peer *peer, int
status);
extern const char *peer_down_str[];
extern void bgp_update_delay_end (struct bgp *);
+/**
+ * Start the route advertisement timer (that honors MRAI) for all the
+ * peers. Typically called at the end of initial convergence, coming
+ * out of read-only mode.
+ */
+extern void bgp_start_routeadv (struct bgp *);
+
+/**
+ * See if the route advertisement timer needs to be adjusted for a
+ * peer. For example, if the last update was written to the peer a
+ * long while back, we don't need to wait for the periodic advertisement
+ * timer to expire to send the new set of prefixes. It should fire
+ * instantly and updates should go out sooner.
+ */
+extern void bgp_adjust_routeadv (struct peer *);
+
#endif /* _QUAGGA_BGP_FSM_H */
diff --git a/bgpd/bgp_packet.c b/bgpd/bgp_packet.c
index 3aa7735..16c355f 100644
--- a/bgpd/bgp_packet.c
+++ b/bgpd/bgp_packet.c
@@ -605,7 +605,7 @@ bgp_write_packet (struct peer *peer)
adv = BGP_ADV_FIFO_HEAD (&peer->sync[afi][safi]->update);
if (adv)
{
- if (adv->binfo && adv->binfo->uptime < peer->synctime)
+ if (adv->binfo && adv->binfo->uptime <= peer->synctime)
{
if (CHECK_FLAG (adv->binfo->peer->cap, PEER_CAP_RESTART_RCV)
&& CHECK_FLAG (adv->binfo->peer->cap, PEER_CAP_RESTART_ADV)
@@ -678,6 +678,7 @@ bgp_write (struct thread *thread)
struct stream *s;
int num;
unsigned int count = 0;
+ unsigned int oc = 0;
/* Yes first of all get peer pointer. */
peer = THREAD_ARG (thread);
@@ -696,6 +697,8 @@ bgp_write (struct thread *thread)
sockopt_cork (peer->fd, 1);
+ oc = peer->update_out;
+
/* Nonblocking write until TCP output buffer is full. */
do
{
@@ -763,13 +766,17 @@ bgp_write (struct thread *thread)
/* OK we send packet so delete it. */
bgp_packet_delete (peer);
}
- while (++count < BGP_WRITE_PACKET_MAX &&
+ while (++count < peer->bgp->wpkt_quanta &&
(s = bgp_write_packet (peer)) != NULL);
-
+
if (bgp_write_proceed (peer))
BGP_WRITE_ON (peer->t_write, bgp_write, peer->fd);
done:
+ /* Update the last write if some updates were written. */
+ if (peer->update_out > oc)
+ peer->last_write = bgp_clock ();
+
sockopt_cork (peer->fd, 0);
return 0;
}
diff --git a/bgpd/bgp_route.c b/bgpd/bgp_route.c
index 42ca41f..f72c72e 100644
--- a/bgpd/bgp_route.c
+++ b/bgpd/bgp_route.c
@@ -1551,8 +1551,17 @@ bgp_process_rsclient (struct work_queue *wq, void *data)
struct bgp_info *old_select;
struct bgp_info_pair old_and_new;
struct listnode *node, *nnode;
- struct peer *rsclient = bgp_node_table (rn)->owner;
-
+ struct peer *rsclient;
+
+ /* Is it end of initial update? (after startup) */
+ if (!rn)
+ {
+ bgp_start_routeadv(bgp);
+ return WQ_SUCCESS;
+ }
+
+ rsclient = bgp_node_table (rn)->owner;
+
/* Best path selection. */
bgp_best_selection (bgp, rn, &bgp->maxpaths[afi][safi], &old_and_new);
new_select = old_and_new.new;
@@ -1615,7 +1624,14 @@ bgp_process_main (struct work_queue *wq, void *data)
struct bgp_info_pair old_and_new;
struct listnode *node, *nnode;
struct peer *peer;
-
+
+ /* Is it end of initial update? (after startup) */
+ if (!rn)
+ {
+ bgp_start_routeadv(bgp);
+ return WQ_SUCCESS;
+ }
+
/* Best path selection. */
bgp_best_selection (bgp, rn, &bgp->maxpaths[afi][safi], &old_and_new);
old_select = old_and_new.old;
@@ -1682,11 +1698,15 @@ static void
bgp_processq_del (struct work_queue *wq, void *data)
{
struct bgp_process_queue *pq = data;
- struct bgp_table *table = bgp_node_table (pq->rn);
-
+ struct bgp_table *table;
+
bgp_unlock (pq->bgp);
- bgp_unlock_node (pq->rn);
- bgp_table_unlock (table);
+ if (pq->rn)
+ {
+ table = bgp_node_table (pq->rn);
+ bgp_unlock_node (pq->rn);
+ bgp_table_unlock (table);
+ }
XFREE (MTYPE_BGP_PROCESS_QUEUE, pq);
}
@@ -1755,6 +1775,36 @@ bgp_process (struct bgp *bgp, struct bgp_node *rn, afi_t
afi, safi_t safi)
return;
}
+void
+bgp_add_eoiu_mark (struct bgp *bgp, bgp_table_t type)
+{
+ struct bgp_process_queue *pqnode;
+
+ if ( (bm->process_main_queue == NULL) ||
+ (bm->process_rsclient_queue == NULL) )
+ bgp_process_queue_init ();
+
+ pqnode = XCALLOC (MTYPE_BGP_PROCESS_QUEUE,
+ sizeof (struct bgp_process_queue));
+ if (!pqnode)
+ return;
+
+ pqnode->rn = NULL;
+ pqnode->bgp = bgp;
+ bgp_lock (bgp);
+ switch (type)
+ {
+ case BGP_TABLE_MAIN:
+ work_queue_add (bm->process_main_queue, pqnode);
+ break;
+ case BGP_TABLE_RSCLIENT:
+ work_queue_add (bm->process_rsclient_queue, pqnode);
+ break;
+ }
+
+ return;
+}
+
static int
bgp_maximum_prefix_restart_timer (struct thread *thread)
{
diff --git a/bgpd/bgp_route.h b/bgpd/bgp_route.h
index e1eeb7e..7bd9ee2 100644
--- a/bgpd/bgp_route.h
+++ b/bgpd/bgp_route.h
@@ -237,6 +237,12 @@ extern int bgp_withdraw (struct peer *, struct prefix *,
struct attr *,
/* for bgp_nexthop and bgp_damp */
extern void bgp_process (struct bgp *, struct bgp_node *, afi_t, safi_t);
+
+/*
+ * Add an end-of-initial-update marker to the process queue. This is just a
+ * queue element with NULL bgp node.
+ */
+extern void bgp_add_eoiu_mark (struct bgp *, bgp_table_t);
extern int bgp_config_write_table_map (struct vty *, struct bgp *, afi_t,
safi_t,
int *);
extern int bgp_config_write_network (struct vty *, struct bgp *, afi_t,
safi_t, int *);
diff --git a/bgpd/bgp_vty.c b/bgpd/bgp_vty.c
index d322c53..0412a65 100644
--- a/bgpd/bgp_vty.c
+++ b/bgpd/bgp_vty.c
@@ -49,6 +49,7 @@ Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA
#include "bgpd/bgp_table.h"
#include "bgpd/bgp_vty.h"
#include "bgpd/bgp_mpath.h"
+#include "bgpd/bgp_packet.h"
extern struct in_addr router_id_zebra;
@@ -801,6 +802,53 @@ ALIAS (no_bgp_update_delay,
"Wait for peers to be established\n"
"Seconds\n")
+static int
+bgp_wpkt_quanta_config_vty (struct vty *vty, const char *num, char set)
+{
+ struct bgp *bgp;
+
+ bgp = vty->index;
+
+ if (set)
+ VTY_GET_INTEGER_RANGE ("write-quanta", bgp->wpkt_quanta, num,
+ 1, 4294967295);
+ else
+ bgp->wpkt_quanta = BGP_WRITE_PACKET_MAX;
+
+ return CMD_SUCCESS;
+}
+
+int
+bgp_config_write_wpkt_quanta (struct vty *vty, struct bgp *bgp)
+{
+ if (bgp->wpkt_quanta != BGP_WRITE_PACKET_MAX)
+ vty_out (vty, " write-quanta %d%s",
+ bgp->wpkt_quanta, VTY_NEWLINE);
+
+ return 0;
+}
+
+
+/* Update-delay configuration */
+DEFUN (bgp_wpkt_quanta,
+ bgp_wpkt_quanta_cmd,
+ "write-quanta <1-4294967295>",
+ "How many packets to write to peer socket per run\n"
+ "Number of packets\n")
+{
+ return bgp_wpkt_quanta_config_vty(vty, argv[0], 1);
+}
+
+/* Update-delay deconfiguration */
+DEFUN (no_bgp_wpkt_quanta,
+ no_bgp_wpkt_quanta_cmd,
+ "no write-quanta <1-4294967295>",
+ "How many packets to write to peer socket per run\n"
+ "Number of packets\n")
+{
+ return bgp_wpkt_quanta_config_vty(vty, argv[0], 0);
+}
+
/* Maximum-paths configuration */
DEFUN (bgp_maxpaths,
bgp_maxpaths_cmd,
@@ -7756,9 +7804,11 @@ bgp_show_peer (struct vty *vty, struct peer *p)
/* read timer */
vty_out (vty, " Last read %s", peer_uptime (p->readtime, timebuf,
BGP_UPTIME_LEN));
+ vty_out (vty, ", Last write %s%s",
+ peer_uptime (p->last_write, timebuf, BGP_UPTIME_LEN), VTY_NEWLINE);
/* Configured timer values. */
- vty_out (vty, ", hold time is %d, keepalive interval is %d seconds%s",
+ vty_out (vty, " Hold time is %d, keepalive interval is %d seconds%s",
p->v_holdtime, p->v_keepalive, VTY_NEWLINE);
if (CHECK_FLAG (p->config, PEER_CONFIG_TIMER))
{
@@ -8059,8 +8109,12 @@ bgp_show_peer (struct vty *vty, struct peer *p)
if (p->t_connect)
vty_out (vty, "Next connect timer due in %ld seconds%s",
thread_timer_remain_second (p->t_connect), VTY_NEWLINE);
-
- vty_out (vty, "Read thread: %s Write thread: %s%s",
+ if (p->t_routeadv)
+ vty_out (vty, "MRAI (interval %d) timer expires in %ld seconds%s",
+ p->v_routeadv, thread_timer_remain_second (p->t_routeadv),
+ VTY_NEWLINE);
+
+ vty_out (vty, "Read thread: %s Write thread: %s%s",
p->t_read ? "on" : "off",
p->t_write ? "on" : "off",
VTY_NEWLINE);
@@ -9309,6 +9363,9 @@ bgp_vty_init (void)
install_element (BGP_NODE, &bgp_update_delay_establish_wait_cmd);
install_element (BGP_NODE, &no_bgp_update_delay_establish_wait_cmd);
+ install_element (BGP_NODE, &bgp_wpkt_quanta_cmd);
+ install_element (BGP_NODE, &no_bgp_wpkt_quanta_cmd);
+
/* "maximum-paths" commands. */
install_element (BGP_NODE, &bgp_maxpaths_cmd);
install_element (BGP_NODE, &no_bgp_maxpaths_cmd);
diff --git a/bgpd/bgp_vty.h b/bgpd/bgp_vty.h
index 61c114e..96a6711 100644
--- a/bgpd/bgp_vty.h
+++ b/bgpd/bgp_vty.h
@@ -28,5 +28,6 @@ Software Foundation, Inc., 59 Temple Place - Suite 330,
Boston, MA
extern void bgp_vty_init (void);
extern const char *afi_safi_print (afi_t, safi_t);
extern int bgp_config_write_update_delay (struct vty *, struct bgp *);
+extern int bgp_config_write_wpkt_quanta(struct vty *vty, struct bgp *bgp);
#endif /* _QUAGGA_BGP_VTY_H */
diff --git a/bgpd/bgpd.c b/bgpd/bgpd.c
index 1719111..731185f 100644
--- a/bgpd/bgpd.c
+++ b/bgpd/bgpd.c
@@ -2044,6 +2044,8 @@ bgp_create (as_t *as, const char *name)
if (name)
bgp->name = strdup (name);
+ bgp->wpkt_quanta = BGP_WRITE_PACKET_MAX;
+
THREAD_TIMER_ON (bm->master, bgp->t_startup, bgp_startup_timer_expire,
bgp, bgp->restart_time);
@@ -5444,6 +5446,9 @@ bgp_config_write (struct vty *vty)
/* BGP update-delay. */
bgp_config_write_update_delay (vty, bgp);
+ /* write quanta */
+ bgp_config_write_wpkt_quanta (vty, bgp);
+
/* BGP graceful-restart. */
if (bgp->stalepath_time != BGP_DEFAULT_STALEPATH_TIME)
vty_out (vty, " bgp graceful-restart stalepath-time %d%s",
diff --git a/bgpd/bgpd.h b/bgpd/bgpd.h
index 7653e50..c0a3248 100644
--- a/bgpd/bgpd.h
+++ b/bgpd/bgpd.h
@@ -196,6 +196,8 @@ struct bgp
u_int16_t ibgp_flags;
#define BGP_FLAG_IBGP_MULTIPATH_SAME_CLUSTERLEN (1 << 0)
} maxpaths[AFI_MAX][SAFI_MAX];
+
+ u_int32_t wpkt_quanta; /* per peer packet quanta to write */
};
/* BGP peer-group support. */
@@ -534,6 +536,7 @@ struct peer
/* Syncronization list and time. */
struct bgp_synchronize *sync[AFI_MAX][SAFI_MAX];
time_t synctime;
+ time_t last_write; /* timestamp when the last UPDATE msg was written */
/* Send prefix count. */
unsigned long scount[AFI_MAX][SAFI_MAX];
--
1.9.1
_______________________________________________
Quagga-dev mailing list
[email protected]
https://lists.quagga.net/mailman/listinfo/quagga-dev