This is a major diff which replaces some old code of bgpd. Using the Adj-RIB-Out it is possible to replace the old code generating updates and queuing them in two RB trees with a RB tree of prefix elements. The big benefit of this is a lot less memeory pressure during large operations (e.g. when starting the daemon).
I have been running this on some production system for a while with no issues but since this change is rather big I would prefer more testing and also review of the diff. The change from before is the introduction of a RB tree element in struct prefix which allows to link prefixes from the Adj-RIB-Out onto update or withdraw trees. The trees are sorted by AS path, nexthop, flags and finally prefix. This way the grouping of prefixes still works while actually not consuming more memory to get there. This is the reason why the spikes of memory usage are now lower. Additionally until now the malloc bucket for struct prefix was also used by the update trees and casued a lot of fragmentation in them which resulted in excessive memory usage (comapred to the amount of show rib mem). The behaviour should be the same and it should hopefully also solve the issue with sending out all EoR records on MP sessions (IP + IPv6). -- :wq Claudio Index: mrt.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/mrt.c,v retrieving revision 1.88 diff -u -p -r1.88 mrt.c --- mrt.c 30 Dec 2018 13:53:07 -0000 1.88 +++ mrt.c 31 Dec 2018 15:20:03 -0000 @@ -510,6 +510,9 @@ mrt_dump_entry_v2(struct mrt *mrt, struc struct bgpd_addr *nh; struct ibuf *tbuf; + if (prefix_aspath(p) == NULL) + continue; + nexthop = prefix_nexthop(p); if (nexthop == NULL) { bzero(&addr, sizeof(struct bgpd_addr)); @@ -672,6 +675,8 @@ mrt_dump_upcall(struct rib_entry *re, vo * be dumped p should be set to p = pt->active. */ LIST_FOREACH(p, &re->prefix_h, rib_l) { + if (prefix_aspath(p) == NULL) + continue; if (mrtbuf->type == MRT_TABLE_DUMP) mrt_dump_entry(mrtbuf, p, mrtbuf->seqnum++, prefix_peer(p)); Index: rde.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.c,v retrieving revision 1.458 diff -u -p -r1.458 rde.c --- rde.c 31 Dec 2018 08:53:09 -0000 1.458 +++ rde.c 31 Dec 2018 15:20:03 -0000 @@ -2260,6 +2260,8 @@ rde_dump_filter(struct prefix *p, struct return; asp = prefix_aspath(p); + if (asp == NULL) + return; if ((req->flags & F_CTL_ACTIVE) && p->re->active != p) return; if ((req->flags & F_CTL_INVALID) && @@ -2651,8 +2653,9 @@ rde_up_dump_done(void *ptr, u_int8_t aid { struct rde_peer *peer = ptr; + peer->throttled = 0; if (peer->capa.grestart.restart) - up_generate_marker(peer, aid); + prefix_add_eor(peer, aid); } u_char queue_buf[4096]; @@ -2674,8 +2677,8 @@ rde_update_queue_pending(void) if (peer->throttled) continue; for (aid = 0; aid < AID_MAX; aid++) { - if (!TAILQ_EMPTY(&peer->updates[aid]) || - !TAILQ_EMPTY(&peer->withdraws[aid])) + if (!RB_EMPTY(&peer->updates[aid]) || + !RB_EMPTY(&peer->withdraws[aid])) return 1; } } @@ -2687,7 +2690,7 @@ rde_update_queue_runner(void) { struct rde_peer *peer; int r, sent, max = RDE_RUNNER_ROUNDS, eor; - u_int16_t len, wd_len, wpos; + u_int16_t len, wpos; len = sizeof(queue_buf) - MSGSIZE_HEADER; do { @@ -2700,48 +2703,33 @@ rde_update_queue_runner(void) if (peer->throttled) continue; eor = 0; - /* first withdraws */ - wpos = 2; /* reserve space for the length field */ - r = up_dump_prefix(queue_buf + wpos, len - wpos - 2, - &peer->withdraws[AID_INET], peer, 1); - wd_len = r; - /* write withdraws length filed */ - wd_len = htons(wd_len); - memcpy(queue_buf, &wd_len, 2); + wpos = 0; + /* first withdraws, save 2 bytes for path attributes */ + if ((r = up_dump_withdraws(queue_buf, len - 2, peer, + AID_INET)) == -1) + continue; wpos += r; - /* now bgp path attributes */ - r = up_dump_attrnlri(queue_buf + wpos, len - wpos, - peer); - switch (r) { - case -1: + /* now bgp path attributes unless it is the EoR mark */ + if (up_is_eor(peer, AID_INET)) { eor = 1; - if (wd_len == 0) { - /* no withdraws queued just send EoR */ - peer_send_eor(peer, AID_INET); - continue; - } - break; - case 2: - if (wd_len == 0) { - /* - * No packet to send. No withdraws and - * no path attributes. Skip. - */ - continue; - } - /* FALLTHROUGH */ - default: + bzero(queue_buf + wpos, 2); + wpos += 2; + } else { + r = up_dump_attrnlri(queue_buf + wpos, + len - wpos, peer); wpos += r; - break; } /* finally send message to SE */ - if (imsg_compose(ibuf_se, IMSG_UPDATE, peer->conf.id, - 0, -1, queue_buf, wpos) == -1) - fatal("%s %d imsg_compose error", __func__, - __LINE__); - sent++; + if (wpos > 4) { + if (imsg_compose(ibuf_se, IMSG_UPDATE, + peer->conf.id, 0, -1, queue_buf, + wpos) == -1) + fatal("%s %d imsg_compose error", + __func__, __LINE__); + sent++; + } if (eor) peer_send_eor(peer, AID_INET); } @@ -2753,7 +2741,6 @@ void rde_update6_queue_runner(u_int8_t aid) { struct rde_peer *peer; - u_char *b; int r, sent, max = RDE_RUNNER_ROUNDS / 2; u_int16_t len; @@ -2768,13 +2755,12 @@ rde_update6_queue_runner(u_int8_t aid) if (peer->throttled) continue; len = sizeof(queue_buf) - MSGSIZE_HEADER; - b = up_dump_mp_unreach(queue_buf, &len, peer, aid); - - if (b == NULL) + r = up_dump_mp_unreach(queue_buf, len, peer, aid); + if (r == -1) continue; /* finally send message to SE */ if (imsg_compose(ibuf_se, IMSG_UPDATE, peer->conf.id, - 0, -1, b, len) == -1) + 0, -1, queue_buf, r) == -1) fatal("%s %d imsg_compose error", __func__, __LINE__); sent++; @@ -2794,21 +2780,17 @@ rde_update6_queue_runner(u_int8_t aid) if (peer->throttled) continue; len = sizeof(queue_buf) - MSGSIZE_HEADER; - r = up_dump_mp_reach(queue_buf, &len, peer, aid); - switch (r) { - case -2: - continue; - case -1: + if (up_is_eor(peer, aid)) { peer_send_eor(peer, aid); continue; - default: - b = queue_buf + r; - break; } + r = up_dump_mp_reach(queue_buf, len, peer, aid); + if (r == 0) + continue; /* finally send message to SE */ if (imsg_compose(ibuf_se, IMSG_UPDATE, peer->conf.id, - 0, -1, b, len) == -1) + 0, -1, queue_buf, r) == -1) fatal("%s %d imsg_compose error", __func__, __LINE__); sent++; @@ -3333,7 +3315,6 @@ peer_add(u_int32_t id, struct peer_confi if (peer->loc_rib_id == RIB_NOTFOUND) fatalx("King Bula's new peer met an unknown RIB"); peer->state = PEER_NONE; - up_init(peer); head = PEER_HASH(id); @@ -3396,6 +3377,20 @@ peer_localaddrs(struct rde_peer *peer, s return (0); } +static void +peer_adjout_flush_upcall(struct rib_entry *re, void *arg) +{ + struct rde_peer *peer = arg; + struct prefix *p, *np; + + LIST_FOREACH_SAFE(p, &re->prefix_h, rib_l, np) { + if (peer != prefix_peer(p)) + continue; + prefix_destroy(p); + break; /* optimization, only one match per peer possible */ + } +} + void peer_up(u_int32_t id, struct session_up *sup) { @@ -3414,8 +3409,10 @@ peer_up(u_int32_t id, struct session_up * There is a race condition when doing PEER_ERR -> PEER_DOWN. * So just do a full reset of the peer here. */ + if (rib_dump_new(RIB_ADJ_OUT, AID_UNSPEC, 0, peer, + peer_adjout_flush_upcall, NULL, NULL) == -1) + fatal("%s: rib_dump_new", __func__); peer_flush(peer, AID_UNSPEC, 0); - up_down(peer); peer->prefix_cnt = 0; peer->state = PEER_DOWN; } @@ -3432,7 +3429,6 @@ peer_up(u_int32_t id, struct session_up } peer->state = PEER_UP; - up_init(peer); if (rde_noevaluate()) /* @@ -3447,20 +3443,6 @@ peer_up(u_int32_t id, struct session_up } } -static void -peer_adjout_flush_upcall(struct rib_entry *re, void *arg) -{ - struct rde_peer *peer = arg; - struct prefix *p, *np; - - LIST_FOREACH_SAFE(p, &re->prefix_h, rib_l, np) { - if (peer != prefix_peer(p)) - continue; - prefix_destroy(p); - break; /* optimization, only one match per peer possible */ - } -} - void peer_down(u_int32_t id) { @@ -3481,8 +3463,6 @@ peer_down(u_int32_t id) peer_adjout_flush_upcall, NULL, NULL) == -1) fatal("%s: rib_dump_new", __func__); - up_down(peer); - peer_flush(peer, AID_UNSPEC, 0); peer->prefix_cnt = 0; @@ -3602,15 +3582,16 @@ peer_dump(u_int32_t id, u_int8_t aid) if (peer->conf.export_type == EXPORT_NONE) { /* nothing to send apart from the marker */ if (peer->capa.grestart.restart) - up_generate_marker(peer, aid); + prefix_add_eor(peer, aid); } else if (peer->conf.export_type == EXPORT_DEFAULT_ROUTE) { up_generate_default(out_rules, peer, aid); if (peer->capa.grestart.restart) - up_generate_marker(peer, aid); + prefix_add_eor(peer, aid); } else { if (rib_dump_new(peer->loc_rib_id, aid, RDE_RUNNER_ROUNDS, peer, rde_up_dump_upcall, rde_up_dump_done, NULL) == -1) fatal("%s: rib_dump_new", __func__); + peer->throttled = 1; /* XXX throttle peer until dump is done */ } } Index: rde.h =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde.h,v retrieving revision 1.207 diff -u -p -r1.207 rde.h --- rde.h 30 Dec 2018 13:53:07 -0000 1.207 +++ rde.h 31 Dec 2018 15:20:03 -0000 @@ -77,11 +77,7 @@ LIST_HEAD(rde_peer_head, rde_peer); LIST_HEAD(aspath_list, aspath); LIST_HEAD(attr_list, attr); LIST_HEAD(aspath_head, rde_aspath); -RB_HEAD(uptree_prefix, update_prefix); -RB_HEAD(uptree_attr, update_attr); - -TAILQ_HEAD(uplist_prefix, update_prefix); -TAILQ_HEAD(uplist_attr, update_attr); +RB_HEAD(prefix_tree, prefix); struct rde_peer { LIST_ENTRY(rde_peer) hash_l; /* hash list over all peers */ @@ -90,12 +86,10 @@ struct rde_peer { struct bgpd_addr remote_addr; struct bgpd_addr local_v4_addr; struct bgpd_addr local_v6_addr; - struct uptree_prefix up_prefix; - struct uptree_attr up_attrs; - struct uplist_attr updates[AID_MAX]; - struct uplist_prefix withdraws[AID_MAX]; - time_t staletime[AID_MAX]; struct capabilities capa; + struct prefix_tree updates[AID_MAX]; + struct prefix_tree withdraws[AID_MAX]; + time_t staletime[AID_MAX]; u_int64_t prefix_rcvd_update; u_int64_t prefix_rcvd_withdraw; u_int64_t prefix_rcvd_eor; @@ -104,8 +98,6 @@ struct rde_peer { u_int64_t prefix_sent_eor; u_int32_t prefix_cnt; /* # of prefixes */ u_int32_t remote_bgpid; /* host byte order! */ - u_int32_t up_pcnt; - u_int32_t up_acnt; u_int32_t up_nlricnt; u_int32_t up_wcnt; enum peer_state state; @@ -297,6 +289,7 @@ struct pt_entry_vpn6 { struct prefix { LIST_ENTRY(prefix) rib_l, nexthop_l; + RB_ENTRY(prefix) entry; struct rib_entry *re; struct rde_aspath *aspath; struct rde_peer *peer; @@ -304,6 +297,10 @@ struct prefix { time_t lastchange; u_int8_t validation_state; u_int8_t nhflags; + u_int8_t flags; + u_int8_t eor; +#define PREFIX_FLAG_WITHDRAW 0x01 +#define PREFIX_FLAG_UPDATE 0x02 }; #define NEXTHOP_SELF 0x01 @@ -499,6 +496,11 @@ struct prefix *prefix_get(struct rib *, struct bgpd_addr *, int); int prefix_remove(struct rib *, struct rde_peer *, struct bgpd_addr *, int); +void prefix_add_eor(struct rde_peer *, u_int8_t); +void prefix_update(struct rib *, struct rde_peer *, + struct bgpd_addr *, int); +int prefix_withdraw(struct rib *, struct rde_peer *, + struct bgpd_addr *, int); int prefix_write(u_char *, int, struct bgpd_addr *, u_int8_t, int); int prefix_writebuf(struct ibuf *, struct bgpd_addr *, u_int8_t); struct prefix *prefix_bypeer(struct rib_entry *, struct rde_peer *); @@ -507,6 +509,8 @@ void prefix_updateall(struct prefix *, void prefix_destroy(struct prefix *); void prefix_relink(struct prefix *, struct rde_aspath *, int); +RB_PROTOTYPE(prefix_tree, prefix, entry, prefix_cmp) + static inline struct rde_peer * prefix_peer(struct prefix *p) { @@ -551,25 +555,18 @@ int nexthop_compare(struct nexthop *, /* rde_update.c */ void up_init(struct rde_peer *); -void up_down(struct rde_peer *); int up_rib_remove(struct rde_peer *, struct rib_entry *); void up_rib_add(struct rde_peer *, struct rib_entry *); void up_withdraw_all(struct rde_peer *); -int up_test_update(struct rde_peer *, struct prefix *); -int up_generate(struct rde_peer *, struct filterstate *, - struct bgpd_addr *, u_int8_t); void up_generate_updates(struct filter_head *, struct rde_peer *, struct prefix *, struct prefix *); void up_generate_default(struct filter_head *, struct rde_peer *, u_int8_t); -int up_generate_marker(struct rde_peer *, u_int8_t); -int up_dump_prefix(u_char *, int, struct uplist_prefix *, - struct rde_peer *, int); +int up_is_eor(struct rde_peer *, u_int8_t); +int up_dump_withdraws(u_char *, int, struct rde_peer *, u_int8_t); +int up_dump_mp_unreach(u_char *, int, struct rde_peer *, u_int8_t); int up_dump_attrnlri(u_char *, int, struct rde_peer *); -u_char *up_dump_mp_unreach(u_char *, u_int16_t *, struct rde_peer *, - u_int8_t); -int up_dump_mp_reach(u_char *, u_int16_t *, struct rde_peer *, - u_int8_t); +int up_dump_mp_reach(u_char *, int, struct rde_peer *, u_int8_t); /* rde_trie.c */ int trie_add(struct trie_head *, struct bgpd_addr *, u_int8_t, u_int8_t, Index: rde_decide.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_decide.c,v retrieving revision 1.73 diff -u -p -r1.73 rde_decide.c --- rde_decide.c 4 Dec 2018 14:13:40 -0000 1.73 +++ rde_decide.c 17 Dec 2018 14:05:31 -0000 @@ -126,9 +126,9 @@ prefix_cmp(struct prefix *p1, struct pre peer2 = prefix_peer(p2); /* pathes with errors are not eligible */ - if (asp1->flags & F_ATTR_PARSE_ERR) + if (asp1 == NULL || asp1->flags & F_ATTR_PARSE_ERR) return (-1); - if (asp2->flags & F_ATTR_PARSE_ERR) + if (asp2 == NULL || asp2->flags & F_ATTR_PARSE_ERR) return (1); /* only loop free pathes are eligible */ @@ -271,9 +271,10 @@ prefix_evaluate(struct prefix *p, struct xp = LIST_FIRST(&re->prefix_h); if (xp != NULL) { struct rde_aspath *xasp = prefix_aspath(xp); - if (xasp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR) || - (prefix_nexthop(xp) != NULL && - prefix_nexthop(xp)->state != NEXTHOP_REACH)) + if (xasp == NULL || + xasp->flags & (F_ATTR_LOOP|F_ATTR_PARSE_ERR) || + (prefix_nexthop(xp) != NULL && prefix_nexthop(xp)->state != + NEXTHOP_REACH)) /* xp is ineligible */ xp = NULL; } Index: rde_rib.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_rib.c,v retrieving revision 1.188 diff -u -p -r1.188 rde_rib.c --- rde_rib.c 30 Dec 2018 13:53:07 -0000 1.188 +++ rde_rib.c 3 Jan 2019 09:36:02 -0000 @@ -101,6 +101,13 @@ rib_tree(struct rib *rib) static inline int rib_compare(const struct rib_entry *a, const struct rib_entry *b) { + /* need to handle NULL entries because of EoR marker */ + if (a == NULL && b == NULL) + return (0); + else if (b == NULL) + return (1); + else if (a == NULL) + return (-1); return (pt_prefix_cmp(a->prefix, b->prefix)); } @@ -195,7 +202,7 @@ rib_free(struct rib *rib) while ((p = LIST_FIRST(&re->prefix_h))) { struct rde_aspath *asp = prefix_aspath(p); np = LIST_NEXT(p, rib_l); - if (asp->pftableid) { + if (asp && asp->pftableid) { struct bgpd_addr addr; pt_getaddr(p->re->prefix, &addr); @@ -494,7 +501,12 @@ SIPHASH_KEY pathtablekey; #define PATH_HASH(x) &pathtable.path_hashtbl[x & pathtable.path_hashmask] -#define path_empty(asp) ((asp)->refcnt <= 0) + +static inline int +path_empty(struct rde_aspath *asp) +{ + return (asp == NULL || asp->refcnt <= 0); +} static inline void path_ref(struct rde_aspath *asp) @@ -598,6 +610,15 @@ path_update(struct rib *rib, struct rde_ p->validation_state = vstate; return (2); } + if (p->flags) { + struct prefix_tree *prefix_head; + /* prefix is a pending update */ + prefix_head = p->flags & PREFIX_FLAG_UPDATE ? + &peer->updates[prefix->aid] : + &peer->withdraws[prefix->aid]; + RB_REMOVE(prefix_tree, prefix_head, p); + p->flags = 0; + } } /* @@ -731,6 +752,9 @@ path_link(struct rde_aspath *asp) static void path_unlink(struct rde_aspath *asp) { + if (asp == NULL) + return; + /* make sure no reference is hold for this rde_aspath */ if (!path_empty(asp)) fatalx("%s: still has prefixes", __func__); @@ -822,12 +846,29 @@ path_put(struct rde_aspath *asp) /* prefix specific functions */ -static struct prefix *prefix_alloc(void); -static void prefix_free(struct prefix *); static void prefix_link(struct prefix *, struct rib_entry *, struct rde_peer *, struct rde_aspath *, struct filterstate *, u_int8_t); static void prefix_unlink(struct prefix *); +static struct prefix *prefix_alloc(void); +static void prefix_free(struct prefix *); + +/* RB tree comparison function */ +static inline int +prefix_cmp(struct prefix *a, struct prefix *b) +{ + if (a->eor != b->eor) + return a->eor - b->eor; + if (a->aspath != b->aspath) + return (a->aspath > b->aspath ? 1 : -1); + if (a->nexthop != b->nexthop) + return (a->nexthop > b->nexthop ? 1 : -1); + if (a->nhflags != b->nhflags) + return (a->nhflags > b->nhflags ? 1 : -1); + return rib_compare(a->re, b->re); +} + +RB_GENERATE(prefix_tree, prefix, entry, prefix_cmp) /* * search for specified prefix of a peer. Returns NULL if not found. @@ -920,9 +961,10 @@ prefix_move(struct prefix *p, struct rde prefix_evaluate(np, np->re); /* remove old prefix node */ - oasp = prefix_aspath(p); - path_unref(oasp); /* as before peer count needs no update because of move */ + oasp = p->aspath; + if (oasp) + path_unref(oasp); /* destroy all references to other objects and free the old prefix */ p->aspath = NULL; @@ -949,19 +991,14 @@ prefix_remove(struct rib *rib, struct rd int prefixlen) { struct prefix *p; - struct rib_entry *re; struct rde_aspath *asp; - re = rib_get(rib, prefix, prefixlen); - if (re == NULL) /* Got a dummy withdrawn request */ - return (0); - - p = prefix_bypeer(re, peer); + p = prefix_get(rib, peer, prefix, prefixlen); if (p == NULL) /* Got a dummy withdrawn request. */ return (0); asp = prefix_aspath(p); - if (asp->pftableid) { + if (asp && asp->pftableid) { /* only prefixes in the local RIB were pushed into pf */ rde_send_pftable(asp->pftableid, prefix, prefixlen, 1); rde_send_pftable_commit(); @@ -972,6 +1009,89 @@ prefix_remove(struct rib *rib, struct rd return (1); } +/* + * Insert an End-of-RIB marker into the update queue. + */ +void +prefix_add_eor(struct rde_peer *peer, u_int8_t aid) +{ + struct prefix *p; + + p = prefix_alloc(); + p->eor = 1; + p->flags = PREFIX_FLAG_UPDATE; + if (RB_INSERT(prefix_tree, &peer->updates[aid], p) != NULL) + /* no need to add if EoR marker already present */ + prefix_free(p); +} + +/* + * Put a prefix from the Adj-RIB-Out onto the update queue. + */ +void +prefix_update(struct rib *rib, struct rde_peer *peer, + struct bgpd_addr *prefix, int prefixlen) +{ + struct prefix *p; + + p = prefix_get(rib, peer, prefix, prefixlen); + if (p == NULL) /* Got a dummy withdrawn request. */ + return; + + if (p->flags != 0) + fatalx("%s: bad flags %x", __func__, p->flags); + p->flags = PREFIX_FLAG_UPDATE; + if (RB_INSERT(prefix_tree, &peer->updates[prefix->aid], p) != NULL) + fatalx("%s: RB tree invariant violated", __func__); +} + +/* + * Withdraw a prefix from the Adj-RIB-Out, this unlinks the aspath but leaves + * the prefix in the RIB linked to the peer withdraw list. + */ +int +prefix_withdraw(struct rib *rib, struct rde_peer *peer, + struct bgpd_addr *prefix, int prefixlen) +{ + struct prefix *p; + struct rde_aspath *asp; + + p = prefix_get(rib, peer, prefix, prefixlen); + if (p == NULL) /* Got a dummy withdrawn request. */ + return (0); + + /* unlink from aspath ...*/ + asp = p->aspath; + if (asp != NULL) { + path_unref(asp); + p->aspath = NULL; + if (path_empty(asp)) + path_unlink(asp); + } + + /* ... and nexthop but keep the re link */ + nexthop_unlink(p); + nexthop_put(p->nexthop); + p->nexthop = NULL; + p->nhflags = 0; + /* re link still exists */ + + if (p->flags) { + struct prefix_tree *prefix_head; + /* p is a pending update or withdraw, remove first */ + prefix_head = p->flags & PREFIX_FLAG_UPDATE ? + &peer->updates[prefix->aid] : + &peer->withdraws[prefix->aid]; + RB_REMOVE(prefix_tree, prefix_head, p); + p->flags = 0; + } + p->flags = PREFIX_FLAG_WITHDRAW; + if (RB_INSERT(prefix_tree, &peer->withdraws[prefix->aid], p) != NULL) + fatalx("%s: RB tree invariant violated", __func__); + return (1); +} + + /* dump a prefix into specified buffer */ int prefix_write(u_char *buf, int len, struct bgpd_addr *prefix, u_int8_t plen, @@ -1139,7 +1259,6 @@ prefix_destroy(struct prefix *p) struct rde_aspath *asp; asp = prefix_aspath(p); - prefix_unlink(p); prefix_free(p); @@ -1151,48 +1270,52 @@ prefix_destroy(struct prefix *p) * Link a prefix into the different parent objects. */ static void -prefix_link(struct prefix *pref, struct rib_entry *re, struct rde_peer *peer, +prefix_link(struct prefix *p, struct rib_entry *re, struct rde_peer *peer, struct rde_aspath *asp, struct filterstate *state, u_int8_t vstate) { path_ref(asp); - pref->aspath = asp; - pref->peer = peer; - pref->nexthop = nexthop_ref(state->nexthop); - nexthop_link(pref); - pref->re = re; - pref->lastchange = time(NULL); - pref->nhflags = state->nhflags; - pref->validation_state = vstate; + p->aspath = asp; + p->peer = peer; + p->nexthop = nexthop_ref(state->nexthop); + nexthop_link(p); + p->re = re; + p->lastchange = time(NULL); + p->nhflags = state->nhflags; + p->validation_state = vstate; /* make route decision */ - prefix_evaluate(pref, re); + prefix_evaluate(p, re); } /* * Unlink a prefix from the different parent objects. */ static void -prefix_unlink(struct prefix *pref) +prefix_unlink(struct prefix *p) { - struct rib_entry *re = pref->re; + struct rib_entry *re = p->re; + + if (p->eor) /* nothing to unlink for EoR markers */ + return; /* make route decision */ - LIST_REMOVE(pref, rib_l); + LIST_REMOVE(p, rib_l); prefix_evaluate(NULL, re); - path_unref(prefix_aspath(pref)); + if (p->aspath) + path_unref(p->aspath); if (rib_empty(re)) rib_remove(re); /* destroy all references to other objects */ - nexthop_unlink(pref); - nexthop_put(pref->nexthop); - pref->nexthop = NULL; - pref->aspath = NULL; - pref->peer = NULL; - pref->re = NULL; + nexthop_unlink(p); + nexthop_put(p->nexthop); + p->nexthop = NULL; + p->aspath = NULL; + p->peer = NULL; + p->re = NULL; /* * It's the caller's duty to do accounting and remove empty aspath @@ -1215,10 +1338,10 @@ prefix_alloc(void) /* free a unlinked entry */ static void -prefix_free(struct prefix *pref) +prefix_free(struct prefix *p) { rdemem.prefix_cnt--; - free(pref); + free(p); } /* Index: rde_update.c =================================================================== RCS file: /cvs/src/usr.sbin/bgpd/rde_update.c,v retrieving revision 1.107 diff -u -p -r1.107 rde_update.c --- rde_update.c 30 Dec 2018 13:53:07 -0000 1.107 +++ rde_update.c 3 Jan 2019 09:36:10 -0000 @@ -17,6 +17,7 @@ */ #include <sys/types.h> #include <sys/queue.h> +#include <sys/tree.h> #include <limits.h> #include <stdlib.h> @@ -27,45 +28,6 @@ #include "rde.h" #include "log.h" -in_addr_t up_get_nexthop(struct rde_peer *, struct filterstate *); -int up_generate_mp_reach(struct rde_peer *, struct update_attr *, - struct filterstate *, u_int8_t); -int up_generate_attr(struct rde_peer *, struct update_attr *, - struct filterstate *, u_int8_t); - -/* update stuff. */ -struct update_prefix { - TAILQ_ENTRY(update_prefix) prefix_l; - RB_ENTRY(update_prefix) entry; - struct uplist_prefix *prefix_h; - struct bgpd_addr prefix; - int prefixlen; -}; - -struct update_attr { - TAILQ_ENTRY(update_attr) attr_l; - RB_ENTRY(update_attr) entry; - struct uplist_prefix prefix_h; - u_char *attr; - u_char *mpattr; - u_int32_t attr_hash; - u_int16_t attr_len; - u_int16_t mpattr_len; -}; - -void up_clear(struct uplist_attr *, struct uplist_prefix *); -int up_prefix_cmp(struct update_prefix *, struct update_prefix *); -int up_attr_cmp(struct update_attr *, struct update_attr *); -int up_add(struct rde_peer *, struct update_prefix *, struct update_attr *); - -RB_PROTOTYPE(uptree_prefix, update_prefix, entry, up_prefix_cmp) -RB_GENERATE(uptree_prefix, update_prefix, entry, up_prefix_cmp) - -RB_PROTOTYPE(uptree_attr, update_attr, entry, up_attr_cmp) -RB_GENERATE(uptree_attr, update_attr, entry, up_attr_cmp) - -SIPHASH_KEY uptree_key; - static struct filter_community comm_no_advertise = { .type = COMMUNITY_TYPE_BASIC, .c.b.data1 = COMMUNITY_WELLKNOWN, @@ -82,233 +44,7 @@ static struct filter_community comm_no_e .c.b.data2 = COMMUNITY_NO_EXPSUBCONFED }; -void -up_init(struct rde_peer *peer) -{ - u_int8_t i; - - for (i = 0; i < AID_MAX; i++) { - TAILQ_INIT(&peer->updates[i]); - TAILQ_INIT(&peer->withdraws[i]); - } - RB_INIT(&peer->up_prefix); - RB_INIT(&peer->up_attrs); - peer->up_pcnt = 0; - peer->up_acnt = 0; - peer->up_nlricnt = 0; - peer->up_wcnt = 0; - arc4random_buf(&uptree_key, sizeof(uptree_key)); -} - -void -up_clear(struct uplist_attr *updates, struct uplist_prefix *withdraws) -{ - struct update_attr *ua; - struct update_prefix *up; - - while ((ua = TAILQ_FIRST(updates)) != NULL) { - TAILQ_REMOVE(updates, ua, attr_l); - while ((up = TAILQ_FIRST(&ua->prefix_h)) != NULL) { - TAILQ_REMOVE(&ua->prefix_h, up, prefix_l); - free(up); - } - free(ua->attr); - free(ua->mpattr); - free(ua); - } - - while ((up = TAILQ_FIRST(withdraws)) != NULL) { - TAILQ_REMOVE(withdraws, up, prefix_l); - free(up); - } -} - -void -up_down(struct rde_peer *peer) -{ - u_int8_t i; - - for (i = 0; i < AID_MAX; i++) - up_clear(&peer->updates[i], &peer->withdraws[i]); - - RB_INIT(&peer->up_prefix); - RB_INIT(&peer->up_attrs); - - peer->up_pcnt = 0; - peer->up_acnt = 0; - peer->up_nlricnt = 0; - peer->up_wcnt = 0; -} - -int -up_prefix_cmp(struct update_prefix *a, struct update_prefix *b) -{ - int i; - - if (a->prefix.aid < b->prefix.aid) - return (-1); - if (a->prefix.aid > b->prefix.aid) - return (1); - - switch (a->prefix.aid) { - case AID_INET: - if (ntohl(a->prefix.v4.s_addr) < ntohl(b->prefix.v4.s_addr)) - return (-1); - if (ntohl(a->prefix.v4.s_addr) > ntohl(b->prefix.v4.s_addr)) - return (1); - break; - case AID_INET6: - i = memcmp(&a->prefix.v6, &b->prefix.v6, - sizeof(struct in6_addr)); - if (i > 0) - return (1); - if (i < 0) - return (-1); - break; - case AID_VPN_IPv4: - if (betoh64(a->prefix.vpn4.rd) < betoh64(b->prefix.vpn4.rd)) - return (-1); - if (betoh64(a->prefix.vpn4.rd) > betoh64(b->prefix.vpn4.rd)) - return (1); - if (ntohl(a->prefix.v4.s_addr) < ntohl(b->prefix.v4.s_addr)) - return (-1); - if (ntohl(a->prefix.v4.s_addr) > ntohl(b->prefix.v4.s_addr)) - return (1); - if (a->prefixlen < b->prefixlen) - return (-1); - if (a->prefixlen > b->prefixlen) - return (1); - if (a->prefix.vpn4.labellen < b->prefix.vpn4.labellen) - return (-1); - if (a->prefix.vpn4.labellen > b->prefix.vpn4.labellen) - return (1); - return (memcmp(a->prefix.vpn4.labelstack, - b->prefix.vpn4.labelstack, a->prefix.vpn4.labellen)); - case AID_VPN_IPv6: - if (betoh64(a->prefix.vpn6.rd) < betoh64(b->prefix.vpn6.rd)) - return (-1); - if (betoh64(a->prefix.vpn6.rd) > betoh64(b->prefix.vpn6.rd)) - return (1); - i = memcmp(&a->prefix.vpn6.addr, &b->prefix.vpn6.addr, - sizeof(struct in6_addr)); - if (i > 0) - return (1); - if (i < 0) - return (-1); - if (a->prefixlen < b->prefixlen) - return (-1); - if (a->prefixlen > b->prefixlen) - return (1); - if (a->prefix.vpn6.labellen < b->prefix.vpn6.labellen) - return (-1); - if (a->prefix.vpn6.labellen > b->prefix.vpn6.labellen) - return (1); - return (memcmp(a->prefix.vpn6.labelstack, - b->prefix.vpn6.labelstack, a->prefix.vpn6.labellen)); - default: - fatalx("up_prefix_cmp: unknown af"); - } - if (a->prefixlen < b->prefixlen) - return (-1); - if (a->prefixlen > b->prefixlen) - return (1); - return (0); -} - -int -up_attr_cmp(struct update_attr *a, struct update_attr *b) -{ - int r; - - if ((r = a->attr_hash - b->attr_hash) != 0) - return (r); - if ((r = a->attr_len - b->attr_len) != 0) - return (r); - if ((r = a->mpattr_len - b->mpattr_len) != 0) - return (r); - if ((r = memcmp(a->mpattr, b->mpattr, a->mpattr_len)) != 0) - return (r); - return (memcmp(a->attr, b->attr, a->attr_len)); -} - -int -up_add(struct rde_peer *peer, struct update_prefix *p, struct update_attr *a) -{ - struct update_attr *na = NULL; - struct update_prefix *np; - struct uplist_attr *upl = NULL; - struct uplist_prefix *wdl = NULL; - - upl = &peer->updates[p->prefix.aid]; - wdl = &peer->withdraws[p->prefix.aid]; - - /* 1. search for attr */ - if (a != NULL && (na = RB_FIND(uptree_attr, &peer->up_attrs, a)) == - NULL) { - /* 1.1 if not found -> add */ - TAILQ_INIT(&a->prefix_h); - if (RB_INSERT(uptree_attr, &peer->up_attrs, a) != NULL) { - log_warnx("uptree_attr insert failed"); - /* cleanup */ - free(a->attr); - free(a->mpattr); - free(a); - free(p); - return (-1); - } - TAILQ_INSERT_TAIL(upl, a, attr_l); - peer->up_acnt++; - } else { - /* 1.2 if found -> use that, free a */ - if (a != NULL) { - free(a->attr); - free(a->mpattr); - free(a); - a = na; - /* move to end of update queue */ - TAILQ_REMOVE(upl, a, attr_l); - TAILQ_INSERT_TAIL(upl, a, attr_l); - } - } - - /* 2. search for prefix */ - if ((np = RB_FIND(uptree_prefix, &peer->up_prefix, p)) == NULL) { - /* 2.1 if not found -> add */ - if (RB_INSERT(uptree_prefix, &peer->up_prefix, p) != NULL) { - log_warnx("uptree_prefix insert failed"); - /* - * cleanup. But do not free a because it is already - * linked or NULL. up_dump_attrnlri() will remove and - * free the empty attribute later. - */ - free(p); - return (-1); - } - peer->up_pcnt++; - } else { - /* 2.2 if found -> use that and free p */ - TAILQ_REMOVE(np->prefix_h, np, prefix_l); - free(p); - p = np; - if (p->prefix_h == wdl) - peer->up_wcnt--; - else - peer->up_nlricnt--; - } - /* 3. link prefix to attr */ - if (a == NULL) { - TAILQ_INSERT_TAIL(wdl, p, prefix_l); - p->prefix_h = wdl; - peer->up_wcnt++; - } else { - TAILQ_INSERT_TAIL(&a->prefix_h, p, prefix_l); - p->prefix_h = &a->prefix_h; - peer->up_nlricnt++; - } - return (0); -} - -int +static int up_test_update(struct rde_peer *peer, struct prefix *p) { struct bgpd_addr addr; @@ -316,9 +52,6 @@ up_test_update(struct rde_peer *peer, st struct rde_peer *prefp; struct attr *attr; - if (peer->state != PEER_UP) - return (-1); - if (p == NULL) /* no prefix available */ return (0); @@ -330,7 +63,7 @@ up_test_update(struct rde_peer *peer, st /* Do not send routes back to sender */ return (0); - if (asp->flags & F_ATTR_PARSE_ERR) + if (asp == NULL || asp->flags & F_ATTR_PARSE_ERR) fatalx("try to send out a botched path"); if (asp->flags & F_ATTR_LOOP) fatalx("try to send out a looped path"); @@ -390,47 +123,6 @@ up_test_update(struct rde_peer *peer, st return (1); } -int -up_generate(struct rde_peer *peer, struct filterstate *state, - struct bgpd_addr *addr, u_int8_t prefixlen) -{ - struct update_attr *ua = NULL; - struct update_prefix *up; - SIPHASH_CTX ctx; - - if (state) { - ua = calloc(1, sizeof(struct update_attr)); - if (ua == NULL) - fatal("up_generate"); - - if (up_generate_attr(peer, ua, state, addr->aid) == -1) { - log_warnx("generation of bgp path attributes failed"); - free(ua); - return (-1); - } - /* - * use aspath_hash as attr_hash, this may be unoptimal - * but currently I don't care. - */ - SipHash24_Init(&ctx, &uptree_key); - SipHash24_Update(&ctx, ua->attr, ua->attr_len); - if (ua->mpattr) - SipHash24_Update(&ctx, ua->mpattr, ua->mpattr_len); - ua->attr_hash = SipHash24_End(&ctx); - } - - up = calloc(1, sizeof(struct update_prefix)); - if (up == NULL) - fatal("up_generate"); - up->prefix = *addr; - up->prefixlen = prefixlen; - - if (up_add(peer, up, ua) == -1) - return (-1); - - return (0); -} - void up_generate_updates(struct filter_head *rules, struct rde_peer *peer, struct prefix *new, struct prefix *old) @@ -444,16 +136,14 @@ up_generate_updates(struct filter_head * if (new == NULL) { withdraw: if (old == NULL) + /* no prefix to withdraw */ return; /* withdraw prefix */ pt_getaddr(old->re->prefix, &addr); - if (prefix_remove(&ribs[RIB_ADJ_OUT].rib, peer, &addr, - old->re->prefix->prefixlen) == 0) { - /* not in table, no need to send withdraw */ - return; - } - up_generate(peer, NULL, &addr, old->re->prefix->prefixlen); + if (prefix_withdraw(&ribs[RIB_ADJ_OUT].rib, peer, &addr, + old->re->prefix->prefixlen) == 1) + peer->up_wcnt++; } else { switch (up_test_update(peer, new)) { case 1: @@ -475,8 +165,9 @@ withdraw: if (path_update(&ribs[RIB_ADJ_OUT].rib, peer, &state, &addr, new->re->prefix->prefixlen, prefix_vstate(new)) != 2) { /* only send update if path changed */ - up_generate(peer, &state, &addr, + prefix_update(&ribs[RIB_ADJ_OUT].rib, peer, &addr, new->re->prefix->prefixlen); + peer->up_nlricnt++; } rde_filterstate_clean(&state); @@ -501,7 +192,8 @@ up_generate_default(struct filter_head * if (peer->capa.mp[aid] == 0) return; - asp = path_get(); + rde_filterstate_prep(&state, NULL, NULL, 0); + asp = &state.aspath; asp->aspath = aspath_get(NULL, 0); asp->origin = ORIGIN_IGP; /* the other default values are OK, nexthop is once again NULL */ @@ -513,8 +205,8 @@ up_generate_default(struct filter_head * /* rde_apply_set(asp, set, af, NULL ???, DIR_IN); */ /* - * XXX this is ugly but it will get better once we have a proper - * Adj-RIB-Out. Since then this will be just inserted there. + * XXX this is ugly because we need to have a prefix for rde_filter() + * but it will be added after filtering. So fake it till we make it. */ bzero(&p, sizeof(p)); bzero(&addr, sizeof(addr)); @@ -524,66 +216,26 @@ up_generate_default(struct filter_head * re = rib_add(rib_byid(peer->loc_rib_id), &addr, 0); p.re = re; p.aspath = asp; - p.peer = peer; + p.peer = peer; /* XXX should be peerself */ /* filter as usual */ - rde_filterstate_prep(&state, asp, NULL, 0); if (rde_filter(rules, peer, &p, &state) == ACTION_DENY) { rde_filterstate_clean(&state); return; } - up_generate(peer, &state, &addr, 0); + path_update(&ribs[RIB_ADJ_OUT].rib, peer, &state, &addr, 0, + ROA_NOTFOUND); /* no longer needed */ rde_filterstate_clean(&state); - path_put(asp); if (rib_empty(re)) rib_remove(re); } -/* generate a EoR marker in the update list. This is a horrible hack. */ -int -up_generate_marker(struct rde_peer *peer, u_int8_t aid) -{ - struct update_attr *ua; - struct update_attr *na = NULL; - struct uplist_attr *upl = NULL; - - ua = calloc(1, sizeof(struct update_attr)); - if (ua == NULL) - fatal("up_generate_marker"); - - upl = &peer->updates[aid]; - - /* 1. search for attr */ - if ((na = RB_FIND(uptree_attr, &peer->up_attrs, ua)) == NULL) { - /* 1.1 if not found -> add */ - TAILQ_INIT(&ua->prefix_h); - if (RB_INSERT(uptree_attr, &peer->up_attrs, ua) != NULL) { - log_warnx("uptree_attr insert failed"); - /* cleanup */ - free(ua); - return (-1); - } - TAILQ_INSERT_TAIL(upl, ua, attr_l); - peer->up_acnt++; - } else { - /* 1.2 if found -> use that, free ua */ - free(ua); - ua = na; - /* move to end of update queue */ - TAILQ_REMOVE(upl, ua, attr_l); - TAILQ_INSERT_TAIL(upl, ua, attr_l); - } - return (0); -} - -u_char up_attr_buf[4096]; - /* only for IPv4 */ -in_addr_t +static in_addr_t up_get_nexthop(struct rde_peer *peer, struct filterstate *state) { in_addr_t mask; @@ -638,205 +290,8 @@ up_get_nexthop(struct rde_peer *peer, st return (peer->local_v4_addr.v4.s_addr); } -int -up_generate_mp_reach(struct rde_peer *peer, struct update_attr *upa, - struct filterstate *state, u_int8_t aid) -{ - u_int16_t tmp; - - switch (aid) { - case AID_INET6: - upa->mpattr_len = 21; /* AFI + SAFI + NH LEN + NH + Reserved */ - upa->mpattr = malloc(upa->mpattr_len); - if (upa->mpattr == NULL) - fatal("up_generate_mp_reach"); - if (aid2afi(aid, &tmp, &upa->mpattr[2])) - fatalx("up_generate_mp_reachi: bad AID"); - tmp = htons(tmp); - memcpy(upa->mpattr, &tmp, sizeof(tmp)); - upa->mpattr[3] = sizeof(struct in6_addr); - upa->mpattr[20] = 0; /* Reserved must be 0 */ - - /* nexthop dance see also up_get_nexthop() */ - if (state->nhflags & NEXTHOP_NOMODIFY) { - /* no modify flag set */ - if (state->nexthop == NULL) - memcpy(&upa->mpattr[4], &peer->local_v6_addr.v6, - sizeof(struct in6_addr)); - else - memcpy(&upa->mpattr[4], - &state->nexthop->exit_nexthop.v6, - sizeof(struct in6_addr)); - } else if (state->nhflags & NEXTHOP_SELF) - memcpy(&upa->mpattr[4], &peer->local_v6_addr.v6, - sizeof(struct in6_addr)); - else if (!peer->conf.ebgp) { - /* ibgp */ - if (state->nexthop == NULL || - (state->nexthop->exit_nexthop.aid == AID_INET6 && - !memcmp(&state->nexthop->exit_nexthop.v6, - &peer->remote_addr.v6, sizeof(struct in6_addr)))) - memcpy(&upa->mpattr[4], &peer->local_v6_addr.v6, - sizeof(struct in6_addr)); - else - memcpy(&upa->mpattr[4], - &state->nexthop->exit_nexthop.v6, - sizeof(struct in6_addr)); - } else if (peer->conf.distance == 1) { - /* ebgp directly connected */ - if (state->nexthop != NULL && - state->nexthop->flags & NEXTHOP_CONNECTED) - if (prefix_compare(&peer->remote_addr, - &state->nexthop->nexthop_net, - state->nexthop->nexthop_netlen) == 0) { - /* - * nexthop and peer are in the same - * subnet - */ - memcpy(&upa->mpattr[4], - &state->nexthop->exit_nexthop.v6, - sizeof(struct in6_addr)); - return (0); - } - memcpy(&upa->mpattr[4], &peer->local_v6_addr.v6, - sizeof(struct in6_addr)); - } else - /* ebgp multihop */ - memcpy(&upa->mpattr[4], &peer->local_v6_addr.v6, - sizeof(struct in6_addr)); - return (0); - case AID_VPN_IPv4: - upa->mpattr_len = 17; /* AFI + SAFI + NH LEN + NH + Reserved */ - upa->mpattr = calloc(upa->mpattr_len, 1); - if (upa->mpattr == NULL) - fatal("up_generate_mp_reach"); - if (aid2afi(aid, &tmp, &upa->mpattr[2])) - fatalx("up_generate_mp_reachi: bad AID"); - tmp = htons(tmp); - memcpy(upa->mpattr, &tmp, sizeof(tmp)); - upa->mpattr[3] = sizeof(u_int64_t) + sizeof(struct in_addr); - - /* nexthop dance see also up_get_nexthop() */ - if (state->nhflags & NEXTHOP_NOMODIFY) { - /* no modify flag set */ - if (state->nexthop == NULL) - memcpy(&upa->mpattr[12], - &peer->local_v4_addr.v4, - sizeof(struct in_addr)); - else - /* nexthops are stored as IPv4 addrs */ - memcpy(&upa->mpattr[12], - &state->nexthop->exit_nexthop.v4, - sizeof(struct in_addr)); - } else if (state->nhflags & NEXTHOP_SELF) - memcpy(&upa->mpattr[12], &peer->local_v4_addr.v4, - sizeof(struct in_addr)); - else if (!peer->conf.ebgp) { - /* ibgp */ - if (state->nexthop == NULL || - (state->nexthop->exit_nexthop.aid == AID_INET && - !memcmp(&state->nexthop->exit_nexthop.v4, - &peer->remote_addr.v4, sizeof(struct in_addr)))) - memcpy(&upa->mpattr[12], - &peer->local_v4_addr.v4, - sizeof(struct in_addr)); - else - memcpy(&upa->mpattr[12], - &state->nexthop->exit_nexthop.v4, - sizeof(struct in_addr)); - } else if (peer->conf.distance == 1) { - /* ebgp directly connected */ - if (state->nexthop != NULL && - state->nexthop->flags & NEXTHOP_CONNECTED) - if (prefix_compare(&peer->remote_addr, - &state->nexthop->nexthop_net, - state->nexthop->nexthop_netlen) == 0) { - /* - * nexthop and peer are in the same - * subnet - */ - memcpy(&upa->mpattr[12], - &state->nexthop->exit_nexthop.v4, - sizeof(struct in_addr)); - return (0); - } - memcpy(&upa->mpattr[12], &peer->local_v4_addr.v4, - sizeof(struct in_addr)); - } else - /* ebgp multihop */ - memcpy(&upa->mpattr[12], &peer->local_v4_addr.v4, - sizeof(struct in_addr)); - return (0); - case AID_VPN_IPv6: - upa->mpattr_len = 29; /* AFI + SAFI + NH LEN + NH + Reserved */ - upa->mpattr = calloc(upa->mpattr_len, 1); - if (upa->mpattr == NULL) - fatal("up_generate_mp_reach"); - if (aid2afi(aid, &tmp, &upa->mpattr[2])) - fatalx("up_generate_mp_reachi: bad AID"); - tmp = htons(tmp); - memcpy(upa->mpattr, &tmp, sizeof(tmp)); - upa->mpattr[3] = sizeof(u_int64_t) + sizeof(struct in6_addr); - upa->mpattr[28] = 0; /* Reserved must be 0 */ - - /* nexthop dance see also up_get_nexthop() */ - if (state->nhflags & NEXTHOP_NOMODIFY) { - /* no modify flag set */ - if (state->nexthop == NULL) - memcpy(&upa->mpattr[12], - &peer->local_v6_addr.v6, - sizeof(struct in6_addr)); - else - memcpy(&upa->mpattr[12], - &state->nexthop->exit_nexthop.v6, - sizeof(struct in6_addr)); - } else if (state->nhflags & NEXTHOP_SELF) - memcpy(&upa->mpattr[12], &peer->local_v6_addr.v6, - sizeof(struct in6_addr)); - else if (!peer->conf.ebgp) { - /* ibgp */ - if (state->nexthop == NULL || - (state->nexthop->exit_nexthop.aid == AID_INET6 && - !memcmp(&state->nexthop->exit_nexthop.v6, - &peer->remote_addr.v6, sizeof(struct in6_addr)))) - memcpy(&upa->mpattr[12], - &peer->local_v6_addr.v6, - sizeof(struct in6_addr)); - else - memcpy(&upa->mpattr[12], - &state->nexthop->exit_nexthop.v6, - sizeof(struct in6_addr)); - } else if (peer->conf.distance == 1) { - /* ebgp directly connected */ - if (state->nexthop != NULL && - state->nexthop->flags & NEXTHOP_CONNECTED) - if (prefix_compare(&peer->remote_addr, - &state->nexthop->nexthop_net, - state->nexthop->nexthop_netlen) == 0) { - /* - * nexthop and peer are in the same - * subnet - */ - memcpy(&upa->mpattr[12], - &state->nexthop->exit_nexthop.v6, - sizeof(struct in6_addr)); - return (0); - } - memcpy(&upa->mpattr[12], &peer->local_v6_addr.v6, - sizeof(struct in6_addr)); - } else - /* ebgp multihop */ - memcpy(&upa->mpattr[12], &peer->local_v6_addr.v6, - sizeof(struct in6_addr)); - return (0); - default: - break; - } - return (-1); -} - -int -up_generate_attr(struct rde_peer *peer, struct update_attr *upa, +static int +up_generate_attr(u_char *buf, int len, struct rde_peer *peer, struct filterstate *state, u_int8_t aid) { struct rde_aspath *asp = &state->aspath; @@ -844,14 +299,14 @@ up_generate_attr(struct rde_peer *peer, u_char *pdata; u_int32_t tmp32; in_addr_t nexthop; - int flags, r, ismp = 0, neednewpath = 0; - u_int16_t len = sizeof(up_attr_buf), wlen = 0, plen; + int flags, r, neednewpath = 0; + u_int16_t wlen = 0, plen; u_int8_t l; u_int16_t nlen = 0; u_char *ndata; /* origin */ - if ((r = attr_write(up_attr_buf + wlen, len, ATTR_WELL_KNOWN, + if ((r = attr_write(buf + wlen, len, ATTR_WELL_KNOWN, ATTR_ORIGIN, &asp->origin, 1)) == -1) return (-1); wlen += r; len -= r; @@ -868,7 +323,7 @@ up_generate_attr(struct rde_peer *peer, if (!rde_as4byte(peer)) pdata = aspath_deflate(pdata, &plen, &neednewpath); - if ((r = attr_write(up_attr_buf + wlen, len, ATTR_WELL_KNOWN, + if ((r = attr_write(buf + wlen, len, ATTR_WELL_KNOWN, ATTR_ASPATH, pdata, plen)) == -1) return (-1); wlen += r; len -= r; @@ -877,13 +332,12 @@ up_generate_attr(struct rde_peer *peer, switch (aid) { case AID_INET: nexthop = up_get_nexthop(peer, state); - if ((r = attr_write(up_attr_buf + wlen, len, ATTR_WELL_KNOWN, + if ((r = attr_write(buf + wlen, len, ATTR_WELL_KNOWN, ATTR_NEXTHOP, &nexthop, 4)) == -1) return (-1); wlen += r; len -= r; break; default: - ismp = 1; break; } @@ -896,7 +350,7 @@ up_generate_attr(struct rde_peer *peer, asp->flags & F_ATTR_MED_ANNOUNCE || peer->conf.flags & PEERFLAG_TRANS_AS)) { tmp32 = htonl(asp->med); - if ((r = attr_write(up_attr_buf + wlen, len, ATTR_OPTIONAL, + if ((r = attr_write(buf + wlen, len, ATTR_OPTIONAL, ATTR_MED, &tmp32, 4)) == -1) return (-1); wlen += r; len -= r; @@ -905,7 +359,7 @@ up_generate_attr(struct rde_peer *peer, if (!peer->conf.ebgp) { /* local preference, only valid for ibgp */ tmp32 = htonl(asp->lpref); - if ((r = attr_write(up_attr_buf + wlen, len, ATTR_WELL_KNOWN, + if ((r = attr_write(buf + wlen, len, ATTR_WELL_KNOWN, ATTR_LOCALPREF, &tmp32, 4)) == -1) return (-1); wlen += r; len -= r; @@ -925,7 +379,7 @@ up_generate_attr(struct rde_peer *peer, break; switch (oa->type) { case ATTR_ATOMIC_AGGREGATE: - if ((r = attr_write(up_attr_buf + wlen, len, + if ((r = attr_write(buf + wlen, len, ATTR_WELL_KNOWN, ATTR_ATOMIC_AGGREGATE, NULL, 0)) == -1) return (-1); @@ -953,7 +407,7 @@ up_generate_attr(struct rde_peer *peer, memcpy(t + sizeof(tas), oa->data + sizeof(tmp32), oa->len - sizeof(tmp32)); - if ((r = attr_write(up_attr_buf + wlen, len, + if ((r = attr_write(buf + wlen, len, oa->flags, oa->type, &t, sizeof(t))) == -1) return (-1); break; @@ -968,7 +422,7 @@ up_generate_attr(struct rde_peer *peer, r = 0; break; } - if ((r = attr_write(up_attr_buf + wlen, len, + if ((r = attr_write(buf + wlen, len, oa->flags, oa->type, oa->data, oa->len)) == -1) return (-1); break; @@ -979,8 +433,8 @@ up_generate_attr(struct rde_peer *peer, oa->len, &nlen); if (nlen > 0) { - if ((r = attr_write(up_attr_buf + wlen, - len, oa->flags, oa->type, ndata, + if ((r = attr_write(buf + wlen, len, + oa->flags, oa->type, ndata, nlen)) == -1) { free(ndata); return (-1); @@ -991,9 +445,8 @@ up_generate_attr(struct rde_peer *peer, r = 0; } } else { - if ((r = attr_write(up_attr_buf + wlen, - len, oa->flags, oa->type, oa->data, - oa->len)) == -1) + if ((r = attr_write(buf + wlen, len, oa->flags, + oa->type, oa->data, oa->len)) == -1) return (-1); } break; @@ -1009,7 +462,7 @@ up_generate_attr(struct rde_peer *peer, r = 0; break; } - if ((r = attr_write(up_attr_buf + wlen, len, + if ((r = attr_write(buf + wlen, len, oa->flags | ATTR_PARTIAL, oa->type, oa->data, oa->len)) == -1) return (-1); @@ -1032,7 +485,7 @@ up_generate_attr(struct rde_peer *peer, flags |= ATTR_PARTIAL; if (plen == 0) r = 0; - else if ((r = attr_write(up_attr_buf + wlen, len, flags, + else if ((r = attr_write(buf + wlen, len, flags, ATTR_AS4_PATH, pdata, plen)) == -1) return (-1); wlen += r; len -= r; @@ -1042,281 +495,456 @@ up_generate_attr(struct rde_peer *peer, flags = ATTR_OPTIONAL|ATTR_TRANSITIVE; if (!(asp->flags & F_PREFIX_ANNOUNCED)) flags |= ATTR_PARTIAL; - if ((r = attr_write(up_attr_buf + wlen, len, flags, + if ((r = attr_write(buf + wlen, len, flags, ATTR_AS4_AGGREGATOR, newaggr->data, newaggr->len)) == -1) return (-1); wlen += r; len -= r; } - /* write mp attribute to different buffer */ - if (ismp) - if (up_generate_mp_reach(peer, upa, state, aid) == -1) - return (-1); - - /* the bgp path attributes are now stored in the global buf */ - upa->attr = malloc(wlen); - if (upa->attr == NULL) - fatal("up_generate_attr"); - memcpy(upa->attr, up_attr_buf, wlen); - upa->attr_len = wlen; return (wlen); } -#define MIN_PREFIX_LEN 5 /* 1 byte prefix length + 4 bytes addr */ +/* + * Check if the pending element is a EoR marker. If so remove it from the + * tree and return 1. + */ int -up_dump_prefix(u_char *buf, int len, struct uplist_prefix *prefix_head, +up_is_eor(struct rde_peer *peer, u_int8_t aid) +{ + struct prefix *p; + + p = RB_MIN(prefix_tree, &peer->updates[aid]); + if (p != NULL && p->eor) { + RB_REMOVE(prefix_tree, &peer->updates[aid], p); + prefix_destroy(p); + return 1; + } + return 0; +} + +/* minimal buffer size > withdraw len + attr len + attr hdr + afi/safi */ +#define MIN_UPDATE_LEN 16 + +/* + * Write prefixes to buffer until either there is no more space or + * the next prefix has no longer the same ASPATH attributes. + */ +static int +up_dump_prefix(u_char *buf, int len, struct prefix_tree *prefix_head, struct rde_peer *peer, int withdraw) { - struct update_prefix *upp; - int r, wpos = 0; + struct prefix *p, *np; + struct bgpd_addr addr; + int r, wpos = 0, done = 0; - while ((upp = TAILQ_FIRST(prefix_head)) != NULL) { + RB_FOREACH_SAFE(p, prefix_tree, prefix_head, np) { + pt_getaddr(p->re->prefix, &addr); if ((r = prefix_write(buf + wpos, len - wpos, - &upp->prefix, upp->prefixlen, withdraw)) == -1) + &addr, p->re->prefix->prefixlen, withdraw)) == -1) break; wpos += r; - if (RB_REMOVE(uptree_prefix, &peer->up_prefix, upp) == NULL) - log_warnx("dequeuing update failed."); - TAILQ_REMOVE(upp->prefix_h, upp, prefix_l); - peer->up_pcnt--; + + /* make sure we only dump prefixes which belong together */ + if (np == NULL || np->aspath != p->aspath || + np->nexthop != p->nexthop || np->nhflags != p->nhflags || + np->eor) + done = 1; + + /* prefix sent, remove from list and clear flag */ + RB_REMOVE(prefix_tree, prefix_head, p); + p->flags = 0; + if (withdraw) { + /* prefix no longer needed, remove it */ + prefix_destroy(p); peer->up_wcnt--; peer->prefix_sent_withdraw++; } else { + /* prefix still in Adj-RIB-Out, keep it */ peer->up_nlricnt--; peer->prefix_sent_update++; } - free(upp); + if (done) + break; } return (wpos); } int +up_dump_withdraws(u_char *buf, int len, struct rde_peer *peer, u_int8_t aid) +{ + u_int16_t wpos, wd_len; + int r; + + if (len < MIN_UPDATE_LEN) + return (-1); + + /* reserve space for the length field */ + wpos = 2; + r = up_dump_prefix(buf + wpos, len - wpos, &peer->withdraws[aid], + peer, 1); + wd_len = htons(r); + memcpy(buf, &wd_len, 2); + + return (wpos + r); +} + +int +up_dump_mp_unreach(u_char *buf, int len, struct rde_peer *peer, u_int8_t aid) +{ + u_char *attrbuf; + int wpos, r; + u_int16_t attr_len, tmp; + + if (len < MIN_UPDATE_LEN || RB_EMPTY(&peer->withdraws[aid])) + return (-1); + + /* reserve space for withdraw len, attr len */ + wpos = 2 + 2; + attrbuf = buf + wpos; + + /* attribute header, defaulting to extended length one */ + attrbuf[0] = ATTR_OPTIONAL | ATTR_EXTLEN; + attrbuf[1] = ATTR_MP_UNREACH_NLRI; + wpos += 4; + + /* afi & safi */ + if (aid2afi(aid, &tmp, buf + wpos + 2)) + fatalx("up_dump_mp_unreach: bad AID"); + tmp = htons(tmp); + memcpy(buf + wpos, &tmp, sizeof(u_int16_t)); + wpos += 3; + + r = up_dump_prefix(buf + wpos, len - wpos, &peer->withdraws[aid], + peer, 1); + if (r == 0) + return (-1); + wpos += r; + attr_len = r + 3; /* prefixes + afi & safi */ + + /* attribute length */ + attr_len = htons(attr_len); + memcpy(attrbuf + 2, &attr_len, sizeof(attr_len)); + + /* write length fields */ + bzero(buf, sizeof(u_int16_t)); /* withdrawn routes len */ + attr_len = htons(wpos - 4); + memcpy(buf + 2, &attr_len, sizeof(attr_len)); + + return (wpos); +} + +int up_dump_attrnlri(u_char *buf, int len, struct rde_peer *peer) { - struct update_attr *upa; + struct filterstate state; + struct prefix *p; int r, wpos; u_int16_t attr_len; - /* - * It is possible that a queued path attribute has no nlri prefix. - * Ignore and remove those path attributes. - */ - while ((upa = TAILQ_FIRST(&peer->updates[AID_INET])) != NULL) - if (TAILQ_EMPTY(&upa->prefix_h)) { - attr_len = upa->attr_len; - if (RB_REMOVE(uptree_attr, &peer->up_attrs, - upa) == NULL) - log_warnx("dequeuing update failed."); - TAILQ_REMOVE(&peer->updates[AID_INET], upa, attr_l); - free(upa->attr); - free(upa->mpattr); - free(upa); - peer->up_acnt--; - /* XXX horrible hack, - * if attr_len is 0, it is a EoR marker */ - if (attr_len == 0) - return (-1); - } else - break; + if (len < 2) + fatalx("up_dump_attrnlri: buffer way too small"); + if (len < MIN_UPDATE_LEN) + goto done; + + p = RB_MIN(prefix_tree, &peer->updates[AID_INET]); + if (p == NULL) + goto done; - if (upa == NULL || upa->attr_len + MIN_PREFIX_LEN > len) { + rde_filterstate_prep(&state, prefix_aspath(p), prefix_nexthop(p), + prefix_nhflags(p)); + + r = up_generate_attr(buf + 2, len - 2, peer, &state, AID_INET); + rde_filterstate_clean(&state); + if (r == -1) { /* * either no packet or not enough space. * The length field needs to be set to zero else it would be * an invalid bgp update. */ +done: bzero(buf, 2); return (2); } /* first dump the 2-byte path attribute length */ - attr_len = htons(upa->attr_len); + attr_len = htons(r); memcpy(buf, &attr_len, 2); wpos = 2; - - /* then the path attributes themselves */ - memcpy(buf + wpos, upa->attr, upa->attr_len); - wpos += upa->attr_len; + /* then skip over the already dumped path attributes themselves */ + wpos += r; /* last but not least dump the nlri */ - r = up_dump_prefix(buf + wpos, len - wpos, &upa->prefix_h, peer, 0); + r = up_dump_prefix(buf + wpos, len - wpos, &peer->updates[AID_INET], + peer, 0); wpos += r; - /* now check if all prefixes were written */ - if (TAILQ_EMPTY(&upa->prefix_h)) { - if (RB_REMOVE(uptree_attr, &peer->up_attrs, upa) == NULL) - log_warnx("dequeuing update failed."); - TAILQ_REMOVE(&peer->updates[AID_INET], upa, attr_l); - free(upa->attr); - free(upa->mpattr); - free(upa); - peer->up_acnt--; - } - return (wpos); } -u_char * -up_dump_mp_unreach(u_char *buf, u_int16_t *len, struct rde_peer *peer, - u_int8_t aid) +static int +up_generate_mp_reach(u_char *buf, int len, struct rde_peer *peer, + struct filterstate *state, u_int8_t aid) { - int wpos; - u_int16_t datalen, tmp; - u_int16_t attrlen = 2; /* attribute header (without len) */ - u_int8_t flags = ATTR_OPTIONAL, safi; + u_char *attrbuf; + int r; + int wpos, attrlen; + u_int16_t tmp; - /* - * reserve space for withdraw len, attr len, the attribute header - * and the mp attribute header - */ - wpos = 2 + 2 + 4 + 3; - - if (*len < wpos) - return (NULL); + if (len < 4) + return (-1); + /* attribute header, defaulting to extended length one */ + buf[0] = ATTR_OPTIONAL | ATTR_EXTLEN; + buf[1] = ATTR_MP_REACH_NLRI; + wpos = 4; + attrbuf = buf + wpos; - datalen = up_dump_prefix(buf + wpos, *len - wpos, - &peer->withdraws[aid], peer, 1); - if (datalen == 0) - return (NULL); + switch (aid) { + case AID_INET6: + attrlen = 21; /* AFI + SAFI + NH LEN + NH + Reserved */ + if (len < wpos + attrlen) + return (-1); + wpos += attrlen; + if (aid2afi(aid, &tmp, &attrbuf[2])) + fatalx("up_generate_mp_reach: bad AID"); + tmp = htons(tmp); + memcpy(attrbuf, &tmp, sizeof(tmp)); + attrbuf[3] = sizeof(struct in6_addr); + attrbuf[20] = 0; /* Reserved must be 0 */ - datalen += 3; /* afi + safi */ + /* nexthop dance see also up_get_nexthop() */ + attrbuf += 4; + if (state->nhflags & NEXTHOP_NOMODIFY) { + /* no modify flag set */ + if (state->nexthop == NULL) + memcpy(attrbuf, &peer->local_v6_addr.v6, + sizeof(struct in6_addr)); + else + memcpy(attrbuf, + &state->nexthop->exit_nexthop.v6, + sizeof(struct in6_addr)); + } else if (state->nhflags & NEXTHOP_SELF) + memcpy(attrbuf, &peer->local_v6_addr.v6, + sizeof(struct in6_addr)); + else if (!peer->conf.ebgp) { + /* ibgp */ + if (state->nexthop == NULL || + (state->nexthop->exit_nexthop.aid == AID_INET6 && + !memcmp(&state->nexthop->exit_nexthop.v6, + &peer->remote_addr.v6, sizeof(struct in6_addr)))) + memcpy(attrbuf, &peer->local_v6_addr.v6, + sizeof(struct in6_addr)); + else + memcpy(attrbuf, + &state->nexthop->exit_nexthop.v6, + sizeof(struct in6_addr)); + } else if (peer->conf.distance == 1) { + /* ebgp directly connected */ + if (state->nexthop != NULL && + state->nexthop->flags & NEXTHOP_CONNECTED) + if (prefix_compare(&peer->remote_addr, + &state->nexthop->nexthop_net, + state->nexthop->nexthop_netlen) == 0) { + /* + * nexthop and peer are in the same + * subnet + */ + memcpy(attrbuf, + &state->nexthop->exit_nexthop.v6, + sizeof(struct in6_addr)); + break; + } + memcpy(attrbuf, &peer->local_v6_addr.v6, + sizeof(struct in6_addr)); + } else + /* ebgp multihop */ + memcpy(attrbuf, &peer->local_v6_addr.v6, + sizeof(struct in6_addr)); + break; + case AID_VPN_IPv4: + attrlen = 17; /* AFI + SAFI + NH LEN + NH + Reserved */ + if (len < wpos + attrlen) + return (-1); + wpos += attrlen; + if (aid2afi(aid, &tmp, &attrbuf[2])) + fatalx("up_generate_mp_reachi: bad AID"); + tmp = htons(tmp); + memcpy(attrbuf, &tmp, sizeof(tmp)); + attrbuf[3] = sizeof(u_int64_t) + sizeof(struct in_addr); + bzero(attrbuf + 4, sizeof(u_int64_t)); + attrbuf[16] = 0; /* Reserved must be 0 */ - /* prepend header, need to do it reverse */ - /* safi & afi */ - if (aid2afi(aid, &tmp, &safi)) - fatalx("up_dump_mp_unreach: bad AID"); - buf[--wpos] = safi; - wpos -= sizeof(u_int16_t); - tmp = htons(tmp); - memcpy(buf + wpos, &tmp, sizeof(u_int16_t)); + /* nexthop dance see also up_get_nexthop() */ + attrbuf += 12; + if (state->nhflags & NEXTHOP_NOMODIFY) { + /* no modify flag set */ + if (state->nexthop == NULL) + memcpy(attrbuf, &peer->local_v4_addr.v4, + sizeof(struct in_addr)); + else + /* nexthops are stored as IPv4 addrs */ + memcpy(attrbuf, + &state->nexthop->exit_nexthop.v4, + sizeof(struct in_addr)); + } else if (state->nhflags & NEXTHOP_SELF) { + memcpy(attrbuf, &peer->local_v4_addr.v4, + sizeof(struct in_addr)); + } else if (!peer->conf.ebgp) { + /* ibgp */ + if (state->nexthop == NULL || + (state->nexthop->exit_nexthop.aid == AID_INET && + !memcmp(&state->nexthop->exit_nexthop.v4, + &peer->remote_addr.v4, sizeof(struct in_addr)))) + memcpy(attrbuf, &peer->local_v4_addr.v4, + sizeof(struct in_addr)); + else + memcpy(attrbuf, + &state->nexthop->exit_nexthop.v4, + sizeof(struct in_addr)); + } else if (peer->conf.distance == 1) { + /* ebgp directly connected */ + if (state->nexthop != NULL && + state->nexthop->flags & NEXTHOP_CONNECTED) + if (prefix_compare(&peer->remote_addr, + &state->nexthop->nexthop_net, + state->nexthop->nexthop_netlen) == 0) { + /* + * nexthop and peer are in the same + * subnet + */ + memcpy(attrbuf, + &state->nexthop->exit_nexthop.v4, + sizeof(struct in_addr)); + break; + } + memcpy(attrbuf, &peer->local_v4_addr.v4, + sizeof(struct in_addr)); + } else + /* ebgp multihop */ + memcpy(attrbuf, &peer->local_v4_addr.v4, + sizeof(struct in_addr)); + break; + case AID_VPN_IPv6: + attrlen = 29; /* AFI + SAFI + NH LEN + NH + Reserved */ + if (len < wpos + attrlen) + return (-1); + wpos += attrlen; + if (aid2afi(aid, &tmp, &attrbuf[2])) + fatalx("up_generate_mp_reachi: bad AID"); + tmp = htons(tmp); + memcpy(attrbuf, &tmp, sizeof(tmp)); + attrbuf[3] = sizeof(u_int64_t) + sizeof(struct in6_addr); + bzero(attrbuf + 4, sizeof(u_int64_t)); + attrbuf[28] = 0; /* Reserved must be 0 */ - /* attribute length */ - if (datalen > 255) { - attrlen += 2 + datalen; - flags |= ATTR_EXTLEN; - wpos -= sizeof(u_int16_t); - tmp = htons(datalen); - memcpy(buf + wpos, &tmp, sizeof(u_int16_t)); - } else { - attrlen += 1 + datalen; - buf[--wpos] = (u_char)datalen; + /* nexthop dance see also up_get_nexthop() */ + attrbuf += 12; + if (state->nhflags & NEXTHOP_NOMODIFY) { + /* no modify flag set */ + if (state->nexthop == NULL) + memcpy(attrbuf, &peer->local_v6_addr.v6, + sizeof(struct in6_addr)); + else + memcpy(attrbuf, + &state->nexthop->exit_nexthop.v6, + sizeof(struct in6_addr)); + } else if (state->nhflags & NEXTHOP_SELF) + memcpy(attrbuf, &peer->local_v6_addr.v6, + sizeof(struct in6_addr)); + else if (!peer->conf.ebgp) { + /* ibgp */ + if (state->nexthop == NULL || + (state->nexthop->exit_nexthop.aid == AID_INET6 && + !memcmp(&state->nexthop->exit_nexthop.v6, + &peer->remote_addr.v6, sizeof(struct in6_addr)))) + memcpy(attrbuf, &peer->local_v6_addr.v6, + sizeof(struct in6_addr)); + else + memcpy(attrbuf, + &state->nexthop->exit_nexthop.v6, + sizeof(struct in6_addr)); + } else if (peer->conf.distance == 1) { + /* ebgp directly connected */ + if (state->nexthop != NULL && + state->nexthop->flags & NEXTHOP_CONNECTED) + if (prefix_compare(&peer->remote_addr, + &state->nexthop->nexthop_net, + state->nexthop->nexthop_netlen) == 0) { + /* + * nexthop and peer are in the same + * subnet + */ + memcpy(attrbuf, + &state->nexthop->exit_nexthop.v6, + sizeof(struct in6_addr)); + break; + } + memcpy(attrbuf, &peer->local_v6_addr.v6, + sizeof(struct in6_addr)); + } else + /* ebgp multihop */ + memcpy(attrbuf, &peer->local_v6_addr.v6, + sizeof(struct in6_addr)); + break; + default: + fatalx("up_generate_mp_reach: unknown AID"); } - /* mp attribute */ - buf[--wpos] = (u_char)ATTR_MP_UNREACH_NLRI; - buf[--wpos] = flags; - - /* attribute length */ - wpos -= sizeof(u_int16_t); + r = up_dump_prefix(buf + wpos, len - wpos, &peer->updates[aid], + peer, 0); + if (r == 0) { + /* no prefixes written ... */ + return (-1); + } + attrlen += r; + wpos += r; + /* update attribute length field */ tmp = htons(attrlen); - memcpy(buf + wpos, &tmp, sizeof(u_int16_t)); - - /* no IPv4 withdraws */ - wpos -= sizeof(u_int16_t); - bzero(buf + wpos, sizeof(u_int16_t)); - - if (wpos < 0) - fatalx("up_dump_mp_unreach: buffer underflow"); + memcpy(buf + 2, &tmp, sizeof(tmp)); - /* total length includes the two 2-bytes length fields. */ - *len = attrlen + 2 * sizeof(u_int16_t); - - return (buf + wpos); + return (wpos); } int -up_dump_mp_reach(u_char *buf, u_int16_t *len, struct rde_peer *peer, - u_int8_t aid) +up_dump_mp_reach(u_char *buf, int len, struct rde_peer *peer, u_int8_t aid) { - struct update_attr *upa; - int wpos; - u_int16_t attr_len, datalen, tmp; - u_int8_t flags = ATTR_OPTIONAL; + struct filterstate state; + struct prefix *p; + int r, wpos; + u_int16_t attr_len; - /* - * It is possible that a queued path attribute has no nlri prefix. - * Ignore and remove those path attributes. - */ - while ((upa = TAILQ_FIRST(&peer->updates[aid])) != NULL) - if (TAILQ_EMPTY(&upa->prefix_h)) { - attr_len = upa->attr_len; - if (RB_REMOVE(uptree_attr, &peer->up_attrs, - upa) == NULL) - log_warnx("dequeuing update failed."); - TAILQ_REMOVE(&peer->updates[aid], upa, attr_l); - free(upa->attr); - free(upa->mpattr); - free(upa); - peer->up_acnt--; - /* XXX horrible hack, - * if attr_len is 0, it is a EoR marker */ - if (attr_len == 0) - return (-1); - } else - break; + if (len < MIN_UPDATE_LEN) + return 0; - if (upa == NULL) - return (-2); + /* get starting point */ + p = RB_MIN(prefix_tree, &peer->updates[aid]); + if (p == NULL) + return 0; - /* - * reserve space for attr len, the attributes, the - * mp attribute and the attribute header - */ - wpos = 2 + 2 + upa->attr_len + 4 + upa->mpattr_len; - if (*len < wpos) - return (-2); - - datalen = up_dump_prefix(buf + wpos, *len - wpos, - &upa->prefix_h, peer, 0); - if (datalen == 0) - return (-2); - - if (upa->mpattr_len == 0 || upa->mpattr == NULL) - fatalx("multiprotocol update without MP attrs"); - - datalen += upa->mpattr_len; - wpos -= upa->mpattr_len; - memcpy(buf + wpos, upa->mpattr, upa->mpattr_len); - - if (datalen > 255) { - wpos -= 2; - tmp = htons(datalen); - memcpy(buf + wpos, &tmp, sizeof(tmp)); - datalen += 4; - flags |= ATTR_EXTLEN; - } else { - buf[--wpos] = (u_char)datalen; - datalen += 3; - } - buf[--wpos] = (u_char)ATTR_MP_REACH_NLRI; - buf[--wpos] = flags; + wpos = 4; /* reserve space for length fields */ - datalen += upa->attr_len; - wpos -= upa->attr_len; - memcpy(buf + wpos, upa->attr, upa->attr_len); - - if (wpos < 4) - fatalx("Grrr, mp_reach buffer fucked up"); - - wpos -= 2; - tmp = htons(datalen); - memcpy(buf + wpos, &tmp, sizeof(tmp)); - - wpos -= 2; - bzero(buf + wpos, 2); - - /* now check if all prefixes were written */ - if (TAILQ_EMPTY(&upa->prefix_h)) { - if (RB_REMOVE(uptree_attr, &peer->up_attrs, upa) == NULL) - log_warnx("dequeuing update failed."); - TAILQ_REMOVE(&peer->updates[aid], upa, attr_l); - free(upa->attr); - free(upa->mpattr); - free(upa); - peer->up_acnt--; + rde_filterstate_prep(&state, prefix_aspath(p), prefix_nexthop(p), + prefix_nhflags(p)); + + /* write regular path attributes */ + r = up_generate_attr(buf + wpos, len - wpos, peer, &state, aid); + if (r == -1) { + rde_filterstate_clean(&state); + return 0; } + wpos += r; + + /* write mp attribute */ + r = up_generate_mp_reach(buf + wpos, len - wpos, peer, &state, aid); + rde_filterstate_clean(&state); + if (r == -1) + return 0; + wpos += r; + + /* write length fields */ + bzero(buf, sizeof(u_int16_t)); /* withdrawn routes len */ + attr_len = htons(wpos - 4); + memcpy(buf + 2, &attr_len, sizeof(attr_len)); - *len = datalen + 4; return (wpos); }