Hello, Francois!
Regarding 2.0.12, you just need to wait, loading 1.2M routes from 50
peers running 6x roa_check each is simply slow on one CPU. You may
ignore the debug log, there is typically nothing useful. Also thank you
for noting that the "reload burst split" message is still there; I
forgot to convert it to a L_TRACE log.
Also you should probably use the roa_check in a switch-case syntax,
effectively reducing the number of calls from 6 to 2. It's the most
resource-demanding thing inside your filter.
Also you may want to postpone BGP startup by several seconds to let the
RPKI feed its table completely, avoiding otherwise necessary filter
recalculaton. (The exact delay must be determined locally.)
The "bad lock order" bug is known in 3.0-alpha0 and has no simple
solution and we ended up rewriting lots of other things. There is no
newer version to test for now.
Thanks to our Support Subscribers, we can afford a testing hardware good
enough to test these scenarios systematically. Thus 3.0-alpha1 will be
able to handle 60M routes fast and safely, and we're going to release it
soon.
Please consider subscribing to make it possible for us to test BIRD also
for your scenarios; for more information, contact us at
[email protected].
Have a nice day!
Maria
On 4/4/23 11:24, Francois Espinet wrote:
Hello,
I am currently trying bird out in a route collector scenario. We have
around 50 devices all sending around 1.2M routes.
I initially started with bird 2.0.12, but the CPU it stuck at 100%, and
the debug logs has a lot of "channel reload burst split (max_feed=-1) ».
So I wanted to try bird 3.0, but I am getting the following logs (using
the -d flag), and the router crashes just after starting:
bird: Started
bird: Trying to lock in a bad order
Aborted
Any idea what could be the issue there ?
Here is my config:
timeformat base iso long;
timeformat log iso long;
timeformat protocol iso long;
timeformat route iso long;
router id X.X.X.X;
hostname "route-collector";
attribute int roa_status1;
attribute int roa_status2;
roa4 table roa4_1;
roa4 table roa4_2;
roa6 table roa6_1;
roa6 table roa6_2;
ipv4 table pb4
ipv6 table pb6;
filter flag_rpki {
if bgp_path.len = 0 || bgp_path.last = 16276 then accept;
if roa_check(roa4_1, net, bgp_path.last) = ROA_INVALID then
roa_status1=1;
if roa_check(roa4_1, net, bgp_path.last) = ROA_UNKNOWN then
roa_status1=2;
if roa_check(roa4_1, net, bgp_path.last) = ROA_VALID then
roa_status1=3;
if roa_check(roa4_2, net, bgp_path.last) = ROA_INVALID then
roa_status2=1;
if roa_check(roa4_2, net, bgp_path.last) = ROA_UNKNOWN then
roa_status2=2;
if roa_check(roa4_2, net, bgp_path.last) = ROA_VALID then
roa_status2=3;
accept;
}
protocol bgp PB {
local X.X.X.X as 16276;
neighbor range 0.0.0.0/0 as 16276;
dynamic name "pb";
dynamic name digits 2;
ipv4 {
export filter {
reject;
};
table pb4;
import filter flag_rpki;
add paths rx;
import table yes;
next hop keep on;
rpki reload on;
};
ipv6 {
export filter {
reject;
};
table pb6;
import filter flag_rpki;
add paths rx;
import table yes;
next hop keep on;
rpki reload on;
};
strict bind on;
}
protocol rpki stack1 {
roa4 { table roa4_1; };
roa6 { table roa6_1; };
remote X.X.X.Z port 323;
transport tcp;
refresh 300;
retry 300;
expire 600;
}
protocol rpki stack2 {
roa4 { table roa4_2; };
roa6 { table roa6_2; };
remote X.X.X.Y port 323;
transport tcp;
refresh 300;
retry 300;
expire 600;
}
Best regards,
François