On Wednesday, October 19, 2022 10:39:55 AM EDT Eelco Chaudron wrote: > On 19 Oct 2022, at 12:14, Adrian Moreno wrote: > > Hi Kevin. > > > > Thanks for following up with this patch! > > > > On 10/3/22 22:07, Kevin Sprague wrote: > >> During normal operations, it is useful to understand when a particular > >> flow > >> gets removed from the system. This can be useful when debugging > >> performance > >> issues tied to ofproto flow changes, trying to determine deployed traffic > >> patterns, or while debugging dynamic systems where ports come and go. > >> > >> Prior to this change, there was a lack of visibility around flow > >> expiration. The existing debugging infrastructure could tell us when a > >> flow was added to the datapath, but not when it was removed or why. > >> > >> This change introduces a USDT probe at the point where the revalidator > >> determines that the flow should be removed. Additionally, we track the > >> reason for the flow eviction and provide that information as well. With > >> this change, we can track the complete flow lifecycle for the netlink > >> datapath by hooking the upcall tracepoint in kernel, the flow put USDT, > >> and the revaldiator USDT, letting us watch as flows are added and > >> removed from the kernel datapath. > >> > >> This change only enables this information via USDT probe, so it won't be > >> possible to access this information any other way (see: > >> Documentation/topics/usdt-probes.rst). > >> > >> Also included is a script (utilities/usdt-scripts/revalidate_monitor.py) > >> that serves as a demonstration of how the new USDT probe might be used > >> going forward. > >> > >> Signed-off-by: Kevin Sprague <[email protected]> > >> --- > >> > >> Documentation/topics/usdt-probes.rst | 23 + > >> ofproto/ofproto-dpif-upcall.c | 40 +- > >> utilities/automake.mk | 2 + > >> utilities/usdt-scripts/revalidate_monitor.py | 609 +++++++++++++++++++ > >> 4 files changed, 668 insertions(+), 6 deletions(-) > >> create mode 100755 utilities/usdt-scripts/revalidate_monitor.py > >> > >> diff --git a/Documentation/topics/usdt-probes.rst > >> b/Documentation/topics/usdt-probes.rst index 7ce19aaed..2cae8c4b4 100644 > >> --- a/Documentation/topics/usdt-probes.rst > >> +++ b/Documentation/topics/usdt-probes.rst > >> > >> @@ -214,6 +214,7 @@ Available probes in ``ovs_vswitchd``: > >> - dpif_recv:recv_upcall > >> - main:poll_block > >> - main:run_start > >> > >> +- revalidate:flow_result > >> > >> dpif_netlink_operate\_\_:op_flow_del > >> > >> @@ -294,6 +295,7 @@ DPIF_OP_FLOW_PUT operation as part of the dpif > >> ``operate()`` callback.>> > >> **Script references**: > >> +- ``utilities/usdt-scripts/revalidate_monitor.py`` > >> > >> - ``utilities/usdt-scripts/upcall_cost.py`` > >> > >> @@ -358,6 +360,27 @@ See also the ``main:run_start`` probe above. > >> > >> - ``utilities/usdt-scripts/bridge_loop.bt`` > >> > >> +probe revalidate:flow_result > >> > >> +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ > >> + > >> +**Description**: > >> +This probe is triggered when the revalidator decides whether or not to > >> +revalidate a flow. ``reason`` is an enum that denotes that either the > >> flow > >> +is being kept, or the reason why the flow is being deleted. The > >> +``revalidate_monitor.py`` script uses this probe to notify users when > >> flows +matching user-provided criteria are deleted. > >> + > >> +**Arguments**: > >> + > >> +- *arg0*: ``(enum flow_del_reason) reason`` > >> +- *arg1*: ``(struct udpif *) udpif`` > >> +- *arg2*: ``(struct udpif_key *) ukey`` > >> + > >> +**Script references**: > >> + > >> +- ``utilities/usdt-scripts/revalidate_monitor.py`` > >> + > >> + > >> > >> Adding your own probes > >> ---------------------- > >> > >> diff --git a/ofproto/ofproto-dpif-upcall.c > >> b/ofproto/ofproto-dpif-upcall.c > >> > >> index 57f94df54..a038799c1 100644 > >> --- a/ofproto/ofproto-dpif-upcall.c > >> +++ b/ofproto/ofproto-dpif-upcall.c > >> @@ -31,6 +31,7 @@ > >> > >> #include "openvswitch/list.h" > >> #include "netlink.h" > >> #include "openvswitch/ofpbuf.h" > >> > >> +#include "openvswitch/usdt-probes.h" > >> > >> #include "ofproto-dpif-ipfix.h" > >> #include "ofproto-dpif-sflow.h" > >> #include "ofproto-dpif-xlate.h" > >> > >> @@ -260,6 +261,17 @@ enum ukey_state { > >> > >> }; > >> #define N_UKEY_STATES (UKEY_DELETED + 1) > >> > >> +enum flow_del_reason { > >> > >> + FDR_FLOW_LIVE = 0, /* The flow was revalidated. */ > >> + FDR_FLOW_TIME_OUT, /* The flow went unused and was deleted. */ > >> + FDR_TOO_EXPENSIVE, /* The flow was too expensive to revalidate. > >> */ + FDR_FLOW_WILDCARDED, /* The flow needed a narrower wildcard > >> mask. */ + FDR_BAD_ODP_FIT, /* The flow had a bad ODP flow > >> fit. */ + FDR_ASSOCIATED_OFPROTO, /* The flow had an associated > >> ofproto. */> > > s/had an associated ofproto/did't have an associated ofproto/ > > > >> + FDR_XLATION_ERROR, /* There was an error translating the flow. > >> */ > >> + FDR_AVOID_CACHING, /* Flow deleted to avoid caching. */ > >> +}; > >> + > >> > >> /* 'udpif_key's are responsible for tracking the little bit of state > >> udpif > >> > >> * needs to do flow expiration which can't be pulled directly from the > >> * datapath. They may be created by any handler or revalidator thread > >> at any>> > >> @@ -2202,7 +2214,8 @@ populate_xcache(struct udpif *udpif, struct > >> udpif_key *ukey,>> > >> static enum reval_result > >> revalidate_ukey__(struct udpif *udpif, const struct udpif_key *ukey, > >> > >> uint16_t tcp_flags, struct ofpbuf *odp_actions, > >> > >> - struct recirc_refs *recircs, struct xlate_cache > >> *xcache) > >> + struct recirc_refs *recircs, struct xlate_cache > >> *xcache, > >> + enum flow_del_reason *reason) > >> > >> { > >> > >> struct xlate_out *xoutp; > >> struct netflow *netflow; > >> > >> @@ -2220,11 +2233,13 @@ revalidate_ukey__(struct udpif *udpif, const > >> struct udpif_key *ukey,>> > >> netflow = NULL; > >> > >> if (xlate_ukey(udpif, ukey, tcp_flags, &ctx)) { > >> > >> + *reason = FDR_XLATION_ERROR; > >> > >> goto exit; > >> > >> } > >> xoutp = &ctx.xout; > >> > >> if (xoutp->avoid_caching) { > >> > >> + *reason = FDR_AVOID_CACHING; > >> > >> goto exit; > >> > >> } > >> > >> @@ -2238,6 +2253,7 @@ revalidate_ukey__(struct udpif *udpif, const > >> struct udpif_key *ukey,>> > >> ofpbuf_clear(odp_actions); > >> > >> if (!ofproto) { > >> > >> + *reason = FDR_ASSOCIATED_OFPROTO; > >> > >> goto exit; > >> > >> } > >> > >> @@ -2249,6 +2265,7 @@ revalidate_ukey__(struct udpif *udpif, const > >> struct udpif_key *ukey,>> > >> if (odp_flow_key_to_mask(ukey->mask, ukey->mask_len, &dp_mask, > >> &ctx.flow, > >> > >> NULL) > >> > >> == ODP_FIT_ERROR) { > >> > >> + *reason = FDR_BAD_ODP_FIT; > >> > >> goto exit; > >> > >> } > >> > >> @@ -2258,6 +2275,7 @@ revalidate_ukey__(struct udpif *udpif, const > >> struct udpif_key *ukey,>> > >> * down. Note that we do not know if the datapath has ignored any > >> of the > >> * wildcarded bits, so we may be overly conservative here. */ > >> > >> if (flow_wildcards_has_extra(&dp_mask, ctx.wc)) { > >> > >> + *reason = FDR_FLOW_WILDCARDED; > >> > >> goto exit; > >> > >> } > >> > >> @@ -2303,7 +2321,8 @@ static enum reval_result > >> > >> revalidate_ukey(struct udpif *udpif, struct udpif_key *ukey, > >> > >> const struct dpif_flow_stats *stats, > >> struct ofpbuf *odp_actions, uint64_t reval_seq, > >> > >> - struct recirc_refs *recircs, bool offloaded) > >> + struct recirc_refs *recircs, bool offloaded, > >> + enum flow_del_reason *reason) > >> > >> OVS_REQUIRES(ukey->mutex) > >> > >> { > >> > >> bool need_revalidate = ukey->reval_seq != reval_seq; > >> > >> @@ -2329,8 +2348,12 @@ revalidate_ukey(struct udpif *udpif, struct > >> udpif_key *ukey,>> > >> xlate_cache_clear(ukey->xcache); > >> > >> } > >> result = revalidate_ukey__(udpif, ukey, push.tcp_flags, > >> > >> - odp_actions, recircs, > >> ukey->xcache); - } /* else delete; too expensive to revalidate */ > >> + odp_actions, recircs, > >> ukey->xcache, > >> + reason); > >> + } else { > >> + /* Delete the flow as it's too expensive to revalidate. */ > >> + *reason = FDR_TOO_EXPENSIVE; > >> + } > >> > >> } else if (!push.n_packets || ukey->xcache > >> > >> || !populate_xcache(udpif, ukey, push.tcp_flags)) { > >> > >> result = UKEY_KEEP; > >> > >> @@ -2720,6 +2743,7 @@ revalidate(struct revalidator *revalidator) > >> > >> struct recirc_refs recircs = RECIRC_REFS_EMPTY_INITIALIZER; > >> struct dpif_flow_stats stats = f->stats; > >> enum reval_result result; > >> > >> + enum flow_del_reason reason = FDR_FLOW_LIVE; > >> > >> struct udpif_key *ukey; > >> bool already_dumped; > >> int error; > >> > >> @@ -2767,10 +2791,11 @@ revalidate(struct revalidator *revalidator) > >> > >> } > >> if (kill_them_all || (used && used < now - max_idle)) { > >> > >> result = UKEY_DELETE; > >> > >> + reason = FDR_FLOW_TIME_OUT; > > > > Maybe we should distinguish between kill_them_all and actual timeout? > > Yes, this might be a good reason to add. > > >> } else { > >> > >> result = revalidate_ukey(udpif, ukey, &stats, > >> &odp_actions, > >> > >> reval_seq, &recircs, > >> > >> - f->attrs.offloaded); > >> + f->attrs.offloaded, &reason); > >> > >> } > >> ukey->dump_seq = dump_seq; > >> > >> @@ -2779,6 +2804,7 @@ revalidate(struct revalidator *revalidator) > >> > >> udpif_update_flow_pps(udpif, ukey, f); > >> > >> } > >> > >> + OVS_USDT_PROBE(revalidate, flow_result, reason, udpif, > >> ukey); > > > > Many production builds won't have USDT enabled. For those cases, it may be > > worth adding a coverage counter like what Han proposed [1]. WDYT? > > > > > > [1] > > https://patchwork.ozlabs.org/project/openvswitch/patch/20220809014100.371 > > [email protected]/ > I agree we should get these two patches integrated, or at least use the same > *flow_del_reason* names for the coverage counters. > >> if (result != UKEY_KEEP) { > >> > >> /* Takes ownership of 'recircs'. */ > >> reval_op_init(&ops[n_ops++], result, udpif, ukey, > >> &recircs, > >> > >> @@ -2829,6 +2855,7 @@ revalidator_sweep__(struct revalidator > >> *revalidator, bool purge)>> > >> struct udpif_key *ukey; > >> struct umap *umap = &udpif->ukeys[i]; > >> size_t n_ops = 0; > >> > >> + enum flow_del_reason reason = FDR_FLOW_LIVE; > >> > >> CMAP_FOR_EACH(ukey, cmap_node, &umap->cmap) { > >> > >> enum ukey_state ukey_state; > >> > >> @@ -2855,7 +2882,8 @@ revalidator_sweep__(struct revalidator > >> *revalidator, bool purge)>> > >> COVERAGE_INC(revalidate_missed_dp_flow); > >> memset(&stats, 0, sizeof stats); > >> result = revalidate_ukey(udpif, ukey, &stats, > >> &odp_actions, > >> > >> - reval_seq, &recircs, > >> false); > >> + reval_seq, &recircs, false, > >> + &reason); > >> > >> } > >> if (result != UKEY_KEEP) { > >> > >> /* Clears 'recircs' if filled by revalidate_ukey(). > >> */ > >> > >> diff --git a/utilities/automake.mk b/utilities/automake.mk > >> index eb57653a1..bb268f343 100644 > >> --- a/utilities/automake.mk > >> +++ b/utilities/automake.mk > >> @@ -63,6 +63,7 @@ EXTRA_DIST += \ > >> > >> utilities/docker/debian/Dockerfile \ > >> utilities/docker/debian/build-kernel-modules.sh \ > >> utilities/usdt-scripts/bridge_loop.bt \ > >> > >> + utilities/usdt-scripts/revalidate_monitor.py \ > >> > >> utilities/usdt-scripts/upcall_cost.py \ > >> utilities/usdt-scripts/upcall_monitor.py > >> > >> MAN_ROOTS += \ > >> > >> @@ -133,6 +134,7 @@ FLAKE8_PYFILES += utilities/ovs-pcap.in \ > >> > >> utilities/ovs-check-dead-ifs.in \ > >> utilities/ovs-tcpdump.in \ > >> utilities/ovs-pipegen.py \ > >> > >> + utilities/usdt-scripts/revalidate_monitor.py \ > >> > >> utilities/usdt-scripts/upcall_monitor.py \ > >> utilities/usdt-scripts/upcall_cost.py > >> > >> diff --git a/utilities/usdt-scripts/revalidate_monitor.py > >> b/utilities/usdt-scripts/revalidate_monitor.py>> > >> new file mode 100755 > >> index 000000000..1b3880af9 > >> --- /dev/null > >> +++ b/utilities/usdt-scripts/revalidate_monitor.py > > We might have a name collision with my patch set. I called my script > reval_monitor.py, as I thought revalidate_monitor was too long ;) > > Maybe we can keep mine as reval_monitor.py as it monitors the revalidate > process, and we can call your’s flow_reval_monitor.py as it monitors flow > revalidation (I know it’s a long name :). What do you think?
flow_reval_monitor.py is the same number of characters as revalidate_monitor.py I'm fine with that name length, but if we wanted to shorten it, flow_monitor.py is one option, although that doesn't feel quite right, since it's not monitoring the flows except when they're upcalled and revalidated. > >> @@ -0,0 +1,609 @@ > >> +#!/usr/bin/env python3 > >> +# > >> +# Copyright (c) 2022 Redhat, Inc. > >> +# > >> +# Licensed under the Apache License, Version 2.0 (the "License"); > >> +# you may not use this file except in compliance with the License. > >> +# You may obtain a copy of the License at: > >> +# > >> +# http://www.apache.org/licenses/LICENSE-2.0 > >> +# > >> +# Unless required by applicable law or agreed to in writing, software > >> +# distributed under the License is distributed on an "AS IS" BASIS, > >> +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or > >> implied. > >> +# See the License for the specific language governing permissions and > >> +# limitations under the License. > >> +# > >> +# Script information: > >> +# ------------------- > >> +# revalidate_monitor.py uses the dpif_netlink_operate:flow_put and > >> +# revalidator:flow_result USDT probes to monitor flow lifetimes and > >> +# expiration events. By default, this will show all flow_put and flow > >> +# expiration events, along with their reasons. This will look like so: > >> +# > >> +# TIME UFID > >> EVENT/REASON +# 101536.226986736 > >> ufid:f76fc899-376d-466b-bc74-0000b933eb97 flow_put +# 101536.227196214 > >> ufid:d08472b6-110e-46cb-a9e4-00008f46838e flow_put +# > >> 101541.516610178 ufid:fc5cc4a2-39e7-4a2d-bbce-000019665b32 flow_put > >> +# 101541.516967303 ufid:fddd6510-26dc-4c87-8f7a-0000fc0c2c3a > >> flow_put +# 101551.688050747 ufid:fddd6510-26dc-4c87-8f7a-0000fc0c2c3a > >> flow timed out +# 101551.688077175 > >> ufid:fc5cc4a2-39e7-4a2d-bbce-000019665b32 flow timed out +# > >> 101557.695391371 ufid:f76fc899-376d-466b-bc74-0000b933eb97 flow > >> timed out +# 101557.695408909 > >> ufid:d08472b6-110e-46cb-a9e4-00008f46838e flow timed out +# > >> +# flow key data can be printed using the --flow-keys option. This will > >> +# print the equivalent datapath flow string. > >> +# > >> +# When filtering flows, the syntax is the same as used by > >> +# `ovs-appctl dpctl/add-flow`. > >> +# > >> +# The following options are available: > >> +# > >> +# usage: revalidate_monitor.py [-h] [--buffer-page-count NUMBER] > >> +# [-k [FLOW_KEYS]] [--pid VSWITCHD_PID] > >> +# [-D [DEBUG]] [-f [FLOW STRING ...]] > >> +# > >> +# optional arguments: > >> +# > >> +# options: > >> +# -h, --help show this help message and exit > >> +# --buffer-page-count NUMBER > >> +# Number of BPF ring buffer pages, default 1024 > >> +# -k [FLOW_KEYS], --flow-keys [FLOW_KEYS] > >> +# Print flow keys as flow strings > > > > I'm not 100% sure about putting the usage in the comment. It will most > > certantly go out of sync with the _actual_ usage. For instance right now > > this line is printed differently: > > > > ./revalidate_monitor.py -h | grep flow-keys > > > > -k, --flow-keys Print flow keys as flow strings > > I’m ok with including this here, but we should make sure it gets synced. > > >> +# --pid VSWITCHD_PID, -p VSWITCHD_PID > >> +# ovs-vswitchd's PID > >> +# -D [DEBUG], --debug [DEBUG] > >> +# Enable eBPF debugging > >> +# -f [FLOW STRING ...], --filter-flows [FLOW STRING ...] > >> +# Filter flows that match the specified flow > >> +# > >> +# Examples: > >> +# > >> +# To use the script on a running ovs-vswitchd to see flow keys and > >> expiration +# events for flows with an ipv4 source of 192.168.10.10: > >> +# $ ./revalidate_monitor.py --flow-keys --filter-flows \ > >> +# "ipv4(src=192.168.10.10)" > >> +# TIME UFID > >> EVENT/REASON +# 105082.457322742 > >> ufid:f76fc899-376d-466b-bc74-0000b933eb97 flow_put +# > >> ufid:f76fc899-376d-466b-bc74-0000b933eb97 has the following flow > >> information: +# in_port(2), > >> +# eth(src=0e:04:47:fc:74:51, dst=da:dc:c5:69:05:d7), \ > >> +# eth_type(0x800), \ > >> +# ipv4(src=192.168.10.10, dst=192.168.10.30, proto=1, tos=0, > >> ttl=64,[...]), +# icmp(type=8, code=0) > >> +# 105092.635450202 ufid:f76fc899-376d-466b-bc74-0000b933eb97 Flow > >> timed out +# > >> +# Notes: > >> +# 1) No options are needed to attach to a runing instance of > >> ovs-vswitchd. > Running > > >> +# 2) If you're using the flow filtering option, it will only track > >> flows that +# have been upcalled since the script began running. > >> + > > I decided to run something simple as I did not have a OVS setup ready. > I did a make sanbox follow by > > - ovs-vsctl add-br br0 > - ovs-appctl dpctl/add-flow "in_port(0),eth_type(0x0800),ipv4()" 100; > ovs-appctl dpctl/dump-flows > > The revalidate will clean this up immediately, but it will result in an > error on the script: > > $ sudo ./revalidate_monitor.py > TIME UFID > EVENT/REASON Type: 'unsigned char [2048]' not recognized. Please define the > data with ctypes manually. Exception ignored on calling ctypes callback > function: <function RingBuf.open_ring_buffer.<locals>.ringbuf_cb_ at > 0x7fc5ae643250> Traceback (most recent call last): > File "/usr/lib/python3.10/site-packages/bcc/table.py", line 1260, in > ringbuf_cb_ ret = callback(ctx, data, size) > File > "/home/echaudron/Documents/review/ovs_ksprague_usdt/OVS_master_DPDK_v21.11/ > ovs_github/utilities/usdt-scripts/./revalidate_monitor.py", line 502, in > handle_event event = b["events"].event(data) > File "/usr/lib/python3.10/site-packages/bcc/table.py", line 1247, in event > self._event_class = _get_event_class(self) > File "/usr/lib/python3.10/site-packages/bcc/table.py", line 264, in > _get_event_class sys.exit(1) > SystemExit: 1 > > Well, looks like I only got this on my Fedora systems, so something must be > wrong with that specific setup. Ignore it for now, or maybe you have seen > this before? > I was not able to duplicate this on my system running Fedora 36. > > I did get another odd error: > > ./revalidate_monitor.py -k > TIME UFID > EVENT/REASON 2939774.392748140 ufid:e263e60b-9ec4-43fd-9b2c-0000aba523b5 > Flow timed out > b'\x08\x00\x14\x00\x00\x00\x00\x00\x08\x00\x13\x00\x00\x00\x00\x00\x08\x00\ > x02\x00\x07\x00\x00\x00\x08\x00\x03\x00\x01\x00\x00\x00\x08\x00\x0f\x00\xd4\ > x00\x00\x00\x08\x00\x16\x00\x00\x00\x00\x00\x06\x00\x17\x00\x00\x00\x00\x00\ > x08\x00\x18\x00\x00\x00\x00\x00\x14\x00\x19\x00\x00\x00\x00\x00\x00\x00\x00\ > x00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x04\x00<\xfd\xfe\x9e\x7fh33\x00\ > x00\x00\x16\x06\x00\x06\x00\x86\xdd\x00\x00,\x00\x08\x00\x00\x00\x00\x00\x00 > \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\x02\x00\x00\x00\x00\x00\x00 > \x00\x00\x00\x00\x00\x00\x00\x16\x00\x00\x00\x00:\x00\x01\x00\x06\x00\x0c\x0 > 0\x8f\x00\x00\x00' 2939808.171260535 > ufid:b20121d7-84a6-42c3-a726-0000d29d844b Insert (put) flow to kernel. > - It holds the following flow information: > skb_priority(7), > in_port(1), > skb_mark(212), > eth(src=3c:fd:fe:9e:7f:68, dst=33:33:00:00:00:16), > eth_type(0x86dd), > ipv6(src=b'\x00', dst=ff02::16, label=b'\x00\x00\x00\x00', proto=58, > tclass=0, hlimit=1, frag=0), icmpv6(type=143, code=0) > b'\x08\x00\x14\x00\x00\x00\x00\x00\x08\x00\x13\x00\x00\x00\x00\x00\x08\x00\x > 02\x00\x00\x00\x00\x00\x08\x00\x03\x00\x01\x00\x00\x00\x08\x00\x0f\x00\x00\x > 00\x00\x00\x08\x00\x16\x00\x00\x00\x00\x00\x06\x00\x17\x00\x00\x00\x00\x00\x > 08\x00\x18\x00\x00\x00\x00\x00\x14\x00\x19\x00\x00\x00\x00\x00\x00\x00\x00\x > 00\x00\x00\x00\x00\x00\x00\x00\x00\x10\x00\x04\x00<\xfd\xfe\x9e\x7fh33\xff\x > 9e\x7fh\x06\x00\x06\x00\x86\xdd\x00\x00,\x00\x08\x00\x00\x00\x00\x00\x00\x00 > \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xff\x02\x00\x00\x00\x00\x00\x00\x00 > \x00\x00\x01\xff\x9e\x7fh\x00\x00\x00\x00:\x00\xff\x00\x06\x00\x0c\x00\x87\x > 00\x00\x00 > \x00\x0e\x00\xfe\x80\x00\x00\x00\x00\x00\x00>\xfd\xfe\xff\xfe\x9e\x7fh\x00\ > x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' Traceback (most recent call > last): > File "_ctypes/callbacks.c", line 234, in 'calling callback function' > File "/usr/lib/python3.6/site-packages/bcc/table.py", line 1068, in > ringbuf_cb_ ret = callback(ctx, data, size) > File "./revalidate_monitor.py", line 504, in handle_event > handle_flow_put(event) > File "./revalidate_monitor.py", line 229, in handle_flow_put > flow_dict, flow_str = parse_flow_dict(key) > File "./revalidate_monitor.py", line 466, in parse_flow_dict > data[1] = str(ipaddress.IPv6Address(data[1])) > File "/usr/lib64/python3.6/ipaddress.py", line 1891, in __init__ > self._check_packed_address(address, 16) > File "/usr/lib64/python3.6/ipaddress.py", line 431, in > _check_packed_address expected_len, self._version)) > > I got this on RHEL8 when just creating a bridge (with some ports) and > bringing it up (it is causing some IPv6 auto assignments): > > ip link set dev br-int up Was able to duplicate this and fix it. > > > Doing stuff like this, “./revalidate_monitor.py -k -f ipv4” is also > crashing. Maybe some filter syntax checking would be nice. Maybe it’s just > a bug in general, as your example is also not working: > > [wsfd-netdev64:~/...ilities/usdt-scripts]$ ./revalidate_monitor.py -k -f > "ipv4(src=192.168.10.10)" Traceback (most recent call last): > File "./revalidate_monitor.py", line 609, in <module> > main() > File "./revalidate_monitor.py", line 580, in main > args.filter_flows = parse_flow_str(args.filter_flows[0]) > File "./revalidate_monitor.py", line 270, in parse_flow_str > f_list = [i.removeprefix(',').strip() for i in flow_str.split(')')] > File "./revalidate_monitor.py", line 270, in <listcomp> > f_list = [i.removeprefix(',').strip() for i in flow_str.split(')')] > AttributeError: 'str' object has no attribute 'removeprefix' > Forgot that str.removeprefix wasn't introduced until Python 3.9 > >> +try: > >> + from bcc import BPF > >> + from bcc import USDT > >> + from bcc import USDTException > >> +except ModuleNotFoundError: > >> + print("WARNING: Can't find the BPF Compiler Collection Tools.") > >> + > > > > Maybe the program should exit here if bcc is not installed with this nice > > error message rather than letting it crash later on. > Agree, guess this is a copy left over from code where you are allowed to > save and replay bpf events. > >> +import argparse > >> +import ipaddress > >> +import psutil > >> +import struct > >> +import sys > >> +import time > >> + > >> +# > >> +# eBPF source code > >> +# > >> + > > > > Nit: extra line > > > >> +bpf_src = """ > >> +#include <linux/types.h> > >> +#include <uapi/linux/ptrace.h> > >> + > >> +#define MAX_KEY 2048 > >> +#define FLOW_FILTER <FILTER_BOOL> > >> + > >> +enum probe { OP_FLOW_PUT, FLOW_RESULT }; > >> + > >> +typedef union ovs_u128 { > >> + u32 ufid32[4]; > >> + u64 ufid64[2]; > >> +} ovs_u128; > >> + > >> +struct dpif_flow_put { > >> + int flags; > >> + void *key_ptr; > >> + size_t key_len; > >> + void *mask_ptr; > >> + size_t mask_len; > >> + u64 action_ptr; > >> + size_t action_len; > >> + void *ufid_ptr; > >> +}; > >> + > >> +struct udpif_key { > >> + void *cmap_node; > >> + void *key_ptr; > >> + size_t key_len; > >> + void *mask_ptr; > >> + size_t mask_len; > >> + ovs_u128 ufid; > >> +}; > > > > Probably Eelco will comment more on this, but there are efforts going on > > to reduce the duplicated struct definitions using pahole. > Yes, this is an item we need to solve before we can merge both my patch, > https://patchwork.ozlabs.org/project/openvswitch/list/?series=319134, and > this one. > > Currently, with pahole I can get the structures we need from a binary that > has the debug symbols included. However, there are still some problems with > getting it from the .debug data. > > Adrian, do you know if any progress was made with this part? > > >> +struct event_t { > >> + u64 ts; > >> + u32 reason; > >> + u32 ufid[4]; /* Can't seem to make the ovs_u128 pass to python side. > >> */ + u64 key_size; > >> + unsigned char key[MAX_KEY]; > >> + enum probe probe; > >> +}; > >> + > +BPF_HASH(watchlist, ovs_u128); > >> +BPF_RINGBUF_OUTPUT(events, <BUFFER_PAGE_COUNT>); > >> + > >> +int usdt__flow_result(struct pt_regs *ctx) { > >> + u64 *ufid_present = NULL; > >> + struct udpif_key ukey; > >> + > >> + bpf_usdt_readarg_p(3, ctx, &ukey, sizeof ukey); > >> + ovs_u128 ufid = ukey.ufid; > >> + ufid_present = watchlist.lookup(&ufid); > >> + if(FLOW_FILTER && !ufid_present) { > >> + return 0; > >> + } > >> + > >> + struct event_t *event = events.ringbuf_reserve(sizeof(struct > >> event_t)); + if(!event) { > >> + /* If we can't reserve the space in the ring buffer, return 1. > >> */ > >> + return 1; > >> + } > >> + > >> + event->probe = FLOW_RESULT; > >> + event->ts = bpf_ktime_get_ns(); > >> + bpf_probe_read(&event->ufid, sizeof ufid, &ufid); > >> + bpf_usdt_readarg(1, ctx, &event->reason); > >> + events.ringbuf_submit(event, 0); > >> + > >> + return 0; > >> +}; > >> + > >> + > >> +int usdt__op_flow_put(struct pt_regs *ctx) { > >> + struct dpif_flow_put put; > >> + ovs_u128 ufid; > >> + > >> + struct event_t *event = events.ringbuf_reserve(sizeof(struct > >> event_t)); + if(!event) { > >> + /* If we can't reserve the space in the ring buffer, return 1. > >> */ > >> + return 1; > >> + } > >> + > >> + event->probe = OP_FLOW_PUT; > >> + event->ts = bpf_ktime_get_ns(); > >> + bpf_usdt_readarg_p(2, ctx, &put, sizeof put); > >> + bpf_probe_read(&event->ufid, sizeof event->ufid, put.ufid_ptr); > >> + bpf_probe_read(&ufid, sizeof ufid, &event->ufid); > >> + if (put.key_len > MAX_KEY) { > >> + put.key_len = MAX_KEY; > >> + } > >> + event->key_size = put.key_len; > >> + bpf_probe_read(&event->key, put.key_len, put.key_ptr); > >> + event->reason = 0; > >> + events.ringbuf_submit(event, 0); > >> + > >> + watchlist.increment(ufid); > >> + return 0; > >> +}; > >> +""" > >> + > >> + > >> +# > >> +# format_ufid() > >> +# > >> +def format_ufid(ufid): > >> + return "ufid:{:08x}-{:04x}-{:04x}-{:04x}-{:04x}{:08x}".format( > >> + ufid[0], ufid[1] >> 16, ufid[1] & 0xffff, > >> + ufid[2] >> 16, ufid[2] & 0, ufid[3]) > >> + > >> + > >> +# > >> +# find_and_delete_from_watchlist() > >> +# > >> +def find_and_delete_from_watchlist(event): > >> + for k, _ in b['watchlist'].items(): > >> + key_ufid = struct.unpack("=IIII", k) > >> + if key_ufid == tuple(event.ufid): > >> + key = (b['watchlist'].Key * 1)(k) > >> + b['watchlist'].items_delete_batch(key) > >> + break > >> + > >> + > >> +# > >> +# handle_flow_put() > >> +# > >> +def handle_flow_put(event): > >> + if args.flow_keys or args.filter_flows is not None: > >> + key = decode_key(bytes(event.key)[:event.key_size]) > >> + flow_dict, flow_str = parse_flow_dict(key) > >> + # For each attribute that we're watching. > >> + if args.filter_flows is not None: > >> + if not compare_dicts(args.filter_flows, flow_dict): > >> + find_and_delete_from_watchlist(event) > >> + return > >> + > >> + print("{:<18.9f} {:<45} {:<13}".format(event.ts / 1000000000, > >> + format_ufid(event.ufid), "Insert (put) flow to kernel.")) > >> + > >> + if args.flow_keys: > >> + if len(flow_str) > 80: > >> + flow_str = " " + "),\n ".join(flow_str.split("), ")) > >> + else: > >> + flow_str = " " + flow_str > >> + print(" - It holds the following flow information:") > >> + print(flow_str) > >> + > >> + > >> +# > >> +# compare_dicts() > >> +# > > >> +def compare_dicts(target, flow): > Can we have a more specific name for this function? > > >> + for key in target: > >> + if key not in flow: > >> + return False > >> + elif target[key] is True: > >> + continue > >> + elif target[key] == flow[key]: > >> + continue > >> + elif isinstance(target[key], dict) and isinstance(flow[key], > >> dict): + return compare_dicts(target[key], flow[key]) > >> + else: > >> + return False > >> + return True > >> + > >> + > >> +# > >> +# parse_flow_str() > >> +# > >> +def parse_flow_str(flow_str): > >> + f_list = [i.removeprefix(',').strip() for i in flow_str.split(')')] > >> + if f_list[-1] == '': > >> + f_list = f_list[:-1] > >> + flow_dict = {} > >> + for e in f_list: > >> + split_list = e.split('(') > >> + k = split_list[0] > >> + if len(split_list) == 1: > >> + flow_dict[k] = True > >> + elif split_list[1].count('=') == 0: > >> + flow_dict[k] = split_list[1] > >> + else: > >> + sub_dict = {} > >> + sublist = [i.strip() for i in split_list[1].split(',')] > >> + for subkey in sublist: > >> + brk = subkey.find('=') > >> + sub_dict[subkey[:brk]] = subkey[brk + 1:] > >> + flow_dict[k] = sub_dict > >> + return flow_dict > >> + > >> + > >> +# > >> +# print_expiration() > >> +# > >> +def print_expiration(event): > >> + reasons = ["Unknown flow expiration reason!", "Flow timed out", > >> + "Flow revalidation too expensive", > >> + "Flow needs narrower wildcard mask", > >> + "Bad ODP flow fit", "Flow with associated ofproto", > >> + "Flow translation error", "Flow cache avoidance"] > >> + > >> + ufid_str = format_ufid(event.ufid) > >> + reason = event.reason > >> + > >> + if reason not in range(0, len(reasons) - 1): > >> + reason = 0 > >> + print("{:<18.9f} {:<45} {:<17}". > >> + format(event.ts / 1000000000, ufid_str, reasons[reason])) > >> + > >> + > >> +# > >> +# decode_key() > >> +# > >> +def decode_key(msg): > >> + print(msg) > > > > This print adds very little value (unless you're very fluent reading > > hexadecimal netlink data :-D) and fills up the terminal when there are a > > lot of upcalls (since this is previous to filtering). I woulr remove it.> Whoops, missed this when I was cleaning up. > >> + bytes_left = len(msg) > >> + result = {} > >> + while bytes_left: > >> + if bytes_left < 4: > >> + break > >> + nla_len, nla_type = struct.unpack("=HH", msg[:4]) > >> + if nla_len < 4: > >> + break > >> + nla_data = msg[4:nla_len] > >> + trunc = False > >> + if nla_len > bytes_left: > >> + trunc = True > >> + nla_data = nla_data[:(bytes_left - 4)] > >> + else: > >> + result[get_ovs_key_attr_str(nla_type)] = nla_data > >> + if trunc: > >> + break > >> + next_offset = (nla_len + 3) & (~3) > >> + msg = msg[next_offset:] > >> + bytes_left -= next_offset > >> + return result > >> + > >> + > >> +# > >> +# get_ovs_key_attr_str() > >> +# > >> +def get_ovs_key_attr_str(attr): > >> + ovs_key_attr = ["OVS_KEY_ATTR_UNSPEC", > >> + "encap", > >> + "skb_priority", > >> + "in_port", > >> + "eth", > >> + "vlan", > >> + "eth_type", > >> + "ipv4", > >> + "ipv6", > >> + "tcp", > >> + "udp", > >> + "icmp", > >> + "icmpv6", > >> + "arp", > >> + "nd", > >> + "skb_mark", > >> + "tunnel", > >> + "sctp", > >> + "tcp_flags", > >> + "dp_hash", > >> + "recirc_id", > >> + "mpls", > >> + "ct_state", > >> + "ct_zone", > >> + "ct_mark", > >> + "ct_label", > >> + "ct_tuple4", > >> + "ct_tuple6", > >> + "nsh"] > >> + > >> + if attr < 0 or attr > len(ovs_key_attr): > >> + return "<UNKNOWN>: {}".format(attr) > >> + > >> + return ovs_key_attr[attr] > >> + > >> + > >> +# > >> +# is_nonzero() > >> +# > >> +def is_nonzero(val): > >> + if isinstance(val, int): > >> + return (val != 0) > >> + > >> + if isinstance(val, str): > >> + val = bytes(val, 'utf-8') > >> + > >> + # If it's not a string or an int, it's bytes. > >> + return (val.count(0) < len(val)) > >> + > >> + > >> +# > >> +# parse_flow_dict() > >> +# > +def parse_flow_dict(key_dict): > >> + ret_str = "" > >> + parseable = {} > >> + skip = ["nsh", "tunnel", "mpls", "vlan"] > >> + need_byte_swap = ["ct_label"] > >> + ipv4addrs = ["ct_tuple4", "tunnel", "ipv4", "arp"] > >> + ipv6addrs = ["ipv6", "nd", "ct_tuple6"] > >> + macs = {"eth": [0, 1], "arp": [3, 4], "nd": [1, 2]} > >> + fields = [("OVS_KEY_ATTR_UNSPEC"), > >> + ("encap", ), > >> + ("skb_priority", "<I"), > >> + ("in_port", "<I"), > >> + ("eth", "!6s6s", "src", "dst"), > >> + ("vlan", ), > >> + ("eth_type", "!H"), > >> + ("ipv4", "!4s4s4B", "src", "dst", "proto", "tos", "ttl", > >> "frag"), + ("ipv6", "!16s16s4s4B", "src", "dst", > >> + "label", "proto", "tclass", "hlimit", "frag"), > >> + ("tcp", "!2H", "src", "dst"), > >> + ("udp", "!2H", "src", "dst"), > >> + ("icmp", "!2B", "type", "code"), > >> + ("icmpv6", "!2B", "type", "code"), > >> + ("arp", "!4s4sH6s6s", "sip", "tip", "op", "sha", "tha"), > >> + ("nd", "!16s6s6s", "target", "sll", "tll"), > >> + ("skb_mark", "<I"), > >> + ("tunnel", ), > >> + ("sctp", "!2H", "src", "dst"), > >> + ("tcp_flags", "!H"), > >> + ("dp_hash", "<I"), > >> + ("recirc_id", "<I"), > >> + ("mpls", ), > >> + ("ct_state", "<I"), > >> + ("ct_zone", "<H"), > >> + ("ct_mark", "<I"), > >> + ("ct_label", "!16s"), > >> + ("ct_tuple4", > >> + "!4s4s2HB", "src", "dst", "tp_src", "tp_dst", "proto"), > >> + ("ct_tuple6", > >> + "!16s16sB2H", "src", "dst", "proto", "tp_src", "tp_dst"), > >> + ("nsh", )] > >> + for k, v in key_dict.items(): > >> + if k in skip: > >> + continue > >> + if int.from_bytes(v, "big") == 0: > >> + parseable[k] = '0' > >> + continue > >> + if k in need_byte_swap: > >> + v = int.from_bytes(v, 'little').to_bytes(len(v), 'big') > >> + attr = -1 > >> + for f in fields: > >> + if k == f[0]: > >> + attr = fields.index(f) > >> + break > >> + if len(fields[attr]) > 1: > >> + data = list(struct.unpack(fields[attr][1], > >> + v[:struct.calcsize(fields[attr][1])])) > >> + if k in ipv4addrs: > >> + if data[0].count(0) < 4: > >> + data[0] = str(ipaddress.IPv4Address(data[0])) > >> + else: > >> + data[0] = b"\x00" > >> + if data[1].count(0) < 4: > >> + data[1] = str(ipaddress.IPv4Address(data[1])) > >> + else: > >> + data[1] = b"\x00" > >> + if k in ipv6addrs: > >> + if data[0].count(0) < 16: > >> + data[0] = str(ipaddress.IPv6Address(data[0])) > >> + else: > >> + data[0] = b"\x00" > >> + if k != "nsh" and data[1].count(0) < 16: > >> + data[1] = str(ipaddress.IPv6Address(data[1])) > >> + elif k != "nsh": > >> + data[1] = b"\x00" > >> + if k in macs.keys(): > >> + for e in macs[k]: > >> + if data[e].count(0) == 6: > >> + mac_str = b"\x00" > >> + else: > >> + mac_str = ':'.join(["%02x" % i for i in > >> data[e]]) > >> + data[e] = mac_str > >> + if len(fields[attr]) > 2: > >> + field_dict = {field: d for field, d in zip(fields[attr][2:], > >> data)} + s = ", ".join(k + "=" + str(v) for k, v in > >> field_dict.items())> > >> + elif k != 'eth_type': > > Nit: There seems to be a bit of an unconsistent use of single vs double > > quotes on strings. I would stick with double quotes everywhere. > +1 > > >> + s = str(data[0]) > >> + field_dict = s > >> + else: > >> + s = hex(data[0]) > >> + field_dict = s > >> + ret_str += k + "(" + s + "), " > >> + parseable[k] = field_dict > >> + ret_str = ret_str[:-2] > >> + return (parseable, ret_str) > >> + > >> + > >> +# > >> +# handle_event() > >> +# > >> +def handle_event(ctx, data, size): > >> + # Once we grab the event, we have three cases. > >> + # 1. It's a revalidator probe and the reason is nonzero: A flow is > >> expiring + # 2. It's a revalidator probe and the reason is zero: flow > >> revalidated + # 3. It's a flow_put probe. > >> + # > >> + # We will ignore case 2, and report all others. > >> + # > >> + event = b["events"].event(data) > >> + if event.probe == 0: # OP_FLOW_PUT > >> + handle_flow_put(event) > >> + elif event.probe == 1 and event.reason > 0: # FLOW_RESULT > >> + print_expiration(event) > >> + > >> + > >> +def main(): > >> + # > >> + # Don't like these globals, but ctx passing does not work with the > >> existing + # open_ring_buffer() API :( > >> + # > >> + global b > >> + global args > >> + > >> + # > >> + # Argument parsing > >> + # > >> + parser = argparse.ArgumentParser() > >> + parser.add_argument("--buffer-page-count", > >> + help="Number of BPF ring buffer pages, default > >> 1024", + type=int, default=1024, > >> metavar="NUMBER") + parser.add_argument("-k", "--flow-keys", > >> + help="Print flow keys as flow strings", > >> + action="store_true") > >> + parser.add_argument("--pid", "-p", metavar="VSWITCHD_PID", > > Guess we should do the short name first, followed by the long name. > > >> + help="ovs-vswitchd's PID", type=int, > >> default=None) > >> + parser.add_argument("-D", "--debug", help="Enable eBPF debugging", > >> + type=int, const=0x3f, default=0, nargs="?") > >> + parser.add_argument("-f", "--filter-flows", metavar="FLOW_STRING", > >> + help="Filter flows that match the specified > >> flow", > >> + type=str, default=None, nargs="*") > >> + args = parser.parse_args() > >> + > >> + # > >> + # Find the PID of the ovs-vswitchd daemon if not specified. > >> + # > >> + if args.pid is None: > >> + for proc in psutil.process_iter(): > >> + if 'ovs-vswitchd' in proc.name(): > >> + if args.pid is not None: > >> + print("Error: Multiple ovs-vswitchd daemons running, > >> " > >> + "use the -p option!") > >> + sys.exit(-1) > >> + > >> + args.pid = proc.pid > >> + # > >> + # Error checking on input parameters > >> + # > >> + if args.pid is None: > >> + print("ERROR: Failed to find ovs-vswitchd's PID!") > >> + sys.exit(-1) > >> + > >> + # > >> + # Attach the USDT probes > >> + # > >> + u = USDT(pid=int(args.pid)) > >> + try: > >> + u.enable_probe(probe="op_flow_put", fn_name="usdt__op_flow_put") > >> + except USDTException as e: > >> + print("Error attaching the dpif_netlink_operate__:op_flow_put > >> probe.") + print(str(e)) > >> + sys.exit(-1) > >> + > >> + try: > >> + u.enable_probe(probe="flow_result", fn_name="usdt__flow_result") > >> + except USDTException as e: > >> + print("Error attaching the revalidate:flow_result probe.") > >> + print(str(e)) > >> + sys.exit(-1) > >> + > >> + # > >> + # Attach the probes to the running process > >> + # > >> + if args.filter_flows is None: > >> + filter_bool = 0 > >> + else: > >> + filter_bool = 1 > >> + args.filter_flows = parse_flow_str(args.filter_flows[0]) > >> + > >> + source = bpf_src.replace("<BUFFER_PAGE_COUNT>", > >> + str(args.buffer_page_count)) > >> + source = source.replace("<FILTER_BOOL>", str(filter_bool)) > >> + > >> + b = BPF(text=source, usdt_contexts=[u], debug=args.debug) > >> + > >> + # > >> + # Print header > >> + # > >> + print("{:<18} {:<45} {:<17}".format("TIME", "UFID", "EVENT/REASON")) > >> + > >> + # > >> + # Dump out all events. > >> + # > >> + b["events"].open_ring_buffer(handle_event) > >> + while 1: > >> + try: > >> + b.ring_buffer_poll() > >> + time.sleep(0.5) > >> + except KeyboardInterrupt: > >> + break > >> + > >> + > >> +# > >> +# Start main() as the default entry point > >> +# > >> +if __name__ == "__main__": > >> + main() > > > > _______________________________________________ > > dev mailing list > > [email protected] > > https://mail.openvswitch.org/mailman/listinfo/ovs-dev _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
