Eelco Chaudron <[email protected]> writes: > On 27 May 2021, at 21:15, Aaron Conole wrote: > > This makes openvswitch module use the event tracing framework > to log the upcall interface and action execution pipeline. When > using openvswitch as the packet forwarding engine, some types of > debugging are made possible simply by using the ovs-vswitchd's > ofproto/trace command. However, such a command has some > limitations: > > 1. When trying to trace packets that go through the CT action, > the state of the packet can't be determined, and probably > would be potentially wrong. > > 2. Deducing problem packets can sometimes be difficult as well > even if many of the flows are known > > 3. It's possible to use the openvswitch module even without > the ovs-vswitchd (although, not common use). > > Introduce the event tracing points here to make it possible for > working through these problems in kernel space. The style is > copied from the mac80211 driver-trace / trace code for > consistency. > > Thanks for doing this Aaron, it will definitely help when trying to debug > some customer issues.
Thanks for the review! > Just to be sure, I did some tests to make sure these changes do not impact > performance, and it looks fine! See some > small nits/comments below, but other than that I would say please re-send as > an official patch. > > Cheers, > > Eelco > > Signed-off-by: Aaron Conole <[email protected]> > --- > net/openvswitch/Makefile | 3 + > net/openvswitch/actions.c | 4 + > net/openvswitch/datapath.c | 7 ++ > net/openvswitch/openvswitch_trace.c | 10 ++ > net/openvswitch/openvswitch_trace.h | 152 ++++++++++++++++++++++++++++ > 5 files changed, 176 insertions(+) > create mode 100644 net/openvswitch/openvswitch_trace.c > create mode 100644 net/openvswitch/openvswitch_trace.h > > diff --git a/net/openvswitch/Makefile b/net/openvswitch/Makefile > index 41109c326f3a..28982630bef3 100644 > --- a/net/openvswitch/Makefile > +++ b/net/openvswitch/Makefile > @@ -13,6 +13,7 @@ openvswitch-y := \ > flow_netlink.o \ > flow_table.o \ > meter.o \ > + openvswitch_trace.o \ > vport.o \ > vport-internal_dev.o \ > vport-netdev.o > @@ -24,3 +25,5 @@ endif > obj-$(CONFIG_OPENVSWITCH_VXLAN)+= vport-vxlan.o > obj-$(CONFIG_OPENVSWITCH_GENEVE)+= vport-geneve.o > obj-$(CONFIG_OPENVSWITCH_GRE) += vport-gre.o > + > +CFLAGS_openvswitch_trace.o = -I$(src) > diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c > index d858ea580e43..62285453ca79 100644 > --- a/net/openvswitch/actions.c > +++ b/net/openvswitch/actions.c > @@ -30,6 +30,7 @@ > #include "conntrack.h" > #include "vport.h" > #include "flow_netlink.h" > +#include "openvswitch_trace.h" > > struct deferred_action { > struct sk_buff *skb; > @@ -1242,6 +1243,9 @@ static int do_execute_actions(struct datapath *dp, > struct sk_buff *skb, > a = nla_next(a, &rem)) { > int err = 0; > > + if (trace_openvswitch_probe_action_enabled()) > + trace_openvswitch_probe_action(dp, skb, key, a, rem); > + > switch (nla_type(a)) { > case OVS_ACTION_ATTR_OUTPUT: { > int port = nla_get_u32(a); > diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c > index 9d6ef6cb9b26..63f19a6ed472 100644 > --- a/net/openvswitch/datapath.c > +++ b/net/openvswitch/datapath.c > @@ -43,6 +43,7 @@ > #include "flow_table.h" > #include "flow_netlink.h" > #include "meter.h" > +#include "openvswitch_trace.h" > #include "vport-internal_dev.h" > #include "vport-netdev.h" > > @@ -275,6 +276,12 @@ int ovs_dp_upcall(struct datapath *dp, struct sk_buff > *skb, > struct dp_stats_percpu *stats; > int err; > > + if (trace_openvswitch_probe_userspace_enabled()) { > + struct sw_flow_key ukey = *key; > + > + trace_openvswitch_probe_userspace(dp, skb, &ukey, upcall_info); > + } > > Rather than work around the const here, can we not fix this in the trace > include? Done. > TP_PROTO(struct datapath *dp, struct sk_buff *skb, > > * struct sw_flow_key *key, > > * const struct sw_flow_key *key, > const struct dp_upcall_info *upcall_info), > > @@ -97,7 +97,7 @@ TRACE_EVENT(openvswitch_probe_userspace, > __field( u16, gso_type ) > __field( u32, ovs_flow_hash ) > __field( u32, recirc_id ) > > * __field( void *, keyaddr ) > > * __field( const void *, keyaddr ) > __field( u16, key_eth_type ) > > + > if (upcall_info->portid == 0) { > err = -ENOTCONN; > goto err; > diff --git a/net/openvswitch/openvswitch_trace.c > b/net/openvswitch/openvswitch_trace.c > new file mode 100644 > index 000000000000..62c5f7d6f023 > --- /dev/null > +++ b/net/openvswitch/openvswitch_trace.c > @@ -0,0 +1,10 @@ > +// SPDX-License-Identifier: GPL-2.0 > +/* bug in tracepoint.h, it should include this */ > +#include <linux/module.h> > + > +/* sparse isn't too happy with all macros... */ > +#ifndef __CHECKER__ > +#define CREATE_TRACE_POINTS > +#include "openvswitch_trace.h" > + > +#endif > diff --git a/net/openvswitch/openvswitch_trace.h > b/net/openvswitch/openvswitch_trace.h > new file mode 100644 > index 000000000000..1b350306f622 > --- /dev/null > +++ b/net/openvswitch/openvswitch_trace.h > @@ -0,0 +1,152 @@ > +/* SPDX-License-Identifier: GPL-2.0 */ > +#undef TRACE_SYSTEM > +#define TRACE_SYSTEM openvswitch > + > +#if !defined(_TRACE_OPENVSWITCH_H) || defined(TRACE_HEADER_MULTI_READ) > +#define _TRACE_OPENVSWITCH_H > + > +#include <linux/tracepoint.h> > + > +#include "datapath.h" > + > > I guess the naming of the events can be a long debate :) But I think just > adding the word “probe” doesn’t help much. > > Maybe for just for function entry/exit, we could do something like > <subsys>_<function_name>_entry/exit. I’ve seen > this used in other components. For example: > > openvswitch_probe_userspace -> openvswitch_ovs_dp_upcall_entry > > For none entry/exit trace macros, we could either use a specific name, as we > have: > > openvswitch_probe_action -> openvswitch_execute_action > > Or maybe even prepended by the function name to easily locate it. For example: > > openvswitch_probe_action -> openvswitch_do_execute_actions__execute_action > > I like the double underscore to differentiate, so it might even be nice for > the __entry, so > openvswitch_ovs_dp_upcall__entry. > > Maybe we should also change openvswitch to ovs for both system and events? Or > is this too short and might cause > confusion? I agree that we can make these more descriptive, and that 'probe' doesn't add much. OTOH, if I use "openvswitch_do_execute_actions__execute_action" or the corresponding upcall trace point name, the line lengths balloon. I don't see that it's as useful to say both 'do_execute_actions' and '__execute_action' - I don't think there will ever be a reason to add an additional tracepoint here since the bulk of processing happens during the action loop. But maybe something like: 'openvswitch_execute_action' and 'openvswitch_dp_upcall' ? Just want to get this part right. > +TRACE_EVENT(openvswitch_probe_action, > + > + TP_PROTO(struct datapath *dp, struct sk_buff *skb, > + struct sw_flow_key *key, const struct nlattr *a, int rem), > + > + TP_ARGS(dp, skb, key, a, rem), > + > + TP_STRUCT__entry( > + __field( void *, dpaddr ) > + __string( dp_name, ovs_dp_name(dp) ) > + __string( dev_name, skb->dev->name ) > + __field( void *, skbaddr ) > + __field( unsigned int, len ) > + __field( unsigned int, data_len ) > + __field( unsigned int, truesize ) > + __field( u8, nr_frags ) > + __field( u16, gso_size ) > + __field( u16, gso_type ) > + __field( u32, ovs_flow_hash ) > + __field( u32, recirc_id ) > + __field( void *, keyaddr ) > + __field( u16, key_eth_type ) > + __field( u8, key_ct_state ) > + __field( u8, key_ct_orig_proto ) > + __field( unsigned int, flow_key_valid ) > + __field( u8, action_type ) > + __field( unsigned int, action_len ) > + __field( void *, action_data ) > + __field( u8, is_last ) > + ), > + > + TP_fast_assign( > + __entry->dpaddr = dp; > + __assign_str(dp_name, ovs_dp_name(dp)); > + __assign_str(dev_name, skb->dev->name); > + __entry->skbaddr = skb; > + __entry->len = skb->len; > + __entry->data_len = skb->data_len; > + __entry->truesize = skb->truesize; > + __entry->nr_frags = skb_shinfo(skb)->nr_frags; > + __entry->gso_size = skb_shinfo(skb)->gso_size; > + __entry->gso_type = skb_shinfo(skb)->gso_type; > + __entry->ovs_flow_hash = key->ovs_flow_hash; > + __entry->recirc_id = key->recirc_id; > + __entry->keyaddr = key; > + __entry->key_eth_type = key->eth.type; > + __entry->key_ct_state = key->ct_state; > + __entry->key_ct_orig_proto = key->ct_orig_proto; > + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); > + __entry->action_type = nla_type(a); > + __entry->action_len = nla_len(a); > + __entry->action_data = nla_data(a); > + __entry->is_last = nla_is_last(a, rem); > + ), > + > + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u > truesize=%u nr_frags=%d > gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p > eth_type=0x%04x > ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d action_type=%u > action_len=%u action_data=%p > is_last=%d", > > I guess %p will be good enough here, as I see no other trace modules use %px. > But I guess, for some of the values > %*ph might be useful, as it will dump the bytes, i.e., action_data. The issue with using %*ph for action_data is in the cases where a netlink attribute doesn't have data. In that case, we don't have any bytes to print, and I think the behavior would be undefined (see ct_clear action, or pop_eth action, etc). > + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), > + __entry->skbaddr, __entry->len, __entry->data_len, > + __entry->truesize, __entry->nr_frags, __entry->gso_size, > + __entry->gso_type, __entry->ovs_flow_hash, > + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, > + __entry->key_ct_state, __entry->key_ct_orig_proto, > + __entry->flow_key_valid, > + __entry->action_type, __entry->action_len, > + __entry->action_data, __entry->is_last) > +); > + > +TRACE_EVENT(openvswitch_probe_userspace, > + > + TP_PROTO(struct datapath *dp, struct sk_buff *skb, > + struct sw_flow_key *key, > + const struct dp_upcall_info *upcall_info), > + > + TP_ARGS(dp, skb, key, upcall_info), > + > + TP_STRUCT__entry( > + __field( void *, dpaddr ) > + __string( dp_name, ovs_dp_name(dp) ) > + __string( dev_name, skb->dev->name ) > + __field( void *, skbaddr ) > + __field( unsigned int, len ) > + __field( unsigned int, data_len ) > + __field( unsigned int, truesize ) > + __field( u8, nr_frags ) > + __field( u16, gso_size ) > + __field( u16, gso_type ) > + __field( u32, ovs_flow_hash ) > + __field( u32, recirc_id ) > + __field( void *, keyaddr ) > + __field( u16, key_eth_type ) > + __field( u8, key_ct_state ) > + __field( u8, key_ct_orig_proto ) > + __field( unsigned int, flow_key_valid ) > + __field( u8, upcall_cmd ) > + __field( u32, upcall_port ) > + __field( u16, upcall_mru ) > + ), > + > + TP_fast_assign( > + __entry->dpaddr = dp; > + __assign_str(dp_name, ovs_dp_name(dp)); > + __assign_str(dev_name, skb->dev->name); > + __entry->skbaddr = skb; > + __entry->len = skb->len; > + __entry->data_len = skb->data_len; > + __entry->truesize = skb->truesize; > + __entry->nr_frags = skb_shinfo(skb)->nr_frags; > + __entry->gso_size = skb_shinfo(skb)->gso_size; > + __entry->gso_type = skb_shinfo(skb)->gso_type; > + __entry->ovs_flow_hash = key->ovs_flow_hash; > + __entry->recirc_id = key->recirc_id; > + __entry->keyaddr = key; > + __entry->key_eth_type = key->eth.type; > + __entry->key_ct_state = key->ct_state; > + __entry->key_ct_orig_proto = key->ct_orig_proto; > + __entry->flow_key_valid = !(key->mac_proto & SW_FLOW_KEY_INVALID); > + __entry->upcall_cmd = upcall_info->cmd; > + __entry->upcall_port = upcall_info->portid; > + __entry->upcall_mru = upcall_info->mru; > + ), > + > + TP_printk("dpaddr=%p dp_name=%s dev=%s skbaddr=%p len=%u data_len=%u > truesize=%u nr_frags=%d > gso_size=%d gso_type=%#x ovs_flow_hash=0x%08x recirc_id=0x%08x keyaddr=%p > eth_type=0x%04x > ct_state=%02x ct_orig_proto=%02x flow_key_valid=%d upcall_cmd=%u > upcall_port=%u upcall_mru=%u", > > See %p comment above. Same applies. > + __entry->dpaddr, __get_str(dp_name), __get_str(dev_name), > + __entry->skbaddr, __entry->len, __entry->data_len, > + __entry->truesize, __entry->nr_frags, __entry->gso_size, > + __entry->gso_type, __entry->ovs_flow_hash, > + __entry->recirc_id, __entry->keyaddr, __entry->key_eth_type, > + __entry->key_ct_state, __entry->key_ct_orig_proto, > + __entry->flow_key_valid, > + __entry->upcall_cmd, __entry->upcall_port, > + __entry->upcall_mru) > +); > + > +#endif /* _TRACE_OPENVSWITCH_H */ > + > +/* This part must be outside protection */ > +#undef TRACE_INCLUDE_PATH > +#define TRACE_INCLUDE_PATH . > +#undef TRACE_INCLUDE_FILE > +#define TRACE_INCLUDE_FILE openvswitch_trace > +#include <trace/define_trace.h> > -- > 2.31.1 _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
