On 20 Mar 2023, at 6:44, Chris Mi wrote:
> On 3/16/2023 5:09 PM, Eelco Chaudron wrote: >> On 1 Mar 2023, at 8:22, Chris Mi wrote: >> >>> When offloading sample action to TC, userspace creates a unique ID >>> to map sFlow action and tunnel info and passes this ID to kernel >>> instead of the sFlow info. Psample will send this ID and sampled >>> packet to userspace. Using the ID, userspace can recover the sFlow >>> info and send sampled packet to the right sFlow monitoring host. >> See some comments inline below. >> >> //Eelco >> >>> Signed-off-by: Chris Mi<c...@nvidia.com> >>> Reviewed-by: Roi Dayan<r...@nvidia.com> >>> --- >>> lib/netdev-offload-tc.c | 233 +++++++++++++++++++++++++++++++++++++++- >>> lib/tc.h | 1 + >>> 2 files changed, 232 insertions(+), 2 deletions(-) >>> >>> diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c >>> index 4fb9d9f21..4214337d8 100644 >>> --- a/lib/netdev-offload-tc.c >>> +++ b/lib/netdev-offload-tc.c >>> @@ -41,6 +41,7 @@ >>> #include "unaligned.h" >>> #include "util.h" >>> #include "dpif-provider.h" >>> +#include "cmap.h" >>> >>> VLOG_DEFINE_THIS_MODULE(netdev_offload_tc); >>> >>> @@ -103,6 +104,226 @@ static void parse_tc_flower_to_stats(struct tc_flower >>> *flower, >>> static int get_ufid_adjust_stats(const ovs_u128 *ufid, >>> struct dpif_flow_stats *stats); >>> >>> +/* When offloading sample action to TC, userspace creates a unique ID >>> + * to map sFlow action and tunnel info and passes this ID to kernel >>> + * instead of the sFlow info. Psample will send this ID and sampled >>> + * packet to userspace. Using the ID, userspace can recover the sFlow >>> + * info and send sampled packet to the right sFlow monitoring host. >>> + */ >>> +struct offload_sflow { >> Should we maybe already give this a more general name, like offload_sample? >> This as you need the same structure later when you add support of offload >> sent to a controller? > OK. >> We might need an offload_type field. > OK. >> >>> + struct nlattr *action; /* SFlow action. Used in flow_get. */ >>> + struct nlattr *userdata; /* Struct user_action_cookie. */ >>> + struct nlattr *actions; /* All actions to get output tunnel. */ >> You should not need actions at all, see comments later. These are the next >> actions, which are not of interest to sflow offload. > As the comment describes, we need it to get the output tunnel. Otherwise, > we'll lose the following info using sflowtool after offloading: > > extendedType out_VNI > out_VNI 4 > flowBlock_tag 0:1023 > flowSampleType tunnel_ipv4_out_IPV4 > tunnel_ipv4_out_sampledPacketSize 0 > tunnel_ipv4_out_IPSize 0 > tunnel_ipv4_out_srcIP 0.0.0.0 > tunnel_ipv4_out_dstIP 192.168.1.66 > tunnel_ipv4_out_IPProtocol 17 > tunnel_ipv4_out_IPTOS 0 > tunnel_ipv4_out_UDPSrcPort 0 > tunnel_ipv4_out_UDPDstPort 46354 > > I don't think we can get the output tunnel info without mapping 'actions'. I’m confused as from the code it looks like actions are the outer actions, and your actions should come from the inner actions, i.e., they should be part of action. Maybe I’m not understanding this correctly, and as this is a more complex case, we should have a test case fore it. If you can share the test case before the next revision of the patchset, I can revisit it to safe another version… >> >>> + struct flow_tnl *tunnel; /* Input tunnel. */ >>> + ovs_u128 ufid; /* Flow ufid. */ >>> +}; >>> + >>> +/* This maps a psample group ID to struct offload_sflow. */ >>> +struct sgid_node { >>> + struct cmap_node metadata_node; >>> + struct cmap_node id_node; >>> + struct ovs_refcount refcount; >>> + uint32_t hash; >>> + uint32_t id; >>> + struct offload_sflow sflow; >>> +}; >>> + >>> +static struct ovs_mutex sgid_lock = OVS_MUTEX_INITIALIZER; >>> +static struct cmap sgid_map = CMAP_INITIALIZER; >>> +static struct cmap sgid_metadata_map = CMAP_INITIALIZER; >>> +static struct id_pool *sample_group_ids OVS_GUARDED_BY(sgid_lock); >>> + >>> +static void >>> +sgid_node_free(struct sgid_node *node) >>> +{ >>> + if (node) { >>> + if (node->sflow.tunnel) { >> This additional null check (and below) is not needed as free() will do this >> internally. > OK. >> >>> + free(node->sflow.tunnel); >>> + } >>> + if (node->sflow.action) { >>> + free(node->sflow.action); >>> + } >>> + if (node->sflow.actions) { >>> + free(node->sflow.actions); >>> + } >>> + if (node->sflow.userdata) { >>> + free(node->sflow.userdata); >>> + } >>> + free(node); >>> + } >>> +} >>> + >>> +static struct sgid_node * >>> +sgid_find(uint32_t id) >>> +{ >>> + const struct cmap_node *node = cmap_find(&sgid_map, id); >>> + >>> + return node ? CONTAINER_OF(node, struct sgid_node, id_node) : NULL; >>> +} >>> + >>> +static uint32_t >>> +offload_sflow_hash(const struct offload_sflow *sflow) >>> +{ >>> + return hash_bytes(&sflow->ufid, sizeof sflow->ufid, 0); >>> +} >>> +static bool >>> +offload_sflow_equal(const struct offload_sflow *a, >>> + const struct offload_sflow *b) >>> +{ >>> + if (!ovs_u128_equals(a->ufid, b->ufid)) { >>> + return false; >>> + } >>> + /* Action can't be NULL. */ >>> + if (!a->action || !b->action) { >>> + return false; >>> + } >>> + if (a->action->nla_len != b->action->nla_len >>> + || memcmp(a->action, b->action, a->action->nla_len)) { >>> + return false; >>> + } >>> + /* Actions is used to get output tunnel. It can be NULL. */ >>> + if ((a->actions && !b->actions) || (!a->actions && b->actions)) { >>> + return false; >>> + } >>> + if ((a->actions && b->actions && >>> + a->actions->nla_len != b->actions->nla_len) >>> + || memcmp(a->actions, b->actions, a->actions->nla_len)) { >> In theory, a->actions and b->actions can be NULL in this case, if >> a->actions->nla_len != 0 for some odd reason, it will lead to a crash. > But it is checked already: > > + /* Actions is used to get output tunnel. It can be NULL. */ > + if ((a->actions && !b->actions) || (!a->actions && b->actions)) { > + return false; > + } > Guess I was a bit cryptic. I meant a->actions == NULL && b->actions == NULL. >> >>> + return false; >>> + } >>> + /* Tunnel is used to get input tunnel. It can be NULL. */ >>> + if (!a->tunnel && !b->tunnel) { >>> + return true; >>> + } >>> + if (!a->tunnel || !b->tunnel) { >>> + return false; >>> + } >>> + return !memcmp(a->tunnel, b->tunnel, sizeof *a->tunnel); >>> +} >>> + >>> +static struct sgid_node * >>> +sgid_find_equal(const struct offload_sflow *target, uint32_t hash) >>> +{ >>> + struct sgid_node *node; >>> + >>> + CMAP_FOR_EACH_WITH_HASH (node, metadata_node, hash, >>> &sgid_metadata_map) { >>> + if (offload_sflow_equal(&node->sflow, target)) { >>> + return node; >>> + } >>> + } >>> + return NULL; >>> +} >>> + >>> +static struct sgid_node * >>> +sgid_ref_equal(const struct offload_sflow *target, uint32_t hash) >>> +{ >>> + struct sgid_node *node; >>> + >>> + do { >>> + node = sgid_find_equal(target, hash); >>> + /* Try again if the node was released before we get the reference. >>> */ >>> + } while (node && !ovs_refcount_try_ref_rcu(&node->refcount)); >> It might be too late in the afternoon and need some more coffee, but if the >> node was released, i.e. refcount zero, would it not loop eternally as >> gid_find_equal is not checking the refcount? > Most of the code is copied from ofproto/ofproto-dpif-rid.c. This patch is > simplified greatly since the first review. The ofproto-dpif-rid.c exists in this code, you code does not. > Maybe it is a overkill to reference the rid. How about change this gid > management to use only mutex and id pool? If the sgid is only used for a single flow we do not need the refcount, and it would definitely simplify the code. >> >>> + >>> + return node; >>> +} >>> + >>> +static void >>> +offload_sflow_clone(struct offload_sflow *new, const struct offload_sflow >>> *old) >> See the earlier comment on the naming, if we re-use this for general psample >> (for sent to userspace), we might want to make the naming more general. > OK. >> >>> +{ >>> + new->action = xmemdup(old->action, old->action->nla_len); >>> + new->actions = old->actions >>> + ? xmemdup(old->actions, old->actions->nla_len) >>> + : NULL; >>> + new->userdata = xmemdup(old->userdata, old->userdata->nla_len); >>> + new->tunnel = old->tunnel >>> + ? xmemdup(old->tunnel, sizeof *old->tunnel) >>> + : NULL; >>> + new->ufid = old->ufid; >>> +} >>> + >>> +/* Allocate a unique group id for the given set of flow metadata. The id >>> + * space is 2^^32 - 1. 0 is reserved. >>> + */ >>> +static struct sgid_node * >>> +sgid_alloc(const struct offload_sflow *sflow, uint32_t hash) >>> +{ >>> + struct sgid_node *node = xzalloc(sizeof *node); >>> + bool ret; >>> + >> There is no protection for two threads trying to insert the same sflow >> structure? Once you determined the node either does not exists or has a ref >> count of 0 in sgid_alloc_ctx(). > Will add it. >> >>> + node->hash = hash; >>> + ovs_refcount_init(&node->refcount); >>> + offload_sflow_clone(&node->sflow, sflow); >>> + >>> + ovs_mutex_lock(&sgid_lock); >>> + ret = id_pool_alloc_id(sample_group_ids, &node->id); >>> + if (!ret) { >>> + VLOG_ERR("Can't find a free group ID"); >> I would make this “Can't find a free sample group ID" > OK. >> >>> + ovs_mutex_unlock(&sgid_lock); >>> + sgid_node_free(node); >>> + return NULL; >>> + } >>> + >>> + cmap_insert(&sgid_map, &node->id_node, node->id); >>> + cmap_insert(&sgid_metadata_map, &node->metadata_node, node->hash); >>> + ovs_mutex_unlock(&sgid_lock); >>> + return node; >>> +} >>> + >>> +/* Allocate a unique group id for the given set of flow metadata and >>> + * optional actions. >>> + */ >> So assuming the ufid is part of the offload_sflow data and we only support a >> single sample per flow we will probably not re-use the metadata. So I guess >> this only makes sense if we support multiple offloads per flow in the future >> am I right? > Actually, ufid was introduced just for easy hashing and comparing. I didn't > noticed that with this change, > metadata can't be reused. Without ufid, arp and ip metedata can be reused, > for example: > > recirc_id(0),in_port(enp8s0f0_1),eth(src=02:25:d0:65:01:02,dst=24:25:d0:e1:00:00),eth_type(0x0800),ipv4(tos=0,frag=no), > packets:16, bytes:2330, used:0.110s, > actions:userspace(pid=4294967295,sFlow(vid=0,pcp=0,output=2429),actions),set(tunnel(tun_id=0x4,dst=192.168.1.66,ttl=64,tp_dst=4789,flags(key))),vxlan_sys_4789 > recirc_id(0),in_port(enp8s0f0_1),eth(src=02:25:d0:65:01:02,dst=24:25:d0:e1:00:00),eth_type(0x0806), > packets:0, bytes:0, used:never, > actions:userspace(pid=4294967295,sFlow(vid=0,pcp=0,output=2429),actions),set(tunnel(tun_id=0x4,dst=192.168.1.66,ttl=64,tp_dst=4789,flags(key))),vxlan_sys_4789 > > Maybe we should remove ufid to re-use meterdata. Or maybe keep it. So there > is a 1:1 mapping for sample group id and ufid. > What do you think, Eelco? I’m ok with both designs if you use the 1:1 mapping you can remove the ref counting, if you remove it you need to fix the above refcount comment. >> >>> +static uint32_t >>> +sgid_alloc_ctx(const struct offload_sflow *sflow) >>> +{ >>> + uint32_t hash = offload_sflow_hash(sflow); >>> + struct sgid_node *node = sgid_ref_equal(sflow, hash); >>> + >>> + if (!node) { >>> + node = sgid_alloc(sflow, hash); >>> + } >>> + >>> + return node ? node->id : 0; >>> +} >>> + >>> +static void >>> +sgid_node_unref(struct sgid_node *node) >>> + OVS_EXCLUDED(sgid_lock) >>> +{ >>> + ovs_mutex_lock(&sgid_lock); >>> + if (node && ovs_refcount_unref(&node->refcount) == 1) { >>> + id_pool_free_id(sample_group_ids, node->id); >>> + cmap_remove(&sgid_metadata_map, &node->metadata_node, node->hash); >>> + cmap_remove(&sgid_map, &node->id_node, node->id); >>> + ovsrcu_postpone(sgid_node_free, node); >>> + } >>> + ovs_mutex_unlock(&sgid_lock); >>> +} >>> + >>> +static void >>> +sgid_free(uint32_t id) >>> +{ >>> + struct sgid_node *node; >>> + >>> + if (!id) { >>> + return; >>> + } >>> + >>> + node = sgid_find(id); >>> + if (node) { >>> + sgid_node_unref(node); >>> + } else { >>> + VLOG_ERR("Freeing nonexistent group ID: %"PRIu32, id); >> "Freeing nonexistent sample group ID:" > OK. >> >>> + } >>> +} >>> + >>> +static int >>> +tc_del_flower_filter_and_sgid(struct tcf_id *id) >>> +{ >> I do not think sample_group_id should be part of tcf_id, but it should be >> part of ufid_tc_data, and then it can be cleaned up in the >> del_ufid_tc_mapping_unlocked() function. If sample_groupd_id != 0. > OK. >> >>> + sgid_free(id->sample_group_id); >>> + id->sample_group_id = 0; >>> + return tc_del_flower_filter(id); >>> +} >>> + >>> static bool >>> is_internal_port(const char *type) >>> { >>> @@ -275,7 +496,7 @@ del_filter_and_ufid_mapping(struct tcf_id *id, const >>> ovs_u128 *ufid, >>> } >>> } >>> >>> - err = tc_del_flower_filter(id); >>> + err = tc_del_flower_filter_and_sgid(id); >>> if (!err) { >>> del_ufid_tc_mapping(ufid); >>> } >>> @@ -545,7 +766,7 @@ netdev_tc_flow_flush(struct netdev *netdev) >>> continue; >>> } >>> >>> - err = tc_del_flower_filter(&data->id); >>> + err = tc_del_flower_filter_and_sgid(&data->id); >>> if (!err) { >>> del_ufid_tc_mapping_unlocked(&data->ufid); >>> } >>> @@ -2118,6 +2339,11 @@ netdev_tc_parse_nl_actions(struct netdev *netdev, >>> struct tc_flower *flower, >>> if (err) { >>> return err; >>> } >>> + } else if (nl_attr_type(nla) == OVS_ACTION_ATTR_SAMPLE) { >>> + struct offload_sflow sflow; >>> + >>> + memset(&sflow, 0, sizeof sflow); >>> + sgid_alloc_ctx(&sflow); >>> } else { >>> VLOG_DBG_RL(&rl, "unsupported put action type: %d", >>> nl_attr_type(nla)); >>> @@ -2853,6 +3079,9 @@ netdev_tc_init_flow_api(struct netdev *netdev) >>> tc_cleanup_policer_actions(meter_police_ids, >>> METER_POLICE_IDS_BASE, >>> METER_POLICE_IDS_MAX); >>> ovs_mutex_unlock(&meter_police_ids_mutex); >>> + ovs_mutex_lock(&sgid_lock); >>> + sample_group_ids = id_pool_create(1, UINT32_MAX - 1); >>> + ovs_mutex_unlock(&sgid_lock); >>> >>> ovsthread_once_done(&once); >>> } >>> diff --git a/lib/tc.h b/lib/tc.h >>> index cdd3b4f60..0d9b1b8e7 100644 >>> --- a/lib/tc.h >>> +++ b/lib/tc.h >>> @@ -312,6 +312,7 @@ struct tcf_id { >>> uint32_t chain; >>> uint16_t prio; >>> uint32_t handle; >>> + uint32_t sample_group_id; >> See above, but I think this should be part of struct ufid_to_tc_data. >> >> In addition, I think we should make this an array of MAX_TC_SAMPLES_PER_FLOW >> = 2, so we are ready for supporting sflow + offload to the controller. > I don't think offload to the controller should be considered in this series. > Making this an array seems to be more reasonable in the series to support > offload to the controller. Maybe just add the array with a count of 1, so it’s clear how it should be fixed in the future? >> >>> }; >>> >>> static inline struct tcf_id >>> -- >>> 2.26.3 _______________________________________________ dev mailing list d...@openvswitch.org https://mail.openvswitch.org/mailman/listinfo/ovs-dev