On 4/12/2023 10:06 PM, Eelco Chaudron wrote:
On 29 Mar 2023, at 13:42, Chris Mi wrote:
Initialize psample socket. Add sample recv API to receive sampled
packets from psample socket. Add sample recv wait API to add psample
socket fd to poll list.
Thanks for the update, see comments inline.
//Eelco
Signed-off-by: Chris Mi<[email protected]>
Reviewed-by: Roi Dayan<[email protected]>
---
lib/dpif.h | 6 +-
lib/netdev-offload-provider.h | 17 +++
lib/netdev-offload-tc.c | 192 ++++++++++++++++++++++++++++++++++
lib/netdev-offload.c | 3 +-
4 files changed, 215 insertions(+), 3 deletions(-)
diff --git a/lib/dpif.h b/lib/dpif.h
index 129cbf6a1..5b094ce68 100644
--- a/lib/dpif.h
+++ b/lib/dpif.h
@@ -834,8 +834,10 @@ struct dpif_upcall {
/* DPIF_UC_ACTION only. */
struct nlattr *userdata; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
- struct nlattr *out_tun_key; /* Output tunnel key. */
- struct nlattr *actions; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
+ struct nlattr *out_tun_key; /* Output tunnel key. */
+ struct nlattr *actions; /* Argument to OVS_ACTION_ATTR_USERSPACE. */
+ struct flow flow; /* Caller provide 'flow' if 'key' is not
+ available. */
"Caller provided 'flow' if the 'key' is not available."
Also, multiline comments should start with a *.
But it is not true for most of data struct member comments. For example,
struct dpif_dp_stats {
uint64_t n_hit; /* Number of flow table matches. */
uint64_t n_missed; /* Number of flow table misses. */
uint64_t n_lost; /* Number of misses not sent to
userspace. */
uint64_t n_flows; /* Number of flows present. */
uint64_t n_cache_hit; /* Number of mega flow mask cache hits for
flow table matches. */
uint64_t n_mask_hit; /* Number of mega flow masks visited for
flow table matches. */
uint32_t n_masks; /* Number of mega flow masks. */
};
Could you please confirm with it? Thanks.
};
/* A callback to notify higher layer of dpif about to be purged, so that
diff --git a/lib/netdev-offload-provider.h b/lib/netdev-offload-provider.h
index 9108856d1..b3bf433d0 100644
--- a/lib/netdev-offload-provider.h
+++ b/lib/netdev-offload-provider.h
@@ -28,6 +28,8 @@
extern "C" {
#endif
+struct dpif_upcall;
+
struct netdev_flow_api {
char *type;
/* Flush all offloaded flows from a netdev.
@@ -121,6 +123,21 @@ struct netdev_flow_api {
int (*meter_del)(ofproto_meter_id meter_id,
struct ofputil_meter_stats *stats);
+ /* Polls for sample offload packets for an upcall handler. If successful,
This should be a general upcall handling API, so I would change this to:
/* Polls for upcall offload packets...
Done.
+ * stores the upcall into '*upcall', using 'buf' for storage.
+ *
Also missing this part as suggested earlier, as it should also apply to the
code supplied here (does it?):
* The implementation should point 'upcall->key' and 'upcall->userdata'
* (if any) into data in the caller-provided 'buf'. The implementation may
* also use 'buf' for storing the data of 'upcall->packet'. If necessary
* to make room, the implementation may reallocate the data in 'buf'.
*
* The caller owns the data of 'upcall->packet' and may modify it. If
* packet's headroom is exhausted as it is manipulated, 'upcall->packet'
* will be reallocated. This requires the data of 'upcall->packet' to be
* released with ofpbuf_uninit() before 'upcall' is destroyed. However,
* when an error is returned, the 'upcall->packet' may be uninitialized
* and should not be released.
Done.
+ * Return 0 if successful, otherwise returns a positive errno value.
+ *
+ * This function must not block. If no upcall is pending when it is
+ * called, it should return EAGAIN without blocking. */
+ int (*recv)(struct dpif_upcall *upcall, struct ofpbuf *buf,
+ uint32_t handler_id);
+
+ /* Arranges for the poll loop for an upcall handler to wake up when
+ * sample socket has a message queued to be received with the recv
+ * member functions. */
+ void (*recv_wait)(uint32_t handler_id);
+
/* Initializies the netdev flow api.
* Return 0 if successful, otherwise returns a positive errno value. */
int (*init_flow_api)(struct netdev *);
diff --git a/lib/netdev-offload-tc.c b/lib/netdev-offload-tc.c
index c53ea5588..8c8f29af7 100644
--- a/lib/netdev-offload-tc.c
+++ b/lib/netdev-offload-tc.c
@@ -18,6 +18,8 @@
#include <errno.h>
#include <linux/if_ether.h>
+#include <linux/psample.h>
+#include <poll.h>
#include "dpif.h"
#include "hash.h"
@@ -124,6 +126,9 @@ struct sgid_node {
struct offload_sample sample;
};
+static struct nl_sock *psample_sock;
+static int psample_family;
+
static struct ovs_mutex sgid_lock = OVS_MUTEX_INITIALIZER;
static struct cmap sgid_map = CMAP_INITIALIZER;
static struct id_pool *sample_group_ids OVS_GUARDED_BY(sgid_lock);
@@ -148,6 +153,14 @@ sgid_find(uint32_t id)
return node ? CONTAINER_OF(node, struct sgid_node, id_node) : NULL;
}
+static struct offload_sample *
+sample_find(uint32_t id)
+{
+ struct sgid_node *node = sgid_find(id);
+
+ return node ? &node->sample: NULL;
+}
+
static void
offload_sample_clone(struct offload_sample *new,
const struct offload_sample *old)
@@ -2931,6 +2944,55 @@ tc_cleanup_policer_actions(struct id_pool *police_ids,
hmap_destroy(&map);
}
+static void
+psample_init(void)
+{
+ unsigned int psample_mcgroup;
+ int err;
+
+ if (!netdev_is_flow_api_enabled()) {
+ VLOG_DBG("Flow API is not enabled");
+ return;
+ }
+
+ if (psample_sock) {
+ VLOG_DBG("Psample socket is already initialized");
+ return;
+ }
+
+ err = nl_lookup_genl_family(PSAMPLE_GENL_NAME,
+ &psample_family);
+ if (err) {
+ VLOG_INFO("Generic Netlink family '%s' does not exist: %s\n"
+ "Please make sure the kernel module psample is loaded",
+ PSAMPLE_GENL_NAME, ovs_strerror(err));
+ return;
+ }
+
+ err = nl_lookup_genl_mcgroup(PSAMPLE_GENL_NAME,
+ PSAMPLE_NL_MCGRP_SAMPLE_NAME,
+ &psample_mcgroup);
+ if (err) {
+ VLOG_INFO("Failed to join Netlink multicast group '%s': %s",
+ PSAMPLE_NL_MCGRP_SAMPLE_NAME, ovs_strerror(err));
+ return;
+ }
+
+ err = nl_sock_create(NETLINK_GENERIC, &psample_sock);
+ if (err) {
+ VLOG_INFO("Failed to create psample socket: %s", ovs_strerror(err));
+ return;
+ }
+
+ err = nl_sock_join_mcgroup(psample_sock, psample_mcgroup);
+ if (err) {
+ VLOG_INFO("Failed to join psample mcgroup: %s", ovs_strerror(err));
+ nl_sock_destroy(psample_sock);
+ psample_sock = NULL;
+ return;
+ }
+}
+
static int
netdev_tc_init_flow_api(struct netdev *netdev)
{
@@ -2990,6 +3052,7 @@ netdev_tc_init_flow_api(struct netdev *netdev)
ovs_mutex_lock(&sgid_lock);
sample_group_ids = id_pool_create(1, UINT32_MAX - 1);
ovs_mutex_unlock(&sgid_lock);
+ psample_init();
ovsthread_once_done(&once);
}
@@ -3206,6 +3269,133 @@ meter_tc_del_policer(ofproto_meter_id meter_id,
return err;
}
+struct offload_psample {
+ struct nlattr *packet; /* Packet data. */
+ uint32_t group_id; /* Mapping id for sample offload. */
+ uint16_t iifindex; /* Input ifindex. */
+};
+
+static int
+nl_parse_psample(struct offload_psample *psample, struct ofpbuf *buf)
+{
+ static const struct nl_policy ovs_psample_policy[] = {
+ [PSAMPLE_ATTR_IIFINDEX] = { .type = NL_A_U16, .optional = true, },
+ [PSAMPLE_ATTR_SAMPLE_GROUP] = { .type = NL_A_U32 },
+ [PSAMPLE_ATTR_GROUP_SEQ] = { .type = NL_A_U32 },
+ [PSAMPLE_ATTR_DATA] = { .type = NL_A_UNSPEC },
+ };
+ struct nlattr *a[ARRAY_SIZE(ovs_psample_policy)];
+ struct genlmsghdr *genl;
+ struct nlmsghdr *nlmsg;
+ struct ofpbuf b;
+
+ b = ofpbuf_const_initializer(buf->data, buf->size);
+ nlmsg = ofpbuf_try_pull(&b, sizeof *nlmsg);
+ genl = ofpbuf_try_pull(&b, sizeof *genl);
+ if (!nlmsg || !genl || nlmsg->nlmsg_type != psample_family
+ || !nl_policy_parse(&b, 0, ovs_psample_policy, a,
+ ARRAY_SIZE(ovs_psample_policy))) {
+ return EINVAL;
+ }
+
+ if (a[PSAMPLE_ATTR_IIFINDEX]) {
+ psample->iifindex = nl_attr_get_u16(a[PSAMPLE_ATTR_IIFINDEX]);
+ }
Do } else { psample->iffindex = 0 }, to safe on the memset later.
I removed psample->ifindex. New code will get input ifindex also via
group_id mapping
instead of from TC. TC only provides group id and sampled packets. We
will get other info
all from the group_id mapping, eg. output tunnel, input ifindex.
+ psample->group_id = nl_attr_get_u32(a[PSAMPLE_ATTR_SAMPLE_GROUP]);
+ psample->packet = a[PSAMPLE_ATTR_DATA];
+
+ return 0;
+}
+
+static int
+psample_parse_packet(struct offload_psample *psample,
+ struct dpif_upcall *upcall)
+{
+ struct flow *flow = &upcall->flow;
+ struct offload_sample *sample;
+
+ memset(upcall, 0, sizeof *upcall);
+ dp_packet_use_const(&upcall->packet,
+ nl_attr_get(psample->packet),
+ nl_attr_get_size(psample->packet));
+
+ sample = sample_find(psample->group_id);
+ if (!sample) {
+ VLOG_ERR_RL(&error_rl, "failed to get sample info via group id: %d",
Capital F for failed.
Done.
+ psample->group_id);
+ return ENOENT;
+ }
+
+ upcall->userdata = sample->userdata;
+ if (sample->tunnel) {
+ memcpy(&flow->tunnel, sample->tunnel, sizeof flow->tunnel);
Are we ok memcpy will work here? We might need to initialize the sample->tunnel
info correctly in the upcomming patches, see flow_tnl_copy__().
From the testing result, it seems it is ok. Maybe flow_tnl_copy__() is
more accurate.
But we need to change a lot of functions non-static. Not sure if we
should do that.
What do you think?
+ }
+ if (sample->userspace_actions) {
+ upcall->actions = sample->userspace_actions;
+ }
+ if (psample->iifindex) {
+ flow->in_port.odp_port = netdev_ifindex_to_odp_port(psample->iifindex);
+ }
If not provided I think the port should be set to ODPP_NONE (the default
value), please confirm.
If this is not true the above netdev_ifindex_to_odp_port() should cause this
function to fail if ODPP_NONE is returned.
Now ifindex is got from netdev_get_ifindex(netdev) in
netdev_tc_flow_put(). It should be provided always.
+ upcall->type = DPIF_UC_ACTION;
+
+ return 0;
+}
+
+static int
+netdev_tc_recv(struct dpif_upcall *upcall, struct ofpbuf *buf,
+ uint32_t handler_id)
+{
+ int read_tries = 0;
+
+ if (handler_id) {
+ return EAGAIN;
+ }
+
+ if (!psample_sock) {
+ return EAGAIN;
+ }
+
Combine the two if's in a single one 'if (handler_id || !psample_sock)'
Done.
+ for (;;) {
+ struct offload_psample psample;
+ int error;
+
+ if (++read_tries > 50) {
+ return EAGAIN;
+ }
+
+ error = nl_sock_recv(psample_sock, buf, NULL, false);
+ if (error == ENOBUFS) {
+ continue;
+ }
+
+ if (error) {
+ if (error == EAGAIN) {
+ break;
The outer loop returns EAGAIN, so why explicitly check/break, we can just
return error, can't we?
Done.
+ }
+ return error;
+ }
+
+ memset(&psample, 0, sizeof psample);
Remove memset here, see comment in nl_parse_psample.
Done.
+ error = nl_parse_psample(&psample, buf);
+
+ return error ? error : psample_parse_packet(&psample, upcall);
+ }
+
+ return EAGAIN;
+}
+
+static void
+netdev_tc_recv_wait(uint32_t handler_id)
+{
+ if (handler_id) {
+ return;
+ }
+
+ if (psample_sock) {
+ nl_sock_wait(psample_sock, POLLIN);
+ }
+}
Add some comments that netlink will only do this in handler 0 for simplicity.
Maybe something like:
Done.
+static void
+netdev_tc_recv_wait(uint32_t handler_id)
+{
+ /* For simplicity, i.e., using a single NetLink socket, only the first
+ * handler thread will be used. */
+ if (!handler_id && psample_sock) }
+ nl_sock_wait(psample_sock, POLLIN);
+ }
+}
+
const struct netdev_flow_api netdev_offload_tc = {
.type = "linux_tc",
.flow_flush = netdev_tc_flow_flush,
@@ -3219,5 +3409,7 @@ const struct netdev_flow_api netdev_offload_tc = {
.meter_set = meter_tc_set_policer,
.meter_get = meter_tc_get_policer,
.meter_del = meter_tc_del_policer,
+ .recv = netdev_tc_recv,
+ .recv_wait = netdev_tc_recv_wait,
.init_flow_api = netdev_tc_init_flow_api,
};
diff --git a/lib/netdev-offload.c b/lib/netdev-offload.c
index 4592262bd..84503ec8b 100644
--- a/lib/netdev-offload.c
+++ b/lib/netdev-offload.c
@@ -38,6 +38,7 @@
#include "netdev-provider.h"
#include "netdev-vport.h"
#include "odp-netlink.h"
+#include "odp-util.h"
#include "openflow/openflow.h"
#include "packets.h"
#include "openvswitch/ofp-print.h"
@@ -820,7 +821,7 @@ odp_port_t
netdev_ifindex_to_odp_port(int ifindex)
{
struct port_to_netdev_data *data;
- odp_port_t ret = 0;
+ odp_port_t ret = ODPP_NONE;
ovs_rwlock_rdlock(&netdev_hmap_rwlock);
HMAP_FOR_EACH_WITH_HASH (data, ifindex_node, ifindex, &ifindex_to_port) {
--
2.26.3
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev