On Thu, Nov 27, 2025 at 11:24:03AM +0800, Longjun Tang wrote: > From: Tang Longjun <[email protected]> > > track skb and virtqueue through the kprobe start_xmit function > > Signed-off-by: Tang Longjun <[email protected]> > --- > tools/virtio/virtnet_mon/virtnet_mon.c | 793 ++++++++++++++++++++++++- > 1 file changed, 772 insertions(+), 21 deletions(-) > > diff --git a/tools/virtio/virtnet_mon/virtnet_mon.c > b/tools/virtio/virtnet_mon/virtnet_mon.c > index 696e621cf803..36b51d0a13d4 100644 > --- a/tools/virtio/virtnet_mon/virtnet_mon.c > +++ b/tools/virtio/virtnet_mon/virtnet_mon.c > @@ -6,15 +6,724 @@ > #include <linux/uaccess.h> > #include <linux/miscdevice.h> > #include <linux/poll.h> > +#include <linux/string.h> > +#include <linux/if_ether.h> > + > +#include <linux/kprobes.h> > +#include <linux/netdevice.h> > +#include <linux/skbuff.h> > +#include <linux/ip.h> > +#include <linux/ipv6.h> > +#include <linux/tcp.h> > +#include <linux/udp.h> > +#include <linux/icmp.h> > +#include <linux/icmpv6.h> > +#include <linux/version.h> > +#include <linux/time.h> > +#include <linux/smp.h> > +#include <linux/virtio.h> > +#include <linux/scatterlist.h> > +#include <linux/bpf.h> > +#include <linux/dim.h> > +#include <linux/mutex.h> > +#include <linux/workqueue.h> > +#include <linux/spinlock.h> > + > +#include <linux/u64_stats_sync.h> > +#include <linux/mm_types_task.h> > +#include <linux/virtio_net.h> > +#include <linux/virtio_ring.h> > +#include <net/xdp.h> > + > > #define DEVICE_NAME "virtnet_mon" > -#define KFIFO_SIZE 1024 // ring buffer size > +#define KFIFO_SIZE 65536 // ring buffer size > +#define WRITE_SIZE 1024 > +#define READ_SIZE 16384 > +#define LINE_MAX_SIZE 1024 > + > +#if defined(CONFIG_X86_64) > +#define KP_GET_ARG(regs, idx) \ > + ((idx) == 0 ? (unsigned long)(regs)->di : \ > + (idx) == 1 ? (unsigned long)(regs)->si : 0UL) > +#elif defined(CONFIG_ARM64) > +#define KP_GET_ARG(regs, idx) \ > + ((idx) < 8 ? (unsigned long)(regs)->regs[(idx)] : 0UL) > +#endif > + > +struct _virtnet_sq_stats { > + struct u64_stats_sync syncp; > + u64_stats_t packets; > + u64_stats_t bytes; > + u64_stats_t xdp_tx; > + u64_stats_t xdp_tx_drops; > + u64_stats_t kicks; > + u64_stats_t tx_timeouts; > + u64_stats_t stop; > + u64_stats_t wake; > +}; > + > +struct _virtnet_interrupt_coalesce { > + u32 max_packets; > + u32 max_usecs; > +}; > + > +struct _send_queue { > + /* Virtqueue associated with this send _queue */ > + struct virtqueue *vq; > + > + /* TX: fragments + linear part + virtio header */ > + struct scatterlist sg[MAX_SKB_FRAGS + 2]; > + > + /* Name of the send queue: output.$index */ > + char name[16]; > + > + struct _virtnet_sq_stats stats; > + > + struct _virtnet_interrupt_coalesce intr_coal; > + > + struct napi_struct napi; > + > + /* Record whether sq is in reset state. */ > + bool reset; > + > + struct xsk_buff_pool *xsk_pool; > + > + dma_addr_t xsk_hdr_dma_addr; > +}; > + > +struct _virtnet_rq_stats { > + struct u64_stats_sync syncp; > + u64_stats_t packets; > + u64_stats_t bytes; > + u64_stats_t drops; > + u64_stats_t xdp_packets; > + u64_stats_t xdp_tx; > + u64_stats_t xdp_redirects; > + u64_stats_t xdp_drops; > + u64_stats_t kicks; > +}; > + > +struct _ewma_pkt_len { > + unsigned long internal; > +}; > + > +struct _virtnet_rq_dma { > + dma_addr_t addr; > + u32 ref; > + u16 len; > + u16 need_sync; > +}; > + > +struct _receive_queue { > + /* Virtqueue associated with this receive_queue */ > + struct virtqueue *vq; > + > + struct napi_struct napi; > + > + struct bpf_prog __rcu *xdp_prog; > + > + struct _virtnet_rq_stats stats; > + > + /* The number of rx notifications */ > + u16 calls; > + > + /* Is dynamic interrupt moderation enabled? */ > + bool dim_enabled; > + > + /* Used to protect dim_enabled and inter_coal */ > + struct mutex dim_lock; > + > + /* Dynamic Interrupt Moderation */ > + struct dim dim; > + > + u32 packets_in_napi; > + > + struct _virtnet_interrupt_coalesce intr_coal; > + > + /* Chain pages by the private ptr. */ > + struct page *pages; > + > + /* Average packet length for mergeable receive buffers. */ > + struct _ewma_pkt_len mrg_avg_pkt_len; > + > + /* Page frag for packet buffer allocation. */ > + struct page_frag alloc_frag; > + > + /* RX: fragments + linear part + virtio header */ > + struct scatterlist sg[MAX_SKB_FRAGS + 2]; > + > + /* Min single buffer size for mergeable buffers case. */ > + unsigned int min_buf_len; > + > + /* Name of this receive queue: input.$index */ > + char name[16]; > + > + struct xdp_rxq_info xdp_rxq; > + > + /* Record the last dma info to free after new pages is allocated. */ > + struct _virtnet_rq_dma *last_dma; > + > + struct xsk_buff_pool *xsk_pool; > + > + /* xdp rxq used by xsk */ > + struct xdp_rxq_info xsk_rxq_info; > + > + struct xdp_buff **xsk_buffs; > +}; > + > +#define VIRTIO_NET_RSS_MAX_KEY_SIZE 40 > + > +struct _control_buf { > + struct virtio_net_ctrl_hdr hdr; > + virtio_net_ctrl_ack status; > +}; > + > +struct _virtnet_info { > + struct virtio_device *vdev; > + struct virtqueue *cvq; > + struct net_device *dev; > + struct _send_queue *sq; > + struct _receive_queue *rq; > + unsigned int status; > + > + /* Max # of queue pairs supported by the device */ > + u16 max_queue_pairs; > + > + /* # of queue pairs currently used by the driver */ > + u16 curr_queue_pairs; > + > + /* # of XDP queue pairs currently used by the driver */ > + u16 xdp_queue_pairs; > + > + /* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */ > + bool xdp_enabled; > + > + /* I like... big packets and I cannot lie! */ > + bool big_packets; > + > + /* number of sg entries allocated for big packets */ > + unsigned int big_packets_num_skbfrags; > + > + /* Host will merge rx buffers for big packets (shake it! shake it!) */ > + bool mergeable_rx_bufs; > + > + /* Host supports rss and/or hash report */ > + bool has_rss; > + bool has_rss_hash_report; > + u8 rss_key_size; > + u16 rss_indir_table_size; > + u32 rss_hash_types_supported; > + u32 rss_hash_types_saved; > + struct virtio_net_rss_config_hdr *rss_hdr; > + struct virtio_net_rss_config_trailer rss_trailer; > + u8 rss_hash_key_data[VIRTIO_NET_RSS_MAX_KEY_SIZE]; > + > + /* Has control virtqueue */ > + bool has_cvq; > + > + /* Lock to protect the control VQ */ > + struct mutex cvq_lock; > + > + /* Host can handle any s/g split between our header and packet data */ > + bool any_header_sg; > + > + /* Packet virtio header size */ > + u8 hdr_len; > + > + /* Work struct for delayed refilling if we run low on memory. */ > + struct delayed_work refill; > + > + /* UDP tunnel support */ > + bool tx_tnl; > + > + bool rx_tnl; > + > + bool rx_tnl_csum; > + > + /* Is delayed refill enabled? */ > + bool refill_enabled; > + > + /* The lock to synchronize the access to refill_enabled */ > + spinlock_t refill_lock; > + > + /* Work struct for config space updates */ > + struct work_struct config_work; > + > + /* Work struct for setting rx mode */ > + struct work_struct rx_mode_work; > + > + /* OK to queue work setting RX mode? */ > + bool rx_mode_work_enabled; > + > + /* Does the affinity hint is set for virtqueues? */ > + > + bool affinity_hint_set; > + > + /* CPU hotplug instances for online & dead */ > + > + struct hlist_node node; > + > + struct hlist_node node_dead; > + > + struct _control_buf *ctrl; > + > + /* Ethtool settings */ > + u8 duplex; > + u32 speed; > + > + /* Is rx dynamic interrupt moderation enabled? */ > + bool rx_dim_enabled; > + > + /* Interrupt coalescing settings */ > + struct _virtnet_interrupt_coalesce intr_coal_tx; > + struct _virtnet_interrupt_coalesce intr_coal_rx; > + > + unsigned long guest_offloads; > + unsigned long guest_offloads_capable; > + > + /* failover when STANDBY feature enabled */ > + struct failover *failover; > + > + u64 device_stats_cap; > +}; > + > + > +struct _vring_desc_state_split { > + void *data; /* Data for callback. */ > + struct vring_desc *indir_desc; /* Indirect descriptor, if any. */ > +}; > + > +struct _vring_desc_extra { > + dma_addr_t addr; /* Descriptor DMA addr. */ > + u32 len; /* Descriptor length. */ > + u16 flags; /* Descriptor flags. */ > + u16 next; /* The next desc state in a list. */ > +}; > + > +struct _vring_virtqueue_split { > + /* Actual memory layout for this queue. */ > + struct vring vring; > + > + /* Last written value to avail->flags */ > + u16 avail_flags_shadow; > + > + /* > + * Last written value to avail->idx in > + * guest byte order. > + */ > + u16 avail_idx_shadow; > + > + /* Per-descriptor state. */ > + struct _vring_desc_state_split *desc_state; > + struct _vring_desc_extra *desc_extra; > + > + /* DMA address and size information */ > + dma_addr_t queue_dma_addr; > + size_t queue_size_in_bytes; > + > + /* > + * The parameters for creating vrings are reserved for creating new > + * vring. > + */ > + u32 vring_align; > + bool may_reduce_num; > +}; > + > +struct _vring_desc_state_packed { > + void *data; /* Data for callback. */ > + struct vring_packed_desc *indir_desc; /* Indirect descriptor, if any. */ > + u16 num; /* Descriptor list length. */ > + u16 last; /* The last desc state in a list. */ > +}; > + > +struct _vring_virtqueue_packed { > + /* Actual memory layout for this queue. */ > + struct { > + unsigned int num; > + struct vring_packed_desc *desc; > + struct vring_packed_desc_event *driver; > + struct vring_packed_desc_event *device; > + } vring; > + > + /* Driver ring wrap counter. */ > + bool avail_wrap_counter; > + > + /* Avail used flags. */ > + u16 avail_used_flags; > + > + /* Index of the next avail descriptor. */ > + u16 next_avail_idx; > + > + /* > + * Last written value to driver->flags in > + * guest byte order. > + */ > + u16 event_flags_shadow; > + > + /* Per-descriptor state. */ > + struct _vring_desc_state_packed *desc_state; > + struct _vring_desc_extra *desc_extra; > + > + /* DMA address and size information */ > + dma_addr_t ring_dma_addr; > + dma_addr_t driver_event_dma_addr; > + dma_addr_t device_event_dma_addr; > + size_t ring_size_in_bytes; > + size_t event_size_in_bytes; > +}; > + > +struct _vring_virtqueue { > + struct virtqueue vq; > + > + /* Is this a packed ring? */ > + bool packed_ring; > + > + /* Is DMA API used? */ > + bool use_dma_api; > + > + /* Can we use weak barriers? */ > + bool weak_barriers; > + > + /* Other side has made a mess, don't try any more. */ > + bool broken; > + > + /* Host supports indirect buffers */ > + bool indirect; > + > + /* Host publishes avail event idx */ > + bool event; > + > + /* Head of free buffer list. */ > + unsigned int free_head; > + /* Number we've added since last sync. */ > + unsigned int num_added; > + > + /* Last used index we've seen. > + * for split ring, it just contains last used index > + * for packed ring: > + * bits up to VRING_PACKED_EVENT_F_WRAP_CTR include the last used index. > + * bits from VRING_PACKED_EVENT_F_WRAP_CTR include the used wrap > counter. > + */ > + u16 last_used_idx; > > -static DEFINE_KFIFO(virtnet_mon_kfifo, char, KFIFO_SIZE); > + /* Hint for event idx: already triggered no need to disable. */ > + bool event_triggered; > + > + union { > + /* Available for split ring */ > + struct _vring_virtqueue_split split; > + > + /* Available for packed ring */ > + struct _vring_virtqueue_packed packed; > + }; > + > + /* How to notify other side. FIXME: commonalize hcalls! */ > + bool (*notify)(struct virtqueue *vq); > + > + /* DMA, allocation, and size information */ > + bool we_own_ring; > + > + union virtio_map map; > +}; > + > +/* RX or TX */ > +enum pkt_dir { > + PKT_DIR_UN = 0, /* Unknown */ > + PKT_DIR_RX = 1, /* RX */ > + PKT_DIR_TX = 2, /* TX */ > + PKT_DIR_MAX > +}; > + > +enum event_type { > + START_XMIT_PRE_EVENT = 1, > + START_XMIT_POST_EVENT = 2, > +}; > + > +struct iph_info { > + struct sk_buff *skb; /* SKB */ > + u8 iph_proto; /* iph protocol type */ > + u32 seq; /* absolute sequence number */ > +}; > + > +struct queue_info { > + struct virtqueue *vq; > + char name[16]; > + unsigned int num_free; > + unsigned int num; > + __virtio16 avail_flags; > + __virtio16 avail_idx; > + u16 avail_flags_shadow; > + u16 avail_idx_shadow; > + __virtio16 used_flags; > + __virtio16 used_idx; > + u16 last_used_idx; > + bool broken; > +};
Not at all excited about all the code duplication going on here.
