On 2/22/2016 10:35 PM, Yuanhan Liu wrote: > Broadcast RARP packet by injecting it to receiving mbuf array at > rte_vhost_dequeue_burst(). > > Commit 33226236a35e ("vhost: handle request to send RARP") iterates > all host interfaces and then broadcast it by all of them. It did > notify the switches about the new location of the migrated VM, however, > the mac learning table in the target host is wrong (at least in my > test with OVS): > > $ ovs-appctl fdb/show ovsbr0 > port VLAN MAC Age > 1 0 b6:3c:72:71:cd:4d 10 > LOCAL 0 b6:3c:72:71:cd:4e 10 > LOCAL 0 52:54:00:12:34:68 9 > 1 0 56:f6:64:2c:bc:c0 1 > > Where 52:54:00:12:34:68 is the mac of the VM. As you can see from the > above, the port learned is "LOCAL", which is the "ovsbr0" port. That > is reasonable, since we indeed send the pkt by the "ovsbr0" interface. > > The wrong mac table lead all the packets to the VM go to the "ovsbr0" > in the end, which ends up with all packets being lost, until the guest > send a ARP quest (or reply) to refresh the mac learning table. > > Jianfeng then came up with a solution I have thought of firstly but NAKed
Is it suitable to mention someone in the commit log? Thanks, Michael > by myself, concerning it has potential issues [0]. The solution is as title > stated: broadcast the RARP packet by injecting it to the receiving mbuf > arrays at rte_vhost_dequeue_burst(). The re-bring of that idea made me > think it twice; it looked like a false concern to me then. And I had done > a rough verification: it worked as expected. > > [0]: http://dpdk.org/ml/archives/dev/2016-February/033527.html > > Another note is that while preparing this version, I found that DPDK has > some ARP related structures and macros defined. So, use them instead of > the one from standard header files here. > > Cc: Thibaut Collet <thibaut.collet at 6wind.com> > Suggested-by: Jianfeng Tan <jianfeng.tan at intel.com> > Signed-off-by: Yuanhan Liu <yuanhan.liu at linux.intel.com> > --- > lib/librte_vhost/rte_virtio_net.h | 5 +- > lib/librte_vhost/vhost_rxtx.c | 80 +++++++++++++++- > lib/librte_vhost/vhost_user/vhost-net-user.c | 2 +- > lib/librte_vhost/vhost_user/virtio-net-user.c | 128 > ++++---------------------- > lib/librte_vhost/vhost_user/virtio-net-user.h | 2 +- > 5 files changed, 104 insertions(+), 113 deletions(-) > > diff --git a/lib/librte_vhost/rte_virtio_net.h > b/lib/librte_vhost/rte_virtio_net.h > index 4a2303a..7d1fde2 100644 > --- a/lib/librte_vhost/rte_virtio_net.h > +++ b/lib/librte_vhost/rte_virtio_net.h > @@ -49,6 +49,7 @@ > > #include <rte_memory.h> > #include <rte_mempool.h> > +#include <rte_ether.h> > > struct rte_mbuf; > > @@ -133,7 +134,9 @@ struct virtio_net { > void *priv; /**< private context */ > uint64_t log_size; /**< Size of log area */ > uint64_t log_base; /**< Where dirty pages are > logged */ > - uint64_t reserved[62]; /**< Reserve some spaces for > future extension. */ > + struct ether_addr mac; /**< MAC address */ > + rte_atomic16_t broadcast_rarp; /**< A flag to tell if we need > broadcast rarp packet */ > + uint64_t reserved[61]; /**< Reserve some spaces for > future extension. */ > struct vhost_virtqueue *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2]; /**< > Contains all virtqueue information. */ > } __rte_cache_aligned; > > diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c > index 12ce0cc..9d23eb1 100644 > --- a/lib/librte_vhost/vhost_rxtx.c > +++ b/lib/librte_vhost/vhost_rxtx.c > @@ -43,6 +43,7 @@ > #include <rte_tcp.h> > #include <rte_udp.h> > #include <rte_sctp.h> > +#include <rte_arp.h> > > #include "vhost-net.h" > > @@ -761,11 +762,50 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, > struct rte_mbuf *m) > } > } > > +#define RARP_PKT_SIZE 64 > + > +static int > +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac) > +{ > + struct ether_hdr *eth_hdr; > + struct arp_hdr *rarp; > + > + if (rarp_mbuf->buf_len < 64) { > + RTE_LOG(WARNING, VHOST_DATA, > + "failed to make RARP; mbuf size too small %u (< %d)\n", > + rarp_mbuf->buf_len, RARP_PKT_SIZE); > + return -1; > + } > + > + /* Ethernet header. */ > + eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0); > + memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN); > + ether_addr_copy(mac, ð_hdr->s_addr); > + eth_hdr->ether_type = htons(ETHER_TYPE_RARP); > + > + /* RARP header. */ > + rarp = (struct arp_hdr *)(eth_hdr + 1); > + rarp->arp_hrd = htons(ARP_HRD_ETHER); > + rarp->arp_pro = htons(ETHER_TYPE_IPv4); > + rarp->arp_hln = ETHER_ADDR_LEN; > + rarp->arp_pln = 4; > + rarp->arp_op = htons(ARP_OP_REVREQUEST); > + > + ether_addr_copy(mac, &rarp->arp_data.arp_sha); > + ether_addr_copy(mac, &rarp->arp_data.arp_tha); > + memset(&rarp->arp_data.arp_sip, 0x00, 4); > + memset(&rarp->arp_data.arp_tip, 0x00, 4); > + > + rarp_mbuf->pkt_len = rarp_mbuf->data_len = RARP_PKT_SIZE; > + > + return 0; > +} > + > uint16_t > rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id, > struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count) > { > - struct rte_mbuf *m, *prev; > + struct rte_mbuf *m, *prev, *rarp_mbuf = NULL; > struct vhost_virtqueue *vq; > struct vring_desc *desc; > uint64_t vb_addr = 0; > @@ -788,11 +828,34 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, > uint16_t queue_id, > if (unlikely(vq->enabled == 0)) > return 0; > > + /* > + * Construct a RARP broadcast packet, and inject it to the "pkts" > + * array, to looks like that guest actually send such packet. > + * > + * Check user_send_rarp() for more information. > + */ > + if (unlikely(rte_atomic16_cmpset((volatile uint16_t *) > + &dev->broadcast_rarp.cnt, 1, 0))) { > + rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool); > + if (rarp_mbuf == NULL) { > + RTE_LOG(ERR, VHOST_DATA, > + "Failed to allocate memory for mbuf.\n"); > + return 0; > + } > + > + if (make_rarp_packet(rarp_mbuf, &dev->mac)) { > + rte_pktmbuf_free(rarp_mbuf); > + rarp_mbuf = NULL; > + } else { > + count -= 1; > + } > + } > + > avail_idx = *((volatile uint16_t *)&vq->avail->idx); > > /* If there are no available buffers then return. */ > if (vq->last_used_idx == avail_idx) > - return 0; > + goto out; > > LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__, > dev->device_fh); > @@ -983,8 +1046,21 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, > uint16_t queue_id, > vq->used->idx += entry_success; > vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx), > sizeof(vq->used->idx)); > + > /* Kick guest if required. */ > if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) > eventfd_write(vq->callfd, (eventfd_t)1); > + > +out: > + if (unlikely(rarp_mbuf != NULL)) { > + /* > + * Inject it to the head of "pkts" array, so that switch's mac > + * learning table will get updated first. > + */ > + memmove(&pkts[1], pkts, entry_success * sizeof(m)); > + pkts[0] = rarp_mbuf; > + entry_success += 1; > + } > + > return entry_success; > } > diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c > b/lib/librte_vhost/vhost_user/vhost-net-user.c > index de7eecb..df2bd64 100644 > --- a/lib/librte_vhost/vhost_user/vhost-net-user.c > +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c > @@ -437,7 +437,7 @@ vserver_message_handler(int connfd, void *dat, int > *remove) > user_set_vring_enable(ctx, &msg.payload.state); > break; > case VHOST_USER_SEND_RARP: > - user_send_rarp(&msg); > + user_send_rarp(ctx, &msg); > break; > > default: > diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c > b/lib/librte_vhost/vhost_user/virtio-net-user.c > index 68b24f4..65b5652 100644 > --- a/lib/librte_vhost/vhost_user/virtio-net-user.c > +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c > @@ -39,12 +39,6 @@ > #include <sys/mman.h> > #include <sys/types.h> > #include <sys/stat.h> > -#include <sys/ioctl.h> > -#include <sys/socket.h> > -#include <net/ethernet.h> > -#include <netinet/in.h> > -#include <netinet/if_ether.h> > -#include <linux/if_packet.h> > > #include <rte_common.h> > #include <rte_log.h> > @@ -415,120 +409,38 @@ user_set_log_base(struct vhost_device_ctx ctx, > return 0; > } > > -#define RARP_BUF_SIZE 64 > - > -static void > -make_rarp_packet(uint8_t *buf, uint8_t *mac) > -{ > - struct ether_header *eth_hdr; > - struct ether_arp *rarp; > - > - /* Ethernet header. */ > - eth_hdr = (struct ether_header *)buf; > - memset(ð_hdr->ether_dhost, 0xff, ETH_ALEN); > - memcpy(ð_hdr->ether_shost, mac, ETH_ALEN); > - eth_hdr->ether_type = htons(ETH_P_RARP); > - > - /* RARP header. */ > - rarp = (struct ether_arp *)(eth_hdr + 1); > - rarp->ea_hdr.ar_hrd = htons(ARPHRD_ETHER); > - rarp->ea_hdr.ar_pro = htons(ETHERTYPE_IP); > - rarp->ea_hdr.ar_hln = ETH_ALEN; > - rarp->ea_hdr.ar_pln = 4; > - rarp->ea_hdr.ar_op = htons(ARPOP_RREQUEST); > - > - memcpy(&rarp->arp_sha, mac, ETH_ALEN); > - memset(&rarp->arp_spa, 0x00, 4); > - memcpy(&rarp->arp_tha, mac, 6); > - memset(&rarp->arp_tpa, 0x00, 4); > -} > - > - > -static void > -send_rarp(const char *ifname, uint8_t *rarp) > -{ > - int fd; > - struct ifreq ifr; > - struct sockaddr_ll addr; > - > - fd = socket(AF_PACKET, SOCK_RAW, 0); > - if (fd < 0) { > - perror("socket failed"); > - return; > - } > - > - memset(&ifr, 0, sizeof(struct ifreq)); > - strncpy(ifr.ifr_name, ifname, IFNAMSIZ); > - if (ioctl(fd, SIOCGIFINDEX, &ifr) < 0) { > - perror("failed to get interface index"); > - close(fd); > - return; > - } > - > - addr.sll_ifindex = ifr.ifr_ifindex; > - addr.sll_halen = ETH_ALEN; > - > - if (sendto(fd, rarp, RARP_BUF_SIZE, 0, > - (const struct sockaddr*)&addr, sizeof(addr)) < 0) { > - perror("send rarp packet failed"); > - } > -} > - > - > /* > - * Broadcast a RARP message to all interfaces, to update > - * switch's mac table > + * An rarp packet is constructed and broadcasted to notify switches about > + * the new location of the migrated VM, so that packets from outside will > + * not be lost after migration. > + * > + * However, we don't actually "send" a rarp packet here, instead, we set > + * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it. > */ > int > -user_send_rarp(struct VhostUserMsg *msg) > +user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *msg) > { > + struct virtio_net *dev; > uint8_t *mac = (uint8_t *)&msg->payload.u64; > - uint8_t rarp[RARP_BUF_SIZE]; > - struct ifconf ifc = {0, }; > - struct ifreq *ifr; > - int nr = 16; > - int fd; > - uint32_t i; > + > + dev = get_device(ctx); > + if (!dev) > + return -1; > > RTE_LOG(DEBUG, VHOST_CONFIG, > ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n", > mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); > - > - make_rarp_packet(rarp, mac); > + memcpy(dev->mac.addr_bytes, mac, 6); > > /* > - * Get all interfaces > + * Set the flag to inject a RARP broadcast packet at > + * rte_vhost_dequeue_burst(). > + * > + * rte_smp_wmb() is for making sure the mac is copied > + * before the flag is set. > */ > - fd = socket(AF_INET, SOCK_DGRAM, 0); > - if (fd < 0) { > - perror("failed to create AF_INET socket"); > - return -1; > - } > - > -again: > - ifc.ifc_len = sizeof(*ifr) * nr; > - ifc.ifc_buf = realloc(ifc.ifc_buf, ifc.ifc_len); > - > - if (ioctl(fd, SIOCGIFCONF, &ifc) < 0) { > - perror("failed at SIOCGIFCONF"); > - close(fd); > - return -1; > - } > - > - if (ifc.ifc_len == (int)sizeof(struct ifreq) * nr) { > - /* > - * current ifc_buf is not big enough to hold > - * all interfaces; double it and try again. > - */ > - nr *= 2; > - goto again; > - } > - > - ifr = (struct ifreq *)ifc.ifc_buf; > - for (i = 0; i < ifc.ifc_len / sizeof(struct ifreq); i++) > - send_rarp(ifr[i].ifr_name, rarp); > - > - close(fd); > + rte_smp_wmb(); > + rte_atomic16_set(&dev->broadcast_rarp, 1); > > return 0; > } > diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h > b/lib/librte_vhost/vhost_user/virtio-net-user.h > index 559bb46..cefec16 100644 > --- a/lib/librte_vhost/vhost_user/virtio-net-user.h > +++ b/lib/librte_vhost/vhost_user/virtio-net-user.h > @@ -54,7 +54,7 @@ void user_set_vring_kick(struct vhost_device_ctx, struct > VhostUserMsg *); > void user_set_protocol_features(struct vhost_device_ctx ctx, > uint64_t protocol_features); > int user_set_log_base(struct vhost_device_ctx ctx, struct VhostUserMsg *); > -int user_send_rarp(struct VhostUserMsg *); > +int user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *); > > int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *); >