On 2/22/2016 10:35 PM, Yuanhan Liu wrote:
> Broadcast RARP packet by injecting it to receiving mbuf array at
> rte_vhost_dequeue_burst().
>
> Commit 33226236a35e ("vhost: handle request to send RARP") iterates
> all host interfaces and then broadcast it by all of them.  It did
> notify the switches about the new location of the migrated VM, however,
> the mac learning table in the target host is wrong (at least in my
> test with OVS):
>
>     $ ovs-appctl fdb/show ovsbr0
>      port  VLAN  MAC                Age
>         1     0  b6:3c:72:71:cd:4d   10
>     LOCAL     0  b6:3c:72:71:cd:4e   10
>     LOCAL     0  52:54:00:12:34:68    9
>         1     0  56:f6:64:2c:bc:c0    1
>
> Where 52:54:00:12:34:68 is the mac of the VM. As you can see from the
> above, the port learned is "LOCAL", which is the "ovsbr0" port. That
> is reasonable, since we indeed send the pkt by the "ovsbr0" interface.
>
> The wrong mac table lead all the packets to the VM go to the "ovsbr0"
> in the end, which ends up with all packets being lost, until the guest
> send a ARP quest (or reply) to refresh the mac learning table.
>
> Jianfeng then came up with a solution I have thought of firstly but NAKed

Is it suitable to mention someone in the commit log?

Thanks,
Michael
> by myself, concerning it has potential issues [0]. The solution is as title
> stated: broadcast the RARP packet by injecting it to the receiving mbuf
> arrays at rte_vhost_dequeue_burst(). The re-bring of that idea made me
> think it twice; it looked like a false concern to me then. And I had done
> a rough verification: it worked as expected.
>
> [0]: http://dpdk.org/ml/archives/dev/2016-February/033527.html
>
> Another note is that while preparing this version, I found that DPDK has
> some ARP related structures and macros defined. So, use them instead of
> the one from standard header files here.
>
> Cc: Thibaut Collet <thibaut.collet at 6wind.com>
> Suggested-by: Jianfeng Tan <jianfeng.tan at intel.com>
> Signed-off-by: Yuanhan Liu <yuanhan.liu at linux.intel.com>
> ---
>  lib/librte_vhost/rte_virtio_net.h             |   5 +-
>  lib/librte_vhost/vhost_rxtx.c                 |  80 +++++++++++++++-
>  lib/librte_vhost/vhost_user/vhost-net-user.c  |   2 +-
>  lib/librte_vhost/vhost_user/virtio-net-user.c | 128 
> ++++----------------------
>  lib/librte_vhost/vhost_user/virtio-net-user.h |   2 +-
>  5 files changed, 104 insertions(+), 113 deletions(-)
>
> diff --git a/lib/librte_vhost/rte_virtio_net.h 
> b/lib/librte_vhost/rte_virtio_net.h
> index 4a2303a..7d1fde2 100644
> --- a/lib/librte_vhost/rte_virtio_net.h
> +++ b/lib/librte_vhost/rte_virtio_net.h
> @@ -49,6 +49,7 @@
>  
>  #include <rte_memory.h>
>  #include <rte_mempool.h>
> +#include <rte_ether.h>
>  
>  struct rte_mbuf;
>  
> @@ -133,7 +134,9 @@ struct virtio_net {
>       void                    *priv;          /**< private context */
>       uint64_t                log_size;       /**< Size of log area */
>       uint64_t                log_base;       /**< Where dirty pages are 
> logged */
> -     uint64_t                reserved[62];   /**< Reserve some spaces for 
> future extension. */
> +     struct ether_addr       mac;            /**< MAC address */
> +     rte_atomic16_t          broadcast_rarp; /**< A flag to tell if we need 
> broadcast rarp packet */
> +     uint64_t                reserved[61];   /**< Reserve some spaces for 
> future extension. */
>       struct vhost_virtqueue  *virtqueue[VHOST_MAX_QUEUE_PAIRS * 2];  /**< 
> Contains all virtqueue information. */
>  } __rte_cache_aligned;
>  
> diff --git a/lib/librte_vhost/vhost_rxtx.c b/lib/librte_vhost/vhost_rxtx.c
> index 12ce0cc..9d23eb1 100644
> --- a/lib/librte_vhost/vhost_rxtx.c
> +++ b/lib/librte_vhost/vhost_rxtx.c
> @@ -43,6 +43,7 @@
>  #include <rte_tcp.h>
>  #include <rte_udp.h>
>  #include <rte_sctp.h>
> +#include <rte_arp.h>
>  
>  #include "vhost-net.h"
>  
> @@ -761,11 +762,50 @@ vhost_dequeue_offload(struct virtio_net_hdr *hdr, 
> struct rte_mbuf *m)
>       }
>  }
>  
> +#define RARP_PKT_SIZE        64
> +
> +static int
> +make_rarp_packet(struct rte_mbuf *rarp_mbuf, const struct ether_addr *mac)
> +{
> +     struct ether_hdr *eth_hdr;
> +     struct arp_hdr  *rarp;
> +
> +     if (rarp_mbuf->buf_len < 64) {
> +             RTE_LOG(WARNING, VHOST_DATA,
> +                     "failed to make RARP; mbuf size too small %u (< %d)\n",
> +                     rarp_mbuf->buf_len, RARP_PKT_SIZE);
> +             return -1;
> +     }
> +
> +     /* Ethernet header. */
> +     eth_hdr = rte_pktmbuf_mtod_offset(rarp_mbuf, struct ether_hdr *, 0);
> +     memset(eth_hdr->d_addr.addr_bytes, 0xff, ETHER_ADDR_LEN);
> +     ether_addr_copy(mac, &eth_hdr->s_addr);
> +     eth_hdr->ether_type = htons(ETHER_TYPE_RARP);
> +
> +     /* RARP header. */
> +     rarp = (struct arp_hdr *)(eth_hdr + 1);
> +     rarp->arp_hrd = htons(ARP_HRD_ETHER);
> +     rarp->arp_pro = htons(ETHER_TYPE_IPv4);
> +     rarp->arp_hln = ETHER_ADDR_LEN;
> +     rarp->arp_pln = 4;
> +     rarp->arp_op  = htons(ARP_OP_REVREQUEST);
> +
> +     ether_addr_copy(mac, &rarp->arp_data.arp_sha);
> +     ether_addr_copy(mac, &rarp->arp_data.arp_tha);
> +     memset(&rarp->arp_data.arp_sip, 0x00, 4);
> +     memset(&rarp->arp_data.arp_tip, 0x00, 4);
> +
> +     rarp_mbuf->pkt_len  = rarp_mbuf->data_len = RARP_PKT_SIZE;
> +
> +     return 0;
> +}
> +
>  uint16_t
>  rte_vhost_dequeue_burst(struct virtio_net *dev, uint16_t queue_id,
>       struct rte_mempool *mbuf_pool, struct rte_mbuf **pkts, uint16_t count)
>  {
> -     struct rte_mbuf *m, *prev;
> +     struct rte_mbuf *m, *prev, *rarp_mbuf = NULL;
>       struct vhost_virtqueue *vq;
>       struct vring_desc *desc;
>       uint64_t vb_addr = 0;
> @@ -788,11 +828,34 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, 
> uint16_t queue_id,
>       if (unlikely(vq->enabled == 0))
>               return 0;
>  
> +     /*
> +      * Construct a RARP broadcast packet, and inject it to the "pkts"
> +      * array, to looks like that guest actually send such packet.
> +      *
> +      * Check user_send_rarp() for more information.
> +      */
> +     if (unlikely(rte_atomic16_cmpset((volatile uint16_t *)
> +                                      &dev->broadcast_rarp.cnt, 1, 0))) {
> +             rarp_mbuf = rte_pktmbuf_alloc(mbuf_pool);
> +             if (rarp_mbuf == NULL) {
> +                     RTE_LOG(ERR, VHOST_DATA,
> +                             "Failed to allocate memory for mbuf.\n");
> +                     return 0;
> +             }
> +
> +             if (make_rarp_packet(rarp_mbuf, &dev->mac)) {
> +                     rte_pktmbuf_free(rarp_mbuf);
> +                     rarp_mbuf = NULL;
> +             } else {
> +                     count -= 1;
> +             }
> +     }
> +
>       avail_idx =  *((volatile uint16_t *)&vq->avail->idx);
>  
>       /* If there are no available buffers then return. */
>       if (vq->last_used_idx == avail_idx)
> -             return 0;
> +             goto out;
>  
>       LOG_DEBUG(VHOST_DATA, "%s (%"PRIu64")\n", __func__,
>               dev->device_fh);
> @@ -983,8 +1046,21 @@ rte_vhost_dequeue_burst(struct virtio_net *dev, 
> uint16_t queue_id,
>       vq->used->idx += entry_success;
>       vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
>                       sizeof(vq->used->idx));
> +
>       /* Kick guest if required. */
>       if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
>               eventfd_write(vq->callfd, (eventfd_t)1);
> +
> +out:
> +     if (unlikely(rarp_mbuf != NULL)) {
> +             /*
> +              * Inject it to the head of "pkts" array, so that switch's mac
> +              * learning table will get updated first.
> +              */
> +             memmove(&pkts[1], pkts, entry_success * sizeof(m));
> +             pkts[0] = rarp_mbuf;
> +             entry_success += 1;
> +     }
> +
>       return entry_success;
>  }
> diff --git a/lib/librte_vhost/vhost_user/vhost-net-user.c 
> b/lib/librte_vhost/vhost_user/vhost-net-user.c
> index de7eecb..df2bd64 100644
> --- a/lib/librte_vhost/vhost_user/vhost-net-user.c
> +++ b/lib/librte_vhost/vhost_user/vhost-net-user.c
> @@ -437,7 +437,7 @@ vserver_message_handler(int connfd, void *dat, int 
> *remove)
>               user_set_vring_enable(ctx, &msg.payload.state);
>               break;
>       case VHOST_USER_SEND_RARP:
> -             user_send_rarp(&msg);
> +             user_send_rarp(ctx, &msg);
>               break;
>  
>       default:
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.c 
> b/lib/librte_vhost/vhost_user/virtio-net-user.c
> index 68b24f4..65b5652 100644
> --- a/lib/librte_vhost/vhost_user/virtio-net-user.c
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.c
> @@ -39,12 +39,6 @@
>  #include <sys/mman.h>
>  #include <sys/types.h>
>  #include <sys/stat.h>
> -#include <sys/ioctl.h>
> -#include <sys/socket.h>
> -#include <net/ethernet.h>
> -#include <netinet/in.h>
> -#include <netinet/if_ether.h>
> -#include <linux/if_packet.h>
>  
>  #include <rte_common.h>
>  #include <rte_log.h>
> @@ -415,120 +409,38 @@ user_set_log_base(struct vhost_device_ctx ctx,
>       return 0;
>  }
>  
> -#define RARP_BUF_SIZE        64
> -
> -static void
> -make_rarp_packet(uint8_t *buf, uint8_t *mac)
> -{
> -     struct ether_header *eth_hdr;
> -     struct ether_arp *rarp;
> -
> -     /* Ethernet header. */
> -     eth_hdr = (struct ether_header *)buf;
> -     memset(&eth_hdr->ether_dhost, 0xff, ETH_ALEN);
> -     memcpy(&eth_hdr->ether_shost, mac,  ETH_ALEN);
> -     eth_hdr->ether_type = htons(ETH_P_RARP);
> -
> -     /* RARP header. */
> -     rarp = (struct ether_arp *)(eth_hdr + 1);
> -     rarp->ea_hdr.ar_hrd = htons(ARPHRD_ETHER);
> -     rarp->ea_hdr.ar_pro = htons(ETHERTYPE_IP);
> -     rarp->ea_hdr.ar_hln = ETH_ALEN;
> -     rarp->ea_hdr.ar_pln = 4;
> -     rarp->ea_hdr.ar_op  = htons(ARPOP_RREQUEST);
> -
> -     memcpy(&rarp->arp_sha, mac, ETH_ALEN);
> -     memset(&rarp->arp_spa, 0x00, 4);
> -     memcpy(&rarp->arp_tha, mac, 6);
> -     memset(&rarp->arp_tpa, 0x00, 4);
> -}
> -
> -
> -static void
> -send_rarp(const char *ifname, uint8_t *rarp)
> -{
> -     int fd;
> -     struct ifreq ifr;
> -     struct sockaddr_ll addr;
> -
> -     fd = socket(AF_PACKET, SOCK_RAW, 0);
> -     if (fd < 0) {
> -             perror("socket failed");
> -             return;
> -     }
> -
> -     memset(&ifr, 0, sizeof(struct ifreq));
> -     strncpy(ifr.ifr_name, ifname, IFNAMSIZ);
> -     if (ioctl(fd, SIOCGIFINDEX, &ifr) < 0) {
> -             perror("failed to get interface index");
> -             close(fd);
> -             return;
> -     }
> -
> -     addr.sll_ifindex = ifr.ifr_ifindex;
> -     addr.sll_halen   = ETH_ALEN;
> -
> -     if (sendto(fd, rarp, RARP_BUF_SIZE, 0,
> -                (const struct sockaddr*)&addr, sizeof(addr)) < 0) {
> -             perror("send rarp packet failed");
> -     }
> -}
> -
> -
>  /*
> - * Broadcast a RARP message to all interfaces, to update
> - * switch's mac table
> + * An rarp packet is constructed and broadcasted to notify switches about
> + * the new location of the migrated VM, so that packets from outside will
> + * not be lost after migration.
> + *
> + * However, we don't actually "send" a rarp packet here, instead, we set
> + * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
>   */
>  int
> -user_send_rarp(struct VhostUserMsg *msg)
> +user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *msg)
>  {
> +     struct virtio_net *dev;
>       uint8_t *mac = (uint8_t *)&msg->payload.u64;
> -     uint8_t rarp[RARP_BUF_SIZE];
> -     struct ifconf ifc = {0, };
> -     struct ifreq *ifr;
> -     int nr = 16;
> -     int fd;
> -     uint32_t i;
> +
> +     dev = get_device(ctx);
> +     if (!dev)
> +             return -1;
>  
>       RTE_LOG(DEBUG, VHOST_CONFIG,
>               ":: mac: %02x:%02x:%02x:%02x:%02x:%02x\n",
>               mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
> -
> -     make_rarp_packet(rarp, mac);
> +     memcpy(dev->mac.addr_bytes, mac, 6);
>  
>       /*
> -      * Get all interfaces
> +      * Set the flag to inject a RARP broadcast packet at
> +      * rte_vhost_dequeue_burst().
> +      *
> +      * rte_smp_wmb() is for making sure the mac is copied
> +      * before the flag is set.
>        */
> -     fd = socket(AF_INET, SOCK_DGRAM, 0);
> -     if (fd < 0) {
> -             perror("failed to create AF_INET socket");
> -             return -1;
> -     }
> -
> -again:
> -     ifc.ifc_len = sizeof(*ifr) * nr;
> -     ifc.ifc_buf = realloc(ifc.ifc_buf, ifc.ifc_len);
> -
> -     if (ioctl(fd, SIOCGIFCONF, &ifc) < 0) {
> -             perror("failed at SIOCGIFCONF");
> -             close(fd);
> -             return -1;
> -     }
> -
> -     if (ifc.ifc_len == (int)sizeof(struct ifreq) * nr) {
> -             /*
> -              * current ifc_buf is not big enough to hold
> -              * all interfaces; double it and try again.
> -              */
> -             nr *= 2;
> -             goto again;
> -     }
> -
> -     ifr = (struct ifreq *)ifc.ifc_buf;
> -     for (i = 0; i < ifc.ifc_len / sizeof(struct ifreq); i++)
> -             send_rarp(ifr[i].ifr_name, rarp);
> -
> -     close(fd);
> +     rte_smp_wmb();
> +     rte_atomic16_set(&dev->broadcast_rarp, 1);
>  
>       return 0;
>  }
> diff --git a/lib/librte_vhost/vhost_user/virtio-net-user.h 
> b/lib/librte_vhost/vhost_user/virtio-net-user.h
> index 559bb46..cefec16 100644
> --- a/lib/librte_vhost/vhost_user/virtio-net-user.h
> +++ b/lib/librte_vhost/vhost_user/virtio-net-user.h
> @@ -54,7 +54,7 @@ void user_set_vring_kick(struct vhost_device_ctx, struct 
> VhostUserMsg *);
>  void user_set_protocol_features(struct vhost_device_ctx ctx,
>                               uint64_t protocol_features);
>  int user_set_log_base(struct vhost_device_ctx ctx, struct VhostUserMsg *);
> -int user_send_rarp(struct VhostUserMsg *);
> +int user_send_rarp(struct vhost_device_ctx ctx, struct VhostUserMsg *);
>  
>  int user_get_vring_base(struct vhost_device_ctx, struct vhost_vring_state *);
>  

Reply via email to