Thanks for the review.  I removed inline from
netdev_linux_sock_batch_send() and applied these patches to master.

On Thu, Aug 03, 2017 at 09:40:36AM +0800, Gao Zhenyu wrote:
> Thanks for working on it. LGTM.
> 
> Could you please add inline for function netdev_linux_tap_batch_send?  That
> makes it looks like function netdev_linux_sock_batch_send.
> Or eliminate inline for function netdev_linux_sock_batch_send.
> 
> Thanks
> Zhenyu Gao
> 
> 2017-08-03 5:58 GMT+08:00 Ben Pfaff <[email protected]>:
> 
> > From: Zhenyu Gao <[email protected]>
> >
> > Sendmmsg can reduce cpu cycles in sending packets to kernel.
> > Replace sendmsg with sendmmsg in function netdev_linux_send to send
> > batch packets if sendmmsg is available.
> >
> > If kernel side doesn't support sendmmsg, will fallback to sendmsg.
> >
> >     netserver
> > |------------|
> > |            |
> > |  container |
> > |----veth----|
> >           |
> >           |        |------------|
> >           |---veth-|   dpdk-ovs |      netperf
> >                    |            |  |--------------|
> >                    |----dpdk----|  | bare-metal   |
> >                          |         |--------------|
> >                          |              |
> >                          |              |
> >                         pnic-----------pnic
> >
> > Netperf was consumed to test the performance:
> >
> > 1)cmd:netperf -H remote-container -t UDP_STREAM -l 60 -- -m 1400
> > result: netserver received 2383.21Mb(sendmsg)/2551.64Mb(sendmmsg)
> >
> > 2)cmd:netperf -H remote-container -t UDP_STREAM -l 60 -- -m 60
> > result: netserver received 109.72Mb(sendmsg)/115.18Mb(sendmmsg)
> >
> > Sendmmsg show about 6% improvement in netperf UDP testing.
> >
> > Signed-off-by: Zhenyu Gao <[email protected]>
> > Signed-off-by: Ben Pfaff <[email protected]>
> > ---
> >  lib/netdev-linux.c | 163 +++++++++++++++++++++++++++++-
> > -----------------------
> >  1 file changed, 89 insertions(+), 74 deletions(-)
> >
> > diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
> > index 5b4c643e4eec..19572a5bc949 100644
> > --- a/lib/netdev-linux.c
> > +++ b/lib/netdev-linux.c
> > @@ -1182,13 +1182,89 @@ netdev_linux_rxq_drain(struct netdev_rxq *rxq_)
> >      }
> >  }
> >
> > -/* Sends 'buffer' on 'netdev'.  Returns 0 if successful, otherwise a
> > positive
> > +static inline int
> > +netdev_linux_sock_batch_send(int sock, int ifindex,
> > +                             struct dp_packet_batch *batch)
> > +{
> > +    /* We don't bother setting most fields in sockaddr_ll because the
> > +     * kernel ignores them for SOCK_RAW. */
> > +    struct sockaddr_ll sll = { .sll_family = AF_PACKET,
> > +                               .sll_ifindex = ifindex };
> > +
> > +    struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * batch->count);
> > +    struct iovec *iov = xmalloc(sizeof(*iov) * batch->count);
> > +
> > +    for (int i = 0; i < batch->count; i++) {
> > +        struct dp_packet *packet = batch->packets[i];
> > +        iov[i].iov_base = dp_packet_data(packet);
> > +        iov[i].iov_len = dp_packet_get_send_len(packet);
> > +        mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll,
> > +                                            .msg_namelen = sizeof sll,
> > +                                            .msg_iov = &iov[i],
> > +                                            .msg_iovlen = 1 };
> > +    }
> > +
> > +    int error = 0;
> > +    for (uint32_t ofs = 0; ofs < batch->count; ) {
> > +        ssize_t retval;
> > +        do {
> > +            retval = sendmmsg(sock, mmsg + ofs, batch->count - ofs, 0);
> > +            error = retval < 0 ? errno : 0;
> > +        } while (error == EINTR);
> > +        if (error) {
> > +            break;
> > +        }
> > +        ofs += retval;
> > +    }
> > +
> > +    free(mmsg);
> > +    free(iov);
> > +    return error;
> > +}
> > +
> > +/* Use the tap fd to send 'batch' to tap device 'netdev'.  Using the tap
> > fd is
> > + * essential, because packets sent to a tap device with an AF_PACKET
> > socket
> > + * will loop back to be *received* again on the tap device.  This doesn't
> > occur
> > + * on other interface types because we attach a socket filter to the rx
> > + * socket. */
> > +static int
> > +netdev_linux_tap_batch_send(struct netdev *netdev_,
> > +                            struct dp_packet_batch *batch)
> > +{
> > +    struct netdev_linux *netdev = netdev_linux_cast(netdev_);
> > +    for (int i = 0; i < batch->count; i++) {
> > +        struct dp_packet *packet = batch->packets[i];
> > +        size_t size = dp_packet_get_send_len(packet);
> > +        ssize_t retval;
> > +        int error;
> > +
> > +        do {
> > +            retval = write(netdev->tap_fd, dp_packet_data(packet), size);
> > +            error = retval < 0 ? errno : 0;
> > +        } while (error == EINTR);
> > +
> > +        if (error) {
> > +            /* The Linux tap driver returns EIO if the device is not up.
> > From
> > +             * the OVS side this is not an error, so we ignore it;
> > otherwise,
> > +             * return the erro. */
> > +            if (error != EIO) {
> > +                return error;
> > +            }
> > +        } else if (retval != size) {
> > +            VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" "
> > +                         "bytes of %"PRIuSIZE") on %s",
> > +                         retval, size, netdev_get_name(netdev_));
> > +            return EMSGSIZE;
> > +        }
> > +    }
> > +    return 0;
> > +}
> > +
> > +/* Sends 'batch' on 'netdev'.  Returns 0 if successful, otherwise a
> > positive
> >   * errno value.  Returns EAGAIN without blocking if the packet cannot be
> > queued
> >   * immediately.  Returns EMSGSIZE if a partial packet was transmitted or
> > if
> >   * the packet is too big or too small to transmit on the device.
> >   *
> > - * The caller retains ownership of 'buffer' in all cases.
> > - *
> >   * The kernel maintains a packet transmission queue, so the caller is not
> >   * expected to do additional queuing of packets. */
> >  static int
> > @@ -1199,8 +1275,6 @@ netdev_linux_send(struct netdev *netdev_, int qid
> > OVS_UNUSED,
> >      int error = 0;
> >      int sock = 0;
> >
> > -    struct sockaddr_ll sll;
> > -    struct msghdr msg;
> >      if (!is_tap_netdev(netdev_)) {
> >          sock = af_packet_sock();
> >          if (sock < 0) {
> > @@ -1214,84 +1288,25 @@ netdev_linux_send(struct netdev *netdev_, int qid
> > OVS_UNUSED,
> >              goto free_batch;
> >          }
> >
> > -        /* We don't bother setting most fields in sockaddr_ll because the
> > -         * kernel ignores them for SOCK_RAW. */
> > -        memset(&sll, 0, sizeof sll);
> > -        sll.sll_family = AF_PACKET;
> > -        sll.sll_ifindex = ifindex;
> > -
> > -        msg.msg_name = &sll;
> > -        msg.msg_namelen = sizeof sll;
> > -        msg.msg_iovlen = 1;
> > -        msg.msg_control = NULL;
> > -        msg.msg_controllen = 0;
> > -        msg.msg_flags = 0;
> > +        error = netdev_linux_sock_batch_send(sock, ifindex, batch);
> > +    } else {
> > +        error = netdev_linux_tap_batch_send(netdev_, batch);
> >      }
> > -
> > -    /* 'i' is incremented only if there's no error */
> > -    for (int i = 0; i < batch->count; ) {
> > -        const void *data = dp_packet_data(batch->packets[i]);
> > -        size_t size = dp_packet_get_send_len(batch->packets[i]);
> > -        ssize_t retval;
> > -
> > -        if (!is_tap_netdev(netdev_)) {
> > -            /* Use our AF_PACKET socket to send to this device. */
> > -            struct iovec iov;
> > -
> > -            iov.iov_base = CONST_CAST(void *, data);
> > -            iov.iov_len = size;
> > -
> > -            msg.msg_iov = &iov;
> > -
> > -            retval = sendmsg(sock, &msg, 0);
> > +    if (error) {
> > +        if (error == ENOBUFS) {
> > +            /* The Linux AF_PACKET implementation never blocks waiting
> > +             * for room for packets, instead returning ENOBUFS.
> > +             * Translate this into EAGAIN for the caller. */
> > +            error = EAGAIN;
> >          } else {
> > -            /* Use the tap fd to send to this device.  This is essential
> > for
> > -             * tap devices, because packets sent to a tap device with an
> > -             * AF_PACKET socket will loop back to be *received* again on
> > the
> > -             * tap device.  This doesn't occur on other interface types
> > -             * because we attach a socket filter to the rx socket. */
> > -            struct netdev_linux *netdev = netdev_linux_cast(netdev_);
> > -
> > -            retval = write(netdev->tap_fd, data, size);
> > -        }
> > -
> > -        if (retval < 0) {
> > -            if (errno == EINTR) {
> > -                /* The send was interrupted by a signal.  Retry the
> > packet by
> > -                 * continuing without incrementing 'i'.*/
> > -                continue;
> > -            } else if (errno == EIO && is_tap_netdev(netdev_)) {
> > -                /* The Linux tap driver returns EIO if the device is not
> > up.
> > -                 * From the OVS side this is not an error, so ignore it.
> > */
> > -            } else {
> > -                /* The Linux AF_PACKET implementation never blocks
> > waiting for
> > -                 * room for packets, instead returning ENOBUFS.
> > Translate this
> > -                 * into EAGAIN for the caller. */
> > -                error = errno == ENOBUFS ? EAGAIN : errno;
> > -                break;
> > -            }
> > -        } else if (retval != size) {
> > -            VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE"
> > bytes"
> > -                              " of %"PRIuSIZE") on %s", retval, size,
> > -                         netdev_get_name(netdev_));
> > -            error = EMSGSIZE;
> > -            break;
> > -        }
> > -
> > -        /* Process the next packet in the batch */
> > -        i++;
> > -    }
> > -
> > -    if (error && error != EAGAIN) {
> >              VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s",
> >                           netdev_get_name(netdev_), ovs_strerror(error));
> > +        }
> >      }
> >
> >  free_batch:
> >      dp_packet_delete_batch(batch, may_steal);
> > -
> >      return error;
> > -
> >  }
> >
> >  /* Registers with the poll loop to wake up from the next call to
> > poll_block()
> > --
> > 2.10.2
> >
> >
_______________________________________________
dev mailing list
[email protected]
https://mail.openvswitch.org/mailman/listinfo/ovs-dev

Reply via email to