Thanks for the review. I removed inline from netdev_linux_sock_batch_send() and applied these patches to master.
On Thu, Aug 03, 2017 at 09:40:36AM +0800, Gao Zhenyu wrote: > Thanks for working on it. LGTM. > > Could you please add inline for function netdev_linux_tap_batch_send? That > makes it looks like function netdev_linux_sock_batch_send. > Or eliminate inline for function netdev_linux_sock_batch_send. > > Thanks > Zhenyu Gao > > 2017-08-03 5:58 GMT+08:00 Ben Pfaff <[email protected]>: > > > From: Zhenyu Gao <[email protected]> > > > > Sendmmsg can reduce cpu cycles in sending packets to kernel. > > Replace sendmsg with sendmmsg in function netdev_linux_send to send > > batch packets if sendmmsg is available. > > > > If kernel side doesn't support sendmmsg, will fallback to sendmsg. > > > > netserver > > |------------| > > | | > > | container | > > |----veth----| > > | > > | |------------| > > |---veth-| dpdk-ovs | netperf > > | | |--------------| > > |----dpdk----| | bare-metal | > > | |--------------| > > | | > > | | > > pnic-----------pnic > > > > Netperf was consumed to test the performance: > > > > 1)cmd:netperf -H remote-container -t UDP_STREAM -l 60 -- -m 1400 > > result: netserver received 2383.21Mb(sendmsg)/2551.64Mb(sendmmsg) > > > > 2)cmd:netperf -H remote-container -t UDP_STREAM -l 60 -- -m 60 > > result: netserver received 109.72Mb(sendmsg)/115.18Mb(sendmmsg) > > > > Sendmmsg show about 6% improvement in netperf UDP testing. > > > > Signed-off-by: Zhenyu Gao <[email protected]> > > Signed-off-by: Ben Pfaff <[email protected]> > > --- > > lib/netdev-linux.c | 163 +++++++++++++++++++++++++++++- > > ----------------------- > > 1 file changed, 89 insertions(+), 74 deletions(-) > > > > diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c > > index 5b4c643e4eec..19572a5bc949 100644 > > --- a/lib/netdev-linux.c > > +++ b/lib/netdev-linux.c > > @@ -1182,13 +1182,89 @@ netdev_linux_rxq_drain(struct netdev_rxq *rxq_) > > } > > } > > > > -/* Sends 'buffer' on 'netdev'. Returns 0 if successful, otherwise a > > positive > > +static inline int > > +netdev_linux_sock_batch_send(int sock, int ifindex, > > + struct dp_packet_batch *batch) > > +{ > > + /* We don't bother setting most fields in sockaddr_ll because the > > + * kernel ignores them for SOCK_RAW. */ > > + struct sockaddr_ll sll = { .sll_family = AF_PACKET, > > + .sll_ifindex = ifindex }; > > + > > + struct mmsghdr *mmsg = xmalloc(sizeof(*mmsg) * batch->count); > > + struct iovec *iov = xmalloc(sizeof(*iov) * batch->count); > > + > > + for (int i = 0; i < batch->count; i++) { > > + struct dp_packet *packet = batch->packets[i]; > > + iov[i].iov_base = dp_packet_data(packet); > > + iov[i].iov_len = dp_packet_get_send_len(packet); > > + mmsg[i].msg_hdr = (struct msghdr) { .msg_name = &sll, > > + .msg_namelen = sizeof sll, > > + .msg_iov = &iov[i], > > + .msg_iovlen = 1 }; > > + } > > + > > + int error = 0; > > + for (uint32_t ofs = 0; ofs < batch->count; ) { > > + ssize_t retval; > > + do { > > + retval = sendmmsg(sock, mmsg + ofs, batch->count - ofs, 0); > > + error = retval < 0 ? errno : 0; > > + } while (error == EINTR); > > + if (error) { > > + break; > > + } > > + ofs += retval; > > + } > > + > > + free(mmsg); > > + free(iov); > > + return error; > > +} > > + > > +/* Use the tap fd to send 'batch' to tap device 'netdev'. Using the tap > > fd is > > + * essential, because packets sent to a tap device with an AF_PACKET > > socket > > + * will loop back to be *received* again on the tap device. This doesn't > > occur > > + * on other interface types because we attach a socket filter to the rx > > + * socket. */ > > +static int > > +netdev_linux_tap_batch_send(struct netdev *netdev_, > > + struct dp_packet_batch *batch) > > +{ > > + struct netdev_linux *netdev = netdev_linux_cast(netdev_); > > + for (int i = 0; i < batch->count; i++) { > > + struct dp_packet *packet = batch->packets[i]; > > + size_t size = dp_packet_get_send_len(packet); > > + ssize_t retval; > > + int error; > > + > > + do { > > + retval = write(netdev->tap_fd, dp_packet_data(packet), size); > > + error = retval < 0 ? errno : 0; > > + } while (error == EINTR); > > + > > + if (error) { > > + /* The Linux tap driver returns EIO if the device is not up. > > From > > + * the OVS side this is not an error, so we ignore it; > > otherwise, > > + * return the erro. */ > > + if (error != EIO) { > > + return error; > > + } > > + } else if (retval != size) { > > + VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" " > > + "bytes of %"PRIuSIZE") on %s", > > + retval, size, netdev_get_name(netdev_)); > > + return EMSGSIZE; > > + } > > + } > > + return 0; > > +} > > + > > +/* Sends 'batch' on 'netdev'. Returns 0 if successful, otherwise a > > positive > > * errno value. Returns EAGAIN without blocking if the packet cannot be > > queued > > * immediately. Returns EMSGSIZE if a partial packet was transmitted or > > if > > * the packet is too big or too small to transmit on the device. > > * > > - * The caller retains ownership of 'buffer' in all cases. > > - * > > * The kernel maintains a packet transmission queue, so the caller is not > > * expected to do additional queuing of packets. */ > > static int > > @@ -1199,8 +1275,6 @@ netdev_linux_send(struct netdev *netdev_, int qid > > OVS_UNUSED, > > int error = 0; > > int sock = 0; > > > > - struct sockaddr_ll sll; > > - struct msghdr msg; > > if (!is_tap_netdev(netdev_)) { > > sock = af_packet_sock(); > > if (sock < 0) { > > @@ -1214,84 +1288,25 @@ netdev_linux_send(struct netdev *netdev_, int qid > > OVS_UNUSED, > > goto free_batch; > > } > > > > - /* We don't bother setting most fields in sockaddr_ll because the > > - * kernel ignores them for SOCK_RAW. */ > > - memset(&sll, 0, sizeof sll); > > - sll.sll_family = AF_PACKET; > > - sll.sll_ifindex = ifindex; > > - > > - msg.msg_name = &sll; > > - msg.msg_namelen = sizeof sll; > > - msg.msg_iovlen = 1; > > - msg.msg_control = NULL; > > - msg.msg_controllen = 0; > > - msg.msg_flags = 0; > > + error = netdev_linux_sock_batch_send(sock, ifindex, batch); > > + } else { > > + error = netdev_linux_tap_batch_send(netdev_, batch); > > } > > - > > - /* 'i' is incremented only if there's no error */ > > - for (int i = 0; i < batch->count; ) { > > - const void *data = dp_packet_data(batch->packets[i]); > > - size_t size = dp_packet_get_send_len(batch->packets[i]); > > - ssize_t retval; > > - > > - if (!is_tap_netdev(netdev_)) { > > - /* Use our AF_PACKET socket to send to this device. */ > > - struct iovec iov; > > - > > - iov.iov_base = CONST_CAST(void *, data); > > - iov.iov_len = size; > > - > > - msg.msg_iov = &iov; > > - > > - retval = sendmsg(sock, &msg, 0); > > + if (error) { > > + if (error == ENOBUFS) { > > + /* The Linux AF_PACKET implementation never blocks waiting > > + * for room for packets, instead returning ENOBUFS. > > + * Translate this into EAGAIN for the caller. */ > > + error = EAGAIN; > > } else { > > - /* Use the tap fd to send to this device. This is essential > > for > > - * tap devices, because packets sent to a tap device with an > > - * AF_PACKET socket will loop back to be *received* again on > > the > > - * tap device. This doesn't occur on other interface types > > - * because we attach a socket filter to the rx socket. */ > > - struct netdev_linux *netdev = netdev_linux_cast(netdev_); > > - > > - retval = write(netdev->tap_fd, data, size); > > - } > > - > > - if (retval < 0) { > > - if (errno == EINTR) { > > - /* The send was interrupted by a signal. Retry the > > packet by > > - * continuing without incrementing 'i'.*/ > > - continue; > > - } else if (errno == EIO && is_tap_netdev(netdev_)) { > > - /* The Linux tap driver returns EIO if the device is not > > up. > > - * From the OVS side this is not an error, so ignore it. > > */ > > - } else { > > - /* The Linux AF_PACKET implementation never blocks > > waiting for > > - * room for packets, instead returning ENOBUFS. > > Translate this > > - * into EAGAIN for the caller. */ > > - error = errno == ENOBUFS ? EAGAIN : errno; > > - break; > > - } > > - } else if (retval != size) { > > - VLOG_WARN_RL(&rl, "sent partial Ethernet packet (%"PRIuSIZE" > > bytes" > > - " of %"PRIuSIZE") on %s", retval, size, > > - netdev_get_name(netdev_)); > > - error = EMSGSIZE; > > - break; > > - } > > - > > - /* Process the next packet in the batch */ > > - i++; > > - } > > - > > - if (error && error != EAGAIN) { > > VLOG_WARN_RL(&rl, "error sending Ethernet packet on %s: %s", > > netdev_get_name(netdev_), ovs_strerror(error)); > > + } > > } > > > > free_batch: > > dp_packet_delete_batch(batch, may_steal); > > - > > return error; > > - > > } > > > > /* Registers with the poll loop to wake up from the next call to > > poll_block() > > -- > > 2.10.2 > > > > _______________________________________________ dev mailing list [email protected] https://mail.openvswitch.org/mailman/listinfo/ovs-dev
