On Tue, Mar 18, 2014 at 3:56 PM, Jarno Rajahalme <[email protected]> wrote:
>
> On Mar 18, 2014, at 1:53 PM, Pravin <[email protected]> wrote:
>
>> DPDK can receive multiple packets but current netdev API does
>> not allow that. Following patch allows dpif-netdev receive batch
>> of packet in a rx_recv() call for any netdev port. This will be
>> used by dpdk-netdev.
>>
>> Signed-off-by: Pravin B Shelar <[email protected]>
>> ---
>> lib/automake.mk | 1 +
>> lib/dpif-netdev.c | 51
>> ++++++++++++++++++-------------------------------
>> lib/dpif-netdev.h | 40 ++++++++++++++++++++++++++++++++++++++
>> lib/netdev-dummy.c | 26 +++++++++----------------
>> lib/netdev-linux.c | 20 ++++++++++++++++---
>> lib/netdev-provider.h | 24 +++++------------------
>> lib/netdev.c | 24 +++++------------------
>> lib/netdev.h | 2 +-
>> lib/packets.c | 9 +++++++++
>> lib/packets.h | 1 +
>> 10 files changed, 107 insertions(+), 91 deletions(-)
>> create mode 100644 lib/dpif-netdev.h
>>
>> diff --git a/lib/automake.mk b/lib/automake.mk
>> index e22165c..832b7f9 100644
>> --- a/lib/automake.mk
>> +++ b/lib/automake.mk
>> @@ -52,6 +52,7 @@ lib_libopenvswitch_la_SOURCES = \
>> lib/dhparams.h \
>> lib/dirs.h \
>> lib/dpif-netdev.c \
>> + lib/dpif-netdev.h \
>> lib/dpif-provider.h \
>> lib/dpif.c \
>> lib/dpif.h \
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 3bbfd2a..d78fb25 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -34,6 +34,7 @@
>> #include "classifier.h"
>> #include "csum.h"
>> #include "dpif.h"
>> +#include "dpif-netdev.h"
>> #include "dpif-provider.h"
>> #include "dummy.h"
>> #include "dynamic-string.h"
>> @@ -68,10 +69,6 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>> /* Configuration parameters. */
>> enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
>>
>> -/* Enough headroom to add a vlan tag, plus an extra 2 bytes to allow IP
>> - * headers to be aligned on a 4-byte boundary. */
>> -enum { DP_NETDEV_HEADROOM = 2 + VLAN_HEADER_LEN };
>> -
>> /* Queues. */
>> enum { N_QUEUES = 2 }; /* Number of queues for dpif_recv(). */
>> enum { MAX_QUEUE_LEN = 128 }; /* Maximum number of packets per queue. */
>> @@ -1443,6 +1440,7 @@ dpif_netdev_execute(struct dpif *dpif, struct
>> dpif_execute *execute)
>> {
>> struct dp_netdev *dp = get_dp_netdev(dpif);
>> struct pkt_metadata *md = &execute->md;
>> + struct ofpbuf *packet;
>> struct flow key;
>>
>> if (execute->packet->size < ETH_HEADER_LEN ||
>> @@ -1453,8 +1451,9 @@
(struct dpif *dpif, struct dpif_execute *execute)
>> /* Extract flow key. */
>> flow_extract(execute->packet, md, &key);
>>
>> + packet = ofpbuf_clone(execute->packet);
>> ovs_rwlock_rdlock(&dp->port_rwlock);
>> - dp_netdev_execute_actions(dp, &key, execute->packet, md,
>> execute->actions,
>> + dp_netdev_execute_actions(dp, &key, packet, md, execute->actions,
>> execute->actions_len);
>> ovs_rwlock_unlock(&dp->port_rwlock);
>>
>
> It is not clear to me why the packet should be cloned here, especially since
> you are already copying the data in the userspace action. Also, it seems this
> will leak memory in cases where there is no OUTPUT or USERSPACE action, as
> the dp_execute_action() will not be called in those cases (e.g., a drop flow).
>
>
In Model I am following DKDP dpdk-recv generate ofpbuf and send consumes it.
That means dpdk-send calls ofpbuf_delete() on every buffer dpif-netdev
sends to it.
Here in dp_execute_action() I need to clone ofpbuf so that dpdk-send
can release it. execute->packet passed can not freed by
ofpbuf_delete().
I missed drop case, I will fix the code.
>> @@ -1586,12 +1585,10 @@ dp_forwarder_main(void *f_)
>> {
>> struct dp_forwarder *f = f_;
>> struct dp_netdev *dp = f->dp;
>> - struct ofpbuf packet;
>>
>> f->name = xasprintf("forwarder_%u", ovsthread_id_self());
>> set_subprogram_name("%s", f->name);
>>
>> - ofpbuf_init(&packet, 0);
>> while (!latch_is_set(&dp->exit_latch)) {
>> bool received_anything;
>> int i;
>> @@ -1605,25 +1602,19 @@ dp_forwarder_main(void *f_)
>> if (port->rx
>> && port->node.hash >= f->min_hash
>> && port->node.hash <= f->max_hash) {
>> - int buf_size;
>> + struct ofpbuf *packets[MAX_RX_BATCH];
>> + int count;
>> int error;
>> - int mtu;
>> -
>> - if (netdev_get_mtu(port->netdev, &mtu)) {
>> - mtu = ETH_PAYLOAD_MAX;
>> - }
>> - buf_size = DP_NETDEV_HEADROOM + VLAN_ETH_HEADER_LEN +
>> mtu;
>> -
>> - ofpbuf_clear(&packet);
>> - ofpbuf_reserve_with_tailroom(&packet,
>> DP_NETDEV_HEADROOM,
>> - buf_size);
>>
>> - error = netdev_rx_recv(port->rx, &packet);
>> + error = netdev_rx_recv(port->rx, packets, &count);
>> if (!error) {
>> + int i;
>> struct pkt_metadata md
>> = PKT_METADATA_INITIALIZER(port->port_no);
>>
>> - dp_netdev_port_input(dp, &packet, &md);
>> + for (i = 0; i < count; i++) {
>> + dp_netdev_port_input(dp, packets[i], &md);
>> + }
>> received_anything = true;
>> } else if (error != EAGAIN && error != EOPNOTSUPP) {
>> static struct vlog_rate_limit rl
>> @@ -1659,7 +1650,6 @@ dp_forwarder_main(void *f_)
>>
>> poll_block();
>> }
>> - ofpbuf_uninit(&packet);
>>
>> free(f->name);
>>
>> @@ -1741,6 +1731,7 @@ dp_netdev_port_input(struct dp_netdev *dp, struct
>> ofpbuf *packet,
>> } else {
>> ovsthread_counter_inc(dp->n_missed, 1);
>> dp_netdev_output_userspace(dp, packet, DPIF_UC_MISS, &key, NULL);
>> + ofpbuf_delete(packet);
>> }
>
> Maybe this should be done in both cases (see below).
>
I want dpdk-send release buffer, therefore I will fix drop case to fix
the issue.
>> }
>>
>> @@ -1767,6 +1758,7 @@ dp_netdev_output_userspace(struct dp_netdev *dp,
>> struct ofpbuf *packet,
>> if (userdata) {
>> buf_size += NLA_ALIGN(userdata->nla_len);
>> }
>> + buf_size += packet->size;
>> ofpbuf_init(buf, buf_size);
>>
>> /* Put ODP flow. */
>> @@ -1780,10 +1772,8 @@ dp_netdev_output_userspace(struct dp_netdev *dp,
>> struct ofpbuf *packet,
>> NLA_ALIGN(userdata->nla_len));
>> }
>>
>> - /* Steal packet data. */
>> - ovs_assert(packet->source == OFPBUF_MALLOC);
>> - upcall->packet = *packet;
>> - ofpbuf_use(packet, NULL, 0);
>> + upcall->packet.data = ofpbuf_put(buf, packet->data, packet->size);
>> + upcall->packet.size = packet->size;
>>
>
> I do not see a reason in this patch why the packet data should be copied
> here. However, I can guess that the reason for it comes apparent with the
> later patches?
>
I would like to directly pass packet pointer, but that requires lot of
changes due to struct dpif_upcall change. since this is slow path I am
not worried about this case at this point.
>> seq_change(dp->queue_seq);
>>
>> @@ -1825,15 +1815,8 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet,
>>
>> userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
>>
>> - /* Make a copy if we are not allowed to steal the packet's data. */
>> - if (!may_steal) {
>> - packet = ofpbuf_clone_with_headroom(packet, DP_NETDEV_HEADROOM);
>> - }
>> dp_netdev_output_userspace(aux->dp, packet, DPIF_UC_ACTION, aux->key,
>> userdata);
>> - if (!may_steal) {
>> - ofpbuf_uninit(packet);
>> - }
>> break;
>> }
>> case OVS_ACTION_ATTR_PUSH_VLAN:
>> @@ -1846,6 +1829,10 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet,
>> case __OVS_ACTION_ATTR_MAX:
>> OVS_NOT_REACHED();
>> }
>> +
>> + if (may_steal) {
>> + ofpbuf_delete(packet);
>> + }
>
> You can not rely on this to happen for all the packets. It seems you could
> delete this and the cloning in dpif_netdev_execute(), and have
> dp_port_input() do the delete also after execution?
>
right, I need to fix cases like drop.
As mentioned earlier I want dpdk-send delete the packet. But I see
this has made code bit complicated I will see if it can be simplified.
>> }
>>
>> static void
>> diff --git a/lib/dpif-netdev.h b/lib/dpif-netdev.h
>> new file mode 100644
>> index 0000000..64c93ee
>> --- /dev/null
>> +++ b/lib/dpif-netdev.h
>> @@ -0,0 +1,40 @@
>> +/*
>> + * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at:
>> + *
>> + * http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#ifndef DPIF_NETDEV_H
>> +#define DPIF_NETDEV_H 1
>> +
>> +#include <stdbool.h>
>> +#include <stddef.h>
>> +#include <stdint.h>
>> +#include "openvswitch/types.h"
>> +#include "packets.h"
>> +
>> +#ifdef __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +/* Enough headroom to add a vlan tag, plus an extra 2 bytes to allow IP
>> + * headers to be aligned on a 4-byte boundary. */
>> +enum { DP_NETDEV_HEADROOM = 2 + VLAN_HEADER_LEN };
>> +
>> +enum { MAX_RX_BATCH = 256 }; /* Maximum number of flows in flow table.
>> */
>> +
>> +#ifdef __cplusplus
>> +}
>> +#endif
>> +
>> +#endif /* netdev.h */
>> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
>> index f23fc9f..27487f9 100644
>> --- a/lib/netdev-dummy.c
>> +++ b/lib/netdev-dummy.c
>> @@ -755,12 +755,11 @@ netdev_dummy_rx_dealloc(struct netdev_rx *rx_)
>> }
>>
>> static int
>> -netdev_dummy_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
>> +netdev_dummy_rx_recv(struct netdev_rx *rx_, struct ofpbuf **arr, int *c)
>> {
>> struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
>> struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
>> struct ofpbuf *packet;
>> - int retval;
>>
>> ovs_mutex_lock(&netdev->mutex);
>> if (!list_is_empty(&rx->recv_queue)) {
>> @@ -774,22 +773,15 @@ netdev_dummy_rx_recv(struct netdev_rx *rx_, struct
>> ofpbuf *buffer)
>> if (!packet) {
>> return EAGAIN;
>> }
>> + ovs_mutex_lock(&netdev->mutex);
>> + netdev->stats.rx_packets++;
>> + netdev->stats.rx_bytes += packet->size;
>> + ovs_mutex_unlock(&netdev->mutex);
>>
>> - if (packet->size <= ofpbuf_tailroom(buffer)) {
>> - memcpy(buffer->data, packet->data, packet->size);
>> - buffer->size += packet->size;
>> - retval = 0;
>> -
>> - ovs_mutex_lock(&netdev->mutex);
>> - netdev->stats.rx_packets++;
>> - netdev->stats.rx_bytes += packet->size;
>> - ovs_mutex_unlock(&netdev->mutex);
>> - } else {
>> - retval = EMSGSIZE;
>> - }
>> - ofpbuf_delete(packet);
>> -
>> - return retval;
>> + packet_set_size(packet, packet->size);
>
> The name of this function is a bit misleading, especially since all the users
> already have the correct size...
>
ok. I will change it.
>> + arr[0] = packet;
>> + *c = 1;
>> + return 0;
>> }
>>
>> static void
>> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
>> index 75ce7c6..574a572 100644
>> --- a/lib/netdev-linux.c
>> +++ b/lib/netdev-linux.c
>> @@ -50,6 +50,7 @@
>> #include "connectivity.h"
>> #include "coverage.h"
>> #include "dpif-linux.h"
>> +#include "dpif-netdev.h"
>> #include "dynamic-string.h"
>> #include "fatal-signal.h"
>> #include "hash.h"
>> @@ -461,6 +462,7 @@ static int af_packet_sock(void);
>> static bool netdev_linux_miimon_enabled(void);
>> static void netdev_linux_miimon_run(void);
>> static void netdev_linux_miimon_wait(void);
>> +static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
>>
>> static bool
>> is_netdev_linux_class(const struct netdev_class *netdev_class)
>> @@ -984,10 +986,19 @@ netdev_linux_rx_recv_tap(int fd, struct ofpbuf *buffer)
>> }
>>
>> static int
>> -netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
>> +netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf **packet, int *c)
>> {
>> struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
>> - int retval;
>> + struct netdev *netdev = rx->up.netdev;
>> + struct ofpbuf *buffer;
>> + ssize_t retval;
>> + int mtu;
>> +
>> + if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
>> + mtu = ETH_PAYLOAD_MAX;
>> + }
>> +
>> + buffer = ofpbuf_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu,
>> DP_NETDEV_HEADROOM);
>>
>> retval = (rx->is_tap
>> ? netdev_linux_rx_recv_tap(rx->fd, buffer)
>> @@ -995,8 +1006,11 @@ netdev_linux_rx_recv(struct netdev_rx *rx_, struct
>> ofpbuf *buffer)
>> if (retval && retval != EAGAIN && retval != EMSGSIZE) {
>> VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
>> ovs_strerror(errno), netdev_rx_get_name(rx_));
>> + } else {
>> + packet_set_size(buffer, buffer->size);
>> + packet[0] = buffer;
>> + *c = 1;
>
> I don't see the buffer being released in all the other cases (when it's
> ownership does not transfer to the caller).
>
You mean other than drop packet case?
>> }
>> -
>> return retval;
>> }
>>
>> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
>> index 673d3ab..9bacaa0 100644
>> --- a/lib/netdev-provider.h
>> +++ b/lib/netdev-provider.h
>> @@ -634,28 +634,14 @@ struct netdev_class {
>> void (*rx_destruct)(struct netdev_rx *);
>> void (*rx_dealloc)(struct netdev_rx *);
>>
>> - /* Attempts to receive a packet from 'rx' into the tailroom of 'buffer',
>> - * which should initially be empty. If successful, returns 0 and
>> - * increments 'buffer->size' by the number of bytes in the received
>> packet,
>> - * otherwise a positive errno value. Returns EAGAIN immediately if no
>> - * packet is ready to be received.
>> - *
>> - * Must return EMSGSIZE, and discard the packet, if the received packet
>> - * is longer than 'ofpbuf_tailroom(buffer)'.
>> - *
>> - * Implementations may make use of VLAN_HEADER_LEN bytes of tailroom to
>> - * add a VLAN header which is obtained out-of-band to the packet. If
>> - * this occurs then VLAN_HEADER_LEN bytes of tailroom will no longer be
>> - * available for the packet, otherwise it may be used for the packet
>> - * itself.
>> - *
>> - * It is advised that the tailroom of 'buffer' should be
>> - * VLAN_HEADER_LEN bytes longer than the MTU to allow space for an
>> - * out-of-band VLAN header to be added to the packet.
>> + /* Attempts to receive batch of packets from 'rx' and place array of
>> pointers
>> + * into '*pkt'. netdev is responsible for allocating buffers.
>> + * '*cnt' points to packet count for given batch. Once packets are
>> returned
>> + * to caller, netdev should give up ownership of ofbpuf data.
>> *
>> * This function may be set to null if it would always return EOPNOTSUPP
>> * anyhow. */
>> - int (*rx_recv)(struct netdev_rx *rx, struct ofpbuf *buffer);
>> + int (*rx_recv)(struct netdev_rx *rx, struct ofpbuf **pkt, int *cnt);
>>
>> /* Registers with the poll loop to wake up from the next call to
>> * poll_block() when a packet is ready to be received with
>> netdev_rx_recv()
>> diff --git a/lib/netdev.c b/lib/netdev.c
>> index e9c8d8f..a058742 100644
>> --- a/lib/netdev.c
>> +++ b/lib/netdev.c
>> @@ -551,10 +551,7 @@ netdev_rx_close(struct netdev_rx *rx)
>> }
>> }
>>
>> -/* Attempts to receive a packet from 'rx' into the tailroom of 'buffer',
>> which
>> - * must initially be empty. If successful, returns 0 and increments
>> - * 'buffer->size' by the number of bytes in the received packet, otherwise a
>> - * positive errno value.
>> +/* Attempts to receive batch packet from 'rx'.
>> *
>> * Returns EAGAIN immediately if no packet is ready to be received.
>> *
>> @@ -562,10 +559,7 @@ netdev_rx_close(struct netdev_rx *rx)
>> * than 'ofpbuf_tailroom(buffer)'.
>> *
>> * Implementations may make use of VLAN_HEADER_LEN bytes of tailroom to
>> - * add a VLAN header which is obtained out-of-band to the packet. If
>> - * this occurs then VLAN_HEADER_LEN bytes of tailroom will no longer be
>> - * available for the packet, otherwise it may be used for the packet
>> - * itself.
>> + * add a VLAN header which is obtained out-of-band to the packet.
>> *
>> * It is advised that the tailroom of 'buffer' should be
>> * VLAN_HEADER_LEN bytes longer than the MTU to allow space for an
>> @@ -575,23 +569,15 @@ netdev_rx_close(struct netdev_rx *rx)
>> * This function may be set to null if it would always return EOPNOTSUPP
>> * anyhow. */
>> int
>> -netdev_rx_recv(struct netdev_rx *rx, struct ofpbuf *buffer)
>> +netdev_rx_recv(struct netdev_rx *rx, struct ofpbuf **buffers, int *cnt)
>> {
>> int retval;
>>
>> - ovs_assert(buffer->size == 0);
>> - ovs_assert(ofpbuf_tailroom(buffer) >= ETH_TOTAL_MIN);
>> -
>> - retval = rx->netdev->netdev_class->rx_recv(rx, buffer);
>> + retval = rx->netdev->netdev_class->rx_recv(rx, buffers, cnt);
>> if (!retval) {
>> COVERAGE_INC(netdev_received);
>> - if (buffer->size < ETH_TOTAL_MIN) {
>> - ofpbuf_put_zeros(buffer, ETH_TOTAL_MIN - buffer->size);
>> - }
>> - return 0;
>> - } else {
>> - return retval;
>> }
>> + return retval;
>> }
>>
>> /* Arranges for poll_block() to wake up when a packet is ready to be received
>> diff --git a/lib/netdev.h b/lib/netdev.h
>> index 410c35b..bfd8f91 100644
>> --- a/lib/netdev.h
>> +++ b/lib/netdev.h
>> @@ -161,7 +161,7 @@ void netdev_rx_close(struct netdev_rx *);
>>
>> const char *netdev_rx_get_name(const struct netdev_rx *);
>>
>> -int netdev_rx_recv(struct netdev_rx *, struct ofpbuf *);
>> +int netdev_rx_recv(struct netdev_rx *rx, struct ofpbuf **buffers, int *cnt);
>> void netdev_rx_wait(struct netdev_rx *);
>> int netdev_rx_drain(struct netdev_rx *);
>>
>> diff --git a/lib/packets.c b/lib/packets.c
>> index 3f7d6eb..6f2fca6 100644
>> --- a/lib/packets.c
>> +++ b/lib/packets.c
>> @@ -1012,3 +1012,12 @@ void pkt_metadata_from_flow(struct pkt_metadata *md,
>> const struct flow *flow)
>> pkt_metadata_init(md, &flow->tunnel, flow->skb_priority,
>> flow->pkt_mark, &flow->in_port);
>> }
>> +
>> +void
>> +packet_set_size(struct ofpbuf *b, int size)
>> +{
>> + b->size = size;
>> + if (b->size < ETH_TOTAL_MIN) {
>> + ofpbuf_put_zeros(b, ETH_TOTAL_MIN - b->size);
>> + }
>> +}
>> diff --git a/lib/packets.h b/lib/packets.h
>> index e6b3303..f41a934 100644
>> --- a/lib/packets.h
>> +++ b/lib/packets.h
>> @@ -671,5 +671,6 @@ void packet_set_sctp_port(struct ofpbuf *, ovs_be16 src,
>> ovs_be16 dst);
>> uint16_t packet_get_tcp_flags(const struct ofpbuf *, const struct flow *);
>> void packet_format_tcp_flags(struct ds *, uint16_t);
>> const char *packet_tcp_flag_to_string(uint32_t flag);
>> +void packet_set_size(struct ofpbuf *b, int size);
>>
>> #endif /* packets.h */
>> --
>> 1.7.9.5
>>
>> _______________________________________________
>> dev mailing list
>> [email protected]
>> http://openvswitch.org/mailman/listinfo/dev
>
_______________________________________________
dev mailing list
[email protected]
http://openvswitch.org/mailman/listinfo/dev