On Tue, Mar 18, 2014 at 3:56 PM, Jarno Rajahalme <[email protected]> wrote:
>
> On Mar 18, 2014, at 1:53 PM, Pravin <[email protected]> wrote:
>
>> DPDK can receive multiple packets but current netdev API does
>> not allow that.  Following patch allows dpif-netdev receive batch
>> of packet in a rx_recv() call for any netdev port.  This will be
>> used by dpdk-netdev.
>>
>> Signed-off-by: Pravin B Shelar <[email protected]>
>> ---
>> lib/automake.mk       |    1 +
>> lib/dpif-netdev.c     |   51 
>> ++++++++++++++++++-------------------------------
>> lib/dpif-netdev.h     |   40 ++++++++++++++++++++++++++++++++++++++
>> lib/netdev-dummy.c    |   26 +++++++++----------------
>> lib/netdev-linux.c    |   20 ++++++++++++++++---
>> lib/netdev-provider.h |   24 +++++------------------
>> lib/netdev.c          |   24 +++++------------------
>> lib/netdev.h          |    2 +-
>> lib/packets.c         |    9 +++++++++
>> lib/packets.h         |    1 +
>> 10 files changed, 107 insertions(+), 91 deletions(-)
>> create mode 100644 lib/dpif-netdev.h
>>
>> diff --git a/lib/automake.mk b/lib/automake.mk
>> index e22165c..832b7f9 100644
>> --- a/lib/automake.mk
>> +++ b/lib/automake.mk
>> @@ -52,6 +52,7 @@ lib_libopenvswitch_la_SOURCES = \
>>       lib/dhparams.h \
>>       lib/dirs.h \
>>       lib/dpif-netdev.c \
>> +     lib/dpif-netdev.h \
>>       lib/dpif-provider.h \
>>       lib/dpif.c \
>>       lib/dpif.h \
>> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
>> index 3bbfd2a..d78fb25 100644
>> --- a/lib/dpif-netdev.c
>> +++ b/lib/dpif-netdev.c
>> @@ -34,6 +34,7 @@
>> #include "classifier.h"
>> #include "csum.h"
>> #include "dpif.h"
>> +#include "dpif-netdev.h"
>> #include "dpif-provider.h"
>> #include "dummy.h"
>> #include "dynamic-string.h"
>> @@ -68,10 +69,6 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
>> /* Configuration parameters. */
>> enum { MAX_FLOWS = 65536 };     /* Maximum number of flows in flow table. */
>>
>> -/* Enough headroom to add a vlan tag, plus an extra 2 bytes to allow IP
>> - * headers to be aligned on a 4-byte boundary.  */
>> -enum { DP_NETDEV_HEADROOM = 2 + VLAN_HEADER_LEN };
>> -
>> /* Queues. */
>> enum { N_QUEUES = 2 };          /* Number of queues for dpif_recv(). */
>> enum { MAX_QUEUE_LEN = 128 };   /* Maximum number of packets per queue. */
>> @@ -1443,6 +1440,7 @@ dpif_netdev_execute(struct dpif *dpif, struct 
>> dpif_execute *execute)
>> {
>>     struct dp_netdev *dp = get_dp_netdev(dpif);
>>     struct pkt_metadata *md = &execute->md;
>> +    struct ofpbuf *packet;
>>     struct flow key;
>>
>>     if (execute->packet->size < ETH_HEADER_LEN ||
>> @@ -1453,8 +1451,9 @@
(struct dpif *dpif, struct dpif_execute *execute)
>>     /* Extract flow key. */
>>     flow_extract(execute->packet, md, &key);
>>
>> +    packet = ofpbuf_clone(execute->packet);
>>     ovs_rwlock_rdlock(&dp->port_rwlock);
>> -    dp_netdev_execute_actions(dp, &key, execute->packet, md, 
>> execute->actions,
>> +    dp_netdev_execute_actions(dp, &key, packet, md, execute->actions,
>>                               execute->actions_len);
>>     ovs_rwlock_unlock(&dp->port_rwlock);
>>
>
> It is not clear to me why the packet should be cloned here, especially since 
> you are already copying the data in the userspace action. Also, it seems this 
> will leak memory in cases where there is no OUTPUT or USERSPACE action, as 
> the dp_execute_action() will not be called in those cases (e.g., a drop flow).
>
>
In Model I am following DKDP dpdk-recv generate ofpbuf and send consumes it.
That means dpdk-send calls ofpbuf_delete() on every buffer dpif-netdev
sends to it.
Here in dp_execute_action()  I need to clone ofpbuf so that dpdk-send
can release it. execute->packet passed can not freed by
ofpbuf_delete().
I missed drop case, I will fix the code.


>> @@ -1586,12 +1585,10 @@ dp_forwarder_main(void *f_)
>> {
>>     struct dp_forwarder *f = f_;
>>     struct dp_netdev *dp = f->dp;
>> -    struct ofpbuf packet;
>>
>>     f->name = xasprintf("forwarder_%u", ovsthread_id_self());
>>     set_subprogram_name("%s", f->name);
>>
>> -    ofpbuf_init(&packet, 0);
>>     while (!latch_is_set(&dp->exit_latch)) {
>>         bool received_anything;
>>         int i;
>> @@ -1605,25 +1602,19 @@ dp_forwarder_main(void *f_)
>>                 if (port->rx
>>                     && port->node.hash >= f->min_hash
>>                     && port->node.hash <= f->max_hash) {
>> -                    int buf_size;
>> +                    struct ofpbuf *packets[MAX_RX_BATCH];
>> +                    int count;
>>                     int error;
>> -                    int mtu;
>> -
>> -                    if (netdev_get_mtu(port->netdev, &mtu)) {
>> -                        mtu = ETH_PAYLOAD_MAX;
>> -                    }
>> -                    buf_size = DP_NETDEV_HEADROOM + VLAN_ETH_HEADER_LEN + 
>> mtu;
>> -
>> -                    ofpbuf_clear(&packet);
>> -                    ofpbuf_reserve_with_tailroom(&packet, 
>> DP_NETDEV_HEADROOM,
>> -                                                 buf_size);
>>
>> -                    error = netdev_rx_recv(port->rx, &packet);
>> +                    error = netdev_rx_recv(port->rx, packets, &count);
>>                     if (!error) {
>> +                        int i;
>>                         struct pkt_metadata md
>>                             = PKT_METADATA_INITIALIZER(port->port_no);
>>
>> -                        dp_netdev_port_input(dp, &packet, &md);
>> +                        for (i = 0; i < count; i++) {
>> +                            dp_netdev_port_input(dp, packets[i], &md);
>> +                        }
>>                         received_anything = true;
>>                     } else if (error != EAGAIN && error != EOPNOTSUPP) {
>>                         static struct vlog_rate_limit rl
>> @@ -1659,7 +1650,6 @@ dp_forwarder_main(void *f_)
>>
>>         poll_block();
>>     }
>> -    ofpbuf_uninit(&packet);
>>
>>     free(f->name);
>>
>> @@ -1741,6 +1731,7 @@ dp_netdev_port_input(struct dp_netdev *dp, struct 
>> ofpbuf *packet,
>>     } else {
>>         ovsthread_counter_inc(dp->n_missed, 1);
>>         dp_netdev_output_userspace(dp, packet, DPIF_UC_MISS, &key, NULL);
>> +        ofpbuf_delete(packet);
>>     }
>
> Maybe this should be done in both cases (see below).
>
I want dpdk-send release buffer, therefore I will fix drop case to fix
the issue.

>> }
>>
>> @@ -1767,6 +1758,7 @@ dp_netdev_output_userspace(struct dp_netdev *dp, 
>> struct ofpbuf *packet,
>>         if (userdata) {
>>             buf_size += NLA_ALIGN(userdata->nla_len);
>>         }
>> +        buf_size += packet->size;
>>         ofpbuf_init(buf, buf_size);
>>
>>         /* Put ODP flow. */
>> @@ -1780,10 +1772,8 @@ dp_netdev_output_userspace(struct dp_netdev *dp, 
>> struct ofpbuf *packet,
>>                                           NLA_ALIGN(userdata->nla_len));
>>         }
>>
>> -        /* Steal packet data. */
>> -        ovs_assert(packet->source == OFPBUF_MALLOC);
>> -        upcall->packet = *packet;
>> -        ofpbuf_use(packet, NULL, 0);
>> +        upcall->packet.data = ofpbuf_put(buf, packet->data, packet->size);
>> +        upcall->packet.size = packet->size;
>>
>
> I do not see a reason in this patch why the packet data should be copied 
> here. However, I can guess that the reason for it comes apparent with the 
> later patches?
>

I would like to directly pass packet pointer, but that requires lot of
changes due to struct dpif_upcall change. since this is slow path I am
not worried about this case at this point.

>>         seq_change(dp->queue_seq);
>>
>> @@ -1825,15 +1815,8 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet,
>>
>>         userdata = nl_attr_find_nested(a, OVS_USERSPACE_ATTR_USERDATA);
>>
>> -        /* Make a copy if we are not allowed to steal the packet's data. */
>> -        if (!may_steal) {
>> -            packet = ofpbuf_clone_with_headroom(packet, DP_NETDEV_HEADROOM);
>> -        }
>>         dp_netdev_output_userspace(aux->dp, packet, DPIF_UC_ACTION, aux->key,
>>                                    userdata);
>> -        if (!may_steal) {
>> -            ofpbuf_uninit(packet);
>> -        }
>>         break;
>>     }
>>     case OVS_ACTION_ATTR_PUSH_VLAN:
>> @@ -1846,6 +1829,10 @@ dp_execute_cb(void *aux_, struct ofpbuf *packet,
>>     case __OVS_ACTION_ATTR_MAX:
>>         OVS_NOT_REACHED();
>>     }
>> +
>> +    if (may_steal) {
>> +        ofpbuf_delete(packet);
>> +    }
>
> You can not rely on this to happen for all the packets. It seems you could 
> delete this and the cloning in dpif_netdev_execute(), and have 
> dp_port_input() do the delete also after execution?
>
right, I need to fix cases like drop.
As mentioned earlier I want dpdk-send delete the packet. But I see
this has made code bit complicated I will see if it can be simplified.

>> }
>>
>> static void
>> diff --git a/lib/dpif-netdev.h b/lib/dpif-netdev.h
>> new file mode 100644
>> index 0000000..64c93ee
>> --- /dev/null
>> +++ b/lib/dpif-netdev.h
>> @@ -0,0 +1,40 @@
>> +/*
>> + * Copyright (c) 2008, 2009, 2010, 2011, 2012, 2013 Nicira, Inc.
>> + *
>> + * Licensed under the Apache License, Version 2.0 (the "License");
>> + * you may not use this file except in compliance with the License.
>> + * You may obtain a copy of the License at:
>> + *
>> + *     http://www.apache.org/licenses/LICENSE-2.0
>> + *
>> + * Unless required by applicable law or agreed to in writing, software
>> + * distributed under the License is distributed on an "AS IS" BASIS,
>> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
>> + * See the License for the specific language governing permissions and
>> + * limitations under the License.
>> + */
>> +
>> +#ifndef DPIF_NETDEV_H
>> +#define DPIF_NETDEV_H 1
>> +
>> +#include <stdbool.h>
>> +#include <stddef.h>
>> +#include <stdint.h>
>> +#include "openvswitch/types.h"
>> +#include "packets.h"
>> +
>> +#ifdef  __cplusplus
>> +extern "C" {
>> +#endif
>> +
>> +/* Enough headroom to add a vlan tag, plus an extra 2 bytes to allow IP
>> + * headers to be aligned on a 4-byte boundary.  */
>> +enum { DP_NETDEV_HEADROOM = 2 + VLAN_HEADER_LEN };
>> +
>> +enum { MAX_RX_BATCH = 256 };     /* Maximum number of flows in flow table. 
>> */
>> +
>> +#ifdef  __cplusplus
>> +}
>> +#endif
>> +
>> +#endif /* netdev.h */
>> diff --git a/lib/netdev-dummy.c b/lib/netdev-dummy.c
>> index f23fc9f..27487f9 100644
>> --- a/lib/netdev-dummy.c
>> +++ b/lib/netdev-dummy.c
>> @@ -755,12 +755,11 @@ netdev_dummy_rx_dealloc(struct netdev_rx *rx_)
>> }
>>
>> static int
>> -netdev_dummy_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
>> +netdev_dummy_rx_recv(struct netdev_rx *rx_, struct ofpbuf **arr, int *c)
>> {
>>     struct netdev_rx_dummy *rx = netdev_rx_dummy_cast(rx_);
>>     struct netdev_dummy *netdev = netdev_dummy_cast(rx->up.netdev);
>>     struct ofpbuf *packet;
>> -    int retval;
>>
>>     ovs_mutex_lock(&netdev->mutex);
>>     if (!list_is_empty(&rx->recv_queue)) {
>> @@ -774,22 +773,15 @@ netdev_dummy_rx_recv(struct netdev_rx *rx_, struct 
>> ofpbuf *buffer)
>>     if (!packet) {
>>         return EAGAIN;
>>     }
>> +    ovs_mutex_lock(&netdev->mutex);
>> +    netdev->stats.rx_packets++;
>> +    netdev->stats.rx_bytes += packet->size;
>> +    ovs_mutex_unlock(&netdev->mutex);
>>
>> -    if (packet->size <= ofpbuf_tailroom(buffer)) {
>> -        memcpy(buffer->data, packet->data, packet->size);
>> -        buffer->size += packet->size;
>> -        retval = 0;
>> -
>> -        ovs_mutex_lock(&netdev->mutex);
>> -        netdev->stats.rx_packets++;
>> -        netdev->stats.rx_bytes += packet->size;
>> -        ovs_mutex_unlock(&netdev->mutex);
>> -    } else {
>> -        retval = EMSGSIZE;
>> -    }
>> -    ofpbuf_delete(packet);
>> -
>> -    return retval;
>> +    packet_set_size(packet, packet->size);
>
> The name of this function is a bit misleading, especially since all the users 
> already have the correct size...
>
ok. I will change it.

>> +    arr[0] = packet;
>> +    *c = 1;
>> +    return 0;
>> }
>>
>> static void
>> diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
>> index 75ce7c6..574a572 100644
>> --- a/lib/netdev-linux.c
>> +++ b/lib/netdev-linux.c
>> @@ -50,6 +50,7 @@
>> #include "connectivity.h"
>> #include "coverage.h"
>> #include "dpif-linux.h"
>> +#include "dpif-netdev.h"
>> #include "dynamic-string.h"
>> #include "fatal-signal.h"
>> #include "hash.h"
>> @@ -461,6 +462,7 @@ static int af_packet_sock(void);
>> static bool netdev_linux_miimon_enabled(void);
>> static void netdev_linux_miimon_run(void);
>> static void netdev_linux_miimon_wait(void);
>> +static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
>>
>> static bool
>> is_netdev_linux_class(const struct netdev_class *netdev_class)
>> @@ -984,10 +986,19 @@ netdev_linux_rx_recv_tap(int fd, struct ofpbuf *buffer)
>> }
>>
>> static int
>> -netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf *buffer)
>> +netdev_linux_rx_recv(struct netdev_rx *rx_, struct ofpbuf **packet, int *c)
>> {
>>     struct netdev_rx_linux *rx = netdev_rx_linux_cast(rx_);
>> -    int retval;
>> +    struct netdev *netdev = rx->up.netdev;
>> +    struct ofpbuf *buffer;
>> +    ssize_t retval;
>> +    int mtu;
>> +
>> +    if (netdev_linux_get_mtu__(netdev_linux_cast(netdev), &mtu)) {
>> +        mtu = ETH_PAYLOAD_MAX;
>> +    }
>> +
>> +    buffer = ofpbuf_new_with_headroom(VLAN_ETH_HEADER_LEN + mtu, 
>> DP_NETDEV_HEADROOM);
>>
>>     retval = (rx->is_tap
>>               ? netdev_linux_rx_recv_tap(rx->fd, buffer)
>> @@ -995,8 +1006,11 @@ netdev_linux_rx_recv(struct netdev_rx *rx_, struct 
>> ofpbuf *buffer)
>>     if (retval && retval != EAGAIN && retval != EMSGSIZE) {
>>         VLOG_WARN_RL(&rl, "error receiving Ethernet packet on %s: %s",
>>                      ovs_strerror(errno), netdev_rx_get_name(rx_));
>> +    } else {
>> +        packet_set_size(buffer, buffer->size);
>> +        packet[0] = buffer;
>> +        *c = 1;
>
> I don't see the buffer being released in all the other cases (when it's 
> ownership does not transfer to the caller).
>
You mean other than drop packet case?


>>     }
>> -
>>     return retval;
>> }
>>
>> diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
>> index 673d3ab..9bacaa0 100644
>> --- a/lib/netdev-provider.h
>> +++ b/lib/netdev-provider.h
>> @@ -634,28 +634,14 @@ struct netdev_class {
>>     void (*rx_destruct)(struct netdev_rx *);
>>     void (*rx_dealloc)(struct netdev_rx *);
>>
>> -    /* Attempts to receive a packet from 'rx' into the tailroom of 'buffer',
>> -     * which should initially be empty.  If successful, returns 0 and
>> -     * increments 'buffer->size' by the number of bytes in the received 
>> packet,
>> -     * otherwise a positive errno value.  Returns EAGAIN immediately if no
>> -     * packet is ready to be received.
>> -     *
>> -     * Must return EMSGSIZE, and discard the packet, if the received packet
>> -     * is longer than 'ofpbuf_tailroom(buffer)'.
>> -     *
>> -     * Implementations may make use of VLAN_HEADER_LEN bytes of tailroom to
>> -     * add a VLAN header which is obtained out-of-band to the packet. If
>> -     * this occurs then VLAN_HEADER_LEN bytes of tailroom will no longer be
>> -     * available for the packet, otherwise it may be used for the packet
>> -     * itself.
>> -     *
>> -     * It is advised that the tailroom of 'buffer' should be
>> -     * VLAN_HEADER_LEN bytes longer than the MTU to allow space for an
>> -     * out-of-band VLAN header to be added to the packet.
>> +    /* Attempts to receive batch of packets from 'rx' and place array of 
>> pointers
>> +     * into '*pkt'. netdev is responsible for allocating buffers.
>> +     * '*cnt' points to packet count for given batch. Once packets are 
>> returned
>> +     * to caller, netdev should give up ownership of ofbpuf data.
>>      *
>>      * This function may be set to null if it would always return EOPNOTSUPP
>>      * anyhow. */
>> -    int (*rx_recv)(struct netdev_rx *rx, struct ofpbuf *buffer);
>> +    int (*rx_recv)(struct netdev_rx *rx, struct ofpbuf **pkt, int *cnt);
>>
>>     /* Registers with the poll loop to wake up from the next call to
>>      * poll_block() when a packet is ready to be received with 
>> netdev_rx_recv()
>> diff --git a/lib/netdev.c b/lib/netdev.c
>> index e9c8d8f..a058742 100644
>> --- a/lib/netdev.c
>> +++ b/lib/netdev.c
>> @@ -551,10 +551,7 @@ netdev_rx_close(struct netdev_rx *rx)
>>     }
>> }
>>
>> -/* Attempts to receive a packet from 'rx' into the tailroom of 'buffer', 
>> which
>> - * must initially be empty.  If successful, returns 0 and increments
>> - * 'buffer->size' by the number of bytes in the received packet, otherwise a
>> - * positive errno value.
>> +/* Attempts to receive batch packet from 'rx'.
>>  *
>>  * Returns EAGAIN immediately if no packet is ready to be received.
>>  *
>> @@ -562,10 +559,7 @@ netdev_rx_close(struct netdev_rx *rx)
>>  * than 'ofpbuf_tailroom(buffer)'.
>>  *
>>  * Implementations may make use of VLAN_HEADER_LEN bytes of tailroom to
>> - * add a VLAN header which is obtained out-of-band to the packet. If
>> - * this occurs then VLAN_HEADER_LEN bytes of tailroom will no longer be
>> - * available for the packet, otherwise it may be used for the packet
>> - * itself.
>> + * add a VLAN header which is obtained out-of-band to the packet.
>>  *
>>  * It is advised that the tailroom of 'buffer' should be
>>  * VLAN_HEADER_LEN bytes longer than the MTU to allow space for an
>> @@ -575,23 +569,15 @@ netdev_rx_close(struct netdev_rx *rx)
>>  * This function may be set to null if it would always return EOPNOTSUPP
>>  * anyhow. */
>> int
>> -netdev_rx_recv(struct netdev_rx *rx, struct ofpbuf *buffer)
>> +netdev_rx_recv(struct netdev_rx *rx, struct ofpbuf **buffers, int *cnt)
>> {
>>     int retval;
>>
>> -    ovs_assert(buffer->size == 0);
>> -    ovs_assert(ofpbuf_tailroom(buffer) >= ETH_TOTAL_MIN);
>> -
>> -    retval = rx->netdev->netdev_class->rx_recv(rx, buffer);
>> +    retval = rx->netdev->netdev_class->rx_recv(rx, buffers, cnt);
>>     if (!retval) {
>>         COVERAGE_INC(netdev_received);
>> -        if (buffer->size < ETH_TOTAL_MIN) {
>> -            ofpbuf_put_zeros(buffer, ETH_TOTAL_MIN - buffer->size);
>> -        }
>> -        return 0;
>> -    } else {
>> -        return retval;
>>     }
>> +    return retval;
>> }
>>
>> /* Arranges for poll_block() to wake up when a packet is ready to be received
>> diff --git a/lib/netdev.h b/lib/netdev.h
>> index 410c35b..bfd8f91 100644
>> --- a/lib/netdev.h
>> +++ b/lib/netdev.h
>> @@ -161,7 +161,7 @@ void netdev_rx_close(struct netdev_rx *);
>>
>> const char *netdev_rx_get_name(const struct netdev_rx *);
>>
>> -int netdev_rx_recv(struct netdev_rx *, struct ofpbuf *);
>> +int netdev_rx_recv(struct netdev_rx *rx, struct ofpbuf **buffers, int *cnt);
>> void netdev_rx_wait(struct netdev_rx *);
>> int netdev_rx_drain(struct netdev_rx *);
>>
>> diff --git a/lib/packets.c b/lib/packets.c
>> index 3f7d6eb..6f2fca6 100644
>> --- a/lib/packets.c
>> +++ b/lib/packets.c
>> @@ -1012,3 +1012,12 @@ void pkt_metadata_from_flow(struct pkt_metadata *md, 
>> const struct flow *flow)
>>     pkt_metadata_init(md, &flow->tunnel, flow->skb_priority,
>>                            flow->pkt_mark, &flow->in_port);
>> }
>> +
>> +void
>> +packet_set_size(struct ofpbuf *b, int size)
>> +{
>> +    b->size = size;
>> +    if (b->size < ETH_TOTAL_MIN) {
>> +        ofpbuf_put_zeros(b, ETH_TOTAL_MIN - b->size);
>> +    }
>> +}
>> diff --git a/lib/packets.h b/lib/packets.h
>> index e6b3303..f41a934 100644
>> --- a/lib/packets.h
>> +++ b/lib/packets.h
>> @@ -671,5 +671,6 @@ void packet_set_sctp_port(struct ofpbuf *, ovs_be16 src, 
>> ovs_be16 dst);
>> uint16_t packet_get_tcp_flags(const struct ofpbuf *, const struct flow *);
>> void packet_format_tcp_flags(struct ds *, uint16_t);
>> const char *packet_tcp_flag_to_string(uint32_t flag);
>> +void packet_set_size(struct ofpbuf *b, int size);
>>
>> #endif /* packets.h */
>> --
>> 1.7.9.5
>>
>> _______________________________________________
>> dev mailing list
>> [email protected]
>> http://openvswitch.org/mailman/listinfo/dev
>
_______________________________________________
dev mailing list
[email protected]
http://openvswitch.org/mailman/listinfo/dev

Reply via email to