With comments below:

Acked-by: Jarno Rajahalme <jrajaha...@nicira.com>

On Oct 16, 2014, at 11:38 AM, Pravin B Shelar <pshe...@nicira.com> wrote:

> diff --git a/README-native-tunneling b/README-native-tunneling
> new file mode 100644
> index 0000000..5cc17ec
> --- /dev/null
> +++ b/README-native-tunneling
> @@ -0,0 +1,83 @@
> +
> +Open vSwitch support tunneling in userspace. Tunneling is implemented in

“supports"

> +platform independent way.
> +
> +Setup:
> +======
> +Setup physical bridges for all physical interfaces. create integration 
> bridge.

“Create"

> +Add vxlan port to int-bridge. Assign IP address to physical bridge where
> +vxlan traffic is expected.
> +

Should VXLAN be capitalized?

> +Example:
> +========
> +If you want to connect to vxlan tunnel endpoint  (logical ip: 192.168.1.2
> +and remote physical ip: 172.168.1.2) that you want to connect to.
> +

The last “that you want to connect to” is redundant.

> +In this case you need to configure OVS bridges as follows.
> +
> +1. Let assume 172.168.1.2/24 network is reachable via eth1 create physical 
> bridge br-eth1

“Lets”

Put “create physical bridge br-eth1” on a line of its own.

> +   assign ip address (172.168.1.1/24) to br-eth1, Add eth1 to br-eth1
> +2. Check ovs cached routes using appctl command
> +   ovs-appctl ovs/route/show
> +   Add tunnel route if not present in OVS route table.
> +   ovs-appctl ovs/route/add 172.168.1.1 255.255.255.0 br-eth1

“172.168.1.1/24” would be preferable format, instead of “172.168.1.1 
255.255.255.0”.

> +3. Add integration brdge int-br and add tunnel port using standard syntax.
> +   ovs-vsctl add-port int-br vxlan0 -- set interface vxlan0 type=vxlan  
> options:remote_ip=172.168.1.2
> +4. Assign IP address to int-br, So final topology looks like:
> +
> +
> +       192.168.1.1/24
> +       +--------------+
> +       |    int-br    |                                   192.168.1.2/24
> +       +--------------+                                  +--------------+
> +       |    vxlan0    |                                  |    vxlan0    |
> +       +--------------+                                  +--------------+
> +             |                                                 |
> +             |                                                 |
> +             |                                                 |
> +        172.168.1.1/24                                         |
> +       +--------------+                                        |
> +       |    br-eth1   |                                  172.168.1.2/24
> +       +--------------+                                  +---------------+
> +       |    eth1      |----------------------------------|    eth1       |
> +       +--------------+                                  +----------------
> +
> +       Host A with OVS.                                      Remote host.
> +
> +With this setup, ping to vxlan target device (192.168.1.2) should work
> +There are following commands that shows internal tables:
> +
> +Tunneling related commands:
> +===========================
> +Tunnel routing table:
> +    To Add route:
> +       ovs-appctl ovs/route/add <IP address>/<prefix length> 
> <output-bridge-name> <gw>
> +    To see all routes configured:
> +       ovs-appctl ovs/route/show
> +    To del route:
> +       ovs-appctl ovs/route/del <IP address>/<prefix length>
> +
> +ARP:
> +    To see arp cache content:
> +       ovs-appctl tnl/arp/show
> +    To flush arp cache:
> +       ovs-appctl tnl/arp/flush
> +
> +To check tunnel ports listening in vswitchd:
> +     ovs-appctl tnl/ports/show
> +
> +To range for VxLan udp source port:

“To set range"

> +     To set:
> +         ovs-appctl tnl/egress_port_range <num1> <num2>
> +     Shows Current range:
> +         ovs-appctl tnl/egress_port_range
> +
> +To check datapath ports:
> +     ovs-appctl dpif/show
> +
> +To check datapath flows:
> +     ovs-appctl dpif/dump-flows
> +
> +Contact
> +=======
> +b...@openvswitch.org
> diff --git a/datapath/linux/compat/include/linux/openvswitch.h 
> b/datapath/linux/compat/include/linux/openvswitch.h
> index b2257e6..ff7c9d9 100644
> --- a/datapath/linux/compat/include/linux/openvswitch.h
> +++ b/datapath/linux/compat/include/linux/openvswitch.h
> @@ -580,6 +580,15 @@ struct ovs_action_hash {
>       uint32_t  hash_basis;
> };
> 
> +#define TNL_PUSH_HEADER_SIZE 128
> +struct ovs_action_push_tnl {
> +     uint32_t tnl_port;
> +     uint32_t out_port;
> +     uint32_t header_len;
> +     uint32_t tnl_type;     /* For logging. */
> +     uint8_t  header[TNL_PUSH_HEADER_SIZE];
> +};
> +

Is this action always fixed length, or is the real length determined by the 
‘header_len’ (+ NL padding)?

> /**
>  * enum ovs_action_attr - Action types.
>  *
> @@ -633,6 +642,10 @@ enum ovs_action_attr {
>                                      * data immediately followed by a mask.
>                                      * The data must be zero for the unmasked
>                                      * bits. */
> 

(snip)

> diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
> index cea7c88..a60eab4 100644
> --- a/lib/dpif-netdev.c
> +++ b/lib/dpif-netdev.c
> @@ -62,6 +62,7 @@
> #include "shash.h"
> #include "sset.h"
> #include "timeval.h"
> +#include "tnl-arp-cache.h"
> #include "unixctl.h"
> #include "util.h"
> #include "vlog.h"
> @@ -204,6 +205,7 @@ struct dp_netdev {
>      * for pin of pmd threads. */
>     size_t n_dpdk_rxqs;
>     char *pmd_cmask;
> +    uint64_t last_tnl_conf_seq;
> };
> 
> static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev 
> *dp,
> @@ -582,6 +584,7 @@ create_dp_netdev(const char *name, const struct 
> dpif_class *class,
>         return error;
>     }
> 
> +    dp->last_tnl_conf_seq = seq_read(tnl_conf_seq);
>     *dpp = dp;
>     return 0;
> }
> @@ -2054,12 +2057,13 @@ dp_netdev_process_rxq_port(struct 
> dp_netdev_pmd_thread *pmd,
>     }
> }
> 
> -static void
> +static bool

Add a comment on the meaning of the return value.

> dpif_netdev_run(struct dpif *dpif)
> {
>     struct dp_netdev_port *port;
>     struct dp_netdev *dp = get_dp_netdev(dpif);
>     struct dp_netdev_pmd_thread *non_pmd = dp_netdev_get_nonpmd(dp);
> +    uint64_t new_tnl_seq;
> 
>     ovs_mutex_lock(&dp->non_pmd_mutex);
>     CMAP_FOR_EACH (port, node, &dp->ports) {
> @@ -2072,6 +2076,14 @@ dpif_netdev_run(struct dpif *dpif)
>         }
>     }
>     ovs_mutex_unlock(&dp->non_pmd_mutex);
> +    tnl_arp_cache_run();
> +    new_tnl_seq = seq_read(tnl_conf_seq);
> +
> +    if (dp->last_tnl_conf_seq != new_tnl_seq) {
> +        dp->last_tnl_conf_seq = new_tnl_seq;
> +        return true;
> +    }
> +    return false;
> }
> 
> static void
> @@ -2091,6 +2103,7 @@ dpif_netdev_wait(struct dpif *dpif)
>         }
>     }
>     ovs_mutex_unlock(&dp_netdev_mutex);
> +    seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq);
> }
> 
> struct rxq_poll {
> @@ -2785,15 +2798,48 @@ dpif_netdev_register_upcall_cb(struct dpif *dpif, 
> upcall_callback *cb,
> static void
> dp_netdev_drop_packets(struct dpif_packet ** packets, int cnt, bool may_steal)
> {
> -    int i;
> -
>     if (may_steal) {
> +        int i;
> +
>         for (i = 0; i < cnt; i++) {
>             dpif_packet_delete(packets[i]);
>         }
>     }
> }
> 
> +static int
> +push_tunnel_action(const struct dp_netdev *dp,
> +                   const struct nlattr *attr,
> +                   struct dpif_packet **packets, int cnt)
> +{
> +    struct dp_netdev_port *tun_port;
> +    struct ovs_action_push_tnl *data;
> +
> +    data = (struct ovs_action_push_tnl *) nl_attr_get(attr);
> +
> +    tun_port = dp_netdev_lookup_port(dp, data->tnl_port);
> +    if (!tun_port) {
> +        return -EINVAL;
> +    }
> +    netdev_push_header(tun_port->netdev, packets, cnt, data);
> +
> +    return 0;
> +}
> +
> +static void
> +dp_netdev_clone_pkt_batch(struct dpif_packet **tnl_pkt,
> +                          struct dpif_packet **packets, int cnt)
> +{
> +    int i;
> +
> +    for (i = 0; i < cnt; i++) {
> +        struct dpif_packet *inner_pkt;
> +
> +         inner_pkt = dpif_packet_clone(packets[i]);
> +         tnl_pkt[i] = inner_pkt;

Variable name “inner_pkt” is not meaningful here, could use just “pkt”?

> +    }
> +}
> +
> 

(snip)

> diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
> index 65cf505..4ae1658 100644
> --- a/lib/dpif-provider.h
> +++ b/lib/dpif-provider.h
> @@ -132,8 +132,9 @@ struct dpif_class {
>      * the 'close' member function. */
>     int (*destroy)(struct dpif *dpif);
> 
> -    /* Performs periodic work needed by 'dpif', if any is necessary. */
> -    void (*run)(struct dpif *dpif);
> +    /* Performs periodic work needed by 'dpif', if any is necessary.
> +     * Return true if need to revalidate. */

“Returns”

> +    bool (*run)(struct dpif *dpif);
> 
>     /* Arranges for poll_block() to wake up if the "run" member function needs
>      * to be called for 'dpif'. */
> diff --git a/lib/dpif.c b/lib/dpif.c
> index d088f68..b2f05c9 100644
> --- a/lib/dpif.c
> +++ b/lib/dpif.c
> @@ -41,6 +41,8 @@
> #include "shash.h"
> #include "sset.h"
> #include "timeval.h"
> +#include "tnl-arp-cache.h"
> +#include "tnl-ports.h"
> #include "util.h"
> #include "valgrind.h"
> #include "vlog.h"
> @@ -114,6 +116,8 @@ dp_initialize(void)
>         }
>         dpctl_unixctl_register();
>         ovsthread_once_done(&once);
> +        tnl_port_map_init();
> +        tnl_arp_cache_init();
>     }
> }
> 
> @@ -402,12 +406,13 @@ dpif_close(struct dpif *dpif)
> }
> 
> /* Performs periodic work needed by 'dpif'. */
> -void
> +bool
> dpif_run(struct dpif *dpif)
> {
>     if (dpif->dpif_class->run) {
> -        dpif->dpif_class->run(dpif);
> +        return dpif->dpif_class->run(dpif);
>     }
> +    return false;
> }
> 
> /* Arranges for poll_block() to wake up when dp_run() needs to be called for
> @@ -1002,6 +1007,8 @@ dpif_execute_helper_cb(void *aux_, struct dpif_packet 
> **packets, int cnt,
> 
>     switch ((enum ovs_action_attr)type) {
>     case OVS_ACTION_ATTR_OUTPUT:
> +    case OVS_ACTION_ATTR_TUNNEL_PUSH:
> +    case OVS_ACTION_ATTR_TUNNEL_POP:
>     case OVS_ACTION_ATTR_USERSPACE:
>     case OVS_ACTION_ATTR_RECIRC: {
>         struct dpif_execute execute;
> @@ -1592,3 +1599,9 @@ log_flow_get_message(const struct dpif *dpif, const 
> struct dpif_flow_get *get,
>                          get->flow->actions, get->flow->actions_len);
>     }
> }
> +
> +bool dpif_support_tnl_push_pop(const struct dpif *dpif)

“dpif_supports_tnl_push_pop”

> +{
> +   return !strcmp(dpif->dpif_class->type, "netdev") ||
> +          !strcmp(dpif->dpif_class->type, "dummy");
> +}
> 

(snip)

> diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
> index b3b8a11..8885cbb 100644
> --- a/lib/netdev-vport.c
> +++ b/lib/netdev-vport.c
> @@ -25,21 +25,30 @@
> #include <sys/ioctl.h>
> 

(snip)

> +static int
> +parse_gre_header(struct ofpbuf *packet,
> +                 struct flow_tnl *tnl)
> +{
> +    const struct gre_base_hdr *greh;
> +    ovs_be32 *options;
> +
> +    greh = ip_extract_md(packet, tnl);
> +    if (!greh) {
> +        return -EINVAL;
> +    }
> +
> +    if (greh->flags & (GRE_VERSION | GRE_ROUTING)) {
> +        return -EINVAL;
> +    }
> +

Is there any guarantee that the packet is actually long enough to contain all 
of the GRE header? 

> +    options = (ovs_be32 *)(greh + 1);
> +    if (greh->flags & GRE_CSUM) {
> +        ovs_be32 pkt_csum;
> +
> +        pkt_csum = csum(greh, ofpbuf_size(packet) -
> +                              sizeof (struct eth_header) -
> +                              sizeof (struct ip_header));
> +        if (pkt_csum) {
> +            return -EINVAL;
> +        }
> +        tnl->flags = FLOW_TNL_F_CSUM;
> +        options++;
> +    }
> +
> +    if (greh->flags & GRE_KEY) {
> +        tnl->tun_id = (ovs_be64) ((uint64_t)(*(uint32_t*)options) << 32);
> +        tnl->flags |= FLOW_TNL_F_KEY;
> +        options++;
> +    }
> +
> +    if (greh->flags & GRE_SEQ) {
> +        options++;
> +    }
> +
> +    return ((uint8_t *)options - (uint8_t *) greh) ;
> +}
> +
> +static void
> +gre_extract_md(struct dpif_packet *dpif_pkt)
> +{
> +    struct ofpbuf *packet = &dpif_pkt->ofpbuf;
> +    struct pkt_metadata *md = &dpif_pkt->md;
> +    struct flow_tnl *tnl = &md->tunnel;
> +    int greh_len, tnl_hlen;
> +
> +    memset(md, 0, sizeof(*md));
> +
> +    greh_len = parse_gre_header(packet, tnl);
> +    if (greh_len <= 0) {
> +        memset(md, 0, sizeof(*md));

Would it be enough to reset the *tnl here?

> +        return;
> +    }
> +
> +    tnl_hlen = sizeof (struct eth_header) + sizeof (struct ip_header) + 
> greh_len;
> +
> +    if (tnl_hlen > ofpbuf_size(packet)) {
> +        memset(md, 0, sizeof(*md));

And here?

> +        return;
> +    }
> +
> +    ofpbuf_reset_packet(packet, tnl_hlen);
> +}
> +
> +static int
> +netdev_gre_pop_header(struct netdev *netdev_ OVS_UNUSED,
> +                      struct dpif_packet **pkt, int cnt)
> +{
> +    int i;
> +
> +    for (i = 0; i < cnt; i++) {
> +        gre_extract_md(pkt[i]);
> +    }
> +    return 0;
> +}
> +
> +static void
> +netdev_gre_push_header__(struct ofpbuf *packet,
> +                         const void *header, int size)
> +{
> +    struct gre_base_hdr *greh;
> +    int ip_tot_size;
> +
> +    greh = push_ip_header(packet, header, size,  &ip_tot_size);
> +
> +    if (greh->flags & GRE_CSUM) {
> +        ovs_be32 *options = (ovs_be32 *) (greh + 1);
> +
> +        *options = csum(greh, ip_tot_size - sizeof (struct ip_header));
> +    }
> +}
> +
> +static int
> +netdev_gre_push_header(const struct netdev *netdev OVS_UNUSED,
> +                       struct dpif_packet **packets, int cnt,
> +                       const struct ovs_action_push_tnl *data)
> +{
> +    int i;
> +
> +    for (i = 0; i < cnt; i++) {
> +        netdev_gre_push_header__(&packets[i]->ofpbuf,
> +                                   data->header, data->header_len);
> +        packets[i]->md = PKT_METADATA_INITIALIZER(data->out_port);
> +    }
> +    return 0;
> +}
> +
> +
> +static int
> +netdev_gre_build_header(const struct netdev *netdev,
> +                        struct ovs_action_push_tnl *data)
> +{
> +    struct netdev_vport *dev = netdev_vport_cast(netdev);
> +    struct netdev_tunnel_config *tnl_cfg;
> +    struct ip_header *ip;
> +    struct gre_base_hdr *greh;
> +    ovs_be32 *options;
> +    int hlen;
> +
> +    /* XXX: RCUfy tnl_cfg. */
> +    ovs_mutex_lock(&dev->mutex);
> +    tnl_cfg = &dev->tnl_cfg;
> +
> +    ip = ip_hdr(data->header);
> +    ip->ip_proto = IPPROTO_GRE;
> +
> +    greh = gre_hdr(ip);
> +    greh->protocol = htons(ETH_TYPE_TEB);
> +    greh->flags = 0;
> +
> +    options = (ovs_be32 *) (greh + 1);
> +    if (tnl_cfg->csum) {
> +        greh->flags |= GRE_CSUM;
> +        *options = 0;
> +        options++;
> +    }
> +
> +    if (tnl_cfg->out_key_present) {
> +        greh->flags = GRE_KEY;
> +        *options = (ovs_be32) (tnl_cfg->out_key >> 32);
> +        options++;
> +    }
> +
> +    ovs_mutex_unlock(&dev->mutex);
> +
> +    hlen = (uint8_t *) options - (uint8_t *) greh;
> +
> +    data->header_len = sizeof (struct eth_header) + sizeof *ip + hlen;
> +    data->tnl_type = OVS_VPORT_TYPE_GRE;
> +    return 0;
> +}
> +
> +static void
> +vxlan_extract_md(struct dpif_packet *dpif_pkt)
> +{
> +    struct ofpbuf *packet = &dpif_pkt->ofpbuf;
> +    struct pkt_metadata *md = &dpif_pkt->md;
> +    struct flow_tnl *tnl = &md->tunnel;
> +    struct udp_header *udp;
> +    struct vxlanhdr *vxh;
> +
> +    memset(md, 0, sizeof(*md));
> +    if (VXLAN_HLEN > ofpbuf_size(packet)) {
> +        return;
> +    }
> +
> +    udp = ip_extract_md(packet, tnl);
> +    if (!udp) {
> +        return;
> +    }
> +    vxh = (struct vxlanhdr *) ((const char *)udp + sizeof(*udp));
> +
> +    if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
> +       (vxh->vx_vni & htonl(0xff))) {
> +        VLOG_ERR("invalid vxlan flags=%#x vni=%#x\n",
> +                 ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
> +        memset(md, 0, sizeof(*md));

Would it be enough to reset *tnl here?

> +        return;
> +    }
> +    tnl->tp_src = udp->udp_src;
> +    tnl->tp_dst = udp->udp_dst;
> +    tnl->tun_id = htonll(ntohl(vxh->vx_vni) >> 8);
> +
> +    ofpbuf_reset_packet(packet, VXLAN_HLEN);
> +}
> +
> +

(snip)

> diff --git a/lib/odp-util.c b/lib/odp-util.c
> index 8f5ed08..2db8a00 100644
> --- a/lib/odp-util.c
> +++ b/lib/odp-util.c
> @@ -24,18 +24,22 @@
> 

(snip)

> static int
> +ovs_parse_tnl_push(const char *s, struct ovs_action_push_tnl *data)
> +{
> +    struct eth_header *eth;
> +    struct ip_header *ip;
> +    struct udp_header *udp;
> +    struct gre_base_hdr *greh;
> +    uint16_t dl_type, udp_src, udp_dst;
> +    ovs_be32 sip, dip;
> +    uint32_t tnl_type = 0, header_len = 0;
> +    void *l3, *l4;
> +    int n = 0;
> +
> +    if (!ovs_scan(s, "tnl_push(tnl_port(%"SCNi32"),%n", &data->tnl_port, 
> &n)) {
> +        return 0;
> +    }
> +
> +    eth = (struct eth_header *) data->header;
> +    l3 = (data->header + sizeof *eth);
> +    l4 = ((uint8_t *) l3 + sizeof (struct ip_header));
> +    ip = (struct ip_header *) l3;
> +    if (!ovs_scan(&s[n], "header(size=%"SCNi32",type=%"SCNi32","
> +                         "eth(dst="ETH_ADDR_SCAN_FMT",%n", &data->tnl_type,
> +                         &data->header_len,
> +                         ETH_ADDR_SCAN_ARGS(eth->eth_dst), &n)) {
> +        return -EINVAL;
> +    }
> +
> +    if (!ovs_scan(&s[n], "src="ETH_ADDR_SCAN_FMT",%n",
> +                  ETH_ADDR_SCAN_ARGS(eth->eth_src), &n)) {
> +        return -EINVAL;
> +    }
> +    if (!ovs_scan(&s[n], "dl_type=%"SCNi16",%n", &dl_type, &n)) {
> +        return -EINVAL;
> +    }
> +    eth->eth_type = htons(dl_type);
> +
> +    /* IPv4 */
> +    if (!ovs_scan(&s[n], 
> "ipv4(src="IP_SCAN_FMT",dst="IP_SCAN_FMT",proto=%"SCNi8
> +                         ",tos=%"SCNi8",ttl=%"SCNi8",frag=%"SCNi16"),%n",
> +                         IP_SCAN_ARGS(&sip),
> +                         IP_SCAN_ARGS(&dip),
> +                         &ip->ip_proto, &ip->ip_tos,
> +                         &ip->ip_ttl, &ip->ip_frag_off, &n)) {
> +        return -EINVAL;
> +    }
> +    put_16aligned_be32(&ip->ip_src, sip);
> +    put_16aligned_be32(&ip->ip_dst, dip);
> +
> +    udp = (struct udp_header *) l4;
> +    greh = (struct gre_base_hdr *) l4;
> +
> +    if (ovs_scan(&s[n], "udp(src=%"SCNi16",dst=%"SCNi16"),%n",
> +                         &udp_src, &udp_dst, &n)) {
> +        struct vxlanhdr *vxh;
> +
> +        udp->udp_src = htons(udp_src);
> +        udp->udp_dst = htons(udp_dst);
> +        udp->udp_len = 0;
> +        udp->udp_csum = 0;
> +
> +        vxh = (struct vxlanhdr *) (udp + 1);
> +        if (!ovs_scan(&s[n], "vxlan(flags=%"SCNi32",vni=%"SCNi32")),%n",
> +                            &vxh->vx_flags, &vxh->vx_vni, &n)) {
> +            return -EINVAL;
> +        }
> +        tnl_type = OVS_VPORT_TYPE_VXLAN;
> +        header_len = sizeof *eth + sizeof *ip +
> +                     sizeof *udp + sizeof *vxh;
> +    } else if (ovs_scan(&s[n], "gre((flags=%"SCNi16",proto=%"SCNi16"),%n",
> +                         &greh->flags, &greh->protocol, &n)){
> +
> +        ovs_be32 *options = (ovs_be32 *) (greh + 1);
> +
> +        if (greh->flags & GRE_CSUM) {
> +            if (!ovs_scan(&s[n], ",csum=%"SCNi32"%n", options, &n)) {
> +                return -EINVAL;
> +            }
> +
> +            options++;
> +        }
> +        if (greh->flags & GRE_KEY) {
> +            uint32_t key;
> +
> +            if (!ovs_scan(&s[n], ",seq=%"SCNi32"%n", &key, &n)) {
> +                return -EINVAL;
> +            }
> +
> +            *options = htonl(key);
> +            options++;
> +        }
> +        if (greh->flags & GRE_SEQ) {
> +            if (!ovs_scan(&s[n], ",seq=%"SCNi32"%n", options, &n)) {
> +                return -EINVAL;
> +            }
> +            options++;
> +        }
> +        if (!ovs_scan(&s[n], ")%n", &n)) {
> +            return -EINVAL;
> +        }
> +
> +        tnl_type = OVS_VPORT_TYPE_GRE;
> +        header_len = sizeof *eth + sizeof *ip +
> +                     ((uint8_t *) options - (uint8_t *) greh);
> +    } else {
> +        return -EINVAL;
> +    }
> +
> +    /* check tune meta data. */

“Check tunnel metadata.”

> +    if (data->tnl_type != tnl_type) {
> +        return -EINVAL;
> +    }
> +    if (data->header_len != header_len) {
> +        return -EINVAL;
> +    }
> +
> +    return n;
> +}
> +
> +static int
> parse_odp_action(const char *s, const struct simap *port_names,
>                  struct ofpbuf *actions)
> {
> @@ -887,6 +1101,28 @@ parse_odp_action(const char *s, const struct simap 
> *port_names,
>         }
>     }
> 
> +    {
> +        uint32_t port;
> +        int n;
> +
> +        if (ovs_scan(s, "tnl_pop(%"SCNi32")%n", &port, &n)) {
> +            nl_msg_put_u32(actions, OVS_ACTION_ATTR_TUNNEL_POP, port);
> +            return n;
> +        }
> +    }
> +
> +    {
> +        struct ovs_action_push_tnl data;
> +        int n;
> +
> +        n = ovs_parse_tnl_push(s, &data);
> +        if (n > 0) {
> +            odp_put_tunnel_push_action(actions, &data);
> +            return n;
> +        } else if (n < 0) {
> +            return n;
> +        }
> +    }
>     return -EINVAL;
> }
> 
> @@ -3543,6 +3779,15 @@ odp_put_tunnel_action(const struct flow_tnl *tunnel,
>     tun_key_to_attr(odp_actions, tunnel);
>     nl_msg_end_nested(odp_actions, offset);
> }
> +
> +void
> +odp_put_tunnel_push_action(struct ofpbuf *odp_actions,
> +                           struct ovs_action_push_tnl *data)
> +{
> +    nl_msg_put_unspec(odp_actions, OVS_ACTION_ATTR_TUNNEL_PUSH,
> +                      data, sizeof (*data));

Could we not push the unused portion of the ovs_action_push_tnl here?

> +}
> +
> 

(snip)

> diff --git a/lib/tnl-arp-cache.c b/lib/tnl-arp-cache.c
> new file mode 100644
> index 0000000..6053eb6
> --- /dev/null
> +++ b/lib/tnl-arp-cache.c
> @@ -0,0 +1,212 @@
> 

(snip)

> +int
> +tnl_arp_snoop(const struct flow *flow, struct flow_wildcards *wc,
> +              const char name[IFNAMSIZ])
> +{
> +    struct tnl_arp_entry *arp;
> +
> +    if (flow->dl_type != htons(ETH_TYPE_ARP)) {
> +        return EINVAL;
> +    }
> +
> +    memset(&wc->masks.dl_type, 0xff, sizeof wc->masks.dl_type);
> +    memset(&wc->masks.nw_proto, 0xff, sizeof wc->masks.nw_proto);
> +    memset(&wc->masks.nw_src, 0xff, sizeof wc->masks.nw_src);
> +    memset(&wc->masks.nw_dst, 0xff, sizeof wc->masks.nw_dst);
> +    memset(&wc->masks.arp_sha, 0xff, sizeof wc->masks.arp_sha);
> +    memset(&wc->masks.arp_tha, 0xff, sizeof wc->masks.arp_tha);
> +
> +    ovs_mutex_lock(&mutex);
> +    arp = __tnl_arp_lookup(name, flow->nw_src);
> +    if (arp) {
> +        if (!memcmp(arp->mac, flow->arp_sha, ETH_ADDR_LEN)) {
> +            arp->expires = time_now() + ARP_ENTRY_DEFAULT_IDLE_TIME;
> +            ovs_mutex_unlock(&mutex);
> +            return 0;
> +        }
> +        tnl_arp_delete(arp);
> +    }
> +

It should be pretty easy to make the successful lookup lockless, but that can 
be left as a future enhancement.

> +    arp = xmalloc(sizeof *arp);
> +
> +    arp->ip = flow->nw_src;
> +    memcpy(arp->mac, flow->arp_sha, ETH_ADDR_LEN);
> +    arp->expires = time_now() + ARP_ENTRY_DEFAULT_IDLE_TIME;
> +    strncpy(arp->br_name, name, IFNAMSIZ);
> +    cmap_insert(&table, &arp->cmap_node, arp->ip);
> +    ovs_mutex_unlock(&mutex);
> +    return 0;
> +}
> +
> +void
> +tnl_arp_cache_run(void)
> +{
> +    struct tnl_arp_entry *arp;
> +    bool changed = false;
> +
> +    ovs_mutex_lock(&mutex);
> +    CMAP_FOR_EACH(arp, cmap_node, &table) {
> +        if (arp->expires <= time_now()) {
> +             tnl_arp_delete(arp);
> +             changed = true;

It is likely that this will be periodically triggered also for active tunnels, 
as when nothing changes and the tunnel headers are pushed from a cached flow, 
there is nothing that keeps the ARP entry fresh. Maybe tap into 
xlate_cache_netdev() to reforest the ARP entry for a tunnel port when stats are 
pushed on it?

> +        }
> +    }
> +    ovs_mutex_unlock(&mutex);
> +
> +    if (changed) {
> +        seq_change(tnl_conf_seq);
> +    }
> +}
> +

(snip)

> diff --git a/lib/tnl-arp-cache.h b/lib/tnl-arp-cache.h
> new file mode 100644
> index 0000000..f68c3b4
> --- /dev/null
> +++ b/lib/tnl-arp-cache.h
> @@ -0,0 +1,47 @@
> +/*
> + * Copyright (c) 2014 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef OVS_TNL_ARP_CACHE_H
> +#define OVS_TNL_ARP_CACHE_H 1

Can drop the “OVS_” prefix here, so that the macro name matches the file name.

> +
> +#include <errno.h>
> +#include <inttypes.h>
> +#include <stddef.h>
> +#include <stdint.h>
> +#include <string.h>
> +#include <net/if.h>
> +#include <sys/socket.h>
> +

Likely none of the above are needed.

> +#include "flow.h"
> +#include “netdev.h"

Likely not needed.

> +#include "packets.h"
> +#include "util.h"
> +
> +#ifdef  __cplusplus
> +extern "C" {
> +#endif
> +

I recall we do not do this unless someone requests it.

> +int tnl_arp_snoop(const struct flow *flow, struct flow_wildcards *wc,
> +                  const char dev_name[]);
> +int tnl_arp_lookup(const char dev_name[], ovs_be32 dst, uint8_t 
> mac[ETH_ADDR_LEN]);
> +void tnl_arp_cache_init(void);
> +void tnl_arp_cache_run(void);
> +
> +#ifdef  __cplusplus
> +}
> +#endif
> +
> +#endif

(snip)

> diff --git a/lib/tnl-ports.h b/lib/tnl-ports.h
> new file mode 100644
> index 0000000..bc4c954
> --- /dev/null
> +++ b/lib/tnl-ports.h
> @@ -0,0 +1,49 @@
> +/*
> + * Copyright (c) 2014 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef OVS_TNL_PORT_H
> +#define OVS_TNL_PORT_H 1
> +
> +#include <errno.h>
> +#include <stddef.h>
> +#include <stdint.h>
> +#include <inttypes.h>
> +#include <net/if.h>
> +#include <sys/socket.h>
> +
> +#include "flow.h"
> +#include "packets.h"
> +#include "util.h"
> +
> +#ifdef  __cplusplus
> +extern "C" {
> +#endif
> +
> +odp_port_t tnl_port_map_lookup(const struct flow *flow,
> +                           struct flow_wildcards *wc);
> +
> +void tnl_port_map_insert(odp_port_t port, ovs_be32 ip_dst, ovs_be16 udp_port,
> +                         const char dev_name[]);
> +
> +void tnl_port_map_delete(ovs_be32 ip_dst OVS_UNUSED, ovs_be16 udp_port 
> OVS_UNUSED);
> +

Should not use OVS_UNUSED in function declarations.

> +void tnl_port_map_init(void);
> +
> +#ifdef  __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/lib/vxlan.h b/lib/vxlan.h
> new file mode 100644
> index 0000000..c847bdb
> --- /dev/null
> +++ b/lib/vxlan.h
> @@ -0,0 +1,50 @@
> +/*
> + * Copyright (c) 2014 Nicira, Inc.
> + *
> + * Licensed under the Apache License, Version 2.0 (the "License");
> + * you may not use this file except in compliance with the License.
> + * You may obtain a copy of the License at:
> + *
> + *     http://www.apache.org/licenses/LICENSE-2.0
> + *
> + * Unless required by applicable law or agreed to in writing, software
> + * distributed under the License is distributed on an "AS IS" BASIS,
> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
> + * See the License for the specific language governing permissions and
> + * limitations under the License.
> + */
> +
> +#ifndef NETDEV_VXLAN_H
> +#define NETDEV_VXLAN_H 1
> +

Should be just VXLAN_H to match the file name?

> +#include <stddef.h>
> +#include <stdint.h>
> +#include “list.h"

No list here.

> +#include "packets.h"
> +#include "util.h"
> +#include "netdev-dpdk.h”

I do not see anything DPDK dependent here.

> +
> +#ifdef  __cplusplus
> +extern "C" {
> +#endif
> +

Not needed.

> +
> +#define VXLAN_FLAGS 0x08000000  /* struct vxlanhdr.vx_flags required value. 
> */
> +
> +/* VXLAN protocol header */
> +OVS_PACKED(
> +struct vxlanhdr {
> +        ovs_be32 vx_flags;
> +        ovs_be32 vx_vni;
> +});
> +

Should use types from unaligned.h instead of packing?

> +#define VXLAN_HLEN (sizeof(struct eth_header) + \
> +                    sizeof(struct ip_header)  + \
> +                    sizeof(struct udp_header) + \
> +                    sizeof(struct vxlanhdr))
> +
> +#ifdef  __cplusplus
> +}
> +#endif
> +
> +#endif
> diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
> index 48576ad..ce0e482 100644
> --- a/ofproto/ofproto-dpif-xlate.c
> +++ b/ofproto/ofproto-dpif-xlate.c
> @@ -17,7 +17,12 @@
> #include "ofproto/ofproto-dpif-xlate.h”

(snip)

> static void
> compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
>                         bool check_stp)
> @@ -2619,9 +2740,15 @@ compose_output_action__(struct xlate_ctx *ctx, 
> ofp_port_t ofp_port,
>             entry->u.dev.tx = netdev_ref(xport->netdev);
>         }
>         out_port = odp_port;
> -        commit_odp_tunnel_action(flow, &ctx->base_flow,
> -                                 ctx->xout->odp_actions);
> -        flow->tunnel = flow_tnl; /* Restore tunnel metadata */
> +        if (ovs_native_tunneling_is_on(ctx->xbridge->ofproto)) {
> +            build_tunnel_send(ctx, xport, flow, odp_port);

You need to also  commit_odp_actions() before calling build_tunnel_send().

> +            flow->tunnel = flow_tnl; /* Restore tunnel metadata */
> +            goto out;
> +        } else {
> +            commit_odp_tunnel_action(flow, &ctx->base_flow,
> +                                     ctx->xout->odp_actions);
> +            flow->tunnel = flow_tnl; /* Restore tunnel metadata */
> +        }
>     } else {
>         odp_port = xport->odp_port;
>         out_port = odp_port;
> @@ -2641,7 +2768,7 @@ compose_output_action__(struct xlate_ctx *ctx, 
> ofp_port_t ofp_port,
>     if (out_port != ODPP_NONE) {
>         ctx->xout->slow |= commit_odp_actions(flow, &ctx->base_flow,
>                                               ctx->xout->odp_actions,
> -                                              &ctx->xout->wc,
> +                                              wc,

Could move the lone “wc” to the previous line.

>                                               
> ctx->xbridge->masked_set_action);
> 
>         if (ctx->use_recirc) {
> @@ -2659,6 +2786,19 @@ compose_output_action__(struct xlate_ctx *ctx, 
> ofp_port_t ofp_port,
>             nl_msg_put_u32(ctx->xout->odp_actions, OVS_ACTION_ATTR_RECIRC,
>                            xr->recirc_id);
>         } else {
> +            /* XXX: Write better Filter for tunnel port. We can use inport
> +             * int tunnel-port flow to avoid these checks completely. */

Mention that this is about tunnel ingress, i.e. sending tunneled packet to a 
port that does tunnel decapsulation.

> +            if (ofp_port == OFPP_LOCAL &&
> +                ovs_native_tunneling_is_on(ctx->xbridge->ofproto)) {
> +                odp_port_t odp_tnl_port;
> +
> +                odp_tnl_port = tnl_port_map_lookup(flow, wc);
> +                if (odp_tnl_port != ODPP_NONE) {
> +                    nl_msg_put_odp_port(ctx->xout->odp_actions, 
> OVS_ACTION_ATTR_TUNNEL_POP,
> +                                        odp_tnl_port);
> +                    goto out;
> +                }
> +            }
>             add_ipfix_output_action(ctx, out_port);
>             nl_msg_put_odp_port(ctx->xout->odp_actions, 
> OVS_ACTION_ATTR_OUTPUT,
>                                 out_port);
> diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
> index d965d38..8bb3fd3 100644
> --- a/ofproto/ofproto-dpif.c
> +++ b/ofproto/ofproto-dpif.c
> @@ -59,6 +59,7 @@
> #include "ofproto-dpif-upcall.h"
> #include "ofproto-dpif-xlate.h"
> #include "poll-loop.h"
> +#include "ovs-router.h"
> #include "seq.h"
> #include "simap.h"
> #include "smap.h"
> @@ -280,6 +281,10 @@ struct dpif_backer {
>     /* Maximum number of MPLS label stack entries that the datapath supports
>      * in a match */
>     size_t max_mpls_depth;
> +
> +    /* True if the datapath supports tnl_push and pop actions. */
> +    bool enable_tnl_push_pop;
> +    struct atomic_count tnl_count;
> };
> 

(snip)

> @@ -1759,26 +1787,33 @@ static void
> port_modified(struct ofport *port_)
> {
>     struct ofport_dpif *port = ofport_dpif_cast(port_);
> +    char namebuf[NETDEV_VPORT_NAME_BUFSIZE];
> +    struct netdev *netdev = port->up.netdev;
> 
>     if (port->bundle && port->bundle->bond) {
> -        bond_slave_set_netdev(port->bundle->bond, port, port->up.netdev);
> +        bond_slave_set_netdev(port->bundle->bond, port, netdev);
>     }
> 
>     if (port->cfm) {
> -        cfm_set_netdev(port->cfm, port->up.netdev);
> +        cfm_set_netdev(port->cfm, netdev);
>     }
> 
>     if (port->bfd) {
> -        bfd_set_netdev(port->bfd, port->up.netdev);
> +        bfd_set_netdev(port->bfd, netdev);
>     }
> 
>     ofproto_dpif_monitor_port_update(port, port->bfd, port->cfm,
>                                      port->up.pp.hw_addr);
> 
> -    if (port->is_tunnel && tnl_port_reconfigure(port, port->up.netdev,
> -                                                port->odp_port)) {
> -        ofproto_dpif_cast(port->up.ofproto)->backer->need_revalidate =
> -            REV_RECONFIGURE;
> +    netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf);
> +

Could move this inside the if statement below?

> +    if (port->is_tunnel) {
> +        struct ofproto_dpif *ofproto = ofproto_dpif_cast(port->up.ofproto);
> +
> +        if (tnl_port_reconfigure(port, netdev, port->odp_port,
> +                                 ovs_native_tunneling_is_on(ofproto), 
> namebuf)) {
> +            ofproto->backer->need_revalidate = REV_RECONFIGURE;
> +        }
>     }
> 
>     ofport_update_peer(port);
> @@ -3462,8 +3497,10 @@ ofproto_dpif_execute_actions(struct ofproto_dpif 
> *ofproto,
>     struct xlate_in xin;
>     ofp_port_t in_port;
>     struct dpif_execute execute;
> +    struct ds ds;
>     int error;
> 
> +    ds_init(&ds);
>     ovs_assert((rule != NULL) != (ofpacts != NULL));
> 
>     dpif_flow_stats_extract(flow, packet, time_msec(), &stats);
> @@ -3481,6 +3518,10 @@ ofproto_dpif_execute_actions(struct ofproto_dpif 
> *ofproto,
> 
>     execute.actions = ofpbuf_data(xout.odp_actions);
>     execute.actions_len = ofpbuf_size(xout.odp_actions);
> +
> +    format_odp_actions(&ds, ofpbuf_data(xout.odp_actions),
> +                           ofpbuf_size(xout.odp_actions));
> +

Seems like a remnant of debugging?

>     execute.packet = packet;
>     execute.md = pkt_metadata_from_flow(flow);
>     execute.needs_help = (xout.slow & SLOW_ACTION) != 0;
> @@ -4964,7 +5005,36 @@ ofproto_unixctl_dpif_dump_flows(struct unixctl_conn 
> *conn,
> }
> 
> static void
> -ofproto_dpif_unixctl_init(void)
> +ofproto_revalidate_all_backer(void)

Maybe “ofproto_revalidate_all_backers()”?

> +{
> +    const struct shash_node **backers;
> +    int i;
> +
> +    backers = shash_sort(&all_dpif_backers);
> +    for (i = 0; i < shash_count(&all_dpif_backers); i++) {
> +        struct dpif_backer *backer = backers[i]->data;
> +        backer->need_revalidate = REV_RECONFIGURE;
> +    }
> +    free(backers);
> +}
> +

(snip)

  Jarno

_______________________________________________
dev mailing list
dev@openvswitch.org
http://openvswitch.org/mailman/listinfo/dev

Reply via email to