With comments below: Acked-by: Jarno Rajahalme <jrajaha...@nicira.com>
On Oct 16, 2014, at 11:38 AM, Pravin B Shelar <pshe...@nicira.com> wrote: > diff --git a/README-native-tunneling b/README-native-tunneling > new file mode 100644 > index 0000000..5cc17ec > --- /dev/null > +++ b/README-native-tunneling > @@ -0,0 +1,83 @@ > + > +Open vSwitch support tunneling in userspace. Tunneling is implemented in “supports" > +platform independent way. > + > +Setup: > +====== > +Setup physical bridges for all physical interfaces. create integration > bridge. “Create" > +Add vxlan port to int-bridge. Assign IP address to physical bridge where > +vxlan traffic is expected. > + Should VXLAN be capitalized? > +Example: > +======== > +If you want to connect to vxlan tunnel endpoint (logical ip: 192.168.1.2 > +and remote physical ip: 172.168.1.2) that you want to connect to. > + The last “that you want to connect to” is redundant. > +In this case you need to configure OVS bridges as follows. > + > +1. Let assume 172.168.1.2/24 network is reachable via eth1 create physical > bridge br-eth1 “Lets” Put “create physical bridge br-eth1” on a line of its own. > + assign ip address (172.168.1.1/24) to br-eth1, Add eth1 to br-eth1 > +2. Check ovs cached routes using appctl command > + ovs-appctl ovs/route/show > + Add tunnel route if not present in OVS route table. > + ovs-appctl ovs/route/add 172.168.1.1 255.255.255.0 br-eth1 “172.168.1.1/24” would be preferable format, instead of “172.168.1.1 255.255.255.0”. > +3. Add integration brdge int-br and add tunnel port using standard syntax. > + ovs-vsctl add-port int-br vxlan0 -- set interface vxlan0 type=vxlan > options:remote_ip=172.168.1.2 > +4. Assign IP address to int-br, So final topology looks like: > + > + > + 192.168.1.1/24 > + +--------------+ > + | int-br | 192.168.1.2/24 > + +--------------+ +--------------+ > + | vxlan0 | | vxlan0 | > + +--------------+ +--------------+ > + | | > + | | > + | | > + 172.168.1.1/24 | > + +--------------+ | > + | br-eth1 | 172.168.1.2/24 > + +--------------+ +---------------+ > + | eth1 |----------------------------------| eth1 | > + +--------------+ +---------------- > + > + Host A with OVS. Remote host. > + > +With this setup, ping to vxlan target device (192.168.1.2) should work > +There are following commands that shows internal tables: > + > +Tunneling related commands: > +=========================== > +Tunnel routing table: > + To Add route: > + ovs-appctl ovs/route/add <IP address>/<prefix length> > <output-bridge-name> <gw> > + To see all routes configured: > + ovs-appctl ovs/route/show > + To del route: > + ovs-appctl ovs/route/del <IP address>/<prefix length> > + > +ARP: > + To see arp cache content: > + ovs-appctl tnl/arp/show > + To flush arp cache: > + ovs-appctl tnl/arp/flush > + > +To check tunnel ports listening in vswitchd: > + ovs-appctl tnl/ports/show > + > +To range for VxLan udp source port: “To set range" > + To set: > + ovs-appctl tnl/egress_port_range <num1> <num2> > + Shows Current range: > + ovs-appctl tnl/egress_port_range > + > +To check datapath ports: > + ovs-appctl dpif/show > + > +To check datapath flows: > + ovs-appctl dpif/dump-flows > + > +Contact > +======= > +b...@openvswitch.org > diff --git a/datapath/linux/compat/include/linux/openvswitch.h > b/datapath/linux/compat/include/linux/openvswitch.h > index b2257e6..ff7c9d9 100644 > --- a/datapath/linux/compat/include/linux/openvswitch.h > +++ b/datapath/linux/compat/include/linux/openvswitch.h > @@ -580,6 +580,15 @@ struct ovs_action_hash { > uint32_t hash_basis; > }; > > +#define TNL_PUSH_HEADER_SIZE 128 > +struct ovs_action_push_tnl { > + uint32_t tnl_port; > + uint32_t out_port; > + uint32_t header_len; > + uint32_t tnl_type; /* For logging. */ > + uint8_t header[TNL_PUSH_HEADER_SIZE]; > +}; > + Is this action always fixed length, or is the real length determined by the ‘header_len’ (+ NL padding)? > /** > * enum ovs_action_attr - Action types. > * > @@ -633,6 +642,10 @@ enum ovs_action_attr { > * data immediately followed by a mask. > * The data must be zero for the unmasked > * bits. */ > (snip) > diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c > index cea7c88..a60eab4 100644 > --- a/lib/dpif-netdev.c > +++ b/lib/dpif-netdev.c > @@ -62,6 +62,7 @@ > #include "shash.h" > #include "sset.h" > #include "timeval.h" > +#include "tnl-arp-cache.h" > #include "unixctl.h" > #include "util.h" > #include "vlog.h" > @@ -204,6 +205,7 @@ struct dp_netdev { > * for pin of pmd threads. */ > size_t n_dpdk_rxqs; > char *pmd_cmask; > + uint64_t last_tnl_conf_seq; > }; > > static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev > *dp, > @@ -582,6 +584,7 @@ create_dp_netdev(const char *name, const struct > dpif_class *class, > return error; > } > > + dp->last_tnl_conf_seq = seq_read(tnl_conf_seq); > *dpp = dp; > return 0; > } > @@ -2054,12 +2057,13 @@ dp_netdev_process_rxq_port(struct > dp_netdev_pmd_thread *pmd, > } > } > > -static void > +static bool Add a comment on the meaning of the return value. > dpif_netdev_run(struct dpif *dpif) > { > struct dp_netdev_port *port; > struct dp_netdev *dp = get_dp_netdev(dpif); > struct dp_netdev_pmd_thread *non_pmd = dp_netdev_get_nonpmd(dp); > + uint64_t new_tnl_seq; > > ovs_mutex_lock(&dp->non_pmd_mutex); > CMAP_FOR_EACH (port, node, &dp->ports) { > @@ -2072,6 +2076,14 @@ dpif_netdev_run(struct dpif *dpif) > } > } > ovs_mutex_unlock(&dp->non_pmd_mutex); > + tnl_arp_cache_run(); > + new_tnl_seq = seq_read(tnl_conf_seq); > + > + if (dp->last_tnl_conf_seq != new_tnl_seq) { > + dp->last_tnl_conf_seq = new_tnl_seq; > + return true; > + } > + return false; > } > > static void > @@ -2091,6 +2103,7 @@ dpif_netdev_wait(struct dpif *dpif) > } > } > ovs_mutex_unlock(&dp_netdev_mutex); > + seq_wait(tnl_conf_seq, dp->last_tnl_conf_seq); > } > > struct rxq_poll { > @@ -2785,15 +2798,48 @@ dpif_netdev_register_upcall_cb(struct dpif *dpif, > upcall_callback *cb, > static void > dp_netdev_drop_packets(struct dpif_packet ** packets, int cnt, bool may_steal) > { > - int i; > - > if (may_steal) { > + int i; > + > for (i = 0; i < cnt; i++) { > dpif_packet_delete(packets[i]); > } > } > } > > +static int > +push_tunnel_action(const struct dp_netdev *dp, > + const struct nlattr *attr, > + struct dpif_packet **packets, int cnt) > +{ > + struct dp_netdev_port *tun_port; > + struct ovs_action_push_tnl *data; > + > + data = (struct ovs_action_push_tnl *) nl_attr_get(attr); > + > + tun_port = dp_netdev_lookup_port(dp, data->tnl_port); > + if (!tun_port) { > + return -EINVAL; > + } > + netdev_push_header(tun_port->netdev, packets, cnt, data); > + > + return 0; > +} > + > +static void > +dp_netdev_clone_pkt_batch(struct dpif_packet **tnl_pkt, > + struct dpif_packet **packets, int cnt) > +{ > + int i; > + > + for (i = 0; i < cnt; i++) { > + struct dpif_packet *inner_pkt; > + > + inner_pkt = dpif_packet_clone(packets[i]); > + tnl_pkt[i] = inner_pkt; Variable name “inner_pkt” is not meaningful here, could use just “pkt”? > + } > +} > + > (snip) > diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h > index 65cf505..4ae1658 100644 > --- a/lib/dpif-provider.h > +++ b/lib/dpif-provider.h > @@ -132,8 +132,9 @@ struct dpif_class { > * the 'close' member function. */ > int (*destroy)(struct dpif *dpif); > > - /* Performs periodic work needed by 'dpif', if any is necessary. */ > - void (*run)(struct dpif *dpif); > + /* Performs periodic work needed by 'dpif', if any is necessary. > + * Return true if need to revalidate. */ “Returns” > + bool (*run)(struct dpif *dpif); > > /* Arranges for poll_block() to wake up if the "run" member function needs > * to be called for 'dpif'. */ > diff --git a/lib/dpif.c b/lib/dpif.c > index d088f68..b2f05c9 100644 > --- a/lib/dpif.c > +++ b/lib/dpif.c > @@ -41,6 +41,8 @@ > #include "shash.h" > #include "sset.h" > #include "timeval.h" > +#include "tnl-arp-cache.h" > +#include "tnl-ports.h" > #include "util.h" > #include "valgrind.h" > #include "vlog.h" > @@ -114,6 +116,8 @@ dp_initialize(void) > } > dpctl_unixctl_register(); > ovsthread_once_done(&once); > + tnl_port_map_init(); > + tnl_arp_cache_init(); > } > } > > @@ -402,12 +406,13 @@ dpif_close(struct dpif *dpif) > } > > /* Performs periodic work needed by 'dpif'. */ > -void > +bool > dpif_run(struct dpif *dpif) > { > if (dpif->dpif_class->run) { > - dpif->dpif_class->run(dpif); > + return dpif->dpif_class->run(dpif); > } > + return false; > } > > /* Arranges for poll_block() to wake up when dp_run() needs to be called for > @@ -1002,6 +1007,8 @@ dpif_execute_helper_cb(void *aux_, struct dpif_packet > **packets, int cnt, > > switch ((enum ovs_action_attr)type) { > case OVS_ACTION_ATTR_OUTPUT: > + case OVS_ACTION_ATTR_TUNNEL_PUSH: > + case OVS_ACTION_ATTR_TUNNEL_POP: > case OVS_ACTION_ATTR_USERSPACE: > case OVS_ACTION_ATTR_RECIRC: { > struct dpif_execute execute; > @@ -1592,3 +1599,9 @@ log_flow_get_message(const struct dpif *dpif, const > struct dpif_flow_get *get, > get->flow->actions, get->flow->actions_len); > } > } > + > +bool dpif_support_tnl_push_pop(const struct dpif *dpif) “dpif_supports_tnl_push_pop” > +{ > + return !strcmp(dpif->dpif_class->type, "netdev") || > + !strcmp(dpif->dpif_class->type, "dummy"); > +} > (snip) > diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c > index b3b8a11..8885cbb 100644 > --- a/lib/netdev-vport.c > +++ b/lib/netdev-vport.c > @@ -25,21 +25,30 @@ > #include <sys/ioctl.h> > (snip) > +static int > +parse_gre_header(struct ofpbuf *packet, > + struct flow_tnl *tnl) > +{ > + const struct gre_base_hdr *greh; > + ovs_be32 *options; > + > + greh = ip_extract_md(packet, tnl); > + if (!greh) { > + return -EINVAL; > + } > + > + if (greh->flags & (GRE_VERSION | GRE_ROUTING)) { > + return -EINVAL; > + } > + Is there any guarantee that the packet is actually long enough to contain all of the GRE header? > + options = (ovs_be32 *)(greh + 1); > + if (greh->flags & GRE_CSUM) { > + ovs_be32 pkt_csum; > + > + pkt_csum = csum(greh, ofpbuf_size(packet) - > + sizeof (struct eth_header) - > + sizeof (struct ip_header)); > + if (pkt_csum) { > + return -EINVAL; > + } > + tnl->flags = FLOW_TNL_F_CSUM; > + options++; > + } > + > + if (greh->flags & GRE_KEY) { > + tnl->tun_id = (ovs_be64) ((uint64_t)(*(uint32_t*)options) << 32); > + tnl->flags |= FLOW_TNL_F_KEY; > + options++; > + } > + > + if (greh->flags & GRE_SEQ) { > + options++; > + } > + > + return ((uint8_t *)options - (uint8_t *) greh) ; > +} > + > +static void > +gre_extract_md(struct dpif_packet *dpif_pkt) > +{ > + struct ofpbuf *packet = &dpif_pkt->ofpbuf; > + struct pkt_metadata *md = &dpif_pkt->md; > + struct flow_tnl *tnl = &md->tunnel; > + int greh_len, tnl_hlen; > + > + memset(md, 0, sizeof(*md)); > + > + greh_len = parse_gre_header(packet, tnl); > + if (greh_len <= 0) { > + memset(md, 0, sizeof(*md)); Would it be enough to reset the *tnl here? > + return; > + } > + > + tnl_hlen = sizeof (struct eth_header) + sizeof (struct ip_header) + > greh_len; > + > + if (tnl_hlen > ofpbuf_size(packet)) { > + memset(md, 0, sizeof(*md)); And here? > + return; > + } > + > + ofpbuf_reset_packet(packet, tnl_hlen); > +} > + > +static int > +netdev_gre_pop_header(struct netdev *netdev_ OVS_UNUSED, > + struct dpif_packet **pkt, int cnt) > +{ > + int i; > + > + for (i = 0; i < cnt; i++) { > + gre_extract_md(pkt[i]); > + } > + return 0; > +} > + > +static void > +netdev_gre_push_header__(struct ofpbuf *packet, > + const void *header, int size) > +{ > + struct gre_base_hdr *greh; > + int ip_tot_size; > + > + greh = push_ip_header(packet, header, size, &ip_tot_size); > + > + if (greh->flags & GRE_CSUM) { > + ovs_be32 *options = (ovs_be32 *) (greh + 1); > + > + *options = csum(greh, ip_tot_size - sizeof (struct ip_header)); > + } > +} > + > +static int > +netdev_gre_push_header(const struct netdev *netdev OVS_UNUSED, > + struct dpif_packet **packets, int cnt, > + const struct ovs_action_push_tnl *data) > +{ > + int i; > + > + for (i = 0; i < cnt; i++) { > + netdev_gre_push_header__(&packets[i]->ofpbuf, > + data->header, data->header_len); > + packets[i]->md = PKT_METADATA_INITIALIZER(data->out_port); > + } > + return 0; > +} > + > + > +static int > +netdev_gre_build_header(const struct netdev *netdev, > + struct ovs_action_push_tnl *data) > +{ > + struct netdev_vport *dev = netdev_vport_cast(netdev); > + struct netdev_tunnel_config *tnl_cfg; > + struct ip_header *ip; > + struct gre_base_hdr *greh; > + ovs_be32 *options; > + int hlen; > + > + /* XXX: RCUfy tnl_cfg. */ > + ovs_mutex_lock(&dev->mutex); > + tnl_cfg = &dev->tnl_cfg; > + > + ip = ip_hdr(data->header); > + ip->ip_proto = IPPROTO_GRE; > + > + greh = gre_hdr(ip); > + greh->protocol = htons(ETH_TYPE_TEB); > + greh->flags = 0; > + > + options = (ovs_be32 *) (greh + 1); > + if (tnl_cfg->csum) { > + greh->flags |= GRE_CSUM; > + *options = 0; > + options++; > + } > + > + if (tnl_cfg->out_key_present) { > + greh->flags = GRE_KEY; > + *options = (ovs_be32) (tnl_cfg->out_key >> 32); > + options++; > + } > + > + ovs_mutex_unlock(&dev->mutex); > + > + hlen = (uint8_t *) options - (uint8_t *) greh; > + > + data->header_len = sizeof (struct eth_header) + sizeof *ip + hlen; > + data->tnl_type = OVS_VPORT_TYPE_GRE; > + return 0; > +} > + > +static void > +vxlan_extract_md(struct dpif_packet *dpif_pkt) > +{ > + struct ofpbuf *packet = &dpif_pkt->ofpbuf; > + struct pkt_metadata *md = &dpif_pkt->md; > + struct flow_tnl *tnl = &md->tunnel; > + struct udp_header *udp; > + struct vxlanhdr *vxh; > + > + memset(md, 0, sizeof(*md)); > + if (VXLAN_HLEN > ofpbuf_size(packet)) { > + return; > + } > + > + udp = ip_extract_md(packet, tnl); > + if (!udp) { > + return; > + } > + vxh = (struct vxlanhdr *) ((const char *)udp + sizeof(*udp)); > + > + if (vxh->vx_flags != htonl(VXLAN_FLAGS) || > + (vxh->vx_vni & htonl(0xff))) { > + VLOG_ERR("invalid vxlan flags=%#x vni=%#x\n", > + ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); > + memset(md, 0, sizeof(*md)); Would it be enough to reset *tnl here? > + return; > + } > + tnl->tp_src = udp->udp_src; > + tnl->tp_dst = udp->udp_dst; > + tnl->tun_id = htonll(ntohl(vxh->vx_vni) >> 8); > + > + ofpbuf_reset_packet(packet, VXLAN_HLEN); > +} > + > + (snip) > diff --git a/lib/odp-util.c b/lib/odp-util.c > index 8f5ed08..2db8a00 100644 > --- a/lib/odp-util.c > +++ b/lib/odp-util.c > @@ -24,18 +24,22 @@ > (snip) > static int > +ovs_parse_tnl_push(const char *s, struct ovs_action_push_tnl *data) > +{ > + struct eth_header *eth; > + struct ip_header *ip; > + struct udp_header *udp; > + struct gre_base_hdr *greh; > + uint16_t dl_type, udp_src, udp_dst; > + ovs_be32 sip, dip; > + uint32_t tnl_type = 0, header_len = 0; > + void *l3, *l4; > + int n = 0; > + > + if (!ovs_scan(s, "tnl_push(tnl_port(%"SCNi32"),%n", &data->tnl_port, > &n)) { > + return 0; > + } > + > + eth = (struct eth_header *) data->header; > + l3 = (data->header + sizeof *eth); > + l4 = ((uint8_t *) l3 + sizeof (struct ip_header)); > + ip = (struct ip_header *) l3; > + if (!ovs_scan(&s[n], "header(size=%"SCNi32",type=%"SCNi32"," > + "eth(dst="ETH_ADDR_SCAN_FMT",%n", &data->tnl_type, > + &data->header_len, > + ETH_ADDR_SCAN_ARGS(eth->eth_dst), &n)) { > + return -EINVAL; > + } > + > + if (!ovs_scan(&s[n], "src="ETH_ADDR_SCAN_FMT",%n", > + ETH_ADDR_SCAN_ARGS(eth->eth_src), &n)) { > + return -EINVAL; > + } > + if (!ovs_scan(&s[n], "dl_type=%"SCNi16",%n", &dl_type, &n)) { > + return -EINVAL; > + } > + eth->eth_type = htons(dl_type); > + > + /* IPv4 */ > + if (!ovs_scan(&s[n], > "ipv4(src="IP_SCAN_FMT",dst="IP_SCAN_FMT",proto=%"SCNi8 > + ",tos=%"SCNi8",ttl=%"SCNi8",frag=%"SCNi16"),%n", > + IP_SCAN_ARGS(&sip), > + IP_SCAN_ARGS(&dip), > + &ip->ip_proto, &ip->ip_tos, > + &ip->ip_ttl, &ip->ip_frag_off, &n)) { > + return -EINVAL; > + } > + put_16aligned_be32(&ip->ip_src, sip); > + put_16aligned_be32(&ip->ip_dst, dip); > + > + udp = (struct udp_header *) l4; > + greh = (struct gre_base_hdr *) l4; > + > + if (ovs_scan(&s[n], "udp(src=%"SCNi16",dst=%"SCNi16"),%n", > + &udp_src, &udp_dst, &n)) { > + struct vxlanhdr *vxh; > + > + udp->udp_src = htons(udp_src); > + udp->udp_dst = htons(udp_dst); > + udp->udp_len = 0; > + udp->udp_csum = 0; > + > + vxh = (struct vxlanhdr *) (udp + 1); > + if (!ovs_scan(&s[n], "vxlan(flags=%"SCNi32",vni=%"SCNi32")),%n", > + &vxh->vx_flags, &vxh->vx_vni, &n)) { > + return -EINVAL; > + } > + tnl_type = OVS_VPORT_TYPE_VXLAN; > + header_len = sizeof *eth + sizeof *ip + > + sizeof *udp + sizeof *vxh; > + } else if (ovs_scan(&s[n], "gre((flags=%"SCNi16",proto=%"SCNi16"),%n", > + &greh->flags, &greh->protocol, &n)){ > + > + ovs_be32 *options = (ovs_be32 *) (greh + 1); > + > + if (greh->flags & GRE_CSUM) { > + if (!ovs_scan(&s[n], ",csum=%"SCNi32"%n", options, &n)) { > + return -EINVAL; > + } > + > + options++; > + } > + if (greh->flags & GRE_KEY) { > + uint32_t key; > + > + if (!ovs_scan(&s[n], ",seq=%"SCNi32"%n", &key, &n)) { > + return -EINVAL; > + } > + > + *options = htonl(key); > + options++; > + } > + if (greh->flags & GRE_SEQ) { > + if (!ovs_scan(&s[n], ",seq=%"SCNi32"%n", options, &n)) { > + return -EINVAL; > + } > + options++; > + } > + if (!ovs_scan(&s[n], ")%n", &n)) { > + return -EINVAL; > + } > + > + tnl_type = OVS_VPORT_TYPE_GRE; > + header_len = sizeof *eth + sizeof *ip + > + ((uint8_t *) options - (uint8_t *) greh); > + } else { > + return -EINVAL; > + } > + > + /* check tune meta data. */ “Check tunnel metadata.” > + if (data->tnl_type != tnl_type) { > + return -EINVAL; > + } > + if (data->header_len != header_len) { > + return -EINVAL; > + } > + > + return n; > +} > + > +static int > parse_odp_action(const char *s, const struct simap *port_names, > struct ofpbuf *actions) > { > @@ -887,6 +1101,28 @@ parse_odp_action(const char *s, const struct simap > *port_names, > } > } > > + { > + uint32_t port; > + int n; > + > + if (ovs_scan(s, "tnl_pop(%"SCNi32")%n", &port, &n)) { > + nl_msg_put_u32(actions, OVS_ACTION_ATTR_TUNNEL_POP, port); > + return n; > + } > + } > + > + { > + struct ovs_action_push_tnl data; > + int n; > + > + n = ovs_parse_tnl_push(s, &data); > + if (n > 0) { > + odp_put_tunnel_push_action(actions, &data); > + return n; > + } else if (n < 0) { > + return n; > + } > + } > return -EINVAL; > } > > @@ -3543,6 +3779,15 @@ odp_put_tunnel_action(const struct flow_tnl *tunnel, > tun_key_to_attr(odp_actions, tunnel); > nl_msg_end_nested(odp_actions, offset); > } > + > +void > +odp_put_tunnel_push_action(struct ofpbuf *odp_actions, > + struct ovs_action_push_tnl *data) > +{ > + nl_msg_put_unspec(odp_actions, OVS_ACTION_ATTR_TUNNEL_PUSH, > + data, sizeof (*data)); Could we not push the unused portion of the ovs_action_push_tnl here? > +} > + > (snip) > diff --git a/lib/tnl-arp-cache.c b/lib/tnl-arp-cache.c > new file mode 100644 > index 0000000..6053eb6 > --- /dev/null > +++ b/lib/tnl-arp-cache.c > @@ -0,0 +1,212 @@ > (snip) > +int > +tnl_arp_snoop(const struct flow *flow, struct flow_wildcards *wc, > + const char name[IFNAMSIZ]) > +{ > + struct tnl_arp_entry *arp; > + > + if (flow->dl_type != htons(ETH_TYPE_ARP)) { > + return EINVAL; > + } > + > + memset(&wc->masks.dl_type, 0xff, sizeof wc->masks.dl_type); > + memset(&wc->masks.nw_proto, 0xff, sizeof wc->masks.nw_proto); > + memset(&wc->masks.nw_src, 0xff, sizeof wc->masks.nw_src); > + memset(&wc->masks.nw_dst, 0xff, sizeof wc->masks.nw_dst); > + memset(&wc->masks.arp_sha, 0xff, sizeof wc->masks.arp_sha); > + memset(&wc->masks.arp_tha, 0xff, sizeof wc->masks.arp_tha); > + > + ovs_mutex_lock(&mutex); > + arp = __tnl_arp_lookup(name, flow->nw_src); > + if (arp) { > + if (!memcmp(arp->mac, flow->arp_sha, ETH_ADDR_LEN)) { > + arp->expires = time_now() + ARP_ENTRY_DEFAULT_IDLE_TIME; > + ovs_mutex_unlock(&mutex); > + return 0; > + } > + tnl_arp_delete(arp); > + } > + It should be pretty easy to make the successful lookup lockless, but that can be left as a future enhancement. > + arp = xmalloc(sizeof *arp); > + > + arp->ip = flow->nw_src; > + memcpy(arp->mac, flow->arp_sha, ETH_ADDR_LEN); > + arp->expires = time_now() + ARP_ENTRY_DEFAULT_IDLE_TIME; > + strncpy(arp->br_name, name, IFNAMSIZ); > + cmap_insert(&table, &arp->cmap_node, arp->ip); > + ovs_mutex_unlock(&mutex); > + return 0; > +} > + > +void > +tnl_arp_cache_run(void) > +{ > + struct tnl_arp_entry *arp; > + bool changed = false; > + > + ovs_mutex_lock(&mutex); > + CMAP_FOR_EACH(arp, cmap_node, &table) { > + if (arp->expires <= time_now()) { > + tnl_arp_delete(arp); > + changed = true; It is likely that this will be periodically triggered also for active tunnels, as when nothing changes and the tunnel headers are pushed from a cached flow, there is nothing that keeps the ARP entry fresh. Maybe tap into xlate_cache_netdev() to reforest the ARP entry for a tunnel port when stats are pushed on it? > + } > + } > + ovs_mutex_unlock(&mutex); > + > + if (changed) { > + seq_change(tnl_conf_seq); > + } > +} > + (snip) > diff --git a/lib/tnl-arp-cache.h b/lib/tnl-arp-cache.h > new file mode 100644 > index 0000000..f68c3b4 > --- /dev/null > +++ b/lib/tnl-arp-cache.h > @@ -0,0 +1,47 @@ > +/* > + * Copyright (c) 2014 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifndef OVS_TNL_ARP_CACHE_H > +#define OVS_TNL_ARP_CACHE_H 1 Can drop the “OVS_” prefix here, so that the macro name matches the file name. > + > +#include <errno.h> > +#include <inttypes.h> > +#include <stddef.h> > +#include <stdint.h> > +#include <string.h> > +#include <net/if.h> > +#include <sys/socket.h> > + Likely none of the above are needed. > +#include "flow.h" > +#include “netdev.h" Likely not needed. > +#include "packets.h" > +#include "util.h" > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + I recall we do not do this unless someone requests it. > +int tnl_arp_snoop(const struct flow *flow, struct flow_wildcards *wc, > + const char dev_name[]); > +int tnl_arp_lookup(const char dev_name[], ovs_be32 dst, uint8_t > mac[ETH_ADDR_LEN]); > +void tnl_arp_cache_init(void); > +void tnl_arp_cache_run(void); > + > +#ifdef __cplusplus > +} > +#endif > + > +#endif (snip) > diff --git a/lib/tnl-ports.h b/lib/tnl-ports.h > new file mode 100644 > index 0000000..bc4c954 > --- /dev/null > +++ b/lib/tnl-ports.h > @@ -0,0 +1,49 @@ > +/* > + * Copyright (c) 2014 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifndef OVS_TNL_PORT_H > +#define OVS_TNL_PORT_H 1 > + > +#include <errno.h> > +#include <stddef.h> > +#include <stdint.h> > +#include <inttypes.h> > +#include <net/if.h> > +#include <sys/socket.h> > + > +#include "flow.h" > +#include "packets.h" > +#include "util.h" > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + > +odp_port_t tnl_port_map_lookup(const struct flow *flow, > + struct flow_wildcards *wc); > + > +void tnl_port_map_insert(odp_port_t port, ovs_be32 ip_dst, ovs_be16 udp_port, > + const char dev_name[]); > + > +void tnl_port_map_delete(ovs_be32 ip_dst OVS_UNUSED, ovs_be16 udp_port > OVS_UNUSED); > + Should not use OVS_UNUSED in function declarations. > +void tnl_port_map_init(void); > + > +#ifdef __cplusplus > +} > +#endif > + > +#endif > diff --git a/lib/vxlan.h b/lib/vxlan.h > new file mode 100644 > index 0000000..c847bdb > --- /dev/null > +++ b/lib/vxlan.h > @@ -0,0 +1,50 @@ > +/* > + * Copyright (c) 2014 Nicira, Inc. > + * > + * Licensed under the Apache License, Version 2.0 (the "License"); > + * you may not use this file except in compliance with the License. > + * You may obtain a copy of the License at: > + * > + * http://www.apache.org/licenses/LICENSE-2.0 > + * > + * Unless required by applicable law or agreed to in writing, software > + * distributed under the License is distributed on an "AS IS" BASIS, > + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. > + * See the License for the specific language governing permissions and > + * limitations under the License. > + */ > + > +#ifndef NETDEV_VXLAN_H > +#define NETDEV_VXLAN_H 1 > + Should be just VXLAN_H to match the file name? > +#include <stddef.h> > +#include <stdint.h> > +#include “list.h" No list here. > +#include "packets.h" > +#include "util.h" > +#include "netdev-dpdk.h” I do not see anything DPDK dependent here. > + > +#ifdef __cplusplus > +extern "C" { > +#endif > + Not needed. > + > +#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. > */ > + > +/* VXLAN protocol header */ > +OVS_PACKED( > +struct vxlanhdr { > + ovs_be32 vx_flags; > + ovs_be32 vx_vni; > +}); > + Should use types from unaligned.h instead of packing? > +#define VXLAN_HLEN (sizeof(struct eth_header) + \ > + sizeof(struct ip_header) + \ > + sizeof(struct udp_header) + \ > + sizeof(struct vxlanhdr)) > + > +#ifdef __cplusplus > +} > +#endif > + > +#endif > diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c > index 48576ad..ce0e482 100644 > --- a/ofproto/ofproto-dpif-xlate.c > +++ b/ofproto/ofproto-dpif-xlate.c > @@ -17,7 +17,12 @@ > #include "ofproto/ofproto-dpif-xlate.h” (snip) > static void > compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port, > bool check_stp) > @@ -2619,9 +2740,15 @@ compose_output_action__(struct xlate_ctx *ctx, > ofp_port_t ofp_port, > entry->u.dev.tx = netdev_ref(xport->netdev); > } > out_port = odp_port; > - commit_odp_tunnel_action(flow, &ctx->base_flow, > - ctx->xout->odp_actions); > - flow->tunnel = flow_tnl; /* Restore tunnel metadata */ > + if (ovs_native_tunneling_is_on(ctx->xbridge->ofproto)) { > + build_tunnel_send(ctx, xport, flow, odp_port); You need to also commit_odp_actions() before calling build_tunnel_send(). > + flow->tunnel = flow_tnl; /* Restore tunnel metadata */ > + goto out; > + } else { > + commit_odp_tunnel_action(flow, &ctx->base_flow, > + ctx->xout->odp_actions); > + flow->tunnel = flow_tnl; /* Restore tunnel metadata */ > + } > } else { > odp_port = xport->odp_port; > out_port = odp_port; > @@ -2641,7 +2768,7 @@ compose_output_action__(struct xlate_ctx *ctx, > ofp_port_t ofp_port, > if (out_port != ODPP_NONE) { > ctx->xout->slow |= commit_odp_actions(flow, &ctx->base_flow, > ctx->xout->odp_actions, > - &ctx->xout->wc, > + wc, Could move the lone “wc” to the previous line. > > ctx->xbridge->masked_set_action); > > if (ctx->use_recirc) { > @@ -2659,6 +2786,19 @@ compose_output_action__(struct xlate_ctx *ctx, > ofp_port_t ofp_port, > nl_msg_put_u32(ctx->xout->odp_actions, OVS_ACTION_ATTR_RECIRC, > xr->recirc_id); > } else { > + /* XXX: Write better Filter for tunnel port. We can use inport > + * int tunnel-port flow to avoid these checks completely. */ Mention that this is about tunnel ingress, i.e. sending tunneled packet to a port that does tunnel decapsulation. > + if (ofp_port == OFPP_LOCAL && > + ovs_native_tunneling_is_on(ctx->xbridge->ofproto)) { > + odp_port_t odp_tnl_port; > + > + odp_tnl_port = tnl_port_map_lookup(flow, wc); > + if (odp_tnl_port != ODPP_NONE) { > + nl_msg_put_odp_port(ctx->xout->odp_actions, > OVS_ACTION_ATTR_TUNNEL_POP, > + odp_tnl_port); > + goto out; > + } > + } > add_ipfix_output_action(ctx, out_port); > nl_msg_put_odp_port(ctx->xout->odp_actions, > OVS_ACTION_ATTR_OUTPUT, > out_port); > diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c > index d965d38..8bb3fd3 100644 > --- a/ofproto/ofproto-dpif.c > +++ b/ofproto/ofproto-dpif.c > @@ -59,6 +59,7 @@ > #include "ofproto-dpif-upcall.h" > #include "ofproto-dpif-xlate.h" > #include "poll-loop.h" > +#include "ovs-router.h" > #include "seq.h" > #include "simap.h" > #include "smap.h" > @@ -280,6 +281,10 @@ struct dpif_backer { > /* Maximum number of MPLS label stack entries that the datapath supports > * in a match */ > size_t max_mpls_depth; > + > + /* True if the datapath supports tnl_push and pop actions. */ > + bool enable_tnl_push_pop; > + struct atomic_count tnl_count; > }; > (snip) > @@ -1759,26 +1787,33 @@ static void > port_modified(struct ofport *port_) > { > struct ofport_dpif *port = ofport_dpif_cast(port_); > + char namebuf[NETDEV_VPORT_NAME_BUFSIZE]; > + struct netdev *netdev = port->up.netdev; > > if (port->bundle && port->bundle->bond) { > - bond_slave_set_netdev(port->bundle->bond, port, port->up.netdev); > + bond_slave_set_netdev(port->bundle->bond, port, netdev); > } > > if (port->cfm) { > - cfm_set_netdev(port->cfm, port->up.netdev); > + cfm_set_netdev(port->cfm, netdev); > } > > if (port->bfd) { > - bfd_set_netdev(port->bfd, port->up.netdev); > + bfd_set_netdev(port->bfd, netdev); > } > > ofproto_dpif_monitor_port_update(port, port->bfd, port->cfm, > port->up.pp.hw_addr); > > - if (port->is_tunnel && tnl_port_reconfigure(port, port->up.netdev, > - port->odp_port)) { > - ofproto_dpif_cast(port->up.ofproto)->backer->need_revalidate = > - REV_RECONFIGURE; > + netdev_vport_get_dpif_port(netdev, namebuf, sizeof namebuf); > + Could move this inside the if statement below? > + if (port->is_tunnel) { > + struct ofproto_dpif *ofproto = ofproto_dpif_cast(port->up.ofproto); > + > + if (tnl_port_reconfigure(port, netdev, port->odp_port, > + ovs_native_tunneling_is_on(ofproto), > namebuf)) { > + ofproto->backer->need_revalidate = REV_RECONFIGURE; > + } > } > > ofport_update_peer(port); > @@ -3462,8 +3497,10 @@ ofproto_dpif_execute_actions(struct ofproto_dpif > *ofproto, > struct xlate_in xin; > ofp_port_t in_port; > struct dpif_execute execute; > + struct ds ds; > int error; > > + ds_init(&ds); > ovs_assert((rule != NULL) != (ofpacts != NULL)); > > dpif_flow_stats_extract(flow, packet, time_msec(), &stats); > @@ -3481,6 +3518,10 @@ ofproto_dpif_execute_actions(struct ofproto_dpif > *ofproto, > > execute.actions = ofpbuf_data(xout.odp_actions); > execute.actions_len = ofpbuf_size(xout.odp_actions); > + > + format_odp_actions(&ds, ofpbuf_data(xout.odp_actions), > + ofpbuf_size(xout.odp_actions)); > + Seems like a remnant of debugging? > execute.packet = packet; > execute.md = pkt_metadata_from_flow(flow); > execute.needs_help = (xout.slow & SLOW_ACTION) != 0; > @@ -4964,7 +5005,36 @@ ofproto_unixctl_dpif_dump_flows(struct unixctl_conn > *conn, > } > > static void > -ofproto_dpif_unixctl_init(void) > +ofproto_revalidate_all_backer(void) Maybe “ofproto_revalidate_all_backers()”? > +{ > + const struct shash_node **backers; > + int i; > + > + backers = shash_sort(&all_dpif_backers); > + for (i = 0; i < shash_count(&all_dpif_backers); i++) { > + struct dpif_backer *backer = backers[i]->data; > + backer->need_revalidate = REV_RECONFIGURE; > + } > + free(backers); > +} > + (snip) Jarno _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev