From 131d3e1292a598c06c9a4382f88e8261730ec391 Mon Sep 17 00:00:00 2001
From: Joseph Glanville <joseph.glanville@orionvm.com.au>
Date: Tue, 17 Jan 2012 03:43:15 +1100
Subject: [PATCH] Implement VXLAN support

---
 datapath/Modules.mk              |    1 +
 datapath/tunnel.h                |    1 +
 datapath/vport-vxlan.c           |  234 ++++++++++++++++++++++++++++++++++++++
 datapath/vport.c                 |    1 +
 datapath/vport.h                 |    1 +
 debian/openvswitch-switch.init   |    1 +
 include/linux/openvswitch.h      |    1 +
 lib/netdev-vport.c               |    7 +
 rhel/etc_init.d_openvswitch      |    2 +
 vswitchd/vswitch.xml             |   27 ++++-
 xenserver/etc_init.d_openvswitch |    2 +
 11 files changed, 273 insertions(+), 5 deletions(-)
 create mode 100644 datapath/vport-vxlan.c

diff --git a/datapath/Modules.mk b/datapath/Modules.mk
index 087cf44..5221862 100644
--- a/datapath/Modules.mk
+++ b/datapath/Modules.mk
@@ -25,6 +25,7 @@ openvswitch_sources = \
 	vport-gre.c \
 	vport-internal_dev.c \
 	vport-netdev.c \
+	vport-vxlan.c \
 	vport-patch.c
 
 openvswitch_headers = \
diff --git a/datapath/tunnel.h b/datapath/tunnel.h
index 9211740..20885cc 100644
--- a/datapath/tunnel.h
+++ b/datapath/tunnel.h
@@ -29,6 +29,7 @@
  */
 #define TNL_T_PROTO_GRE		0
 #define TNL_T_PROTO_CAPWAP	1
+#define TNL_T_PROTO_VXLAN	2
 
 /* These flags are only needed when calling tnl_find_port(). */
 #define TNL_T_KEY_EXACT		(1 << 10)
diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c
new file mode 100644
index 0000000..6ff87e8
--- /dev/null
+++ b/datapath/vport-vxlan.c
@@ -0,0 +1,234 @@
+ /*
+ * Copyright (c) 2011 Nicira Networks.
+ * Distributed under the terms of the GNU GPL version 2.
+ *
+ * Significant portions of this file may be copied from parts of the Linux
+ * kernel, by Linus Torvalds and others.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/version.h>
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
+
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/net.h>
+#include <linux/udp.h>
+
+#include <net/icmp.h>
+#include <net/ip.h>
+
+#include "tunnel.h"
+#include "vport.h"
+#include "vport-generic.h"
+
+#define VXLAN_DST_PORT 4563
+
+#define VXLAN_FLAGS 0x08000000	/* struct vxlanhdr.vx_flags required value. */
+
+/**
+ * struct vxlanhdr - VXLAN header
+ * @vx_flags: Must have the exact value %VXLAN_FLAGS.
+ * @vx_vni: VXLAN Network Identifier (VNI) in top 24 bits, low 8 bits zeroed.
+ */
+struct vxlanhdr {
+	__be32 vx_flags;
+	__be32 vx_vni;
+};
+
+static struct vxlanhdr *vxlan_hdr(const struct sk_buff *skb)
+{
+	return (struct vxlanhdr *)(udp_hdr(skb) + 1);
+}
+
+#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr))
+
+static struct socket *vxlan_rcv_socket;
+static int vxlan_n_tunnels;
+
+static int vxlan_hdr_len(const struct tnl_mutable_config *mutable)
+{
+	return VXLAN_HLEN;
+}
+
+static __be16 get_src_port(const struct sk_buff *skb,
+                           const struct tnl_mutable_config *mutable)
+{
+        /* Convert hash into a port between 32768 and 65535. */
+        return (__force __be16)OVS_CB(skb)->flow->hash | htons(32768);
+}
+static void vxlan_build_header(const struct vport *vport,
+			       const struct tnl_mutable_config *mutable,
+			       void *header)
+{
+	struct udphdr *udph = header;
+	struct vxlanhdr *vxh = (struct vxlanhdr *)(udph + 1);
+
+	udph->dest = htons(VXLAN_DST_PORT);
+	udph->check = 0;
+
+	vxh->vx_flags = htonl(VXLAN_FLAGS);
+	vxh->vx_vni = htonl(be64_to_cpu(mutable->out_key) << 8);
+}
+
+static struct sk_buff *vxlan_update_header(const struct vport *vport,
+					   const struct tnl_mutable_config *mutable,
+					   struct dst_entry *dst,
+					   struct sk_buff *skb)
+{
+	struct udphdr *udph = udp_hdr(skb);
+	struct vxlanhdr *vxh = (struct vxlanhdr *)(udph + 1);
+
+	if (mutable->flags & TNL_F_OUT_KEY_ACTION)
+		vxh->vx_vni = htonl(be64_to_cpu(OVS_CB(skb)->tun_id) << 8);
+
+	udph->source = get_src_port(skb, mutable);
+	udph->len = htons(skb->len - skb_transport_offset(skb));
+
+	/*
+	 * Allow our local IP stack to fragment the outer packet even if the
+	 * DF bit is set as a last resort.  We also need to force selection of
+	 * an IP ID here because Linux will otherwise leave it at 0 if the
+	 * packet originally had DF set.
+	 */
+	skb->local_df = 1;
+	__ip_select_ident(ip_hdr(skb), dst, 0);
+
+	return skb;
+}
+
+/* Called with rcu_read_lock and BH disabled. */
+static int vxlan_rcv(struct sock *sk, struct sk_buff *skb)
+{
+	struct vport *vport;
+	struct vxlanhdr *vxh;
+	const struct tnl_mutable_config *mutable;
+	struct iphdr *iph;
+	__be64 key;
+
+	if (unlikely(!pskb_may_pull(skb, VXLAN_HLEN + ETH_HLEN)))
+		goto error;
+
+	vxh = vxlan_hdr(skb);
+	if (unlikely(vxh->vx_flags != htonl(VXLAN_FLAGS) ||
+		     vxh->vx_vni & htonl(0xff)))
+		goto error;
+
+	__skb_pull(skb, VXLAN_HLEN);
+
+	key = cpu_to_be64(ntohl(vxh->vx_vni) >> 8);
+
+	iph = ip_hdr(skb);
+	vport = tnl_find_port(iph->daddr, iph->saddr, key, TNL_T_PROTO_VXLAN,
+			      &mutable);
+	if (unlikely(!vport)) {
+		icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
+		goto error;
+	}
+
+	skb_postpull_rcsum(skb, skb_transport_header(skb), VXLAN_HLEN + ETH_HLEN);
+
+	if (mutable->flags & TNL_F_IN_KEY_MATCH)
+		OVS_CB(skb)->tun_id = key;
+	else
+		OVS_CB(skb)->tun_id = 0;
+
+	tnl_rcv(vport, skb, iph->tos);
+	goto out;
+
+error:
+	kfree_skb(skb);
+out:
+	return 0;
+}
+
+static const struct tnl_ops vxlan_tnl_ops = {
+	.tunnel_type	= TNL_T_PROTO_VXLAN,
+	.ipproto	= IPPROTO_UDP,
+	.hdr_len	= vxlan_hdr_len,
+	.build_header	= vxlan_build_header,
+	.update_header	= vxlan_update_header,
+};
+
+/* Random value.  Irrelevant as long as it's not 0 since we set the handler. */
+#define UDP_ENCAP_VXLAN 10
+static int vxlan_init(void)
+{
+	int err;
+	struct sockaddr_in sin;
+
+	if (vxlan_n_tunnels++)
+		return 0;
+
+	err = sock_create(AF_INET, SOCK_DGRAM, 0, &vxlan_rcv_socket);
+	if (err)
+		goto error;
+
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = htonl(INADDR_ANY);
+	sin.sin_port = htons(VXLAN_DST_PORT);
+
+	err = kernel_bind(vxlan_rcv_socket, (struct sockaddr *)&sin,
+			  sizeof(struct sockaddr_in));
+	if (err)
+		goto error_sock;
+
+	udp_sk(vxlan_rcv_socket->sk)->encap_type = UDP_ENCAP_VXLAN;
+	udp_sk(vxlan_rcv_socket->sk)->encap_rcv = vxlan_rcv;
+
+	return 0;
+
+error_sock:
+	sock_release(vxlan_rcv_socket);
+error:
+	pr_warn("cannot register vxlan protocol handler\n");
+	vxlan_n_tunnels--;
+	return err;
+}
+
+static void vxlan_uninit(void)
+{
+	if (!--vxlan_n_tunnels)
+		sock_release(vxlan_rcv_socket);
+}
+
+static struct vport *vxlan_create(const struct vport_parms *parms)
+{
+	struct vport *vport;
+	int error;
+
+	error = vxlan_init();
+	if (error)
+		return ERR_PTR(error);
+
+	vport = tnl_create(parms, &vxlan_vport_ops, &vxlan_tnl_ops);
+	if (IS_ERR(vport))
+		vxlan_uninit();
+	return vport;
+}
+
+static void vxlan_destroy(struct vport *vport)
+{
+	vxlan_uninit();
+	return tnl_destroy(vport);
+}
+
+const struct vport_ops vxlan_vport_ops = {
+	.type		= OVS_VPORT_TYPE_VXLAN,
+	.flags		= VPORT_F_TUN_ID,
+	.create		= vxlan_create,
+	.destroy	= vxlan_destroy,
+	.set_addr	= tnl_set_addr,
+	.get_name	= tnl_get_name,
+	.get_addr	= tnl_get_addr,
+	.get_options	= tnl_get_options,
+	.set_options	= tnl_set_options,
+	.get_dev_flags	= vport_gen_get_dev_flags,
+	.is_running	= vport_gen_is_running,
+	.get_operstate	= vport_gen_get_operstate,
+	.send		= tnl_send,
+};
+#else
+#warning VXLAN tunneling will not be available on kernels before 2.6.26
+#endif /* Linux kernel < 2.6.26 */
diff --git a/datapath/vport.c b/datapath/vport.c
index 2e36803..263ae05 100644
--- a/datapath/vport.c
+++ b/datapath/vport.c
@@ -31,6 +31,7 @@ static const struct vport_ops *base_vport_ops_list[] = {
 	&gre_vport_ops,
 #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26)
 	&capwap_vport_ops,
+	&vxlan_vport_ops,
 #endif
 };
 
diff --git a/datapath/vport.h b/datapath/vport.h
index 2c9c4aa..e3a1c54 100644
--- a/datapath/vport.h
+++ b/datapath/vport.h
@@ -252,5 +252,6 @@ extern const struct vport_ops internal_vport_ops;
 extern const struct vport_ops patch_vport_ops;
 extern const struct vport_ops gre_vport_ops;
 extern const struct vport_ops capwap_vport_ops;
+extern const struct vport_ops vxlan_vport_ops;
 
 #endif /* vport.h */
diff --git a/debian/openvswitch-switch.init b/debian/openvswitch-switch.init
index 3d187a0..98dd2f5 100755
--- a/debian/openvswitch-switch.init
+++ b/debian/openvswitch-switch.init
@@ -64,6 +64,7 @@ start () {
     "$@" || exit $?
 
     ovs_ctl --protocol=gre enable-protocol
+    ovs_ctl --protocol=udp --dport=4563  enable-protocol
 }
 
 stop () {
diff --git a/include/linux/openvswitch.h b/include/linux/openvswitch.h
index 3b1dafc..2f4fca3 100644
--- a/include/linux/openvswitch.h
+++ b/include/linux/openvswitch.h
@@ -185,6 +185,7 @@ enum ovs_vport_type {
 	OVS_VPORT_TYPE_PATCH,    /* virtual tunnel connecting two vports */
 	OVS_VPORT_TYPE_GRE,      /* GRE tunnel */
 	OVS_VPORT_TYPE_CAPWAP,   /* CAPWAP tunnel */
+	OVS_VPORT_TYPE_VXLAN,   /* VXLAN tunnel */
 	__OVS_VPORT_TYPE_MAX
 };
 
diff --git a/lib/netdev-vport.c b/lib/netdev-vport.c
index 620c22e..29e2a9f 100644
--- a/lib/netdev-vport.c
+++ b/lib/netdev-vport.c
@@ -165,6 +165,9 @@ netdev_vport_get_netdev_type(const struct dpif_linux_vport *vport)
     case OVS_VPORT_TYPE_CAPWAP:
         return "capwap";
 
+    case OVS_VPORT_TYPE_VXLAN:
+        return "vxlan";
+
     case __OVS_VPORT_TYPE_MAX:
         break;
     }
@@ -956,6 +959,10 @@ netdev_vport_register(void)
           { "capwap", VPORT_FUNCTIONS(netdev_vport_get_status) },
           parse_tunnel_config, unparse_tunnel_config },
 
+        { OVS_VPORT_TYPE_VXLAN,
+          { "vxlan", VPORT_FUNCTIONS(netdev_vport_get_status) },
+          parse_tunnel_config, unparse_tunnel_config },
+
         { OVS_VPORT_TYPE_PATCH,
           { "patch", VPORT_FUNCTIONS(NULL) },
           parse_patch_config, unparse_patch_config }
diff --git a/rhel/etc_init.d_openvswitch b/rhel/etc_init.d_openvswitch
index 5501d18..884950d 100755
--- a/rhel/etc_init.d_openvswitch
+++ b/rhel/etc_init.d_openvswitch
@@ -47,7 +47,9 @@ start () {
     fi
     "$@"
 
+    # Allow tunnel traffic
     $ovs_ctl --protocol=gre enable-protocol
+    $ovs_ctl --protocol=udp --dport=4563 enable-protocol
 
     touch /var/lock/subsys/openvswitch
 }
diff --git a/vswitchd/vswitch.xml b/vswitchd/vswitch.xml
index e2e73cf..c945ec5 100644
--- a/vswitchd/vswitch.xml
+++ b/vswitchd/vswitch.xml
@@ -1109,6 +1109,23 @@
             with the Linux kernel datapath with kernel version 2.6.26 or later.
           </dd>
 
+          <dt><code>vxlan</code></dt>
+          <dd>
+	    <p>
+	      An Ethernet tunnel over the experimental, UDP-based VXLAN
+	      protocol described at
+	      <code>http://tools.ietf.org/html/draft-mahalingam-dutt-dcops-vxlan-00</code>.
+	      VXLAN is currently supported only with the Linux kernel datapath
+	      with kernel version 2.6.26 or later.
+	    </p>
+	    <p>
+	      As an experimental protocol, VXLAN has no officially assigned UDP
+	      port.  Open vSwitch currently uses UDP destination port 4563.
+	      The source port used for VXLAN traffic varies on a per-flow basis
+	      between 32768 and 65535 to allow load balancing.
+	    </p>
+          </dd>
+
           <dt><code>patch</code></dt>
           <dd>
             A pair of virtual devices that act as a patch cable.
@@ -1123,7 +1140,7 @@
     <group title="Tunnel Options">
       <p>
         These options apply to interfaces with <ref column="type"/> of
-        <code>gre</code>, <code>ipsec_gre</code>, and <code>capwap</code>.
+        <code>gre</code>, <code>ipsec_gre</code>, <code>capwap</code> and <code>vxlan</code>.
       </p>
 
       <p>
@@ -1156,7 +1173,7 @@
             key="in_key"/> at all.
           </li>
           <li>
-            A positive 32-bit (for GRE) or 64-bit (for CAPWAP) number.  The
+            A positive 24-bit (for VXLAN), 32-bit (for GRE) or 64-bit (for CAPWAP) number.  The
             tunnel receives only packets with the specified key.
           </li>
           <li>
@@ -1182,7 +1199,7 @@
             key="out_key"/> at all.
           </li>
           <li>
-            A positive 32-bit (for GRE) or 64-bit (for CAPWAP) number.  Packets
+            A positive 24-bit (for VXLAN), 32-bit (for GRE) or 64-bit (for CAPWAP) number.  Packets
             sent through the tunnel will have the specified key.
           </li>
           <li>
@@ -1241,9 +1258,9 @@
         enabled; set to <code>false</code> to disable.
       </column>
 
-      <group title="Tunnel Options: gre only">
+      <group title="Tunnel Options: gre and vxlan only">
         <p>
-          Only <code>gre</code> interfaces support these options.
+          Only <code>gre</code> and <code>vxlan</code> interfaces support these options.
         </p>
 
         <column name="options" key="header_cache" type='{"type": "boolean"}'>
diff --git a/xenserver/etc_init.d_openvswitch b/xenserver/etc_init.d_openvswitch
index 8ba8aee..7e9f63b 100755
--- a/xenserver/etc_init.d_openvswitch
+++ b/xenserver/etc_init.d_openvswitch
@@ -76,7 +76,9 @@ start () {
             --log-file --pidfile --detach --monitor unix:/var/run/openvswitch/db.sock
     fi
 
+    # Allow tunnel traffic
     $ovs_ctl --protocol=gre enable-protocol
+    $ovs_ctl --protocol=udp --dport=4563 enable-protocol
 
     touch /var/lock/subsys/openvswitch
 }
-- 
1.7.3.4

