This patch implements vxlan-soe: http://tools.ietf.org/html/draft-zhou-li-vxlan-soe-01
Tested VXLAN throughput between two hypervisors, and the performance gain of vxlan-soe is significant. netperf TCP_STREAM test result: Before the change: 2.62 Gbits/sec After the change: 6.68 Gbits/sec Speedup is ~250%. Hope this feature is useful for those who rely on VXLAN. Let me know your thoughts and any comments are welcome! Signed-off-by: Han Zhou <zhou...@gmail.com> --- datapath/linux/compat/include/net/vxlan.h | 28 +++++- datapath/linux/compat/vxlan.c | 153 ++++++++++++++++++++++++++---- datapath/vport-vxlan.c | 9 +- 3 files changed, 165 insertions(+), 25 deletions(-) diff --git a/datapath/linux/compat/include/net/vxlan.h b/datapath/linux/compat/include/net/vxlan.h index 414a497..7ba5291 100644 --- a/datapath/linux/compat/include/net/vxlan.h +++ b/datapath/linux/compat/include/net/vxlan.h @@ -10,8 +10,32 @@ #include_next <net/vxlan.h> #else +#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) + +#define VXLAN_FLAG_GSO 0x80 /* VXLAN-SOE */ +#define VXLAN_FLAGS 0x08 /* struct vxlanhdr.vx_flags required value. */ + +/* VXLAN protocol header */ +struct vxlanhdr { + __u8 vx_flags; + __u8 vx_mss_hi; + __be16 vx_protocol; /* VXLAN-GPE */ + __u8 vx_vni[3]; + __u8 vx_mss_lo; +}; + +static inline void vxh_set_vni(struct vxlanhdr *vxh, __u32 vni) +{ + *((__u32*)&vxh->vx_vni) = htonl(vxh->vx_mss_lo | (vni << 8)); +} + +static inline __u32 vxh_get_vni(struct vxlanhdr *vxh) +{ + return ((ntohl(*(__u32*)&vxh->vx_vni) & 0xffffff00) >> 8); +} + struct vxlan_sock; -typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff *skb, __be32 key); +typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff *skb, __u32 key); /* per UDP socket information */ struct vxlan_sock { @@ -32,7 +56,7 @@ void vxlan_sock_release(struct vxlan_sock *vs); int vxlan_xmit_skb(struct vxlan_sock *vs, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, __be32 vni); + __be16 src_port, __be16 dst_port, __u32 vni); __be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb); diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c index b8b8fa7..80fa233 100644 --- a/datapath/linux/compat/vxlan.c +++ b/datapath/linux/compat/vxlan.c @@ -59,15 +59,102 @@ #include "gso.h" #include "vlan.h" -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) -#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */ +static inline int vxlan_parse_inner_hdr(struct sk_buff *skb, u16 *l3_type, u8 *l4_type, u16 *l4_offset) +{ + struct ethhdr *ethh = (struct ethhdr*)skb->data; + unsigned char *p = (unsigned char *)(ethh + 1); + u16 l2_hdr_size, l3_hdr_size; + u16 ethertype; + u8 l4_proto; + struct iphdr *iph; + struct ipv6hdr *ipv6; + + + ethertype = ntohs(ethh->h_proto); + if (ethertype == ETH_P_8021Q) { + ethertype = ntohs(*(__be16*)(p + 2)); + p += 4; + } + l2_hdr_size = p - skb->data; + + if (ethertype == ETH_P_IP) { + iph = (struct iphdr *)p; + l3_hdr_size = iph->ihl << 2; + l4_proto = iph->protocol; + + } else if (ethertype == ETH_P_IPV6) { + ipv6 = (struct ipv6hdr *)p; + l3_hdr_size = sizeof(struct ipv6hdr); + l4_proto = ipv6->nexthdr; + + } else { + return -1; + } + if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP) { + return -1; + } + + *l3_type = ethertype; + *l4_type = l4_proto; + *l4_offset = l2_hdr_size + l3_hdr_size; + return 0; + +} + +static inline int vxlan_handle_soe(struct sk_buff *skb, struct vxlanhdr *vxh) +{ + u16 ethertype; + u8 ipproto; + u16 csum_offset, l4_offset; + unsigned short gso_type; + + if (unlikely(skb_unclone(skb, GFP_ATOMIC))) { + return -1; + } + + skb_shinfo(skb)->gso_size = (((__u16)vxh->vx_mss_hi) << 8) + + vxh->vx_mss_lo; + skb_shinfo(skb)->gso_segs = 0; -/* VXLAN protocol header */ -struct vxlanhdr { - __be32 vx_flags; - __be32 vx_vni; -}; + + if (unlikely(skb_linearize(skb))) + return -1; + + if (unlikely(vxlan_parse_inner_hdr(skb, ðertype, &ipproto, &l4_offset))) { + return -1; + } + if (ethertype == ETH_P_IP) { + if (ipproto == IPPROTO_TCP) { + gso_type = SKB_GSO_TCPV4; + csum_offset = offsetof(struct tcphdr, check); + } else if (ipproto == IPPROTO_UDP) { + gso_type = SKB_GSO_UDP; + csum_offset = offsetof(struct udphdr, check); + } else { + BUG(); + } + } else if (ethertype == ETH_P_IPV6) { + if (ipproto == IPPROTO_TCP) { + gso_type = SKB_GSO_TCPV6; + csum_offset = offsetof(struct tcphdr, check); + } else if (ipproto == IPPROTO_UDP) { + gso_type = SKB_GSO_UDP; + csum_offset = offsetof(struct udphdr, check); + } else { + BUG(); + } + } else { + BUG(); + } + + skb_shinfo(skb)->gso_type = gso_type; + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + l4_offset; + skb->csum_offset = offsetof(struct tcphdr, check); + + return 0; +} /* Callback from net/ipv4/udp.c to receive packets */ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) @@ -81,13 +168,13 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) /* Return packets with reserved bits set */ vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); - if (vxh->vx_flags != htonl(VXLAN_FLAGS) || +/* if (vxh->vx_flags != htonl(VXLAN_FLAGS) || (vxh->vx_vni & htonl(0xff))) { pr_warn("invalid vxlan flags=%#x vni=%#x\n", ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); goto error; } - +*/ if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) goto drop; @@ -95,7 +182,13 @@ static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) if (!vs) goto drop; - vs->rcv(vs, skb, vxh->vx_vni); + if (vxh->vx_flags & VXLAN_FLAG_GSO) { + if (unlikely(vxlan_handle_soe(skb, vxh))) + goto drop; + } + + vs->rcv(vs, skb, vxh_get_vni(vxh)); + return 0; drop: @@ -153,10 +246,10 @@ static void vxlan_gso(struct sk_buff *skb) struct iphdr *iph = ip_hdr(skb); uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - skb->len - udp_offset, - IPPROTO_UDP, 0); + skb->len - udp_offset, + IPPROTO_UDP, 0); uh->check = csum_fold(skb_checksum(skb, udp_offset, - skb->len - udp_offset, 0)); + skb->len - udp_offset, 0)); if (uh->check == 0) uh->check = CSUM_MANGLED_0; @@ -165,10 +258,31 @@ static void vxlan_gso(struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; } -static int handle_offloads(struct sk_buff *skb) + +static int handle_offloads(struct sk_buff *skb, struct vxlanhdr* vxh) { + int err; if (skb_is_gso(skb)) { - OVS_GSO_CB(skb)->fix_segment = vxlan_gso; + /* offload with vxlan-soe if encapsulated packet + fits in MAX IP packet size, otherwise fallback to + local GSO */ + if (skb->len + sizeof(struct iphdr) > 65535) { + OVS_GSO_CB(skb)->fix_segment = vxlan_gso; + } else { + + vxh->vx_flags |= VXLAN_FLAG_GSO; + vxh->vx_mss_hi = (__u8)(skb_shinfo(skb)->gso_size >> 8); + vxh->vx_mss_lo = (__u8)skb_shinfo(skb)->gso_size; + + err = skb_unclone(skb, GFP_ATOMIC); + if (unlikely(err)) + return err; + + skb_shinfo(skb)->gso_type = 0; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_segs = 0; + } + } else { if (skb->ip_summed != CHECKSUM_PARTIAL) skb->ip_summed = CHECKSUM_NONE; @@ -179,7 +293,7 @@ static int handle_offloads(struct sk_buff *skb) int vxlan_xmit_skb(struct vxlan_sock *vs, struct rtable *rt, struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, - __be16 src_port, __be16 dst_port, __be32 vni) + __be16 src_port, __be16 dst_port, __u32 vni) { struct vxlanhdr *vxh; struct udphdr *uh; @@ -207,8 +321,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, skb_reset_inner_headers(skb); vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); - vxh->vx_flags = htonl(VXLAN_FLAGS); - vxh->vx_vni = vni; + memset(vxh, 0, sizeof(*vxh)); + vxh->vx_flags = VXLAN_FLAGS; + vxh_set_vni(vxh, vni); __skb_push(skb, sizeof(*uh)); skb_reset_transport_header(skb); @@ -222,7 +337,7 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, vxlan_set_owner(vs->sock->sk, skb); - err = handle_offloads(skb); + err = handle_offloads(skb, vxh); if (err) return err; diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c index cc9477d..ef0cd06 100644 --- a/datapath/vport-vxlan.c +++ b/datapath/vport-vxlan.c @@ -58,16 +58,17 @@ static inline struct vxlan_port *vxlan_vport(const struct vport *vport) } /* Called with rcu_read_lock and BH disabled. */ -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __be32 vx_vni) +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __u32 vx_vni) { struct ovs_key_ipv4_tunnel tun_key; struct vport *vport = vs->data; struct iphdr *iph; __be64 key; - + + /* Save outer tunnel values */ iph = ip_hdr(skb); - key = cpu_to_be64(ntohl(vx_vni) >> 8); + key = cpu_to_be64(vx_vni); ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); ovs_vport_receive(vport, skb, &tun_key); @@ -181,7 +182,7 @@ static int vxlan_tnl_send(struct vport *vport, struct sk_buff *skb) OVS_CB(skb)->tun_key->ipv4_tos, OVS_CB(skb)->tun_key->ipv4_ttl, df, src_port, dst_port, - htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8)); + (__u32)be64_to_cpu(OVS_CB(skb)->tun_key->tun_id)); if (err < 0) ip_rt_put(rt); error: -- 1.9.0 _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev