Repost it since the original one was blocked by some spam filters. @Jesse, could you help review it from technical point of view? If this looks good I can then make it optional and configurable. Thanks a lot!
-Han On Mon, May 12, 2014 at 4:04 PM, Han Zhou <[email protected]> wrote: > This patch implements vxlan-soe: > http://tools.ietf.org/html/draft-zhou-li-vxlan-soe-01 > > Tested VXLAN throughput between two hypervisors, and the performance > gain of vxlan-soe is significant. > netperf TCP_STREAM test result: > > Before the change: 2.62 Gbits/sec > After the change: 6.68 Gbits/sec > Speedup is ~250%. > > Hope this feature is useful for those who rely on VXLAN. > > > Signed-off-by: Han Zhou <[email protected]> > --- > datapath/linux/compat/include/net/vxlan.h | 28 +++++- > datapath/linux/compat/vxlan.c | 153 > ++++++++++++++++++++++++++---- > datapath/vport-vxlan.c | 9 +- > 3 files changed, 165 insertions(+), 25 deletions(-) > > diff --git a/datapath/linux/compat/include/net/vxlan.h > b/datapath/linux/compat/include/net/vxlan.h > index 414a497..7ba5291 100644 > --- a/datapath/linux/compat/include/net/vxlan.h > +++ b/datapath/linux/compat/include/net/vxlan.h > @@ -10,8 +10,32 @@ > #include_next <net/vxlan.h> > #else > > +#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) > + > +#define VXLAN_FLAG_GSO 0x80 /* VXLAN-SOE */ > +#define VXLAN_FLAGS 0x08 /* struct vxlanhdr.vx_flags required value. */ > + > +/* VXLAN protocol header */ > +struct vxlanhdr { > + __u8 vx_flags; > + __u8 vx_mss_hi; > + __be16 vx_protocol; /* VXLAN-GPE */ > + __u8 vx_vni[3]; > + __u8 vx_mss_lo; > +}; > + > +static inline void vxh_set_vni(struct vxlanhdr *vxh, __u32 vni) > +{ > + *((__u32*)&vxh->vx_vni) = htonl(vxh->vx_mss_lo | (vni << 8)); > +} > + > +static inline __u32 vxh_get_vni(struct vxlanhdr *vxh) > +{ > + return ((ntohl(*(__u32*)&vxh->vx_vni) & 0xffffff00) >> 8); > +} > + > struct vxlan_sock; > -typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff > *skb, __be32 key); > +typedef void (vxlan_rcv_t)(struct vxlan_sock *vs, struct sk_buff > *skb, __u32 key); > > /* per UDP socket information */ > struct vxlan_sock { > @@ -32,7 +56,7 @@ void vxlan_sock_release(struct vxlan_sock *vs); > int vxlan_xmit_skb(struct vxlan_sock *vs, > struct rtable *rt, struct sk_buff *skb, > __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, > - __be16 src_port, __be16 dst_port, __be32 vni); > + __be16 src_port, __be16 dst_port, __u32 vni); > > __be16 vxlan_src_port(__u16 port_min, __u16 port_max, struct sk_buff *skb); > > diff --git a/datapath/linux/compat/vxlan.c b/datapath/linux/compat/vxlan.c > index b8b8fa7..80fa233 100644 > --- a/datapath/linux/compat/vxlan.c > +++ b/datapath/linux/compat/vxlan.c > @@ -59,15 +59,102 @@ > #include "gso.h" > #include "vlan.h" > > -#define VXLAN_HLEN (sizeof(struct udphdr) + sizeof(struct vxlanhdr)) > > -#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags > required value. */ > +static inline int vxlan_parse_inner_hdr(struct sk_buff *skb, u16 > *l3_type, u8 *l4_type, u16 *l4_offset) > +{ > + struct ethhdr *ethh = (struct ethhdr*)skb->data; > + unsigned char *p = (unsigned char *)(ethh + 1); > + u16 l2_hdr_size, l3_hdr_size; > + u16 ethertype; > + u8 l4_proto; > + struct iphdr *iph; > + struct ipv6hdr *ipv6; > + > + > + ethertype = ntohs(ethh->h_proto); > + if (ethertype == ETH_P_8021Q) { > + ethertype = ntohs(*(__be16*)(p + 2)); > + p += 4; > + } > + l2_hdr_size = p - skb->data; > + > + if (ethertype == ETH_P_IP) { > + iph = (struct iphdr *)p; > + l3_hdr_size = iph->ihl << 2; > + l4_proto = iph->protocol; > + > + } else if (ethertype == ETH_P_IPV6) { > + ipv6 = (struct ipv6hdr *)p; > + l3_hdr_size = sizeof(struct ipv6hdr); > + l4_proto = ipv6->nexthdr; > + > + } else { > + return -1; > + } > + if (l4_proto != IPPROTO_TCP && l4_proto != IPPROTO_UDP) { > + return -1; > + } > + > + *l3_type = ethertype; > + *l4_type = l4_proto; > + *l4_offset = l2_hdr_size + l3_hdr_size; > + return 0; > + > +} > + > +static inline int vxlan_handle_soe(struct sk_buff *skb, struct vxlanhdr *vxh) > +{ > + u16 ethertype; > + u8 ipproto; > + u16 csum_offset, l4_offset; > + unsigned short gso_type; > + > + if (unlikely(skb_unclone(skb, GFP_ATOMIC))) { > + return -1; > + } > + > + skb_shinfo(skb)->gso_size = (((__u16)vxh->vx_mss_hi) << 8) + > + vxh->vx_mss_lo; > + skb_shinfo(skb)->gso_segs = 0; > > -/* VXLAN protocol header */ > -struct vxlanhdr { > - __be32 vx_flags; > - __be32 vx_vni; > -}; > + > + if (unlikely(skb_linearize(skb))) > + return -1; > + > + if (unlikely(vxlan_parse_inner_hdr(skb, ðertype, &ipproto, > &l4_offset))) { > + return -1; > + } > + if (ethertype == ETH_P_IP) { > + if (ipproto == IPPROTO_TCP) { > + gso_type = SKB_GSO_TCPV4; > + csum_offset = offsetof(struct tcphdr, check); > + } else if (ipproto == IPPROTO_UDP) { > + gso_type = SKB_GSO_UDP; > + csum_offset = offsetof(struct udphdr, check); > + } else { > + BUG(); > + } > + } else if (ethertype == ETH_P_IPV6) { > + if (ipproto == IPPROTO_TCP) { > + gso_type = SKB_GSO_TCPV6; > + csum_offset = offsetof(struct tcphdr, check); > + } else if (ipproto == IPPROTO_UDP) { > + gso_type = SKB_GSO_UDP; > + csum_offset = offsetof(struct udphdr, check); > + } else { > + BUG(); > + } > + } else { > + BUG(); > + } > + > + skb_shinfo(skb)->gso_type = gso_type; > + skb->ip_summed = CHECKSUM_PARTIAL; > + skb->csum_start = skb_headroom(skb) + l4_offset; > + skb->csum_offset = offsetof(struct tcphdr, check); > + > + return 0; > +} > > /* Callback from net/ipv4/udp.c to receive packets */ > static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb) > @@ -81,13 +168,13 @@ static int vxlan_udp_encap_recv(struct sock *sk, > struct sk_buff *skb) > > /* Return packets with reserved bits set */ > vxh = (struct vxlanhdr *)(udp_hdr(skb) + 1); > - if (vxh->vx_flags != htonl(VXLAN_FLAGS) || > +/* if (vxh->vx_flags != htonl(VXLAN_FLAGS) || > (vxh->vx_vni & htonl(0xff))) { > pr_warn("invalid vxlan flags=%#x vni=%#x\n", > ntohl(vxh->vx_flags), ntohl(vxh->vx_vni)); > goto error; > } > - > +*/ > if (iptunnel_pull_header(skb, VXLAN_HLEN, htons(ETH_P_TEB))) > goto drop; > > @@ -95,7 +182,13 @@ static int vxlan_udp_encap_recv(struct sock *sk, > struct sk_buff *skb) > if (!vs) > goto drop; > > - vs->rcv(vs, skb, vxh->vx_vni); > + if (vxh->vx_flags & VXLAN_FLAG_GSO) { > + if (unlikely(vxlan_handle_soe(skb, vxh))) > + goto drop; > + } > + > + vs->rcv(vs, skb, vxh_get_vni(vxh)); > + > return 0; > > drop: > @@ -153,10 +246,10 @@ static void vxlan_gso(struct sk_buff *skb) > struct iphdr *iph = ip_hdr(skb); > > uh->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, > - skb->len - udp_offset, > - IPPROTO_UDP, 0); > + skb->len - udp_offset, > + IPPROTO_UDP, 0); > uh->check = csum_fold(skb_checksum(skb, udp_offset, > - skb->len - udp_offset, 0)); > + skb->len - udp_offset, 0)); > > if (uh->check == 0) > uh->check = CSUM_MANGLED_0; > @@ -165,10 +258,31 @@ static void vxlan_gso(struct sk_buff *skb) > skb->ip_summed = CHECKSUM_NONE; > } > > -static int handle_offloads(struct sk_buff *skb) > + > +static int handle_offloads(struct sk_buff *skb, struct vxlanhdr* vxh) > { > + int err; > if (skb_is_gso(skb)) { > - OVS_GSO_CB(skb)->fix_segment = vxlan_gso; > + /* offload with vxlan-soe if encapsulated packet > + fits in MAX IP packet size, otherwise fallback to > + local GSO */ > + if (skb->len + sizeof(struct iphdr) > 65535) { > + OVS_GSO_CB(skb)->fix_segment = vxlan_gso; > + } else { > + > + vxh->vx_flags |= VXLAN_FLAG_GSO; > + vxh->vx_mss_hi = (__u8)(skb_shinfo(skb)->gso_size >> 8); > + vxh->vx_mss_lo = (__u8)skb_shinfo(skb)->gso_size; > + > + err = skb_unclone(skb, GFP_ATOMIC); > + if (unlikely(err)) > + return err; > + > + skb_shinfo(skb)->gso_type = 0; > + skb_shinfo(skb)->gso_size = 0; > + skb_shinfo(skb)->gso_segs = 0; > + } > + > } else { > if (skb->ip_summed != CHECKSUM_PARTIAL) > skb->ip_summed = CHECKSUM_NONE; > @@ -179,7 +293,7 @@ static int handle_offloads(struct sk_buff *skb) > int vxlan_xmit_skb(struct vxlan_sock *vs, > struct rtable *rt, struct sk_buff *skb, > __be32 src, __be32 dst, __u8 tos, __u8 ttl, __be16 df, > - __be16 src_port, __be16 dst_port, __be32 vni) > + __be16 src_port, __be16 dst_port, __u32 vni) > { > struct vxlanhdr *vxh; > struct udphdr *uh; > @@ -207,8 +321,9 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, > skb_reset_inner_headers(skb); > > vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh)); > - vxh->vx_flags = htonl(VXLAN_FLAGS); > - vxh->vx_vni = vni; > + memset(vxh, 0, sizeof(*vxh)); > + vxh->vx_flags = VXLAN_FLAGS; > + vxh_set_vni(vxh, vni); > > __skb_push(skb, sizeof(*uh)); > skb_reset_transport_header(skb); > @@ -222,7 +337,7 @@ int vxlan_xmit_skb(struct vxlan_sock *vs, > > vxlan_set_owner(vs->sock->sk, skb); > > - err = handle_offloads(skb); > + err = handle_offloads(skb, vxh); > if (err) > return err; > > diff --git a/datapath/vport-vxlan.c b/datapath/vport-vxlan.c > index cc9477d..ef0cd06 100644 > --- a/datapath/vport-vxlan.c > +++ b/datapath/vport-vxlan.c > @@ -58,16 +58,17 @@ static inline struct vxlan_port *vxlan_vport(const > struct vport *vport) > } > > /* Called with rcu_read_lock and BH disabled. */ > -static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, > __be32 vx_vni) > +static void vxlan_rcv(struct vxlan_sock *vs, struct sk_buff *skb, __u32 > vx_vni) > { > struct ovs_key_ipv4_tunnel tun_key; > struct vport *vport = vs->data; > struct iphdr *iph; > __be64 key; > - > + > + > /* Save outer tunnel values */ > iph = ip_hdr(skb); > - key = cpu_to_be64(ntohl(vx_vni) >> 8); > + key = cpu_to_be64(vx_vni); > ovs_flow_tun_key_init(&tun_key, iph, key, TUNNEL_KEY); > > ovs_vport_receive(vport, skb, &tun_key); > @@ -181,7 +182,7 @@ static int vxlan_tnl_send(struct vport *vport, > struct sk_buff *skb) > OVS_CB(skb)->tun_key->ipv4_tos, > OVS_CB(skb)->tun_key->ipv4_ttl, df, > src_port, dst_port, > - htonl(be64_to_cpu(OVS_CB(skb)->tun_key->tun_id) << 8)); > + (__u32)be64_to_cpu(OVS_CB(skb)->tun_key->tun_id)); > if (err < 0) > ip_rt_put(rt); > error: > -- > 1.9.0 _______________________________________________ dev mailing list [email protected] http://openvswitch.org/mailman/listinfo/dev
