On Thu, Aug 30, 2012 at 6:12 AM, John Basila <jbas...@checkpoint.com> wrote: > When running multiple instances of QEMU from the same image file > (using -snapshot) and connecting each instance to a dedicated TAP > device, the Guest OS will most likely not be able to communicate > with the outside world as all packets leave the Guest OS from the > same IP and thus the Host OS will have difficulty returning the > packets to the correct TAP device/Guest OS. Stateless Static > Network Address Translation or SSNAT allows the QEMU to map the > network of the Guest OS to the network of the TAP device allowing > a unique IP address for each Guest OS that ease such case. > The only mandatory argument to the SSNAT is the Guest OS network > IP, the rest will be figured out from the underlying TAP device. > > Signed-off-by: John Basila <jbas...@checkpoint.com> > --- > net/tap.c | 369 > +++++++++++++++++++++++++++++++++++++++++++++++++++++- > qapi-schema.json | 5 +- > qemu-options.hx | 10 ++- > 3 files changed, 381 insertions(+), 3 deletions(-) > > diff --git a/net/tap.c b/net/tap.c > index 1971525..2408a49 100644 > --- a/net/tap.c > +++ b/net/tap.c > @@ -39,16 +39,88 @@ > #include "qemu-char.h" > #include "qemu-common.h" > #include "qemu-error.h" > +#include "qemu_socket.h" > > #include "net/tap-linux.h" > > #include "hw/vhost_net.h" > > +#include "checksum.h" > + > +#define ETH_P_ARP 0x0806 /* Address Resolution packet */ > +#define ETH_P_IP 0x0800 /* Internet Protocol packet */ > +#define ETH_P_IPV6 0x86DD /* IPv6 over blueblook */
Probably not used. Would it be hard to add NAT for IPv6 too? > + > +#define ETH_ALEN 6 > + > +struct ethhdr { > + unsigned char h_dest[ETH_ALEN]; /* destination eth addr */ > + unsigned char h_source[ETH_ALEN]; /* source ether addr */ > + unsigned short h_proto; /* packet type ID field */ > +}; The definitions probably conflict with system headers. Can you use those or slirp headers instead? > + > +#define IP_PROTO_TCP 6 > +#define IP_PROTO_UDP 17 > +#define IPV4_ADRESS_LENGTH 4 > + > +struct arphdr { > + unsigned short ar_hrd; /* format of hardware address */ > + unsigned short ar_pro; /* format of protocol address */ > + unsigned char ar_hln; /* length of hardware address */ > + unsigned char ar_pln; /* length of protocol address */ > + unsigned short ar_op; /* ARP opcode (command) */ > + > + /* > + * Ethernet looks like this : This bit is variable sized however... > + */ > + unsigned char ar_sha[ETH_ALEN]; /* sender hardware > address */ > + unsigned char ar_sip[IPV4_ADRESS_LENGTH]; /* sender IP address */ > + unsigned char ar_tha[ETH_ALEN]; /* target hardware > address */ > + unsigned char ar_tip[IPV4_ADRESS_LENGTH]; /* target IP address */ > +}; > + > +#define IP_HEADER_LENGTH(ip) (((ip->ip_hlv)&0xf) << 2) Please add spaces around operators, like '&' here. I think checkpatch.pl should catch this. Macro arguments ('ip' here) should be surrounded by parentheses when used. > + > +/** An IPv4 packet header */ > +struct iphdr { > + uint8_t ip_hlv; /**< Header length and version of the header > */ Remove '*<' > + uint8_t ip_tos; /**< Type of Service > */ > + uint16_t ip_len; /**< Length in octets, inlc. this header and > data */ incl > + uint16_t ip_id; /**< ID is used to aid in assembling framents > */ fragments > + uint16_t ip_off; /**< Info about fragmentation (control, offset) > */ > + uint8_t ip_ttl; /**< Time to Live > */ > + uint8_t ip_p; /**< Next level protocol type > */ > + uint16_t ip_sum; /**< Header checksum > */ > + uint32_t ip_src; /**< Source IP address > */ > + uint32_t ip_dst; /**< Destination IP address > */ > +}; > + > +/** UDP packet header */ > +typedef struct udphdr { > + uint16_t uh_sport; /* source port */ > + uint16_t uh_dport; /* destination port */ > + uint16_t uh_ulen; /* udp length */ > + uint16_t uh_chksum;/* udp checksum */ > +} udp_header; > + > + > /* Maximum GSO packet size (64k) plus plenty of room for > * the ethernet and virtio_net headers > */ > #define TAP_BUFSIZE (4096 + 65536) > > +typedef struct SSNATInfo { > + unsigned int ssnat_active : 1; bool > + > + struct in_addr ssnat_ifaddr; > + struct in_addr ssnat_ifmask; > + uint8_t ssnat_hwaddr[ETH_ALEN]; > + > + struct in_addr ssnat_guest; > + struct in_addr ssnat_host; > + struct in_addr ssnat_mask; > +} SSNATInfo; > + > typedef struct TAPState { > NetClientState nc; > int fd; > @@ -59,6 +131,9 @@ typedef struct TAPState { > unsigned int write_poll : 1; > unsigned int using_vnet_hdr : 1; > unsigned int has_ufo: 1; > + > + SSNATInfo ssnat_info; This only accommodates a single rule, but it would be nice to support several. > + > VHostNetState *vhost_net; > unsigned host_vnet_hdr_len; > } TAPState; > @@ -154,11 +229,154 @@ static ssize_t tap_receive_raw(NetClientState *nc, > const uint8_t *buf, size_t si > return tap_write_packet(s, iov, iovcnt); > } > > +#define SSNAT_MAP_IP(_orig, _to, _mask) ( (_orig.s_addr & ~_mask.s_addr) | > (_to.s_addr & _mask.s_addr) ) > +#define SSNAT_IS_MATCH(_orig, _from, _mask) ( (_orig.s_addr & > _mask.s_addr) == (_from.s_addr & _mask.s_addr) ) > + > +static void tap_ssnat_translate_arp(uint8_t* buf, size_t size, const struct > in_addr from, const struct in_addr to, const struct in_addr mask) > +{ > + size_t packetSize = size; Please don't use Hungarian notation. Only struct and typedef names should use CamelCase. The default indent is 4 and maximum line length 80 chars. Tabs may not be used. Maybe you have not read CODING_STYLE? > + uint8_t* pPacket = buf; > + > + if(packetSize >= sizeof(struct arphdr)) Space after 'if', please. > + { Braces must cuddle with the line with 'if', 'else' etc. > + struct arphdr* pArpHeader = (struct arphdr*)pPacket; > + struct in_addr sourceAddress; > + struct in_addr targetAddress; > + > + memcpy(&sourceAddress, pArpHeader->ar_sip, > sizeof(sourceAddress)); > + if( SSNAT_IS_MATCH(sourceAddress, from, mask) ) > + { > + sourceAddress.s_addr = SSNAT_MAP_IP(sourceAddress, > to, mask); > + memcpy(pArpHeader->ar_sip, &sourceAddress, > sizeof(sourceAddress)); > + } > + > + memcpy(&targetAddress, pArpHeader->ar_tip, > sizeof(targetAddress)); > + if( SSNAT_IS_MATCH(targetAddress, from, mask) ) > + { > + targetAddress.s_addr = SSNAT_MAP_IP(targetAddress, > to, mask); > + memcpy(pArpHeader->ar_tip, &targetAddress, > sizeof(targetAddress)); > + } > + } > +} > + > +static void tap_ssnat_adjust_ip_checksums(uint8_t* pBuffer, const size_t > size) > +{ > + size_t packetSize = size; > + uint8_t* pPacket = pBuffer; > + > + if(packetSize >= sizeof(struct iphdr)) > + { > + struct iphdr* pIpHeader = (struct iphdr*)pPacket; > + uint16_t* pCheckSumField = NULL; > + uint32_t uiCheckSum = 0; > + > + pIpHeader->ip_sum = 0; > + if(IP_HEADER_LENGTH(pIpHeader) <= packetSize) > + { > + /*pIpHeader->ip_sum = > tap_ssnat_calculate_checksum(pIpHeader, IP_HEADER_LENGTH(pIpHeader));*/ Why commented out code? Remove. > + > + uiCheckSum = > net_checksum_add(IP_HEADER_LENGTH(pIpHeader), (uint8_t*)pIpHeader); > + pIpHeader->ip_sum = > htons(net_checksum_finish(uiCheckSum)); > + } > + > + switch(pIpHeader->ip_p) > + { > + case IP_PROTO_TCP: Case labels should align with switch statement, the code block is indented. > + { > + struct tcphdr* pTcpHeader = (struct > tcphdr*)(pPacket + IP_HEADER_LENGTH(pIpHeader)); > + pCheckSumField = &pTcpHeader->check; > + break; > + } > + > + case IP_PROTO_UDP: > + { > + struct udphdr* pUdpHeader = (struct > udphdr*)(pPacket + IP_HEADER_LENGTH(pIpHeader)); > + pCheckSumField = &pUdpHeader->uh_chksum; > + break; > + } > + > + default: > + return; > + } > + > + *pCheckSumField = 0; > + uiCheckSum = net_checksum_add(ntohs(pIpHeader->ip_len) - > IP_HEADER_LENGTH(pIpHeader), pPacket + IP_HEADER_LENGTH(pIpHeader)); > + uiCheckSum += net_checksum_add(sizeof(pIpHeader->ip_src) + > sizeof(pIpHeader->ip_dst), (uint8_t*)&pIpHeader->ip_src); > + uiCheckSum += pIpHeader->ip_p + ntohs(pIpHeader->ip_len) - > IP_HEADER_LENGTH(pIpHeader); > + *pCheckSumField = htons(net_checksum_finish(uiCheckSum)); > + } > +} > + > +static void tap_ssnat_translate_ip(uint8_t* buf, size_t size, const struct > in_addr from, const struct in_addr to, const struct in_addr mask) > +{ > + size_t packetSize = size; > + uint8_t* pPacket = buf; > + > + if(packetSize >= sizeof(struct iphdr)) > + { > + struct iphdr* pIpHeader = (struct iphdr*)pPacket; > + struct in_addr sourceAddress; > + struct in_addr targetAddress; > + int iCalculateCheckSum = 0; 'bool' > + > + sourceAddress.s_addr = pIpHeader->ip_src; > + targetAddress.s_addr = pIpHeader->ip_dst; > + if( SSNAT_IS_MATCH(sourceAddress, from, mask) ) > + { > + pIpHeader->ip_src = SSNAT_MAP_IP(sourceAddress, to, > mask); > + iCalculateCheckSum = 1; 'true' > + } > + > + if( SSNAT_IS_MATCH(targetAddress, from, mask) ) > + { > + pIpHeader->ip_dst = SSNAT_MAP_IP(targetAddress, to, > mask); > + iCalculateCheckSum = 1; > + } > + > + if(iCalculateCheckSum) > + { > + tap_ssnat_adjust_ip_checksums(pPacket, packetSize); > + } > + } > +} > + > +static void tap_ssnat_translate(uint8_t* buf, size_t size, const struct > in_addr from, const struct in_addr to, const struct in_addr mask) > +{ > + size_t packetSize = size; > + uint8_t* pPacket = buf; > + > + if(packetSize >= sizeof(struct ethhdr)) > + { > + struct ethhdr *pEthernetHeader = (struct ethhdr*)pPacket; > + pPacket += sizeof(struct ethhdr); > + packetSize -= sizeof(struct ethhdr); > + switch(htons(pEthernetHeader->h_proto)) > + { > + case ETH_P_ARP: > + tap_ssnat_translate_arp(pPacket, > packetSize, from, to, mask); > + break; The whole code block should use the same indent (except for case label). > + > + case ETH_P_IP: > + tap_ssnat_translate_ip(pPacket, > packetSize, from, to, mask); > + break; > + } > + } > +} > + > +static void tap_ssnat_reveive(TAPState *s, uint8_t* buf, size_t size) 'receive', but I'd just put the check to tap_receive() and call tap_ssnat_translate() from there. > +{ > + if(s->ssnat_info.ssnat_active) > + { > + tap_ssnat_translate(buf, size, s->ssnat_info.ssnat_guest, > s->ssnat_info.ssnat_host,s->ssnat_info.ssnat_mask); > + } > +} > + > static ssize_t tap_receive(NetClientState *nc, const uint8_t *buf, size_t > size) > { > TAPState *s = DO_UPCAST(TAPState, nc, nc); > struct iovec iov[1]; > > + tap_ssnat_reveive(s, (uint8_t*)buf, size); > if (s->host_vnet_hdr_len && !s->using_vnet_hdr) { > return tap_receive_raw(nc, buf, size); > } > @@ -189,6 +407,14 @@ static void tap_send_completed(NetClientState *nc, > ssize_t len) > tap_read_poll(s, 1); > } > > +static void tap_ssnat_send(TAPState *s, uint8_t *buf, size_t size) > +{ > + if(s->ssnat_info.ssnat_active) > + { > + tap_ssnat_translate(buf, size, s->ssnat_info.ssnat_host, > s->ssnat_info.ssnat_guest, s->ssnat_info.ssnat_mask); > + } > +} > + > static void tap_send(void *opaque) > { > TAPState *s = opaque; > @@ -207,6 +433,8 @@ static void tap_send(void *opaque) > size -= s->host_vnet_hdr_len; > } > > + tap_ssnat_send(s, buf, size); > + > size = qemu_send_packet_async(&s->nc, buf, size, tap_send_completed); > if (size == 0) { > tap_read_poll(s, 0); > @@ -586,6 +814,145 @@ static int net_tap_init(const NetdevTapOptions *tap, > int *vnet_hdr, > return fd; > } > > +static int get_str_sep(char *buf, int buf_size, const char **pp, int sep) Probably other net parsing functions, strtok() or GLib string functions can be reused instead. > +{ > + const char *p, *p1; Odd indentation. > + int len; > + p = *pp; > + p1 = strchr(p, sep); > + if (!p1) Missing braces. > + return -1; > + len = p1 - p; > + p1++; > + if (buf_size > 0) { > + if (len > buf_size - 1) Ditto. > + len = buf_size - 1; > + memcpy(buf, p, len); > + buf[len] = '\0'; > + } > + *pp = p1; > + return 0; > +} > + > +static int ssnat_net_tap_set_ifinfo(const char* ifname, TAPState* s) Return bool? 'int' could be useful for errno passing. > +{ > + int iReturnValue = -1; > + struct ifreq ifr; > + int fd = -1; > + > + fd = socket(AF_INET, SOCK_DGRAM, 0); > + if(fd > 0) >= > + { > + strncpy(ifr.ifr_name, ifname, IFNAMSIZ-1); Missing NUL termination, please use pstrcpy(). > + > + if(ioctl(fd, SIOCGIFHWADDR, &ifr) == 0) > + { > + memcpy(s->ssnat_info.ssnat_hwaddr, > (void*)ifr.ifr_hwaddr.sa_data, sizeof(s->ssnat_info.ssnat_hwaddr)); > + > + if(ioctl(fd, SIOCGIFADDR, &ifr) == 0) > + { > + memcpy(&s->ssnat_info.ssnat_ifaddr, > &(((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr), > sizeof(s->ssnat_info.ssnat_ifaddr)); > + > + if(ioctl(fd, SIOCGIFNETMASK, &ifr) == 0) > + { > + memcpy(&s->ssnat_info.ssnat_ifmask, > &(((struct sockaddr_in *)&ifr.ifr_addr)->sin_addr), > sizeof(s->ssnat_info.ssnat_ifmask)); > + iReturnValue = 0; > + } > + } > + else 'else' must cuddle with the previous '}', missing braces for the block below. > + error_report("failed to fetch the ip address > of interface '%s'", ifname); > + } > + else > + error_report("failed to fetch the hardware address of > interface '%s'", ifname); > + > + close(fd); > + } > + > + return iReturnValue; > +} > + > +static int ssnat_net_tap_init(TAPState* s, const NetdevTapOptions *tap) > +{ > + int returnValue = -1; > + > + if(tap->has_ssnat) > + { > + const char *ifname = NULL; > + ifname = tap->ifname; > + if(ifname) > + { > + const char* ssnat_value = tap->ssnat; > + if(ssnat_net_tap_set_ifinfo(ifname, s) >= 0) > + { > + char ssnat_str[1024] = { 0 }; > + const char* p = ssnat_value; > + > + pstrcpy(ssnat_str, sizeof(ssnat_str), > ssnat_value); > + if(get_str_sep(ssnat_str, sizeof(ssnat_str), > &p, ':') >= 0) > + { > + if(ssnat_str[0]) > + { > + if(inet_aton(ssnat_str, > &s->ssnat_info.ssnat_guest)) Please use getaddrinfo() instead. > + { > + > if(get_str_sep(ssnat_str, sizeof(ssnat_str), &p, ':') >= 0) > + { > + > if(ssnat_str[0]) > + { > + > if(inet_aton(ssnat_str, &s->ssnat_info.ssnat_mask)) It would be nice to support /24 style masks too. > + { > + > if(p[0]) > + > { > + > if(inet_aton(p, &s->ssnat_info.ssnat_host)) > + > { Pretty deep nesting. Please combine some expressions with '&&'. > + > returnValue = 0; > + > } > + > else > + > error_report("invalid stateless static nat rule '%s', > invalid host-side-ip", ssnat_value); > + > } > + > else > + > error_report("invalid stateless static nat rule '%s', empty > host-side-ip", ssnat_value); > + } > + else > + > error_report("invalid stateless static nat rule '%s', invalid netmask", > ssnat_value); > + } > + else > + > error_report("invalid stateless static nat rule '%s', empty netmask length", > ssnat_value); > + } > + else > + > error_report("invalid stateless static nat rule '%s', incomplete rule", > ssnat_value); > + } > + else > + error_report("invalid > stateless static nat rule '%s', invalid guest-side-ip", ssnat_value); > + } > + else > + error_report("invalid > stateless static nat rule '%s', empty guest-side-ip", ssnat_value); > + } > + else > + { > + if(inet_aton(p, > &s->ssnat_info.ssnat_guest)) > + { > + s->ssnat_info.ssnat_host = > s->ssnat_info.ssnat_ifaddr; > + s->ssnat_info.ssnat_mask = > s->ssnat_info.ssnat_ifmask; > + > + s->ssnat_info.ssnat_active = > 1; > + returnValue = 0; > + } > + else > + error_report("invalid > stateless static nat rule '%s'", ssnat_value); > + } > + } > + else > + error_report("failed to fetch the interface > '%s' information", ifname); > + } > + else > + error_report("could not retrieve ifname attribute"); > + } > + else > + returnValue = 0; > + > + return returnValue; > +} > + > int net_init_tap(const NetClientOptions *opts, const char *name, > NetClientState *peer) > { > @@ -705,7 +1072,7 @@ int net_init_tap(const NetClientOptions *opts, const > char *name, > return -1; > } > > - return 0; > + return ssnat_net_tap_init(s, tap); > } > > VHostNetState *tap_get_vhost_net(NetClientState *nc) > diff --git a/qapi-schema.json b/qapi-schema.json > index bd8ad74..59aa127 100644 > --- a/qapi-schema.json > +++ b/qapi-schema.json > @@ -2105,6 +2105,8 @@ > # > # @vhostforce: #optional vhost on for non-MSIX virtio guests > # > +# @ssnat: #optional stateless static nat NAT > +# > # Since 1.2 > ## > { 'type': 'NetdevTapOptions', > @@ -2118,7 +2120,8 @@ > '*vnet_hdr': 'bool', > '*vhost': 'bool', > '*vhostfd': 'str', > - '*vhostforce': 'bool' } } > + '*vhostforce': 'bool', > + '*ssnat': 'str' } } > > ## > # @NetdevSocketOptions > diff --git a/qemu-options.hx b/qemu-options.hx > index 3c411c4..c0aa852 100644 > --- a/qemu-options.hx > +++ b/qemu-options.hx > @@ -1268,7 +1268,9 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, > "-net tap[,vlan=n][,name=str],ifname=name\n" > " connect the host TAP network interface to VLAN 'n'\n" > #else > - "-net > tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile][,helper=helper][,sndbuf=nbytes][,vnet_hdr=on|off][,vhost=on|off][,vhostfd=h][,vhostforce=on|off]\n" > + "-net > tap[,vlan=n][,name=str][,fd=h][,ifname=name][,script=file][,downscript=dfile]\n" > + " > [,helper=helper][,sndbuf=nbytes][,vnet_hdr=on|off][,vhost=on|off][,vhostfd=h]\n" > + " [,vhostforce=on|off][,ssnat=rule]\n" > " connect the host TAP network interface to VLAN 'n' \n" > " use network scripts 'file' (default=" > DEFAULT_NETWORK_SCRIPT ")\n" > " to configure it and 'dfile' (default=" > DEFAULT_NETWORK_DOWN_SCRIPT ")\n" > @@ -1285,6 +1287,12 @@ DEF("net", HAS_ARG, QEMU_OPTION_net, > " (only has effect for virtio guests which use > MSIX)\n" > " use vhostforce=on to force vhost on for non-MSIX virtio > guests\n" > " use 'vhostfd=h' to connect to an already opened vhost > net device\n" > + " use 'ssnat=rule' to create stateless static nat\n" > + " rule: <guest-side-ip>[:<netmask>:<host-side-ip>]\n" > + " for example: > 'ssnat=172.16.0.0:255.255.255.0:192.168.1.0' will result in translating\n" > + " the Guest machine IP addresses from > 172.16.0.x to 192.168.1.x\n" guest > + " If only the guest-side-ip parameter is passed, > the netmask and host-side-ip\n" > + " will be taken from the interface passed via > ifname\n" > "-net bridge[,vlan=n][,name=str][,br=bridge][,helper=helper]\n" > " connects a host TAP network interface to a host bridge > device 'br'\n" > " (default=" DEFAULT_BRIDGE_INTERFACE ") using the > program 'helper'\n" > -- > 1.7.2.5 > >