[Qemu-devel] [PATCH 00/16] slirp: Adding IPv6 support to Qemu -net user mode
We have developed IPv6 in Qemu -net user mode. These patches add ICMPv6, NDP, and make UDP and TCP compatible with IPv6. We have made some refactoring to make current code compatible with IPv6. Some patches, like 2 and 13, can be reviewed using interdiff -w /dev/null patchfile to get rid of the indentation. [PATCH 01/16] slirp: goto bad in udp_input if sosendto fails [PATCH 02/16] slirp: Generalizing and neutralizing code before adding [PATCH 03/16] qemu/timer.h : Adding function to second scale [PATCH 04/16] slirp: Adding IPv6, ICMPv6 Echo and NDP [PATCH 05/16] slirp: Adding ICMPv6 error sending [PATCH 06/16] slirp: Make Socket structure IPv6 compatible [PATCH 07/16] slirp: Factorizing address translation [PATCH 08/16] slirp: Factorizing and cleaning solookup() [PATCH 09/16] slirp: Make udp_attach IPv6 compatible [PATCH 10/16] slirp: Adding IPv6 UDP support [PATCH 11/16] slirp: Adding family argument to tcp_fconnect() [PATCH 12/16] slirp: Factorizing tcpiphdr structure with an union [PATCH 13/16] slirp: Generalizing and neutralizing various TCP [PATCH 14/16] slirp: Handle IPv6 in TCP functions [PATCH 15/16] slirp: Adding IPv6 address for DNS relay [PATCH 16/16] qapi-schema, qemu-options slirp: Adding Qemu options
[Qemu-devel] [PATCH 03/16] qemu/timer.h : Adding function to second scale
This patch adds SCALE_S, timer_new_s(), and qemu_clock_get_s in qemu/timer.h to manage second-scale timers. Signed-off-by: Guillaume Subiron maet...@subiron.org Signed-off-by: Samuel Thibault samuel.thiba...@ens-lyon.org --- include/qemu/timer.h | 32 1 file changed, 32 insertions(+) diff --git a/include/qemu/timer.h b/include/qemu/timer.h index b58903b..f71553d 100644 --- a/include/qemu/timer.h +++ b/include/qemu/timer.h @@ -7,6 +7,7 @@ /* timers */ +#define SCALE_S 10 #define SCALE_MS 100 #define SCALE_US 1000 #define SCALE_NS 1 @@ -81,6 +82,20 @@ extern QEMUTimerListGroup main_loop_tlg; int64_t qemu_clock_get_ns(QEMUClockType type); /** + * qemu_clock_get_s; + * @type: the clock type + * + * Get the second value of a clock with + * type @type + * + * Returns: the clock value in seconds + */ +static inline int64_t qemu_clock_get_s(QEMUClockType type) +{ +return qemu_clock_get_ns(type) / SCALE_S; +} + +/** * qemu_clock_get_ms; * @type: the clock type * @@ -508,6 +523,23 @@ static inline QEMUTimer *timer_new_ms(QEMUClockType type, QEMUTimerCB *cb, } /** + * timer_new_s: + * @clock: the clock to associate with the timer + * @callback: the callback to call when the timer expires + * @opaque: the opaque pointer to pass to the callback + * + * Create a new timer with second scale on the default timer list + * associated with the clock. + * + * Returns: a pointer to the newly created timer + */ +static inline QEMUTimer *timer_new_s(QEMUClockType type, QEMUTimerCB *cb, + void *opaque) +{ +return timer_new(type, SCALE_S, cb, opaque); +} + +/** * timer_free: * @ts: the timer * -- 1.8.4.rc3
[Qemu-devel] [PATCH 15/16] slirp: Adding IPv6 address for DNS relay
This patch adds an IPv6 address to the DNS relay. in6_equal_dns() is developed using this Slirp attribute. sotranslate_in/out() are also updated to manage the IPv6 case so the guest can be able to join the host using one of the Slirp addresses. Signed-off-by: Guillaume Subiron maet...@subiron.org --- slirp/ip6.h| 5 - slirp/slirp.c | 2 ++ slirp/slirp.h | 1 + slirp/socket.c | 26 -- 4 files changed, 31 insertions(+), 3 deletions(-) diff --git a/slirp/ip6.h b/slirp/ip6.h index 16124ec..b88456d 100644 --- a/slirp/ip6.h +++ b/slirp/ip6.h @@ -74,7 +74,10 @@ static inline int in6_equal_mach(struct in6_addr a, struct in6_addr b, || (in6_equal_net(a, (struct in6_addr)LINKLOCAL_ADDR, 64)\ in6_equal_mach(a, slirp-vhost_addr6, 64))) -#define in6_equal_dns(a) 0 +#define in6_equal_dns(a)\ +((in6_equal_net(a, slirp-vprefix_addr6, slirp-vprefix_len)\ + || in6_equal_net(a, (struct in6_addr)LINKLOCAL_ADDR, 64))\ + in6_equal_mach(a, slirp-vnameserver_addr6, slirp-vprefix_len)) #define in6_equal_host(a)\ (in6_equal_router(a) || in6_equal_dns(a)) diff --git a/slirp/slirp.c b/slirp/slirp.c index 0f6f006..695e8a6 100644 --- a/slirp/slirp.c +++ b/slirp/slirp.c @@ -236,6 +236,8 @@ Slirp *slirp_init(int restricted, struct in_addr vnetwork, slirp-bootp_filename = g_strdup(bootfile); slirp-vdhcp_startaddr = vdhcp_start; slirp-vnameserver_addr = vnameserver; +/* :TODO:maethor:130311: Use a parameter passed to the function */ +inet_pton(AF_INET6, fc00::2, slirp-vnameserver_addr6); if (vdnssearch) { translate_dnssearch(slirp, vdnssearch); diff --git a/slirp/slirp.h b/slirp/slirp.h index b6e805e..0688ea7 100644 --- a/slirp/slirp.h +++ b/slirp/slirp.h @@ -236,6 +236,7 @@ struct Slirp { struct in6_addr vhost_addr6; struct in_addr vdhcp_startaddr; struct in_addr vnameserver_addr; +struct in6_addr vnameserver_addr6; struct in_addr client_ipaddr; char client_hostname[33]; diff --git a/slirp/socket.c b/slirp/socket.c index 567f9bc..a9b3957 100644 --- a/slirp/socket.c +++ b/slirp/socket.c @@ -741,12 +741,12 @@ sofwdrain(struct socket *so) /* * Translate addr in host addr when it is a virtual address - * :TODO:maethor:130314: Manage IPv6 */ void sotranslate_out(struct socket *so, struct sockaddr_storage *addr) { Slirp *slirp = so-slirp; struct sockaddr_in *sin = (struct sockaddr_in *)addr; +struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; switch (addr-ss_family) { case AF_INET: @@ -767,16 +767,29 @@ void sotranslate_out(struct socket *so, struct sockaddr_storage *addr) ntohs(sin-sin_port), inet_ntoa(sin-sin_addr))); break; +case AF_INET6: +if (in6_equal_net(so-so_faddr6, slirp-vprefix_addr6, +slirp-vprefix_len)) { +if (in6_equal(so-so_faddr6, slirp-vnameserver_addr6)) { +/*if (get_dns_addr(addr) 0) {*/ /* TODO */ +sin6-sin6_addr = in6addr_loopback; +/*}*/ +} else { +sin6-sin6_addr = in6addr_loopback; +} +} +break; + default: break; } } -/* :TODO:maethor:130314: IPv6 */ void sotranslate_in(struct socket *so, struct sockaddr_storage *addr) { Slirp *slirp = so-slirp; struct sockaddr_in *sin = (struct sockaddr_in *)addr; +struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr; switch (addr-ss_family) { case AF_INET: @@ -793,6 +806,15 @@ void sotranslate_in(struct socket *so, struct sockaddr_storage *addr) } break; +case AF_INET6: +if (in6_equal_net(so-so_faddr6, slirp-vprefix_addr6, +slirp-vprefix_len)) { +if (in6_equal(sin6-sin6_addr, in6addr_loopback) +|| !in6_equal(so-so_faddr6, slirp-vhost_addr6)) { +sin6-sin6_addr = so-so_faddr6; +} +} + default: break; } -- 1.8.4.rc3
[Qemu-devel] [PATCH 11/16] slirp: Adding family argument to tcp_fconnect()
This patch simply adds a sa_family_t argument to remove the hardcoded AF_INET in the call of qemu_socket(). Signed-off-by: Guillaume Subiron maet...@subiron.org --- slirp/slirp.h | 2 +- slirp/tcp_input.c | 3 ++- slirp/tcp_subr.c | 5 +++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/slirp/slirp.h b/slirp/slirp.h index 0d1e14d..aa44055 100644 --- a/slirp/slirp.h +++ b/slirp/slirp.h @@ -367,7 +367,7 @@ void tcp_respond(struct tcpcb *, register struct tcpiphdr *, register struct mbu struct tcpcb * tcp_newtcpcb(struct socket *); struct tcpcb * tcp_close(register struct tcpcb *); void tcp_sockclosed(struct tcpcb *); -int tcp_fconnect(struct socket *); +int tcp_fconnect(struct socket *, sa_family_t af); void tcp_connect(struct socket *); int tcp_attach(struct socket *); uint8_t tcp_tos(struct socket *); diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c index f7a8d49..25929bd 100644 --- a/slirp/tcp_input.c +++ b/slirp/tcp_input.c @@ -581,7 +581,8 @@ findso: goto cont_input; } - if((tcp_fconnect(so) == -1) (errno != EINPROGRESS) (errno != EWOULDBLOCK)) { + if ((tcp_fconnect(so, so-so_ffamily) == -1) + (errno != EINPROGRESS) (errno != EWOULDBLOCK)) { u_char code=ICMP_UNREACH_NET; DEBUG_MISC((dfd, tcp fconnect errno = %d-%s\n, errno,strerror(errno))); diff --git a/slirp/tcp_subr.c b/slirp/tcp_subr.c index 4791c0c..3558115 100644 --- a/slirp/tcp_subr.c +++ b/slirp/tcp_subr.c @@ -324,14 +324,15 @@ tcp_sockclosed(struct tcpcb *tp) * nonblocking. Connect returns after the SYN is sent, and does * not wait for ACK+SYN. */ -int tcp_fconnect(struct socket *so) +int tcp_fconnect(struct socket *so, sa_family_t af) { int ret=0; DEBUG_CALL(tcp_fconnect); DEBUG_ARG(so = %lx, (long )so); - if( (ret = so-s = qemu_socket(AF_INET,SOCK_STREAM,0)) = 0) { + ret = so-s = qemu_socket(af, SOCK_STREAM, 0); + if (ret = 0) { int opt, s=so-s; struct sockaddr_storage addr; -- 1.8.4.rc3
[Qemu-devel] [PATCH 01/16] slirp: goto bad in udp_input if sosendto fails
Before this patch, if sosendto fails, udp_input is executed as if the packet was sent. This could cause memory leak. This patch adds a goto bad to cut the execution of this function. Signed-off-by: Guillaume Subiron maet...@subiron.org --- slirp/udp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/slirp/udp.c b/slirp/udp.c index 8cc6cb6..fd2446a 100644 --- a/slirp/udp.c +++ b/slirp/udp.c @@ -218,6 +218,7 @@ udp_input(register struct mbuf *m, int iphlen) *ip=save_ip; DEBUG_MISC((dfd,udp tx errno = %d-%s\n,errno,strerror(errno))); icmp_error(m, ICMP_UNREACH,ICMP_UNREACH_NET, 0,strerror(errno)); + goto bad; } m_free(so-so_m); /* used for ICMP if error on sorecvfrom */ -- 1.8.4.rc3
[Qemu-devel] [PATCH 08/16] slirp: Factorizing and cleaning solookup()
This patch makes solookup() compatible with all address family. Also, this function was only compatible with TCP. Having the socket list in argument, it is now compatible with UDP too. Finally, some optimization code is factorized inside the function (the function look at the last returned result before browsing the complete socket list). This also adds a sockaddr_equal() function to compare two sockaddr_storage. Signed-off-by: Guillaume Subiron maet...@subiron.org --- slirp/socket.c| 30 -- slirp/socket.h| 30 +- slirp/tcp_input.c | 27 +++ slirp/udp.c | 25 ++--- 4 files changed, 62 insertions(+), 50 deletions(-) diff --git a/slirp/socket.c b/slirp/socket.c index 375281c..f333fcf 100644 --- a/slirp/socket.c +++ b/slirp/socket.c @@ -15,24 +15,26 @@ static void sofcantrcvmore(struct socket *so); static void sofcantsendmore(struct socket *so); -struct socket * -solookup(struct socket *head, struct in_addr laddr, u_int lport, - struct in_addr faddr, u_int fport) +struct socket *solookup(struct socket **last, struct socket *head, +struct sockaddr_storage *lhost, struct sockaddr_storage *fhost) { - struct socket *so; +struct socket *so = *last; - for (so = head-so_next; so != head; so = so-so_next) { - if (so-so_lport == lport - so-so_laddr.s_addr == laddr.s_addr - so-so_faddr.s_addr == faddr.s_addr - so-so_fport == fport) - break; - } +/* Optimisation */ +if (sockaddr_equal((so-lhost.ss), lhost) + (!fhost || sockaddr_equal((so-fhost.ss), fhost))) { +return so; +} - if (so == head) - return (struct socket *)NULL; - return so; +for (so = head-so_next; so != head; so = so-so_next) { +if (sockaddr_equal((so-lhost.ss), lhost) + (!fhost || sockaddr_equal((so-fhost.ss), fhost))) { +*last = so; +return so; +} +} +return (struct socket *)NULL; } /* diff --git a/slirp/socket.h b/slirp/socket.h index 50059be..ad509b9 100644 --- a/slirp/socket.h +++ b/slirp/socket.h @@ -93,7 +93,35 @@ struct socket { #define SS_HOSTFWD 0x1000 /* Socket describes host-guest forwarding */ #define SS_INCOMING0x2000 /* Connection was initiated by a host on the internet */ -struct socket * solookup(struct socket *, struct in_addr, u_int, struct in_addr, u_int); +static inline int sockaddr_equal(struct sockaddr_storage *a, +struct sockaddr_storage *b) +{ +if (a-ss_family != b-ss_family) { +return 0; +} else { +switch (a-ss_family) { +case AF_INET: +{ +struct sockaddr_in *a4 = (struct sockaddr_in *) a; +struct sockaddr_in *b4 = (struct sockaddr_in *) b; +return (a4-sin_addr.s_addr == b4-sin_addr.s_addr + a4-sin_port == b4-sin_port); +} +case AF_INET6: +{ +struct sockaddr_in6 *a6 = (struct sockaddr_in6 *) a; +struct sockaddr_in6 *b6 = (struct sockaddr_in6 *) b; +return (in6_equal(a6-sin6_addr, b6-sin6_addr) + a6-sin6_port == b6-sin6_port); +} +default: +return 0; +} +} +} + +struct socket *solookup(struct socket **, struct socket *, +struct sockaddr_storage *, struct sockaddr_storage *); struct socket * socreate(Slirp *); void sofree(struct socket *); int soread(struct socket *); diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c index 70ef376..f7a8d49 100644 --- a/slirp/tcp_input.c +++ b/slirp/tcp_input.c @@ -227,6 +227,7 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso) int iss = 0; u_long tiwin; int ret; + struct sockaddr_storage lhost, fhost; struct ex_list *ex_ptr; Slirp *slirp; @@ -320,16 +321,14 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso) * Locate pcb for segment. */ findso: - so = slirp-tcp_last_so; - if (so-so_fport != ti-ti_dport || - so-so_lport != ti-ti_sport || - so-so_laddr.s_addr != ti-ti_src.s_addr || - so-so_faddr.s_addr != ti-ti_dst.s_addr) { - so = solookup(slirp-tcb, ti-ti_src, ti-ti_sport, - ti-ti_dst, ti-ti_dport); - if (so) - slirp-tcp_last_so = so; - } + lhost.ss_family = AF_INET; + ((struct sockaddr_in *)lhost)-sin_addr = ti-ti_src; + ((struct sockaddr_in *)lhost)-sin_port = ti-ti_sport; + fhost.ss_family = AF_INET; + ((struct sockaddr_in *)fhost)-sin_addr = ti-ti_dst; + ((struct sockaddr_in *)fhost)-sin_port = ti-ti_dport; + + so = solookup(slirp-tcp_last_so, slirp-tcb, lhost, fhost); /* * If the state is
[Qemu-devel] [PATCH 05/16] slirp: Adding ICMPv6 error sending
Disambiguation : icmp_error is renamed into icmp_send_error, since it doesn't manage errors, but only sends ICMP Error messages. Adding icmp6_send_error to send ICMPv6 Error messages. This function is simpler than the v4 version. Adding some calls in various functions to send ICMP errors, when a received packet is too big, or when its hop limit is 0. Signed-off-by: Yann Bordenave m...@meowstars.org --- slirp/ip6_icmp.c | 60 +++ slirp/ip6_icmp.h | 10 ++ slirp/ip6_input.c | 16 --- slirp/ip_icmp.c | 12 +-- slirp/ip_icmp.h | 4 ++-- slirp/ip_input.c | 8 slirp/socket.c| 4 ++-- slirp/tcp_input.c | 2 +- slirp/udp.c | 3 ++- 9 files changed, 96 insertions(+), 23 deletions(-) diff --git a/slirp/ip6_icmp.c b/slirp/ip6_icmp.c index 32de0ba..706e430 100644 --- a/slirp/ip6_icmp.c +++ b/slirp/ip6_icmp.c @@ -65,6 +65,66 @@ static void icmp6_send_echoreply(struct mbuf *m, Slirp *slirp, struct ip6 *ip, ip6_output(NULL, t, 0); } +void icmp6_send_error(struct mbuf *m, uint8_t type, uint8_t code) +{ +Slirp *slirp = m-slirp; +struct mbuf *t = m_get(slirp); +struct ip6 *ip = mtod(m, struct ip6 *); + +char addrstr[INET6_ADDRSTRLEN]; +DEBUG_CALL(icmp_send_error); +DEBUG_ARGS((dfd, type = %d, code = %d\n, type, code)); + +/* IPv6 packet */ +struct ip6 *rip = mtod(t, struct ip6 *); +rip-ip_src = (struct in6_addr)LINKLOCAL_ADDR; +if (in6_multicast(ip-ip_src) || in6_unspecified(ip-ip_src)) { +/* :TODO:maethor:130317: icmp error? */ +return; +} +rip-ip_dst = ip-ip_src; +inet_ntop(AF_INET6, rip-ip_dst, addrstr, INET6_ADDRSTRLEN); +DEBUG_ARG(target = %s, addrstr); + +rip-ip_nh = IPPROTO_ICMPV6; +const int error_data_len = min(m-m_len, +IF_MTU - (sizeof(struct ip6) + ICMP6_ERROR_MINLEN)); +rip-ip_pl = htons(ICMP6_ERROR_MINLEN + error_data_len); +t-m_len = sizeof(struct ip6) + ntohs(rip-ip_pl); + +/* ICMPv6 packet */ +t-m_data += sizeof(struct ip6); +struct icmp6 *ricmp = mtod(t, struct icmp6 *); +ricmp-icmp6_type = type; +ricmp-icmp6_code = code; +ricmp-icmp6_cksum = 0; + +switch (type) { +case ICMP6_UNREACH: +case ICMP6_TIMXCEED: +ricmp-icmp6_err.unused = 0; +break; +case ICMP6_TOOBIG: +ricmp-icmp6_err.mtu = htonl(IF_MTU); +break; +case ICMP6_PARAMPROB: +/* :TODO:Meow:130316: Handle this case */ +break; +default: +assert(0); +break; +} +t-m_data += ICMP6_ERROR_MINLEN; +memcpy(t-m_data, m-m_data, error_data_len); + +/* Checksum */ +t-m_data -= ICMP6_ERROR_MINLEN; +t-m_data -= sizeof(struct ip6); +ricmp-icmp6_cksum = ip6_cksum(t); + +ip6_output(NULL, t, 0); +} + /* * Process a NDP message */ diff --git a/slirp/ip6_icmp.h b/slirp/ip6_icmp.h index 2b21c84..7779964 100644 --- a/slirp/ip6_icmp.h +++ b/slirp/ip6_icmp.h @@ -22,6 +22,12 @@ struct icmp6_echo { /* Echo Messages */ uint16_t seq_num; }; +union icmp6_error_body { +uint32_t unused; +uint32_t pointer; +uint32_t mtu; +}; + /* * NDP Messages */ @@ -85,6 +91,7 @@ struct icmp6 { uint8_t icmp6_code; /* type sub code */ uint16_ticmp6_cksum;/* ones complement cksum of struct */ union { +union icmp6_error_body error_body; struct icmp6_echo echo; struct ndp_rs ndp_rs; struct ndp_ra ndp_ra; @@ -92,6 +99,7 @@ struct icmp6 { struct ndp_na ndp_na; struct ndp_redirect ndp_redirect; } icmp6_body; +#define icmp6_err icmp6_body.error_body #define icmp6_echo icmp6_body.echo #define icmp6_nrs icmp6_body.ndp_rs #define icmp6_nra icmp6_body.ndp_ra @@ -101,6 +109,7 @@ struct icmp6 { } QEMU_PACKED; #define ICMP6_MINLEN4 +#define ICMP6_ERROR_MINLEN 8 #define ICMP6_ECHO_MINLEN 8 #define ICMP6_NDP_RS_MINLEN 8 #define ICMP6_NDP_RA_MINLEN 16 @@ -241,6 +250,7 @@ void icmp6_input(struct mbuf *); void icmp6_error(struct mbuf *msrc, u_char type, u_char code, int minsize, const char *message); */ +void icmp6_send_error(struct mbuf *m, uint8_t type, uint8_t code); void ndp_send_ra(Slirp *slirp); void ndp_send_ns(Slirp *slirp, struct in6_addr addr); diff --git a/slirp/ip6_input.c b/slirp/ip6_input.c index 9663c42..af098a5 100644 --- a/slirp/ip6_input.c +++ b/slirp/ip6_input.c @@ -33,7 +33,7 @@ void ip6_input(struct mbuf *m) DEBUG_ARG(m_len = %d, m-m_len); if (m-m_len sizeof(struct ip6)) { -return; +goto bad; } ip6 = mtod(m, struct ip6 *); @@ -42,10 +42,14 @@ void ip6_input(struct mbuf *m) goto bad; } +if (ntohs(ip6-ip_pl) IF_MTU) { +icmp6_send_error(m, ICMP6_TOOBIG, 0); +goto bad; +} + /* check ip_ttl for a correct ICMP reply */ if (ip6-ip_hl == 0) { -/* :TODO:maethor:130307: icmp6_error
[Qemu-devel] [PATCH 09/16] slirp: Make udp_attach IPv6 compatible
A sa_family_t is now passed in argument to udp_attach instead of using a hardcoded AF_INET to call qemu_socket(). Signed-off-by: Guillaume Subiron maet...@subiron.org --- slirp/ip_icmp.c | 2 +- slirp/udp.c | 7 --- slirp/udp.h | 2 +- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/slirp/ip_icmp.c b/slirp/ip_icmp.c index 8787aae..c896574 100644 --- a/slirp/ip_icmp.c +++ b/slirp/ip_icmp.c @@ -162,7 +162,7 @@ icmp_input(struct mbuf *m, int hlen) if (icmp_send(so, m, hlen) == 0) { return; } - if(udp_attach(so) == -1) { + if (udp_attach(so, AF_INET) == -1) { DEBUG_MISC((dfd,icmp_input udp_attach errno = %d-%s\n, errno,strerror(errno))); sofree(so); diff --git a/slirp/udp.c b/slirp/udp.c index 7e0f1b2..f53ee11 100644 --- a/slirp/udp.c +++ b/slirp/udp.c @@ -167,7 +167,7 @@ udp_input(register struct mbuf *m, int iphlen) if (!so) { goto bad; } - if(udp_attach(so) == -1) { + if (udp_attach(so, AF_INET) == -1) { DEBUG_MISC((dfd, udp_attach errno = %d-%s\n, errno,strerror(errno))); sofree(so); @@ -276,9 +276,10 @@ int udp_output(struct socket *so, struct mbuf *m, } int -udp_attach(struct socket *so) +udp_attach(struct socket *so, sa_family_t af) { - if((so-s = qemu_socket(AF_INET,SOCK_DGRAM,0)) != -1) { + so-s = qemu_socket(af, SOCK_DGRAM, 0); + if (so-s != -1) { so-so_expire = curtime + SO_EXPIRE; insque(so, so-slirp-udb); } diff --git a/slirp/udp.h b/slirp/udp.h index a04b8ce..15e73c1 100644 --- a/slirp/udp.h +++ b/slirp/udp.h @@ -76,7 +76,7 @@ struct mbuf; void udp_init(Slirp *); void udp_cleanup(Slirp *); void udp_input(register struct mbuf *, int); -int udp_attach(struct socket *); +int udp_attach(struct socket *, sa_family_t af); void udp_detach(struct socket *); struct socket * udp_listen(Slirp *, uint32_t, u_int, uint32_t, u_int, int); -- 1.8.4.rc3
[Qemu-devel] [PATCH 10/16] slirp: Adding IPv6 UDP support
This patch adds udp6_input() and udp6_output(). It also adds the IPv6 case in sorecvfrom(). Finally, udp_input() is called by ip6_input(). Signed-off-by: Guillaume Subiron maet...@subiron.org --- slirp/Makefile.objs | 2 +- slirp/ip6_input.c | 3 +- slirp/socket.c | 7 ++- slirp/udp.h | 5 ++ slirp/udp6.c| 149 5 files changed, 162 insertions(+), 4 deletions(-) create mode 100644 slirp/udp6.c diff --git a/slirp/Makefile.objs b/slirp/Makefile.objs index 2dfe8e0..faa32b6 100644 --- a/slirp/Makefile.objs +++ b/slirp/Makefile.objs @@ -1,3 +1,3 @@ common-obj-y = cksum.o if.o ip_icmp.o ip6_icmp.o ip6_input.o ip6_output.o ip_input.o ip_output.o dnssearch.o common-obj-y += slirp.o mbuf.o misc.o sbuf.o socket.o tcp_input.o tcp_output.o -common-obj-y += tcp_subr.o tcp_timer.o udp.o bootp.o tftp.o arp_table.o ndp_table.o +common-obj-y += tcp_subr.o tcp_timer.o udp.o udp6.o bootp.o tftp.o arp_table.o ndp_table.o diff --git a/slirp/ip6_input.c b/slirp/ip6_input.c index af098a5..3290af8 100644 --- a/slirp/ip6_input.c +++ b/slirp/ip6_input.c @@ -62,8 +62,7 @@ void ip6_input(struct mbuf *m) icmp6_send_error(m, ICMP6_UNREACH, ICMP6_UNREACH_NO_ROUTE); break; case IPPROTO_UDP: -/* :TODO:maethor:130312: UDP */ -icmp6_send_error(m, ICMP6_UNREACH, ICMP6_UNREACH_NO_ROUTE); +udp6_input(m); break; case IPPROTO_ICMPV6: icmp6_input(m); diff --git a/slirp/socket.c b/slirp/socket.c index f333fcf..31bbb7e 100644 --- a/slirp/socket.c +++ b/slirp/socket.c @@ -540,8 +540,13 @@ sorecvfrom(struct socket *so) (struct sockaddr_in *) daddr, so-so_iptos); break; - default: + case AF_INET6: + udp6_output(so, m, (struct sockaddr_in6 *) saddr, + (struct sockaddr_in6 *) daddr); break; + default: + assert(0); + break; } } /* rx error */ } /* if ping packet */ diff --git a/slirp/udp.h b/slirp/udp.h index 15e73c1..8a4d9f5 100644 --- a/slirp/udp.h +++ b/slirp/udp.h @@ -83,4 +83,9 @@ struct socket * udp_listen(Slirp *, uint32_t, u_int, uint32_t, u_int, int udp_output(struct socket *so, struct mbuf *m, struct sockaddr_in *saddr, struct sockaddr_in *daddr, int iptos); + +void udp6_input(register struct mbuf *); +int udp6_output(struct socket *so, struct mbuf *m, +struct sockaddr_in6 *saddr, struct sockaddr_in6 *daddr); + #endif diff --git a/slirp/udp6.c b/slirp/udp6.c new file mode 100644 index 000..3940959 --- /dev/null +++ b/slirp/udp6.c @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2013 + * Guillaume Subiron + * + * Please read the file COPYRIGHT for the + * terms and conditions of the copyright. + */ + +#include slirp.h +#include udp.h + +void udp6_input(struct mbuf *m) +{ +Slirp *slirp = m-slirp; +struct ip6 *ip, save_ip; +struct udphdr *uh; +int hlen = sizeof(struct ip6); +int len; +struct socket *so; +struct sockaddr_storage lhost; + +DEBUG_CALL(udp6_input); +DEBUG_ARG(m = %lx, (long)m); + +if (slirp-restricted) { +goto bad; +} + +ip = mtod(m, struct ip6 *); +m-m_len -= hlen; +m-m_data += hlen; +uh = mtod(m, struct udphdr *); +m-m_len += hlen; +m-m_data -= hlen; + +if (ip6_cksum(m)) { +goto bad; +} + +len = ntohs((uint16_t)uh-uh_ulen); + +/* + * Make mbuf data length reflect UDP length. + * If not enough data to reflect UDP length, drop. + */ +if (ntohs(ip-ip_pl) != len) { +if (len ntohs(ip-ip_pl)) { +goto bad; +} +m_adj(m, len - ntohs(ip-ip_pl)); +ip-ip_pl = htons(len); +} + +/* TODO handle DHCP/BOOTP */ +/* TODO handle TFTP */ + +/* Locate pcb for datagram. */ +lhost.ss_family = AF_INET6; +((struct sockaddr_in6 *)lhost)-sin6_addr = ip-ip_src; +((struct sockaddr_in6 *)lhost)-sin6_port = uh-uh_sport; + +so = solookup(slirp-udp_last_so, slirp-udb, lhost, NULL); + +if (so == NULL) { +/* If there's no socket for this packet, create one. */ +so = socreate(slirp); +if (!so) { +goto bad; +} +if (udp_attach(so, AF_INET6) == -1) { +DEBUG_MISC((dfd, udp6_attach errno = %d-%s\n, +errno, strerror(errno))); +sofree(so); +goto bad; +} + +/* Setup fields */ +so-so_lfamily = AF_INET6; +so-so_laddr6 = ip-ip_src; +so-so_lport6 = uh-uh_sport; +} + +so-so_ffamily = AF_INET6; +so-so_faddr6 = ip-ip_dst; /* XXX */ +so-so_fport6 = uh-uh_dport; /* XXX */ + +hlen += sizeof(struct udphdr); +m-m_len -= hlen; +m-m_data += hlen; + +/* + * Now we sendto() the packet. + */ +
[Qemu-devel] [PATCH 13/16] slirp: Generalizing and neutralizing various TCP functions before adding IPv6 stuff
Basically, this patch adds some switch in various TCP functions to prepare them for the IPv6 case. To have something to switch in tcp_input() and tcp_respond(), a new argument is used to give them the sa_family of the addresses they are working on. Signed-off-by: Guillaume Subiron maet...@subiron.org --- slirp/ip_input.c | 2 +- slirp/slirp.c | 6 ++- slirp/slirp.h | 5 +- slirp/tcp_input.c | 142 + slirp/tcp_output.c | 43 +--- slirp/tcp_subr.c | 94 +-- slirp/tcp_timer.c | 3 +- 7 files changed, 181 insertions(+), 114 deletions(-) diff --git a/slirp/ip_input.c b/slirp/ip_input.c index 1925cdc..9aa8909 100644 --- a/slirp/ip_input.c +++ b/slirp/ip_input.c @@ -199,7 +199,7 @@ ip_input(struct mbuf *m) */ switch (ip-ip_p) { case IPPROTO_TCP: - tcp_input(m, hlen, (struct socket *)NULL); + tcp_input(m, hlen, (struct socket *)NULL, AF_INET); break; case IPPROTO_UDP: udp_input(m, hlen); diff --git a/slirp/slirp.c b/slirp/slirp.c index d0b8c79..0f6f006 100644 --- a/slirp/slirp.c +++ b/slirp/slirp.c @@ -576,7 +576,8 @@ void slirp_pollfds_poll(GArray *pollfds, int select_error) /* * Continue tcp_input */ -tcp_input((struct mbuf *)NULL, sizeof(struct ip), so); +tcp_input((struct mbuf *)NULL, sizeof(struct ip), so, +so-so_ffamily); /* continue; */ } else { ret = sowrite(so); @@ -625,7 +626,8 @@ void slirp_pollfds_poll(GArray *pollfds, int select_error) } } -tcp_input((struct mbuf *)NULL, sizeof(struct ip), so); +tcp_input((struct mbuf *)NULL, sizeof(struct ip), so, +so-so_ffamily); } /* SS_ISFCONNECTING */ #endif } diff --git a/slirp/slirp.h b/slirp/slirp.h index aa44055..b6e805e 100644 --- a/slirp/slirp.h +++ b/slirp/slirp.h @@ -352,7 +352,7 @@ void ip6_input(struct mbuf *); int ip6_output(struct socket *, struct mbuf *, int fast); /* tcp_input.c */ -void tcp_input(register struct mbuf *, int, struct socket *); +void tcp_input(register struct mbuf *, int, struct socket *, sa_family_t af); int tcp_mss(register struct tcpcb *, u_int); /* tcp_output.c */ @@ -363,7 +363,8 @@ void tcp_setpersist(register struct tcpcb *); void tcp_init(Slirp *); void tcp_cleanup(Slirp *); void tcp_template(struct tcpcb *); -void tcp_respond(struct tcpcb *, register struct tcpiphdr *, register struct mbuf *, tcp_seq, tcp_seq, int); +void tcp_respond(struct tcpcb *, register struct tcpiphdr *, +register struct mbuf *, tcp_seq, tcp_seq, int, sa_family_t); struct tcpcb * tcp_newtcpcb(struct socket *); struct tcpcb * tcp_close(register struct tcpcb *); void tcp_sockclosed(struct tcpcb *); diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c index dde89b6..3409557 100644 --- a/slirp/tcp_input.c +++ b/slirp/tcp_input.c @@ -213,7 +213,7 @@ present: * protocol specification dated September, 1981 very closely. */ void -tcp_input(struct mbuf *m, int iphlen, struct socket *inso) +tcp_input(struct mbuf *m, int iphlen, struct socket *inso, sa_family_t af) { struct ip save_ip, *ip; register struct tcpiphdr *ti; @@ -254,46 +254,53 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso) } slirp = m-slirp; - if (iphlen sizeof(struct ip )) { - ip_stripoptions(m, (struct mbuf *)0); - iphlen=sizeof(struct ip ); - } - /* XXX Check if too short */ + switch (af) { + case AF_INET: + if (iphlen sizeof(struct ip)) { + ip_stripoptions(m, (struct mbuf *)0); + iphlen = sizeof(struct ip); + } + /* XXX Check if too short */ - /* -* Save a copy of the IP header in case we want restore it -* for sending an ICMP error message in response. -*/ - ip=mtod(m, struct ip *); - save_ip = *ip; - save_ip.ip_len+= iphlen; + /* +* Save a copy of the IP header in case we want restore it +* for sending an ICMP error message in response. +*/ + ip = mtod(m, struct ip *); + save_ip = *ip; + save_ip.ip_len += iphlen; - /* -* Get IP and TCP header together in first mbuf. -* Note: IP leaves IP header in first mbuf. -*/ - m-m_data -= sizeof(struct tcpiphdr) - (sizeof(struct ip) -+ sizeof(struct tcphdr)); - m-m_len += sizeof(struct tcpiphdr) - (sizeof(struct ip) - + sizeof(struct
[Qemu-devel] [PATCH 12/16] slirp: Factorizing tcpiphdr structure with an union
This patch factorizes the tcpiphdr structure to put the IPv4 fields in an union, for addition of version 6 in further patch. Using some macros, retrocompatibility of the existing code is assured. This patch also fixes the SLIRP_MSIZE and margin computation in various functions, and makes them compatible with the new tcpiphdr structure, whose size will be bigger than sizeof(struct tcphdr) + sizeof(struct ip) Signed-off-by: Guillaume Subiron maet...@subiron.org Signed-off-by: Samuel Thibault samuel.thiba...@ens-lyon.org --- slirp/if.h | 4 ++-- slirp/mbuf.c | 3 ++- slirp/slirp.c | 15 --- slirp/socket.c | 13 - slirp/tcp_input.c | 31 --- slirp/tcp_output.c | 18 +- slirp/tcp_subr.c | 31 ++- slirp/tcpip.h | 31 +++ 8 files changed, 102 insertions(+), 44 deletions(-) diff --git a/slirp/if.h b/slirp/if.h index 3327023..c7a5c57 100644 --- a/slirp/if.h +++ b/slirp/if.h @@ -17,7 +17,7 @@ #define IF_MRU 1500 #defineIF_COMP IF_AUTOCOMP /* Flags for compression */ -/* 2 for alignment, 14 for ethernet, 40 for TCP/IP */ -#define IF_MAXLINKHDR (2 + 14 + 40) +/* 2 for alignment, 14 for ethernet */ +#define IF_MAXLINKHDR (2 + ETH_HLEN) #endif diff --git a/slirp/mbuf.c b/slirp/mbuf.c index 92c429e..87ee550 100644 --- a/slirp/mbuf.c +++ b/slirp/mbuf.c @@ -23,7 +23,8 @@ * Find a nice value for msize * XXX if_maxlinkhdr already in mtu */ -#define SLIRP_MSIZE (IF_MTU + IF_MAXLINKHDR + offsetof(struct mbuf, m_dat) + 6) +#define SLIRP_MSIZE\ +(offsetof(struct mbuf, m_dat) + IF_MAXLINKHDR + TCPIPHDR_DELTA + IF_MTU) void m_init(Slirp *slirp) diff --git a/slirp/slirp.c b/slirp/slirp.c index 2caddcd..d0b8c79 100644 --- a/slirp/slirp.c +++ b/slirp/slirp.c @@ -756,15 +756,16 @@ void slirp_input(Slirp *slirp, const uint8_t *pkt, int pkt_len) m = m_get(slirp); if (!m) return; -/* Note: we add to align the IP header */ -if (M_FREEROOM(m) pkt_len + 2) { -m_inc(m, pkt_len + 2); +/* Note: we add 2 to align the IP header on 4 bytes, + * and add the margin for the tcpiphdr overhead */ +if (M_FREEROOM(m) pkt_len + TCPIPHDR_DELTA + 2) { +m_inc(m, pkt_len + TCPIPHDR_DELTA + 2); } -m-m_len = pkt_len + 2; -memcpy(m-m_data + 2, pkt, pkt_len); +m-m_len = pkt_len + TCPIPHDR_DELTA + 2; +memcpy(m-m_data + TCPIPHDR_DELTA + 2, pkt, pkt_len); -m-m_data += 2 + ETH_HLEN; -m-m_len -= 2 + ETH_HLEN; +m-m_data += TCPIPHDR_DELTA + 2 + ETH_HLEN; +m-m_len -= TCPIPHDR_DELTA + 2 + ETH_HLEN; if (proto == ETH_P_IP) { ip_input(m); diff --git a/slirp/socket.c b/slirp/socket.c index 31bbb7e..567f9bc 100644 --- a/slirp/socket.c +++ b/slirp/socket.c @@ -482,7 +482,18 @@ sorecvfrom(struct socket *so) if (!m) { return; } - m-m_data += IF_MAXLINKHDR; + switch (so-so_ffamily) { + case AF_INET: + m-m_data += IF_MAXLINKHDR + sizeof(struct udpiphdr); + break; + case AF_INET6: + m-m_data += IF_MAXLINKHDR + sizeof(struct ip6) ++ sizeof(struct udphdr); + break; + default: + assert(0); + break; + } /* * XXX Shouldn't FIONREAD packets destined for port 53, diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c index 25929bd..dde89b6 100644 --- a/slirp/tcp_input.c +++ b/slirp/tcp_input.c @@ -254,11 +254,6 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso) } slirp = m-slirp; - /* -* Get IP and TCP header together in first mbuf. -* Note: IP leaves IP header in first mbuf. -*/ - ti = mtod(m, struct tcpiphdr *); if (iphlen sizeof(struct ip )) { ip_stripoptions(m, (struct mbuf *)0); iphlen=sizeof(struct ip ); @@ -275,14 +270,28 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso) save_ip.ip_len+= iphlen; /* +* Get IP and TCP header together in first mbuf. +* Note: IP leaves IP header in first mbuf. +*/ + m-m_data -= sizeof(struct tcpiphdr) - (sizeof(struct ip) ++ sizeof(struct tcphdr)); + m-m_len += sizeof(struct tcpiphdr) - (sizeof(struct ip) + + sizeof(struct tcphdr)); + ti = mtod(m, struct tcpiphdr *); + + /* * Checksum extended TCP header and data. */ - tlen = ((struct ip *)ti)-ip_len; -tcpiphdr2qlink(ti)-next = tcpiphdr2qlink(ti)-prev = NULL; -memset(ti-ti_i.ih_mbuf, 0 , sizeof(struct mbuf_ptr)); - ti-ti_x1 = 0; + tlen = ip-ip_len; + tcpiphdr2qlink(ti)-next =
[Qemu-devel] [PATCH 04/16] slirp: Adding IPv6, ICMPv6 Echo and NDP autoconfiguration
This patch adds the functions needed to handle IPv6 packets. ICMPv6 and NDP headers are implemented. Slirp is now able to send NDP Router or Neighbor Advertisement when it receives Router or Neighbor Solicitation. Using a 64bit-sized IPv6 prefix, the guest is now able to perform stateless autoconfiguration (SLAAC) and to compute its IPv6 address. This patch adds an ndp_table, mainly inspired by arp_table, to keep an NDP cache and manage network address resolution. Slirp regularly sends NDP Neighbor Advertisement, as recommended by the RFC, to make the guest refresh its route. This also adds ip6_cksum() to compute ICMPv6 checksums using IPv6 pseudo-header. Signed-off-by: Guillaume Subiron maet...@subiron.org Signed-off-by: Samuel Thibault samuel.thiba...@ens-lyon.org --- slirp/Makefile.objs | 4 +- slirp/cksum.c | 23 slirp/ip6.h | 139 + slirp/ip6_icmp.c| 350 slirp/ip6_icmp.h| 247 slirp/ip6_input.c | 75 +++ slirp/ip6_output.c | 41 ++ slirp/ndp_table.c | 87 + slirp/slirp.c | 47 +-- slirp/slirp.h | 33 + 10 files changed, 1036 insertions(+), 10 deletions(-) create mode 100644 slirp/ip6.h create mode 100644 slirp/ip6_icmp.c create mode 100644 slirp/ip6_icmp.h create mode 100644 slirp/ip6_input.c create mode 100644 slirp/ip6_output.c create mode 100644 slirp/ndp_table.c diff --git a/slirp/Makefile.objs b/slirp/Makefile.objs index 2daa9dc..2dfe8e0 100644 --- a/slirp/Makefile.objs +++ b/slirp/Makefile.objs @@ -1,3 +1,3 @@ -common-obj-y = cksum.o if.o ip_icmp.o ip_input.o ip_output.o dnssearch.o +common-obj-y = cksum.o if.o ip_icmp.o ip6_icmp.o ip6_input.o ip6_output.o ip_input.o ip_output.o dnssearch.o common-obj-y += slirp.o mbuf.o misc.o sbuf.o socket.o tcp_input.o tcp_output.o -common-obj-y += tcp_subr.o tcp_timer.o udp.o bootp.o tftp.o arp_table.o +common-obj-y += tcp_subr.o tcp_timer.o udp.o bootp.o tftp.o arp_table.o ndp_table.o diff --git a/slirp/cksum.c b/slirp/cksum.c index 6328660..f0a1398 100644 --- a/slirp/cksum.c +++ b/slirp/cksum.c @@ -137,3 +137,26 @@ cont: REDUCE; return (~sum 0x); } + +int ip6_cksum(struct mbuf *m) +{ +struct ip6 save_ip, *ip = mtod(m, struct ip6 *); +struct ip6_pseudohdr *ih = mtod(m, struct ip6_pseudohdr *); +int sum; + +save_ip = *ip; + +ih-ih_src = save_ip.ip_src; +ih-ih_dst = save_ip.ip_dst; +ih-ih_pl = htonl((uint32_t)ntohs(save_ip.ip_pl)); +ih-ih_zero_hi = 0; +ih-ih_zero_lo = 0; +ih-ih_nh = save_ip.ip_nh; + +sum = cksum(m, ((int)sizeof(struct ip6_pseudohdr)) ++ ntohl(ih-ih_pl)); + +*ip = save_ip; + +return sum; +} diff --git a/slirp/ip6.h b/slirp/ip6.h new file mode 100644 index 000..16124ec --- /dev/null +++ b/slirp/ip6.h @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2013 + * Guillaume Subiron, Yann Bordenave, Serigne Modou Wagne. + * + * Please read the file COPYRIGHT for the + * terms and conditions of the copyright. + */ + +#ifndef _IP6_H_ +#define _IP6_H_ + +#define in6_multicast(a) IN6_IS_ADDR_MULTICAST((a)) +#define in6_linklocal(a) IN6_IS_ADDR_LINKLOCAL((a)) +#define in6_unspecified(a) IN6_IS_ADDR_UNSPECIFIED((a)) + +#define ALLNODES_MULTICAST { .s6_addr = \ +{ 0xff, 0x02, 0x00, 0x00,\ +0x00, 0x00, 0x00, 0x00,\ +0x00, 0x00, 0x00, 0x00,\ +0x00, 0x00, 0x00, 0x01 } } + +#define SOLICITED_NODE_PREFIX { .s6_addr = \ +{ 0xff, 0x02, 0x00, 0x00,\ +0x00, 0x00, 0x00, 0x00,\ +0x00, 0x00, 0x00, 0x01,\ +0xff, 0x00, 0x00, 0x00 } } + +#define LINKLOCAL_ADDR { .s6_addr = \ +{ 0xfe, 0x80, 0x00, 0x00,\ +0x00, 0x00, 0x00, 0x00,\ +0x00, 0x00, 0x00, 0x00,\ +0x00, 0x00, 0x00, 0x01 } } + +static inline int in6_equal(struct in6_addr a, struct in6_addr b) +{ +return memcmp(a, b, sizeof(a)) == 0; +} + +static inline int in6_equal_net(struct in6_addr a, struct in6_addr b, +int prefix_len) +{ +if (memcmp(a, b, prefix_len / 8) != 0) { +return 0; +} + +if (prefix_len % 8 == 0) { +return 1; +} + +return (a.s6_addr[prefix_len / 8] (8 - (prefix_len % 8))) +== (b.s6_addr[prefix_len / 8] (8 - (prefix_len % 8))); +} + +static inline int in6_equal_mach(struct in6_addr a, struct in6_addr b, +int prefix_len) +{ +if (memcmp((a.s6_addr[(prefix_len + 7) / 8]), +(b.s6_addr[(prefix_len + 7) / 8]), +16 - (prefix_len + 7) / 8) != 0) { +return 0; +} + +if (prefix_len % 8 == 0) { +return 1; +} + +return (a.s6_addr[prefix_len / 8] ((1U (8 - (prefix_len % 8))) -
[Qemu-devel] [PATCH 06/16] slirp: Make Socket structure IPv6 compatible
This patch replaces foreign and local address/port couples in Socket structure by 2 sockaddr_storage which can be casted in sockaddr_in or sockaddr_in6. Direct access to address and port is still possible thanks to some \#define, so retrocompatibility of the existing code is assured. The ss_family field of sockaddr_storage is declared after each socket creation. The whole structure is also saved/restored when a Qemu session is saved/restored. Signed-off-by: Guillaume Subiron maet...@subiron.org --- slirp/ip_icmp.c | 2 ++ slirp/slirp.c | 48 slirp/socket.c| 14 +++--- slirp/socket.h| 25 + slirp/tcp_input.c | 2 ++ slirp/tcp_subr.c | 2 ++ slirp/udp.c | 4 7 files changed, 82 insertions(+), 15 deletions(-) diff --git a/slirp/ip_icmp.c b/slirp/ip_icmp.c index 1808976..768ea4a 100644 --- a/slirp/ip_icmp.c +++ b/slirp/ip_icmp.c @@ -170,8 +170,10 @@ icmp_input(struct mbuf *m, int hlen) goto end_error; } so-so_m = m; + so-so_ffamily = AF_INET; so-so_faddr = ip-ip_dst; so-so_fport = htons(7); + so-so_lfamily = AF_INET; so-so_laddr = ip-ip_src; so-so_lport = htons(9); so-so_iptos = ip-ip_tos; diff --git a/slirp/slirp.c b/slirp/slirp.c index 1533d31..2caddcd 100644 --- a/slirp/slirp.c +++ b/slirp/slirp.c @@ -1056,10 +1056,26 @@ static void slirp_sbuf_save(QEMUFile *f, struct sbuf *sbuf) static void slirp_socket_save(QEMUFile *f, struct socket *so) { qemu_put_be32(f, so-so_urgc); -qemu_put_be32(f, so-so_faddr.s_addr); -qemu_put_be32(f, so-so_laddr.s_addr); -qemu_put_be16(f, so-so_fport); -qemu_put_be16(f, so-so_lport); +qemu_put_be16(f, so-so_ffamily); +switch (so-so_ffamily) { +case AF_INET: +qemu_put_be32(f, so-so_faddr.s_addr); +qemu_put_be16(f, so-so_fport); +break; +default: +fprintf(stderr, +so_ffamily unknown, unable to save so_faddr and so_fport\n); +} +qemu_put_be16(f, so-so_lfamily); +switch (so-so_lfamily) { +case AF_INET: +qemu_put_be32(f, so-so_laddr.s_addr); +qemu_put_be16(f, so-so_lport); +break; +default: +fprintf(stderr, +so_ffamily unknown, unable to save so_laddr and so_lport\n); +} qemu_put_byte(f, so-so_iptos); qemu_put_byte(f, so-so_emu); qemu_put_byte(f, so-so_type); @@ -1179,10 +1195,26 @@ static int slirp_socket_load(QEMUFile *f, struct socket *so) return -ENOMEM; so-so_urgc = qemu_get_be32(f); -so-so_faddr.s_addr = qemu_get_be32(f); -so-so_laddr.s_addr = qemu_get_be32(f); -so-so_fport = qemu_get_be16(f); -so-so_lport = qemu_get_be16(f); +so-so_ffamily = qemu_get_be16(f); +switch (so-so_ffamily) { +case AF_INET: +so-so_faddr.s_addr = qemu_get_be32(f); +so-so_fport = qemu_get_be16(f); +break; +default: +fprintf(stderr, +so_ffamily unknown, unable to restore so_faddr and so_lport\n); +} +so-so_lfamily = qemu_get_be16(f); +switch (so-so_lfamily) { +case AF_INET: +so-so_laddr.s_addr = qemu_get_be32(f); +so-so_lport = qemu_get_be16(f); +break; +default: +fprintf(stderr, +so_ffamily unknown, unable to restore so_laddr and so_lport\n); +} so-so_iptos = qemu_get_byte(f); so-so_emu = qemu_get_byte(f); so-so_type = qemu_get_byte(f); diff --git a/slirp/socket.c b/slirp/socket.c index e87c70e..2f166fb 100644 --- a/slirp/socket.c +++ b/slirp/socket.c @@ -437,8 +437,8 @@ sowrite(struct socket *so) void sorecvfrom(struct socket *so) { - struct sockaddr_in addr; - socklen_t addrlen = sizeof(struct sockaddr_in); + struct sockaddr_storage addr; + socklen_t addrlen = sizeof(struct sockaddr_storage); DEBUG_CALL(sorecvfrom); DEBUG_ARG(so = %lx, (long)so); @@ -527,7 +527,13 @@ sorecvfrom(struct socket *so) * If this packet was destined for CTL_ADDR, * make it look like that's where it came from, done by udp_output */ - udp_output(so, m, addr); + switch (so-so_ffamily) { + case AF_INET: + udp_output(so, m, (struct sockaddr_in *) addr); + break; + default: + break; + } } /* rx error */ } /* if ping packet */ } @@ -619,6 +625,7 @@ tcp_listen(Slirp *slirp, uint32_t haddr, u_int hport, uint32_t laddr, so-so_state = SS_PERSISTENT_MASK; so-so_state |= (SS_FACCEPTCONN | flags); + so-so_lfamily = AF_INET; so-so_lport = lport; /* Kept in network format */ so-so_laddr.s_addr = laddr; /* Ditto */ @@ -645,6 +652,7 @@ tcp_listen(Slirp *slirp, uint32_t haddr, u_int hport, uint32_t laddr, qemu_setsockopt(s, SOL_SOCKET, SO_OOBINLINE, opt, sizeof(int));
[Qemu-devel] [PATCH 14/16] slirp: Handle IPv6 in TCP functions
This patch adds IPv6 case in TCP functions refactored by the last patches. This also adds IPv6 pseudo-header in tcpiphdr structure. Finally, tcp_input() is called by ip6_input(). Signed-off-by: Guillaume Subiron maet...@subiron.org Signed-off-by: Samuel Thibault samuel.thiba...@ens-lyon.org --- slirp/ip6_input.c | 4 ++-- slirp/tcp.h| 2 ++ slirp/tcp_input.c | 58 +- slirp/tcp_output.c | 16 +++ slirp/tcp_subr.c | 36 + slirp/tcpip.h | 9 + 6 files changed, 105 insertions(+), 20 deletions(-) diff --git a/slirp/ip6_input.c b/slirp/ip6_input.c index 3290af8..b03b795 100644 --- a/slirp/ip6_input.c +++ b/slirp/ip6_input.c @@ -58,8 +58,8 @@ void ip6_input(struct mbuf *m) */ switch (ip6-ip_nh) { case IPPROTO_TCP: -/* :TODO:maethor:130307: TCP */ -icmp6_send_error(m, ICMP6_UNREACH, ICMP6_UNREACH_NO_ROUTE); +NTOHS(ip6-ip_pl); +tcp_input(m, sizeof(struct ip6), (struct socket *)NULL, AF_INET6); break; case IPPROTO_UDP: udp6_input(m); diff --git a/slirp/tcp.h b/slirp/tcp.h index 2e2b403..61befcd 100644 --- a/slirp/tcp.h +++ b/slirp/tcp.h @@ -106,6 +106,8 @@ struct tcphdr { */ #undef TCP_MSS #defineTCP_MSS 1460 +#undef TCP6_MSS +#define TCP6_MSS 1440 #undef TCP_MAXWIN #defineTCP_MAXWIN 65535 /* largest value for (unscaled) window */ diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c index 3409557..e5056f8 100644 --- a/slirp/tcp_input.c +++ b/slirp/tcp_input.c @@ -215,7 +215,8 @@ present: void tcp_input(struct mbuf *m, int iphlen, struct socket *inso, sa_family_t af) { - struct ip save_ip, *ip; + struct ip save_ip, *ip; + struct ip6 save_ip6, *ip6; register struct tcpiphdr *ti; caddr_t optp = NULL; int optlen = 0; @@ -254,6 +255,11 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso, sa_family_t af) } slirp = m-slirp; + ip = mtod(m, struct ip *); + ip6 = mtod(m, struct ip6 *); + save_ip = *ip; + save_ip6 = *ip6; + switch (af) { case AF_INET: if (iphlen sizeof(struct ip)) { @@ -262,13 +268,6 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso, sa_family_t af) } /* XXX Check if too short */ - - /* -* Save a copy of the IP header in case we want restore it -* for sending an ICMP error message in response. -*/ - ip = mtod(m, struct ip *); - save_ip = *ip; save_ip.ip_len += iphlen; /* @@ -293,16 +292,35 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso, sa_family_t af) ti-ti_dst = save_ip.ip_dst; ti-ti_pr = save_ip.ip_p; ti-ti_len = htons((uint16_t)tlen); - len = ((sizeof(struct tcpiphdr) - sizeof(struct tcphdr)) + tlen); - if (cksum(m, len)) { - goto drop; - } + break; + + case AF_INET6: + m-m_data -= sizeof(struct tcpiphdr) - (sizeof(struct ip6) ++ sizeof(struct tcphdr)); + m-m_len += sizeof(struct tcpiphdr) - (sizeof(struct ip6) ++ sizeof(struct tcphdr)); + ti = mtod(m, struct tcpiphdr *); + + tlen = ip6-ip_pl; + tcpiphdr2qlink(ti)-next = tcpiphdr2qlink(ti)-prev = NULL; + memset(ti-ih_mbuf, 0 , sizeof(struct mbuf_ptr)); + memset(ti-ti, 0, sizeof(ti-ti)); + ti-ti_x0 = 0; + ti-ti_src6 = save_ip6.ip_src; + ti-ti_dst6 = save_ip6.ip_dst; + ti-ti_nh6 = save_ip6.ip_nh; + ti-ti_len = htons((uint16_t)tlen); break; default: goto drop; } + len = ((sizeof(struct tcpiphdr) - sizeof(struct tcphdr)) + tlen); + if (cksum(m, len)) { + goto drop; + } + /* * Check that TCP offset makes sense, * pull out TCP options and adjust length. XXX @@ -346,6 +364,12 @@ findso: ((struct sockaddr_in *)fhost)-sin_addr = ti-ti_dst; ((struct sockaddr_in *)fhost)-sin_port = ti-ti_dport; break; + case AF_INET6: + ((struct sockaddr_in6 *)lhost)-sin6_addr = ti-ti_src6; + ((struct sockaddr_in6 *)lhost)-sin6_port = ti-ti_sport; + ((struct sockaddr_in6 *)fhost)-sin6_addr = ti-ti_dst6; + ((struct sockaddr_in6 *)fhost)-sin6_port = ti-ti_dport; + break; default: goto drop; } @@ -405,7 +429,6 @@ findso: so-so_iptos = ((struct ip *)ti)-ip_tos; break; default: - goto drop; break; } } @@ -634,6 +657,9 @@ findso: case AF_INET:
[Qemu-devel] [PATCH 02/16] slirp: Generalizing and neutralizing code before adding IPv6 stuff
Basically, this patch replaces arp by resolution every time arp means mac resolution and not specifically ARP. Some indentation problems are solved in functions that will be modified in the next patches (ip_input…). In if_encap, a switch is added to prepare for the IPv6 case. Some code is factorized. Some #define ETH_* are moved upper in slirp.h to make them accessible to other slirp/*.h Signed-off-by: Guillaume Subiron maet...@subiron.org Signed-off-by: Samuel Thibault samuel.thiba...@ens-lyon.org --- slirp/if.c| 2 +- slirp/mbuf.c | 2 +- slirp/mbuf.h | 2 +- slirp/slirp.c | 107 ++ slirp/slirp.h | 12 +++ 5 files changed, 71 insertions(+), 54 deletions(-) diff --git a/slirp/if.c b/slirp/if.c index 87ca8a5..c138ff4 100644 --- a/slirp/if.c +++ b/slirp/if.c @@ -193,7 +193,7 @@ void if_start(Slirp *slirp) /* Try to send packet unless it already expired */ if (ifm-expiration_date = now !if_encap(slirp, ifm)) { -/* Packet is delayed due to pending ARP resolution */ +/* Packet is delayed due to pending ARP or NDP resolution */ continue; } diff --git a/slirp/mbuf.c b/slirp/mbuf.c index 4fefb04..92c429e 100644 --- a/slirp/mbuf.c +++ b/slirp/mbuf.c @@ -91,7 +91,7 @@ m_get(Slirp *slirp) m-m_len = 0; m-m_nextpkt = NULL; m-m_prevpkt = NULL; -m-arp_requested = false; +m-resolution_requested = false; m-expiration_date = (uint64_t)-1; end_error: DEBUG_ARG(m = %lx, (long )m); diff --git a/slirp/mbuf.h b/slirp/mbuf.h index b144f1c..38fedf4 100644 --- a/slirp/mbuf.h +++ b/slirp/mbuf.h @@ -79,7 +79,7 @@ struct mbuf { int m_len; /* Amount of data in this mbuf */ Slirp *slirp; - boolarp_requested; + boolresolution_requested; uint64_t expiration_date; /* start of dynamic buffer area, must be last element */ union { diff --git a/slirp/slirp.c b/slirp/slirp.c index bad8dad..bfc4832 100644 --- a/slirp/slirp.c +++ b/slirp/slirp.c @@ -778,53 +778,70 @@ int if_encap(Slirp *slirp, struct mbuf *ifm) return 1; } -if (!arp_table_search(slirp, iph-ip_dst.s_addr, ethaddr)) { -uint8_t arp_req[ETH_HLEN + sizeof(struct arphdr)]; -struct ethhdr *reh = (struct ethhdr *)arp_req; -struct arphdr *rah = (struct arphdr *)(arp_req + ETH_HLEN); - -if (!ifm-arp_requested) { -/* If the client addr is not known, send an ARP request */ -memset(reh-h_dest, 0xff, ETH_ALEN); -memcpy(reh-h_source, special_ethaddr, ETH_ALEN - 4); -memcpy(reh-h_source[2], slirp-vhost_addr, 4); -reh-h_proto = htons(ETH_P_ARP); -rah-ar_hrd = htons(1); -rah-ar_pro = htons(ETH_P_IP); -rah-ar_hln = ETH_ALEN; -rah-ar_pln = 4; -rah-ar_op = htons(ARPOP_REQUEST); - -/* source hw addr */ -memcpy(rah-ar_sha, special_ethaddr, ETH_ALEN - 4); -memcpy(rah-ar_sha[2], slirp-vhost_addr, 4); - -/* source IP */ -rah-ar_sip = slirp-vhost_addr.s_addr; - -/* target hw addr (none) */ -memset(rah-ar_tha, 0, ETH_ALEN); - -/* target IP */ -rah-ar_tip = iph-ip_dst.s_addr; -slirp-client_ipaddr = iph-ip_dst; -slirp_output(slirp-opaque, arp_req, sizeof(arp_req)); -ifm-arp_requested = true; - -/* Expire request and drop outgoing packet after 1 second */ -ifm-expiration_date = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + 10ULL; +switch (iph-ip_v) { +case IPVERSION: +if (!arp_table_search(slirp, iph-ip_dst.s_addr, ethaddr)) { +uint8_t arp_req[ETH_HLEN + sizeof(struct arphdr)]; +struct ethhdr *reh = (struct ethhdr *)arp_req; +struct arphdr *rah = (struct arphdr *)(arp_req + ETH_HLEN); + +if (!ifm-resolution_requested) { +/* If the client addr is not known, send an ARP request */ +memset(reh-h_dest, 0xff, ETH_ALEN); +memcpy(reh-h_source, special_ethaddr, ETH_ALEN - 4); +memcpy(reh-h_source[2], slirp-vhost_addr, 4); +reh-h_proto = htons(ETH_P_ARP); +rah-ar_hrd = htons(1); +rah-ar_pro = htons(ETH_P_IP); +rah-ar_hln = ETH_ALEN; +rah-ar_pln = 4; +rah-ar_op = htons(ARPOP_REQUEST); + +/* source hw addr */ +memcpy(rah-ar_sha, special_ethaddr, ETH_ALEN - 4); +memcpy(rah-ar_sha[2], slirp-vhost_addr, 4); + +/* source IP */ +rah-ar_sip = slirp-vhost_addr.s_addr; + +/* target hw addr (none) */ +memset(rah-ar_tha, 0, ETH_ALEN); + +/* target IP */ +
[Qemu-devel] [PATCH 16/16] qapi-schema, qemu-options slirp: Adding Qemu options for IPv6 addresses
This patchs adds parameters to manage some new options in the qemu -net command. Slirp IPv6 address, network prefix, and DNS IPv6 address can be given in argument to the qemu command. Defaults parameters are respectively fc00::1, fc00::, /64 and fc00::2. Signed-off-by: Yann Bordenave m...@meowstars.org --- net/slirp.c | 56 qapi-schema.json | 37 +++-- qemu-options.hx | 5 +++-- slirp/libslirp.h | 8 +--- slirp/slirp.c| 20 +--- 5 files changed, 88 insertions(+), 38 deletions(-) diff --git a/net/slirp.c b/net/slirp.c index 124e953..68f4aa9 100644 --- a/net/slirp.c +++ b/net/slirp.c @@ -134,17 +134,23 @@ static NetClientInfo net_slirp_info = { static int net_slirp_init(NetClientState *peer, const char *model, const char *name, int restricted, const char *vnetwork, const char *vhost, + const char *vprefix6, const char *vhost6, const char *vhostname, const char *tftp_export, const char *bootfile, const char *vdhcp_start, - const char *vnameserver, const char *smb_export, - const char *vsmbserver, const char **dnssearch) + const char *vnameserver, const char *vnameserver6, + const char *smb_export, const char *vsmbserver, + const char **dnssearch) { -/* default settings according to historic slirp */ +/* default settings according to historic slirp and updated for IPv6 */ struct in_addr net = { .s_addr = htonl(0x0a000200) }; /* 10.0.2.0 */ struct in_addr mask = { .s_addr = htonl(0xff00) }; /* 255.255.255.0 */ struct in_addr host = { .s_addr = htonl(0x0a000202) }; /* 10.0.2.2 */ +struct in6_addr ip6_prefix; +uint8_t ip6_prefix_len = 64; +struct in6_addr ip6_host; struct in_addr dhcp = { .s_addr = htonl(0x0a00020f) }; /* 10.0.2.15 */ struct in_addr dns = { .s_addr = htonl(0x0a000203) }; /* 10.0.2.3 */ +struct in6_addr ip6_dns; #ifndef _WIN32 struct in_addr smbsrv = { .s_addr = 0 }; #endif @@ -156,6 +162,11 @@ static int net_slirp_init(NetClientState *peer, const char *model, char *end; struct slirp_config_str *config; +/* IPv6 defaults initialisations */ +inet_pton(AF_INET6, fc00::0, ip6_prefix); +inet_pton(AF_INET6, fc00::1, ip6_host); +inet_pton(AF_INET6, fc00::2, ip6_dns); + if (!tftp_export) { tftp_export = legacy_tftp_prefix; } @@ -228,6 +239,32 @@ static int net_slirp_init(NetClientState *peer, const char *model, return -1; } +if (vprefix6) { +if (get_str_sep(buf, sizeof(buf), vprefix6, '/') 0) { +if (!inet_pton(AF_INET6, vprefix6, ip6_prefix)) { +return -1; +} +} else { +if (!inet_pton(AF_INET6, buf, ip6_prefix)) { +return -1; +} +shift = strtol(vprefix6, end, 10); +if (*end != '\0' || (shift 0 shift 129)) { +ip6_prefix_len = shift; +} else { +return -1; +} +} +} + +if (vhost6 !inet_pton(AF_INET6, vhost6, ip6_host)) { +return -1; +} + +if (vnameserver6 !inet_pton(AF_INET6, vnameserver6, ip6_dns)) { +return -1; +} + #ifndef _WIN32 if (vsmbserver !inet_aton(vsmbserver, smbsrv)) { return -1; @@ -242,8 +279,10 @@ static int net_slirp_init(NetClientState *peer, const char *model, s = DO_UPCAST(SlirpState, nc, nc); -s-slirp = slirp_init(restricted, net, mask, host, vhostname, - tftp_export, bootfile, dhcp, dns, dnssearch, s); +s-slirp = slirp_init(restricted, net, mask, host, + ip6_prefix, ip6_prefix_len, ip6_host, + vhostname, tftp_export, bootfile, dhcp, + dns, ip6_dns, dnssearch, s); QTAILQ_INSERT_TAIL(slirp_stacks, s, entry); for (config = slirp_configs; config; config = config-next) { @@ -750,9 +789,10 @@ int net_init_slirp(const NetClientOptions *opts, const char *name, net_init_slirp_configs(user-guestfwd, 0); ret = net_slirp_init(peer, user, name, user-q_restrict, vnet, - user-host, user-hostname, user-tftp, - user-bootfile, user-dhcpstart, user-dns, user-smb, - user-smbserver, dnssearch); + user-host, user-ip6_prefix, user-ip6_host, + user-hostname, user-tftp, user-bootfile, + user-dhcpstart, user-dns, user-ip6_dns, + user-smb, user-smbserver, dnssearch); while (slirp_configs) { config = slirp_configs; diff --git a/qapi-schema.json
[Qemu-devel] [PATCH 07/16] slirp: Factorizing address translation
This patch factorizes some duplicate code into a new function, sotranslate_out(). This function perform the address translation when a packet is transmitted to the host network. If the paquet is destinated to the host, the loopback address is used, and if the paquet is destinated to the virtual DNS, the real DNS address is used. This code is just a copy of the existant, but factorized and ready to manage the IPv6 case. On the same model, the major part of udp_output() code is moved into a new sotranslate_in(). This function is directly used in sorecvfrom(), like sotranslate_out() in sosendto(). udp_output() becoming useless, it is removed and udp_output2() is renamed into udp_output(). This adds consistency with the udp6_output() function introduced by further patches. Signed-off-by: Guillaume Subiron maet...@subiron.org --- slirp/bootp.c| 2 +- slirp/ip_icmp.c | 19 +++- slirp/socket.c | 93 slirp/socket.h | 3 ++ slirp/tcp_subr.c | 24 +++ slirp/tftp.c | 6 ++-- slirp/udp.c | 27 +--- slirp/udp.h | 3 +- 8 files changed, 91 insertions(+), 86 deletions(-) diff --git a/slirp/bootp.c b/slirp/bootp.c index b7db9fa..03e2e42 100644 --- a/slirp/bootp.c +++ b/slirp/bootp.c @@ -319,7 +319,7 @@ static void bootp_reply(Slirp *slirp, const struct bootp_t *bp) m-m_len = sizeof(struct bootp_t) - sizeof(struct ip) - sizeof(struct udphdr); -udp_output2(NULL, m, saddr, daddr, IPTOS_LOWDELAY); +udp_output(NULL, m, saddr, daddr, IPTOS_LOWDELAY); } void bootp_input(struct mbuf *m) diff --git a/slirp/ip_icmp.c b/slirp/ip_icmp.c index 768ea4a..8787aae 100644 --- a/slirp/ip_icmp.c +++ b/slirp/ip_icmp.c @@ -157,7 +157,7 @@ icmp_input(struct mbuf *m, int hlen) goto freeit; } else { struct socket *so; - struct sockaddr_in addr; + struct sockaddr_storage addr; if ((so = socreate(slirp)) == NULL) goto freeit; if (icmp_send(so, m, hlen) == 0) { return; @@ -181,20 +181,9 @@ icmp_input(struct mbuf *m, int hlen) so-so_state = SS_ISFCONNECTED; /* Send the packet */ - addr.sin_family = AF_INET; - if ((so-so_faddr.s_addr slirp-vnetwork_mask.s_addr) == - slirp-vnetwork_addr.s_addr) { - /* It's an alias */ - if (so-so_faddr.s_addr == slirp-vnameserver_addr.s_addr) { - if (get_dns_addr(addr.sin_addr) 0) - addr.sin_addr = loopback_addr; - } else { - addr.sin_addr = loopback_addr; - } - } else { - addr.sin_addr = so-so_faddr; - } - addr.sin_port = so-so_fport; + addr = so-fhost.ss; + sotranslate_out(so, addr); + if(sendto(so-s, icmp_ping_msg, strlen(icmp_ping_msg), 0, (struct sockaddr *)addr, sizeof(addr)) == -1) { DEBUG_MISC((dfd,icmp_input udp sendto tx errno = %d-%s\n, diff --git a/slirp/socket.c b/slirp/socket.c index 2f166fb..375281c 100644 --- a/slirp/socket.c +++ b/slirp/socket.c @@ -438,6 +438,7 @@ void sorecvfrom(struct socket *so) { struct sockaddr_storage addr; + struct sockaddr_storage saddr, daddr; socklen_t addrlen = sizeof(struct sockaddr_storage); DEBUG_CALL(sorecvfrom); @@ -525,11 +526,17 @@ sorecvfrom(struct socket *so) /* * If this packet was destined for CTL_ADDR, -* make it look like that's where it came from, done by udp_output +* make it look like that's where it came from */ + saddr = addr; + sotranslate_in(so, saddr); + daddr = so-lhost.ss; + switch (so-so_ffamily) { case AF_INET: - udp_output(so, m, (struct sockaddr_in *) addr); + udp_output(so, m, (struct sockaddr_in *) saddr, + (struct sockaddr_in *) daddr, + so-so_iptos); break; default: break; @@ -544,33 +551,20 @@ sorecvfrom(struct socket *so) int sosendto(struct socket *so, struct mbuf *m) { - Slirp *slirp = so-slirp; int ret; - struct sockaddr_in addr; + struct sockaddr_storage addr; DEBUG_CALL(sosendto); DEBUG_ARG(so = %lx, (long)so); DEBUG_ARG(m = %lx, (long)m); -addr.sin_family = AF_INET; - if ((so-so_faddr.s_addr slirp-vnetwork_mask.s_addr) == - slirp-vnetwork_addr.s_addr) { - /* It's an alias */ - if (so-so_faddr.s_addr == slirp-vnameserver_addr.s_addr) { - if (get_dns_addr(addr.sin_addr) 0) - addr.sin_addr = loopback_addr; - } else { - addr.sin_addr = loopback_addr; - } - } else - addr.sin_addr = so-so_faddr; - addr.sin_port = so-so_fport; - - DEBUG_MISC((dfd, sendto()ing, addr.sin_port=%d, addr.sin_addr.s_addr=%.16s\n, ntohs(addr.sin_port), inet_ntoa(addr.sin_addr)));
[Qemu-devel] [PATCHv5 02/17] block: add flags to bdrv_*_write_zeroes
Reviewed-by: Eric Blake ebl...@redhat.com Signed-off-by: Peter Lieven p...@kamp.de --- block-migration.c |2 +- block.c | 20 +++- block/backup.c|3 ++- block/qcow2-cluster.c |2 +- block/qcow2.c |2 +- block/qed.c |3 ++- block/raw_bsd.c |5 +++-- block/vmdk.c |3 ++- include/block/block.h |4 ++-- include/block/block_int.h |2 +- qemu-io-cmds.c|2 +- 11 files changed, 27 insertions(+), 21 deletions(-) diff --git a/block-migration.c b/block-migration.c index daf9ec1..713a8e3 100644 --- a/block-migration.c +++ b/block-migration.c @@ -780,7 +780,7 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) } if (flags BLK_MIG_FLAG_ZERO_BLOCK) { -ret = bdrv_write_zeroes(bs, addr, nr_sectors); +ret = bdrv_write_zeroes(bs, addr, nr_sectors, 0); } else { buf = g_malloc(BLOCK_SIZE); qemu_get_buffer(f, buf, BLOCK_SIZE); diff --git a/block.c b/block.c index eb11a07..3259429 100644 --- a/block.c +++ b/block.c @@ -79,7 +79,7 @@ static BlockDriverAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, bool is_write); static void coroutine_fn bdrv_co_do_rw(void *opaque); static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, -int64_t sector_num, int nb_sectors); +int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); static QTAILQ_HEAD(, BlockDriverState) bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states); @@ -2384,10 +2384,11 @@ int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov) return bdrv_rwv_co(bs, sector_num, qiov, true, 0); } -int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) { return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, - BDRV_REQ_ZERO_WRITE); + BDRV_REQ_ZERO_WRITE | flags); } int bdrv_pread(BlockDriverState *bs, int64_t offset, @@ -2569,7 +2570,7 @@ static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, if (drv-bdrv_co_write_zeroes buffer_is_zero(bounce_buffer, iov.iov_len)) { ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, - cluster_nb_sectors); + cluster_nb_sectors, 0); } else { /* This does not change the data on the disk, it is not necessary * to flush even in cache=writethrough mode. @@ -2703,7 +2704,7 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, } static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, -int64_t sector_num, int nb_sectors) +int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { BlockDriver *drv = bs-drv; QEMUIOVector qiov; @@ -2715,7 +2716,7 @@ static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, /* First try the efficient write zeroes operation */ if (drv-bdrv_co_write_zeroes) { -ret = drv-bdrv_co_write_zeroes(bs, sector_num, nb_sectors); +ret = drv-bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); if (ret != -ENOTSUP) { return ret; } @@ -2770,7 +2771,7 @@ static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, if (ret 0) { /* Do nothing, write notifier decided to fail this request */ } else if (flags BDRV_REQ_ZERO_WRITE) { -ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors); +ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); } else { ret = drv-bdrv_co_writev(bs, sector_num, nb_sectors, qiov); } @@ -2804,12 +2805,13 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, } int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) + int64_t sector_num, int nb_sectors, + BdrvRequestFlags flags) { trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors); return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, - BDRV_REQ_ZERO_WRITE); + BDRV_REQ_ZERO_WRITE | flags); } /** diff --git a/block/backup.c b/block/backup.c index cad14c9..830a179 100644 --- a/block/backup.c +++ b/block/backup.c @@ -138,7 +138,8 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, if (buffer_is_zero(iov.iov_base, iov.iov_len)) { ret = bdrv_co_write_zeroes(job-target, - start * BACKUP_SECTORS_PER_CLUSTER, n); +
[Qemu-devel] [PATCHv5 01/17] block: make BdrvRequestFlags public
Reviewed-by: Eric Blake ebl...@redhat.com Signed-off-by: Peter Lieven p...@kamp.de --- block.c |5 - include/block/block.h |5 + 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/block.c b/block.c index fd05a80..eb11a07 100644 --- a/block.c +++ b/block.c @@ -51,11 +51,6 @@ #define NOT_DONE 0x7fff /* used while emulated sync operation in progress */ -typedef enum { -BDRV_REQ_COPY_ON_READ = 0x1, -BDRV_REQ_ZERO_WRITE = 0x2, -} BdrvRequestFlags; - static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load); static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, diff --git a/include/block/block.h b/include/block/block.h index 3560deb..ba2082c 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -62,6 +62,11 @@ typedef struct BlockDevOps { void (*resize_cb)(void *opaque); } BlockDevOps; +typedef enum { +BDRV_REQ_COPY_ON_READ = 0x1, +BDRV_REQ_ZERO_WRITE = 0x2, +} BdrvRequestFlags; + #define BDRV_O_RDWR0x0002 #define BDRV_O_SNAPSHOT0x0008 /* open the file read only and save writes in a snapshot */ #define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */ -- 1.7.9.5
[Qemu-devel] [PATCHv5 03/17] block: introduce BDRV_REQ_MAY_UNMAP request flag
Reviewed-by: Eric Blake ebl...@redhat.com Signed-off-by: Peter Lieven p...@kamp.de --- block-migration.c |3 ++- block.c |4 block/backup.c|2 +- include/block/block.h |7 +++ 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/block-migration.c b/block-migration.c index 713a8e3..fc4ef93 100644 --- a/block-migration.c +++ b/block-migration.c @@ -780,7 +780,8 @@ static int block_load(QEMUFile *f, void *opaque, int version_id) } if (flags BLK_MIG_FLAG_ZERO_BLOCK) { -ret = bdrv_write_zeroes(bs, addr, nr_sectors, 0); +ret = bdrv_write_zeroes(bs, addr, nr_sectors, +BDRV_REQ_MAY_UNMAP); } else { buf = g_malloc(BLOCK_SIZE); qemu_get_buffer(f, buf, BLOCK_SIZE); diff --git a/block.c b/block.c index 3259429..0d97ce6 100644 --- a/block.c +++ b/block.c @@ -2810,6 +2810,10 @@ int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, { trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors); +if (!(bs-open_flags BDRV_O_UNMAP)) { +flags = ~BDRV_REQ_MAY_UNMAP; +} + return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, BDRV_REQ_ZERO_WRITE | flags); } diff --git a/block/backup.c b/block/backup.c index 830a179..0198514 100644 --- a/block/backup.c +++ b/block/backup.c @@ -139,7 +139,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, if (buffer_is_zero(iov.iov_base, iov.iov_len)) { ret = bdrv_co_write_zeroes(job-target, start * BACKUP_SECTORS_PER_CLUSTER, - n, 0); + n, BDRV_REQ_MAY_UNMAP); } else { ret = bdrv_co_writev(job-target, start * BACKUP_SECTORS_PER_CLUSTER, n, diff --git a/include/block/block.h b/include/block/block.h index 8ba9f0c..1f30a56 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -65,6 +65,13 @@ typedef struct BlockDevOps { typedef enum { BDRV_REQ_COPY_ON_READ = 0x1, BDRV_REQ_ZERO_WRITE = 0x2, +/* The BDRV_REQ_MAY_UNMAP flag is used to indicate that the block driver + * is allowed to optimize a write zeroes request by unmapping (discarding) + * blocks if it is guaranteed that the result will read back as + * zeroes. The flag is only passed to the driver if the block device is + * opened with BDRV_O_UNMAP. + */ +BDRV_REQ_MAY_UNMAP= 0x4, } BdrvRequestFlags; #define BDRV_O_RDWR0x0002 -- 1.7.9.5
[Qemu-devel] [PATCHv5 00/17] block: logical block provisioning enhancements
this patch adds the ability for targets to stay sparse during block migration (if the zero_blocks capability is set) and qemu-img convert even if the target does not have has_zero_init = 1. the series was especially developed for iSCSI, but it should also work with other drivers with little or no adjustments. these adjustments should be limited to providing block provisioning information through get_block_info and/or honouring BDRV_REQ_MAY_UNMAP on writing zeroes. v4-v5: - new patches 4-6 to move the block provisioning information to the BlockDriverInfo. - kept 2 wrappers to read the information from the BDI and renamed them to make more clear what they do: bdrv_has_discard_zeroes - bdrv_unallocated_blocks_are_zero bdrv_has_discard_write_zeroes - bdrv_can_write_zeroes_with_unmap - added additional information about the 2 flags in the BDI struct in block.h v3-v4: - changed BlockLimits struct to typedef (Stefan, Eric) - renamed bdrv_zeroize to bdrv_make_zero (Stefan) - added comment about the -S flag of qemu-img convert in qemu-img.texi (Eric) - used struct assignment for bs-bl in raw_open (Stefan, Eric) - dropped 3 get_block_status fixes that are independent of this series and already partly merged. v2-v3: - fix merge conflict in block/qcow2_cluster.c - changed return type of bdrv_has_discard_zeroes and bdrv_has_discard_write_zeroes to bool. - moved alignment and limits info to a BlockLimits struct (Paolo). - added magic constanst for default maximum in bdrv_co_do_write_zeroes and bdrv_co_discard (Eric). - bdrv_co_do_write_zeroes: allocating the bounce buffer only once (Eric), fixed bounce iov_len in the fall back path. - bdrv_zeroize: added inline docu (Eric) and do not mask flags passed to bdrv_write_zeroes (Eric). - qemu-img: changed the default hint for -S (min_sparse) in the usage help to 4k. not changing the default as it is unclear why this default was set. size suffixes are already supported (Eric). v1-v2: - moved block max_discard and max_write_zeroes to BlockDriverState - added discard_alignment and write_zeroes_alignment to BlockDriverState - added bdrv_has_discard_zeroes() and bdrv_has_discard_write_zeroes() - added logic to bdrv_co_discard and bdrv_co_do_write_zeroes to honour limit and alignment info. - added support for -S 0 in qemu-img convert. Peter Lieven (17): block: make BdrvRequestFlags public block: add flags to bdrv_*_write_zeroes block: introduce BDRV_REQ_MAY_UNMAP request flag block: add logical block provisioning info to BlockDriverInfo block: add wrappers for logical block provisioning information block/iscsi: add .bdrv_get_info block: add BlockLimits structure to BlockDriverState block: honour BlockLimits in bdrv_co_do_write_zeroes block: honour BlockLimits in bdrv_co_discard iscsi: simplify iscsi_co_discard iscsi: set limits in BlockDriverState iscsi: add bdrv_co_write_zeroes block: introduce bdrv_make_zero block/get_block_status: fix BDRV_BLOCK_ZERO for unallocated blocks qemu-img: add support for fully allocated images qemu-img: conditionally zero out target on convert block/raw: copy BlockLimits on raw_open block-migration.c |3 +- block.c | 200 + block/backup.c|3 +- block/iscsi.c | 145 +++- block/qcow2-cluster.c |2 +- block/qcow2.c |2 +- block/qed.c |3 +- block/raw_bsd.c |6 +- block/vmdk.c |3 +- include/block/block.h | 35 +++- include/block/block_int.h | 19 - qemu-img.c| 18 +++- qemu-img.texi |5 ++ qemu-io-cmds.c|2 +- 14 files changed, 358 insertions(+), 88 deletions(-) -- 1.7.9.5
[Qemu-devel] [PATCHv5 06/17] block/iscsi: add .bdrv_get_info
Signed-off-by: Peter Lieven p...@kamp.de --- block/iscsi.c |9 + 1 file changed, 9 insertions(+) diff --git a/block/iscsi.c b/block/iscsi.c index a2a961e..1dbbcad 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -1506,6 +1506,14 @@ out: return ret; } +static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ +IscsiLun *iscsilun = bs-opaque; +bdi-unallocated_blocks_are_zero = !!iscsilun-lbprz; +bdi-can_write_zeroes_with_unmap = iscsilun-lbprz iscsilun-lbp.lbpws; +return 0; +} + static QEMUOptionParameter iscsi_create_options[] = { { .name = BLOCK_OPT_SIZE, @@ -1527,6 +1535,7 @@ static BlockDriver bdrv_iscsi = { .create_options = iscsi_create_options, .bdrv_getlength = iscsi_getlength, +.bdrv_get_info = iscsi_get_info, .bdrv_truncate = iscsi_truncate, #if defined(LIBISCSI_FEATURE_IOVECTOR) -- 1.7.9.5
[Qemu-devel] [PATCHv5 08/17] block: honour BlockLimits in bdrv_co_do_write_zeroes
Reviewed-by: Eric Blake ebl...@redhat.com Signed-off-by: Peter Lieven p...@kamp.de --- block.c | 65 +++ 1 file changed, 49 insertions(+), 16 deletions(-) diff --git a/block.c b/block.c index 0601b02..0c0b0ac 100644 --- a/block.c +++ b/block.c @@ -2703,32 +2703,65 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, BDRV_REQ_COPY_ON_READ); } +/* if no limit is specified in the BlockLimits use a default + * of 32768 512-byte sectors (16 MiB) per request. + */ +#define MAX_WRITE_ZEROES_DEFAULT 32768 + static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { BlockDriver *drv = bs-drv; QEMUIOVector qiov; -struct iovec iov; -int ret; +struct iovec iov = {0}; +int ret = 0; -/* TODO Emulate only part of misaligned requests instead of letting block - * drivers return -ENOTSUP and emulate everything */ +int max_write_zeroes = bs-bl.max_write_zeroes ? + bs-bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT; -/* First try the efficient write zeroes operation */ -if (drv-bdrv_co_write_zeroes) { -ret = drv-bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); -if (ret != -ENOTSUP) { -return ret; +while (nb_sectors 0 !ret) { +int num = nb_sectors; + +/* align request */ +if (bs-bl.write_zeroes_alignment +num = bs-bl.write_zeroes_alignment +sector_num % bs-bl.write_zeroes_alignment) { +if (num bs-bl.write_zeroes_alignment) { +num = bs-bl.write_zeroes_alignment; +} +num -= sector_num % bs-bl.write_zeroes_alignment; } -} -/* Fall back to bounce buffer if write zeroes is unsupported */ -iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE; -iov.iov_base = qemu_blockalign(bs, iov.iov_len); -memset(iov.iov_base, 0, iov.iov_len); -qemu_iovec_init_external(qiov, iov, 1); +/* limit request size */ +if (num max_write_zeroes) { +num = max_write_zeroes; +} + +ret = -ENOTSUP; +/* First try the efficient write zeroes operation */ +if (drv-bdrv_co_write_zeroes) { +ret = drv-bdrv_co_write_zeroes(bs, sector_num, num, flags); +} + +if (ret == -ENOTSUP) { +/* Fall back to bounce buffer if write zeroes is unsupported */ +iov.iov_len = num * BDRV_SECTOR_SIZE; +if (iov.iov_base == NULL) { +/* allocate bounce buffer only once and ensure that it + * is big enough for this and all future requests. + */ +size_t bufsize = num = nb_sectors ? num : max_write_zeroes; +iov.iov_base = qemu_blockalign(bs, bufsize * BDRV_SECTOR_SIZE); +memset(iov.iov_base, 0, bufsize * BDRV_SECTOR_SIZE); +} +qemu_iovec_init_external(qiov, iov, 1); -ret = drv-bdrv_co_writev(bs, sector_num, nb_sectors, qiov); +ret = drv-bdrv_co_writev(bs, sector_num, num, qiov); +} + +sector_num += num; +nb_sectors -= num; +} qemu_vfree(iov.iov_base); return ret; -- 1.7.9.5
[Qemu-devel] [PATCHv5 04/17] block: add logical block provisioning info to BlockDriverInfo
Signed-off-by: Peter Lieven p...@kamp.de --- include/block/block.h | 16 1 file changed, 16 insertions(+) diff --git a/include/block/block.h b/include/block/block.h index 1f30a56..5fbab01 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -18,6 +18,22 @@ typedef struct BlockDriverInfo { /* offset at which the VM state can be saved (0 if not possible) */ int64_t vm_state_offset; bool is_dirty; +/* + * True if unallocated blocks read back as zeroes. This is equivalent + * to the the LBPRZ flag in the SCSI logical block provisioning page. + */ +bool unallocated_blocks_are_zero; +/* + * True if the driver can optimize writing zeroes by unmapping + * sectors. This is equivalent to the BLKDISCARDZEROES ioctl in Linux + * with the difference that in qemu a discard is allowed to silently + * fail. Therefore we have to use bdrv_write_zeroes with the + * BDRV_REQ_MAY_UNMAP flag for an optimized zero write with unmapping. + * After this call the driver has to guarantee that the contents read + * back as zero. It is additionally required that the block device is + * opened with BDRV_O_UNMAP flag for this to work. + */ +bool can_write_zeroes_with_unmap; } BlockDriverInfo; typedef struct BlockFragInfo { -- 1.7.9.5
[Qemu-devel] [PATCHv5 07/17] block: add BlockLimits structure to BlockDriverState
this patch adds BlockLimits which introduces discard and write_zeroes limits and alignment information to the BlockDriverState. Signed-off-by: Peter Lieven p...@kamp.de --- include/block/block_int.h | 17 + 1 file changed, 17 insertions(+) diff --git a/include/block/block_int.h b/include/block/block_int.h index 9bbaa29..33be247 100644 --- a/include/block/block_int.h +++ b/include/block/block_int.h @@ -227,6 +227,20 @@ struct BlockDriver { QLIST_ENTRY(BlockDriver) list; }; +typedef struct BlockLimits { +/* maximum number of sectors that can be discarded at once */ +int max_discard; + +/* optimal alignment for discard requests in sectors */ +int64_t discard_alignment; + +/* maximum number of sectors that can zeroized at once */ +int max_write_zeroes; + +/* optimal alignment for write zeroes requests in sectors */ +int64_t write_zeroes_alignment; +} BlockLimits; + /* * Note: the function bdrv_append() copies and swaps contents of * BlockDriverStates, so if you add new fields to this struct, please @@ -280,6 +294,9 @@ struct BlockDriverState { uint64_t total_time_ns[BDRV_MAX_IOTYPE]; uint64_t wr_highest_sector; +/* I/O Limits */ +BlockLimits bl; + /* Whether the disk can expand beyond total_sectors */ int growable; -- 1.7.9.5
[Qemu-devel] [PATCHv5 12/17] iscsi: add bdrv_co_write_zeroes
Reviewed-by: Eric Blake ebl...@redhat.com Signed-off-by: Peter Lieven p...@kamp.de --- block/iscsi.c | 59 + 1 file changed, 59 insertions(+) diff --git a/block/iscsi.c b/block/iscsi.c index c0465aa..1845fc8 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -56,6 +56,7 @@ typedef struct IscsiLun { uint8_t lbprz; struct scsi_inquiry_logical_block_provisioning lbp; struct scsi_inquiry_block_limits bl; +unsigned char *zeroblock; } IscsiLun; typedef struct IscsiTask { @@ -959,6 +960,62 @@ retry: return 0; } + +static int +coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ +IscsiLun *iscsilun = bs-opaque; +struct IscsiTask iTask; +uint64_t lba; +uint32_t nb_blocks; + +if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { +return -EINVAL; +} + +if (!iscsilun-lbp.lbpws) { +/* WRITE SAME is not supported by the target */ +return -ENOTSUP; +} + +lba = sector_qemu2lun(sector_num, iscsilun); +nb_blocks = sector_qemu2lun(nb_sectors, iscsilun); + +if (iscsilun-zeroblock == NULL) { +iscsilun-zeroblock = g_malloc0(iscsilun-block_size); +} + +iscsi_co_init_iscsitask(iscsilun, iTask); +retry: +if (iscsi_writesame16_task(iscsilun-iscsi, iscsilun-lun, lba, + iscsilun-zeroblock, iscsilun-block_size, + nb_blocks, 0, !!(flags BDRV_REQ_MAY_UNMAP), + 0, 0, iscsi_co_generic_cb, iTask) == NULL) { +return -EIO; +} + +while (!iTask.complete) { +iscsi_set_events(iscsilun); +qemu_coroutine_yield(); +} + +if (iTask.task != NULL) { +scsi_free_scsi_task(iTask.task); +iTask.task = NULL; +} + +if (iTask.do_retry) { +goto retry; +} + +if (iTask.status != SCSI_STATUS_GOOD) { +return -EIO; +} + +return 0; +} + static int parse_chap(struct iscsi_context *iscsi, const char *target) { QemuOptsList *list; @@ -1421,6 +1478,7 @@ static void iscsi_close(BlockDriverState *bs) } qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL); iscsi_destroy_context(iscsi); +g_free(iscsilun-zeroblock); memset(iscsilun, 0, sizeof(IscsiLun)); } @@ -1539,6 +1597,7 @@ static BlockDriver bdrv_iscsi = { .bdrv_co_get_block_status = iscsi_co_get_block_status, #endif .bdrv_co_discard = iscsi_co_discard, +.bdrv_co_write_zeroes = iscsi_co_write_zeroes, .bdrv_aio_readv = iscsi_aio_readv, .bdrv_aio_writev = iscsi_aio_writev, -- 1.7.9.5
[Qemu-devel] [PATCHv5 05/17] block: add wrappers for logical block provisioning information
This adds 2 wrappers to read the unallocated_blocks_are_zero and can_write_zeroes_with_unmap info from the BDI. The wrappers are required to check for the existence of a backing_hd and if the devices are opened with the correct flags. Signed-off-by: Peter Lieven p...@kamp.de --- block.c | 30 ++ include/block/block.h |2 ++ 2 files changed, 32 insertions(+) diff --git a/block.c b/block.c index 0d97ce6..0601b02 100644 --- a/block.c +++ b/block.c @@ -3094,6 +3094,36 @@ int bdrv_has_zero_init(BlockDriverState *bs) return 0; } +bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs) +{ +BlockDriverInfo bdi; + +if (bs-backing_hd) { +return false; +} + +if (bdrv_get_info(bs, bdi) == 0) { +return bdi.unallocated_blocks_are_zero; +} + +return false; +} + +bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs) +{ +BlockDriverInfo bdi; + +if (bs-backing_hd || !(bs-open_flags BDRV_O_UNMAP)) { +return false; +} + +if (bdrv_get_info(bs, bdi) == 0) { +return bdi.can_write_zeroes_with_unmap; +} + +return false; +} + typedef struct BdrvCoGetBlockStatusData { BlockDriverState *bs; BlockDriverState *base; diff --git a/include/block/block.h b/include/block/block.h index 5fbab01..de0d6ab 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -344,6 +344,8 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); int bdrv_has_zero_init_1(BlockDriverState *bs); int bdrv_has_zero_init(BlockDriverState *bs); +bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs); +bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs); int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum); int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, -- 1.7.9.5
[Qemu-devel] [PATCHv5 09/17] block: honour BlockLimits in bdrv_co_discard
Reviewed-by: Eric Blake ebl...@redhat.com Signed-off-by: Peter Lieven p...@kamp.de --- block.c | 37 - 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/block.c b/block.c index 0c0b0ac..b28dd42 100644 --- a/block.c +++ b/block.c @@ -4234,6 +4234,11 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque) rwco-ret = bdrv_co_discard(rwco-bs, rwco-sector_num, rwco-nb_sectors); } +/* if no limit is specified in the BlockLimits use a default + * of 32768 512-byte sectors (16 MiB) per request. + */ +#define MAX_DISCARD_DEFAULT 32768 + int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { @@ -4255,7 +4260,37 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, } if (bs-drv-bdrv_co_discard) { -return bs-drv-bdrv_co_discard(bs, sector_num, nb_sectors); +int max_discard = bs-bl.max_discard ? + bs-bl.max_discard : MAX_DISCARD_DEFAULT; + +while (nb_sectors 0) { +int ret; +int num = nb_sectors; + +/* align request */ +if (bs-bl.discard_alignment +num = bs-bl.discard_alignment +sector_num % bs-bl.discard_alignment) { +if (num bs-bl.discard_alignment) { +num = bs-bl.discard_alignment; +} +num -= sector_num % bs-bl.discard_alignment; +} + +/* limit request size */ +if (num max_discard) { +num = max_discard; +} + +ret = bs-drv-bdrv_co_discard(bs, sector_num, num); +if (ret) { +return ret; +} + +sector_num += num; +nb_sectors -= num; +} +return 0; } else if (bs-drv-bdrv_aio_discard) { BlockDriverAIOCB *acb; CoroutineIOCompletion co = { -- 1.7.9.5
[Qemu-devel] [PATCHv5 16/17] qemu-img: conditionally zero out target on convert
If the target has_zero_init = 0, but supports efficiently writing zeroes by unmapping we call bdrv_make_zero to avoid fully allocating the target. This currently is designed especially for iscsi. Reviewed-by: Eric Blake ebl...@redhat.com Signed-off-by: Peter Lieven p...@kamp.de --- qemu-img.c | 10 +- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/qemu-img.c b/qemu-img.c index c6eff15..fe0bdb1 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1353,7 +1353,7 @@ static int img_convert(int argc, char **argv) } } -flags = BDRV_O_RDWR; +flags = min_sparse ? (BDRV_O_RDWR | BDRV_O_UNMAP) : BDRV_O_RDWR; ret = bdrv_parse_cache_flags(cache, flags); if (ret 0) { error_report(Invalid cache option: %s, cache); @@ -1469,6 +1469,14 @@ static int img_convert(int argc, char **argv) } else { int has_zero_init = min_sparse ? bdrv_has_zero_init(out_bs) : 0; +if (!has_zero_init bdrv_can_write_zeroes_with_unmap(out_bs)) { +ret = bdrv_make_zero(out_bs, BDRV_REQ_MAY_UNMAP); +if (ret 0) { +goto out; +} +has_zero_init = 1; +} + sector_num = 0; // total number of sectors converted so far nb_sectors = total_sectors - sector_num; if (nb_sectors != 0) { -- 1.7.9.5
[Qemu-devel] [PATCHv5 17/17] block/raw: copy BlockLimits on raw_open
Signed-off-by: Peter Lieven p...@kamp.de --- block/raw_bsd.c |1 + 1 file changed, 1 insertion(+) diff --git a/block/raw_bsd.c b/block/raw_bsd.c index b0dd23f..49ac18c 100644 --- a/block/raw_bsd.c +++ b/block/raw_bsd.c @@ -150,6 +150,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { bs-sg = bs-file-sg; +bs-bl = bs-file-bl; return 0; } -- 1.7.9.5
[Qemu-devel] [PATCHv5 13/17] block: introduce bdrv_make_zero
this patch adds a call to completely zero out a block device. the operation is sped up by checking the block status and only writing zeroes to the device if they currently do not return zeroes. optionally the zero writing can be sped up by setting the flag BDRV_REQ_MAY_UNMAP to emulate the zero write by unmapping if the driver supports it. Signed-off-by: Peter Lieven p...@kamp.de --- block.c | 37 + include/block/block.h |1 + 2 files changed, 38 insertions(+) diff --git a/block.c b/block.c index b28dd42..21a992a 100644 --- a/block.c +++ b/block.c @@ -2391,6 +2391,43 @@ int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, BDRV_REQ_ZERO_WRITE | flags); } +/* + * Completely zero out a block device with the help of bdrv_write_zeroes. + * The operation is sped up by checking the block status and only writing + * zeroes to the device if they currently do not return zeroes. Optional + * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). + * + * Returns 0 on error, 0 on success. For error codes see bdrv_write(). + */ +int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) +{ +int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE; +int64_t ret, nb_sectors, sector_num = 0; +int n; + +for (;;) { +nb_sectors = target_size - sector_num; +if (nb_sectors = 0) { +return 0; +} +if (nb_sectors INT_MAX) { +nb_sectors = INT_MAX; +} +ret = bdrv_get_block_status(bs, sector_num, nb_sectors, n); +if (ret BDRV_BLOCK_ZERO) { +sector_num += n; +continue; +} +ret = bdrv_write_zeroes(bs, sector_num, n, flags); +if (ret 0) { +error_report(error writing zeroes at sector % PRId64 : %s, + sector_num, strerror(-ret)); +return ret; +} +sector_num += n; +} +} + int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int count1) { diff --git a/include/block/block.h b/include/block/block.h index de0d6ab..8244adb 100644 --- a/include/block/block.h +++ b/include/block/block.h @@ -216,6 +216,7 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors); int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); +int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags); int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov); int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int count); -- 1.7.9.5
[Qemu-devel] [PATCHv5 14/17] block/get_block_status: fix BDRV_BLOCK_ZERO for unallocated blocks
this patch does 2 things: a) only do additional call outs if BDRV_BLOCK_ZERO is not already set. b) use the newly introduced bdrv_has_discard_zeroes() to return the zero state of an unallocated block. the used callout to bdrv_has_zero_init() is only valid right after bdrv_create. Reviewed-by: Eric Blake ebl...@redhat.com Signed-off-by: Peter Lieven p...@kamp.de --- block.c |4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/block.c b/block.c index 21a992a..69a2d2b 100644 --- a/block.c +++ b/block.c @@ -3263,8 +3263,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, *pnum, pnum); } -if (!(ret BDRV_BLOCK_DATA)) { -if (bdrv_has_zero_init(bs)) { +if (!(ret BDRV_BLOCK_DATA) !(ret BDRV_BLOCK_ZERO)) { +if (bdrv_unallocated_blocks_are_zero(bs)) { ret |= BDRV_BLOCK_ZERO; } else if (bs-backing_hd) { BlockDriverState *bs2 = bs-backing_hd; -- 1.7.9.5
[Qemu-devel] [PATCHv5 15/17] qemu-img: add support for fully allocated images
Signed-off-by: Peter Lieven p...@kamp.de --- qemu-img.c|8 +--- qemu-img.texi |5 + 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 926f0a0..c6eff15 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -100,8 +100,10 @@ static void help(void) '-h' with or without a command shows this help and lists the supported formats\n '-p' show progress of command (only certain commands)\n '-q' use Quiet mode - do not print any output (except errors)\n - '-S' indicates the consecutive number of bytes that must contain only zeros\n - for qemu-img to create a sparse image during conversion\n + '-S' indicates the consecutive number of bytes (defaults to 4k) that must\n + contain only zeros for qemu-img to create a sparse image during\n + conversion. if the number of bytes is 0 sparse files are disabled and\n + images will always be fully allocated\n '--output' takes the format in which the output must be done (human or json)\n '-n' skips the target volume creation (useful if the volume is created\n prior to running qemu-img)\n @@ -1465,7 +1467,7 @@ static int img_convert(int argc, char **argv) /* signal EOF to align */ bdrv_write_compressed(out_bs, 0, NULL, 0); } else { -int has_zero_init = bdrv_has_zero_init(out_bs); +int has_zero_init = min_sparse ? bdrv_has_zero_init(out_bs) : 0; sector_num = 0; // total number of sectors converted so far nb_sectors = total_sectors - sector_num; diff --git a/qemu-img.texi b/qemu-img.texi index 768054e..51a1ee5 100644 --- a/qemu-img.texi +++ b/qemu-img.texi @@ -193,6 +193,11 @@ Image conversion is also useful to get smaller image when using a growable format such as @code{qcow} or @code{cow}: the empty sectors are detected and suppressed from the destination image. +@var{sparse_size} indicates the consecutive number of bytes (defaults to 4k) +that must contain only zeros for qemu-img to create a sparse image during +conversion. If the number of bytes is 0 sparse files are disabled and +images will always be fully allocated. + You can use the @var{backing_file} option to force the output image to be created as a copy on write image of the specified base image; the @var{backing_file} should have the same content as the input's base image, -- 1.7.9.5
[Qemu-devel] [PATCHv5 10/17] iscsi: simplify iscsi_co_discard
now that bdrv_co_discard can handle limits we do not need the request split logic here anymore. Reviewed-by: Eric Blake ebl...@redhat.com Signed-off-by: Peter Lieven p...@kamp.de --- block/iscsi.c | 67 + 1 file changed, 25 insertions(+), 42 deletions(-) diff --git a/block/iscsi.c b/block/iscsi.c index 1dbbcad..47b9cc9 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -87,7 +87,6 @@ typedef struct IscsiAIOCB { #define NOP_INTERVAL 5000 #define MAX_NOP_FAILURES 3 #define ISCSI_CMD_RETRIES 5 -#define ISCSI_MAX_UNMAP 131072 static void iscsi_bh_cb(void *p) @@ -912,8 +911,6 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, IscsiLun *iscsilun = bs-opaque; struct IscsiTask iTask; struct unmap_list list; -uint32_t nb_blocks; -uint32_t max_unmap; if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; @@ -925,52 +922,38 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, } list.lba = sector_qemu2lun(sector_num, iscsilun); -nb_blocks = sector_qemu2lun(nb_sectors, iscsilun); +list.num = sector_qemu2lun(nb_sectors, iscsilun); -max_unmap = iscsilun-bl.max_unmap; -if (max_unmap == 0x) { -max_unmap = ISCSI_MAX_UNMAP; -} - -while (nb_blocks 0) { -iscsi_co_init_iscsitask(iscsilun, iTask); -list.num = nb_blocks; -if (list.num max_unmap) { -list.num = max_unmap; -} +iscsi_co_init_iscsitask(iscsilun, iTask); retry: -if (iscsi_unmap_task(iscsilun-iscsi, iscsilun-lun, 0, 0, list, 1, - iscsi_co_generic_cb, iTask) == NULL) { -return -EIO; -} - -while (!iTask.complete) { -iscsi_set_events(iscsilun); -qemu_coroutine_yield(); -} +if (iscsi_unmap_task(iscsilun-iscsi, iscsilun-lun, 0, 0, list, 1, + iscsi_co_generic_cb, iTask) == NULL) { +return -EIO; +} -if (iTask.task != NULL) { -scsi_free_scsi_task(iTask.task); -iTask.task = NULL; -} +while (!iTask.complete) { +iscsi_set_events(iscsilun); +qemu_coroutine_yield(); +} -if (iTask.do_retry) { -goto retry; -} +if (iTask.task != NULL) { +scsi_free_scsi_task(iTask.task); +iTask.task = NULL; +} -if (iTask.status == SCSI_STATUS_CHECK_CONDITION) { -/* the target might fail with a check condition if it - is not happy with the alignment of the UNMAP request - we silently fail in this case */ -return 0; -} +if (iTask.do_retry) { +goto retry; +} -if (iTask.status != SCSI_STATUS_GOOD) { -return -EIO; -} +if (iTask.status == SCSI_STATUS_CHECK_CONDITION) { +/* the target might fail with a check condition if it + is not happy with the alignment of the UNMAP request + we silently fail in this case */ +return 0; +} -list.lba += list.num; -nb_blocks -= list.num; +if (iTask.status != SCSI_STATUS_GOOD) { +return -EIO; } return 0; -- 1.7.9.5
[Qemu-devel] [PATCHv5 11/17] iscsi: set limits in BlockDriverState
Reviewed-by: Eric Blake ebl...@redhat.com Signed-off-by: Peter Lieven p...@kamp.de --- block/iscsi.c | 14 ++ 1 file changed, 14 insertions(+) diff --git a/block/iscsi.c b/block/iscsi.c index 47b9cc9..c0465aa 100644 --- a/block/iscsi.c +++ b/block/iscsi.c @@ -1367,6 +1367,20 @@ static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, sizeof(struct scsi_inquiry_block_limits)); scsi_free_scsi_task(task); task = NULL; + +if (iscsilun-bl.max_unmap 0x) { +bs-bl.max_discard = sector_lun2qemu(iscsilun-bl.max_unmap, + iscsilun); +} +bs-bl.discard_alignment = sector_lun2qemu(iscsilun-bl.opt_unmap_gran, + iscsilun); + +if (iscsilun-bl.max_ws_len 0x) { +bs-bl.max_write_zeroes = sector_lun2qemu(iscsilun-bl.max_ws_len, + iscsilun); +} +bs-bl.write_zeroes_alignment = sector_lun2qemu(iscsilun-bl.opt_unmap_gran, +iscsilun); } #if defined(LIBISCSI_FEATURE_NOP_COUNTER) -- 1.7.9.5
Re: [Qemu-devel] [PATCH 54/66] add a header file for atomic operations
On 4 July 2013 16:13, Paolo Bonzini pbonz...@redhat.com wrote: +#ifndef atomic_xchg +#ifdef __ATOMIC_SEQ_CST +#define atomic_xchg(ptr, i)({ \ +typeof(*ptr) _new = (i), _old; \ +__atomic_exchange(ptr, _new, _old, __ATOMIC_SEQ_CST); \ +_old; \ +}) +#elif defined __clang__ +#define atomic_xchg(ptr, i)__sync_exchange(ptr, i) +#else +/* __sync_lock_test_and_set() is documented to be an acquire barrier only. */ +#define atomic_xchg(ptr, i)(smp_mb(), __sync_lock_test_and_set(ptr, i)) +#endif +#endif Hi. I'm afraid this doesn't compile on MacOSX/clang: CCutil/qemu-thread-posix.o util/qemu-thread-posix.c:351:13: error: too many arguments to function call, expected 3, have 4 if (atomic_xchg(ev-value, EV_SET) == EV_BUSY) { ^~~ /Users/pm215/src/qemu/include/qemu/atomic.h:174:42: note: expanded from macro 'atomic_xchg' __atomic_exchange(ptr, _new, _old, __ATOMIC_SEQ_CST); \ ~^ built-in:16:26: note: expanded from macro '__ATOMIC_SEQ_CST' #define __ATOMIC_SEQ_CST 5 ^ 1 error generated. make: *** [util/qemu-thread-posix.o] Error 1 I tried the '#elif defined__clang__' block instead and that doesn't work either: CCutil/qemu-thread-posix.o util/qemu-thread-posix.c:351:13: warning: implicit declaration of function '__sync_exchange' is invalid in C99 [-Wimplicit-function-declaration] if (atomic_xchg(ev-value, EV_SET) == EV_BUSY) { ^ /Users/pm215/src/qemu/include/qemu/atomic.h:179:32: note: expanded from macro 'atomic_xchg' #define atomic_xchg(ptr, i)__sync_exchange(ptr, i) ^ 1 warning generated. LINK qemu-nbd Undefined symbols for architecture x86_64: ___sync_exchange, referenced from: _qemu_event_set in libqemuutil.a(qemu-thread-posix.o) ld: symbol(s) not found for architecture x86_64 clang: error: linker command failed with exit code 1 (use -v to see invocation) make: *** [qemu-nbd] Error 1 It looks like we need to select the '#else' case for MacOSX... any suggestions about how best to do that? thanks -- PMM
[Qemu-devel] [PATCH] configure: Add config.status to recreate the current configuration
The latest configure invocation was saved in config-host.mak and could be extracted from that file to recreate the configuration. Now it is saved in a new file config.status which can be directly executed to recreate the configuration. The file name and the comments were copied from GNU autoconf. Makefile now uses config.status, but also includes transitional code for the old mechanism. Signed-off-by: Stefan Weil s...@weilnetz.de --- This patch still does not escape special characters in the command line. I expect that most users don't use such characters, so they have no problem. It's easy to fix that for hosts with bash: simply replace %s by %q. If required, this can be done in a later patch. Regards, Stefan Weil Makefile |9 - configure | 15 +-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index b15003f..073f18b 100644 --- a/Makefile +++ b/Makefile @@ -28,7 +28,14 @@ CONFIG_ALL=y include $(SRC_PATH)/rules.mak config-host.mak: $(SRC_PATH)/configure @echo $@ is out-of-date, running configure - @sed -n /.*Configured with/s/[^:]*: //p $@ | sh + @# TODO: The next lines include code which supports a smooth + @# transition from old configurations without config.status. + @# This code can be removed after QEMU 1.7. + @if test -x config.status; then \ + ./config.status; \ +else \ + sed -n /.*Configured with/s/[^:]*: //p $@ | sh; \ + fi else config-host.mak: ifneq ($(filter-out %clean,$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fail)) diff --git a/configure b/configure index 57ee62a..125d0c5 100755 --- a/configure +++ b/configure @@ -27,6 +27,19 @@ printf '%s' $0 $@ config.log echo config.log echo # config.log +# Save the configure command line for later reuse. +cat EOD config.status +#!/bin/sh +# Generated by configure. +# Run this file to recreate the current configuration. +# Compiler output produced by configure, useful for debugging +# configure, is in config.log if it exists. +EOD +printf exec config.status +printf '%s' $0 $@ config.status +echo config.status +chmod +x config.status + error_exit() { echo echo ERROR: $1 @@ -3765,8 +3778,6 @@ config_host_mak=config-host.mak echo # Automatically generated by configure - do not modify config-all-disas.mak echo # Automatically generated by configure - do not modify $config_host_mak -printf # Configured with: $config_host_mak -printf '%s' $0 $@ $config_host_mak echo $config_host_mak echo all: $config_host_mak -- 1.7.10.4
[Qemu-devel] [PATCH] qcow2: Restore total_sectors value in save_vmstate
Since df2a6f29a5, bdrv_co_do_writev increases the total_sectors value of a growable block devices on writes after the current end. This leads to the virtual disk apparently growing in qcow2_save_vmstate, which in turn affects the disk size captured by the internal snapshot taken directly afterwards through e.g. the HMP savevm command. Such a grown snapshot cannot be loaded after reopening the qcow2 image, since its disk size differs from the actual virtual disk size (writing a VM state does not actually increase the virtual disk size). Fix this by restoring total_sectors at the end of qcow2_save_vmstate. Signed-off-by: Max Reitz mre...@redhat.com --- block/qcow2.c | 5 + 1 file changed, 5 insertions(+) diff --git a/block/qcow2.c b/block/qcow2.c index c1abaff..5c05bb5 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1939,6 +1939,7 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { BDRVQcowState *s = bs-opaque; +int64_t total_sectors = bs-total_sectors; int growable = bs-growable; int ret; @@ -1946,6 +1947,10 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, bs-growable = 1; ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); bs-growable = growable; +// bdrv_co_do_writev will have increased the total_sectors value to include +// the VM state - the VM state is however not an actual part of the block +// device, therefore, we need to restore the old value. +bs-total_sectors = total_sectors; return ret; } -- 1.8.3.1
[Qemu-devel] [PATCH] qcow2: Unset zero_beyond_eof in save_vmstate
Saving the VM state is done using bdrv_pwrite. This function may perform a read-modify-write, which in this case results in data being read from beyond the end of the virtual disk. Since we are actually trying to access an area which is not a part of the virtual disk, zero_beyond_eof has to be set to false before performing the partial write, otherwise the VM state may become corrupted. Signed-off-by: Max Reitz mre...@redhat.com --- Follow-up to (depends on): - qcow2: Restore total_sectors value in save_vmstate --- block/qcow2.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/qcow2.c b/block/qcow2.c index 5c05bb5..3e11f25 100644 --- a/block/qcow2.c +++ b/block/qcow2.c @@ -1941,12 +1941,15 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, BDRVQcowState *s = bs-opaque; int64_t total_sectors = bs-total_sectors; int growable = bs-growable; +bool zero_beyond_eof = bs-zero_beyond_eof; int ret; BLKDBG_EVENT(bs-file, BLKDBG_VMSTATE_SAVE); bs-growable = 1; +bs-zero_beyond_eof = false; ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); bs-growable = growable; +bs-zero_beyond_eof = zero_beyond_eof; // bdrv_co_do_writev will have increased the total_sectors value to include // the VM state - the VM state is however not an actual part of the block // device, therefore, we need to restore the old value. -- 1.8.3.1
[Qemu-devel] [RFC PATCH v1: 02/12] rdma: remove reference to github.com
From: Michael R. Hines mrhi...@us.ibm.com Signed-off-by: Michael R. Hines mrhi...@us.ibm.com --- docs/rdma.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/rdma.txt b/docs/rdma.txt index 2aca63b..6d116e2 100644 --- a/docs/rdma.txt +++ b/docs/rdma.txt @@ -2,7 +2,6 @@ RDMA Live Migration Specification, Version # 1 == Wiki: http://wiki.qemu-project.org/Features/RDMALiveMigration -Github: g...@github.com:hinesmr/qemu.git, 'rdma' branch Copyright (C) 2013 Michael R. Hines mrhi...@us.ibm.com -- 1.8.1.2
[Qemu-devel] [RFC PATCH v1: 00/12] fault tolerance through micro-checkpointing
From: Michael R. Hines mrhi...@us.ibm.com This patch implements RDMA-aware fault tolerance for the VM using Micro-Checkpointing (to be presented at the KVM Forum). The breakout of the patches is not ideal and is really meant to kick things off for review, which will likely extend well past 1.7 and into 1.8 version of QEMU, assuming about 5-6 months of reviews. Please begin with patch #01 as it provides a good narrative of what is different about this and previous attempts at fault tolerance, including a breakdown of the current empirical performance challenges. Michael R. Hines (12): mc: add documentation for micro-checkpointing rdma: remove reference to github.com migration: introduce parallelization of migration_bitmap mc: introduce a checkpointing status check into the VCPU states migration: support custom page loading rdma: accelerated memcpy() support mc: introduce state machine error handling and migration_bitmap prep mc: modified QMP statistics and migration_thread handoff mc: core logic mc: configure and makefile support mc: register MC qemu-file functions and expose MC tunable capability mc: activate and use MC core logic if requested Makefile.objs |1 + arch_init.c | 276 +- configure | 45 + cpus.c|9 +- docs/mc.txt | 261 ++ docs/rdma.txt |1 - hmp-commands.hx | 14 + hmp.c | 23 + hmp.h |1 + include/migration/migration.h | 69 +- include/migration/qemu-file.h | 55 +- include/qemu-common.h | 12 + migration-checkpoint.c| 1589 migration-rdma.c | 2008 ++--- migration.c | 148 ++- qapi-schema.json | 92 +- qmp-commands.hx | 23 + savevm.c | 84 +- vl.c | 42 + 19 files changed, 4123 insertions(+), 630 deletions(-) create mode 100644 docs/mc.txt create mode 100644 migration-checkpoint.c -- 1.8.1.2
[Qemu-devel] [RFC PATCH v1: 04/12] mc: introduce a checkpointing status check into the VCPU states
From: Michael R. Hines mrhi...@us.ibm.com During micro-checkpointing, the VCPUs get repeatedly paused and resumed. We need to not freak out when the VM begins micro-checkpointing. Signed-off-by: Michael R. Hines mrhi...@us.ibm.com --- arch_init.c | 2 +- cpus.c| 9 - include/migration/migration.h | 2 ++ qapi-schema.json | 4 +++- vl.c | 6 ++ 5 files changed, 20 insertions(+), 3 deletions(-) diff --git a/arch_init.c b/arch_init.c index 4a71311..b139512 100644 --- a/arch_init.c +++ b/arch_init.c @@ -199,7 +199,7 @@ typedef struct AccountingInfo { static AccountingInfo acct_info; -static void acct_clear(void) +void acct_clear(void) { memset(acct_info, 0, sizeof(acct_info)); } diff --git a/cpus.c b/cpus.c index 398229e..d090c2c 100644 --- a/cpus.c +++ b/cpus.c @@ -530,7 +530,14 @@ static int do_vm_stop(RunState state) pause_all_vcpus(); runstate_set(state); vm_state_notify(0, state); -monitor_protocol_event(QEVENT_STOP, NULL); +/* + * If MC is enabled, libvirt gets confused + * because it thinks the VM is stopped when + * its just being micro-checkpointed. + */ +if(state != RUN_STATE_CHECKPOINT_VM) { +monitor_protocol_event(QEVENT_STOP, NULL); +} } bdrv_drain_all(); diff --git a/include/migration/migration.h b/include/migration/migration.h index 3ffc433..3ad06c5 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -119,6 +119,8 @@ uint64_t xbzrle_mig_bytes_transferred(void); uint64_t xbzrle_mig_pages_transferred(void); uint64_t xbzrle_mig_pages_overflow(void); uint64_t xbzrle_mig_pages_cache_miss(void); +void acct_clear(void); + void *migration_bitmap_worker(void *opaque); void migration_bitmap_worker_start(MigrationState *s); void migration_bitmap_worker_stop(MigrationState *s); diff --git a/qapi-schema.json b/qapi-schema.json index aac0894..8e72bcf 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -169,6 +169,8 @@ # # @save-vm: guest is paused to save the VM state # +# @checkpoint-vm: guest is paused to checkpoint the VM state +# # @shutdown: guest is shut down (and -no-shutdown is in use) # # @suspended: guest is suspended (ACPI S3) @@ -181,7 +183,7 @@ 'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused', 'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm', 'running', 'save-vm', 'shutdown', 'suspended', 'watchdog', -'guest-panicked' ] } +'guest-panicked', 'checkpoint-vm' ] } ## # @SnapshotInfo diff --git a/vl.c b/vl.c index e2ba2e8..74d52ab 100644 --- a/vl.c +++ b/vl.c @@ -611,14 +611,18 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING }, { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE }, +{ RUN_STATE_FINISH_MIGRATE, RUN_STATE_CHECKPOINT_VM }, { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING }, +{ RUN_STATE_CHECKPOINT_VM, RUN_STATE_RUNNING }, + { RUN_STATE_RUNNING, RUN_STATE_DEBUG }, { RUN_STATE_RUNNING, RUN_STATE_INTERNAL_ERROR }, { RUN_STATE_RUNNING, RUN_STATE_IO_ERROR }, { RUN_STATE_RUNNING, RUN_STATE_PAUSED }, { RUN_STATE_RUNNING, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_RUNNING, RUN_STATE_CHECKPOINT_VM }, { RUN_STATE_RUNNING, RUN_STATE_RESTORE_VM }, { RUN_STATE_RUNNING, RUN_STATE_SAVE_VM }, { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN }, @@ -634,9 +638,11 @@ static const RunStateTransition runstate_transitions_def[] = { { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED }, { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING }, { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_SUSPENDED, RUN_STATE_CHECKPOINT_VM }, { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING }, { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE }, +{ RUN_STATE_WATCHDOG, RUN_STATE_CHECKPOINT_VM }, { RUN_STATE_GUEST_PANICKED, RUN_STATE_PAUSED }, { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE }, -- 1.8.1.2
[Qemu-devel] [RFC PATCH v1: 03/12] migration: introduce parallelization of migration_bitmap
From: Michael R. Hines mrhi...@us.ibm.com This patch allows the preparation of the migration_bitmap to be parallelized. For very large VMs, this can take on the order of 10s of milliseconds, which translates as downtime. We count the number of cores first, and then handout chunks of the logdirty bitmap to a thread per core. Each thread scans for dirty bits in parallel. Signed-off-by: Michael R. Hines mrhi...@us.ibm.com --- arch_init.c | 228 +++--- include/migration/migration.h | 10 ++ include/qemu-common.h | 12 +++ qapi-schema.json | 73 +- vl.c | 33 ++ 5 files changed, 340 insertions(+), 16 deletions(-) diff --git a/arch_init.c b/arch_init.c index 7545d96..4a71311 100644 --- a/arch_init.c +++ b/arch_init.c @@ -189,6 +189,8 @@ typedef struct AccountingInfo { uint64_t skipped_pages; uint64_t norm_pages; uint64_t iterations; +uint64_t log_dirty_time; +uint64_t migration_bitmap_time; uint64_t xbzrle_bytes; uint64_t xbzrle_pages; uint64_t xbzrle_cache_miss; @@ -232,6 +234,16 @@ uint64_t norm_mig_pages_transferred(void) return acct_info.norm_pages; } +uint64_t norm_mig_log_dirty_time(void) +{ +return acct_info.log_dirty_time; +} + +uint64_t norm_mig_bitmap_time(void) +{ +return acct_info.migration_bitmap_time; +} + uint64_t xbzrle_mig_bytes_transferred(void) { return acct_info.xbzrle_bytes; @@ -362,15 +374,189 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(MemoryRegion *mr, static inline bool migration_bitmap_set_dirty(MemoryRegion *mr, ram_addr_t offset) { -bool ret; -int nr = (mr-ram_addr + offset) TARGET_PAGE_BITS; +return test_and_set_bit((mr-ram_addr + offset) TARGET_PAGE_BITS, +migration_bitmap); +} + +typedef struct BitmapWalkerParams { +QemuMutex ready_mutex; +QemuMutex done_mutex; +QemuCond cond; +QemuThread walker; +MigrationState *s; +int core_id; +int keep_running; +ram_addr_t start; +ram_addr_t stop; +void *block; +uint64_t dirty_pages; +} BitmapWalkerParams; -ret = test_and_set_bit(nr, migration_bitmap); +static int nb_bitmap_workers = 0; -if (!ret) { -migration_dirty_pages++; +BitmapWalkerParams *bitmap_walkers = NULL; + +/* + * Bitmap workers: This is a temporary performance-driven + * workaround for the slowness (10s of milliseconds) incurred + * during calls to migration_bitmap_sync(). + * + * Ideally, migration_bitmap_sync() should be able to use the + * GET_LOG_DIRTY bitmap from KVM directly, but it does not right + * now because the bitmap is not retrieved as a single memory + * allocation which requires a couple of transformations into + * a 'unified' bitmap before the migration code can make good use + * of it. + * + * Bitmap workers perform this transformation in parallel + * in a multi-threaded fashion until a patch is ready to process + * the bitmaps from GET_LOG_DIRTY directly. + */ +static uint64_t migration_bitmap_sync_range(RAMBlock *block, +ram_addr_t start, ram_addr_t stop) +{ +ram_addr_t addr; +uint64_t dirty_pages = 0; + + +for (addr = start; addr stop; addr += TARGET_PAGE_SIZE) { +if (memory_region_test_and_clear_dirty(block-mr, + addr, TARGET_PAGE_SIZE, + DIRTY_MEMORY_MIGRATION)) { +if (!migration_bitmap_set_dirty(block-mr, addr)) { +dirty_pages++; +} +} +} + +return dirty_pages; +} + +/* + * The worker sleeps until it gets some work to transform a + * chunk of bitmap from KVM to the migration_bitmap. + */ +void *migration_bitmap_worker(void *opaque) +{ +BitmapWalkerParams * bwp = opaque; + +do { +qemu_mutex_lock(bwp-ready_mutex); +qemu_mutex_lock(bwp-done_mutex); +qemu_mutex_unlock(bwp-ready_mutex); +qemu_cond_signal(bwp-cond); + +if(!bwp-keep_running) { +break; +} + +bwp-dirty_pages = migration_bitmap_sync_range(bwp-block, bwp-start, bwp-stop); + +qemu_cond_wait(bwp-cond, bwp-done_mutex); +qemu_mutex_unlock(bwp-done_mutex); +} while(bwp-keep_running); + +return NULL; +} + +void migration_bitmap_worker_start(MigrationState *s) +{ +int core; + +/* + * CPUs N - 1 are reserved for N - 1 worker threads + * processing the pc.ram bytemap = migration_bitmap. + * The migration thread goes on the last CPU, + * which process the remaining, smaller RAMblocks. + */ +nb_bitmap_workers = getNumCores() - 1; + +bitmap_walkers = g_malloc0(sizeof(struct BitmapWalkerParams) * +nb_bitmap_workers); + +memset(bitmap_walkers, 0, sizeof(BitmapWalkerParams)
[Qemu-devel] [RFC PATCH v1: 01/12] mc: add documentation for micro-checkpointing
From: Michael R. Hines mrhi...@us.ibm.com Signed-off-by: Michael R. Hines mrhi...@us.ibm.com --- docs/mc.txt | 261 1 file changed, 261 insertions(+) create mode 100644 docs/mc.txt diff --git a/docs/mc.txt b/docs/mc.txt new file mode 100644 index 000..90888f7 --- /dev/null +++ b/docs/mc.txt @@ -0,0 +1,261 @@ +Micro Checkpointing Specification +== +Wiki: http://wiki.qemu.org/Features/MicroCheckpointing +Github: g...@github.com:hinesmr/qemu.git, 'mc' branch + +Copyright (C) 2014 Michael R. Hines mrhi...@us.ibm.com + +Contents: += +* Introduction +* The Micro-Checkpointing Process +* RDMA Integration +* Failure Recovery +* Before running +* Running +* Performance +* TODO + +INTRODUCTION: += + +Micro-Checkpointing (MC) is one method for providing Fault Tolerance to a +running virtual machine (VM) with neither runtime assistance from the guest +kernel nor from the guest application software. Furthermore, Fault Tolerance +is one method of providing high availability to a VM such that, from the +perspective of the outside world (clients, devices, and neighboring VMs that +may be paired with it), the VM and its applications have not lost any runtime +state in the event of either a failure of the hypervisor/hardware to allow the +VM to make forward progress or a complete loss of power. This mechanism for +providing fault tolerance does *not* provide any protection whatsoever against +software-level faults in the guest kernel or applications. In fact, due to +the potentially extended lifetime of the VM because of this type of high +availability, such software-level bugs may in fact manifest themselves +*more often* than they ordinarily would, in which case you would need to +employ other forms of availability to guard against such software-level faults. + +This implementation is also fully compatible with RDMA. (See docs/rdma.txt +for more details). + +THE MICRO-CHECKPOINTING PROCESS: + + +Micro-Checkpointing works against the existing live migration path in QEMU, +and can effectively be understood as a live migration that never ends. +As such, iterations rounds happen at the granularity of 10s of milliseconds +and perform the following steps: + +1. After N milliseconds, stop the VM. +2. Generate a MC by invoking the live migration software path + to identify and copy dirty memory into a local staging area inside QEMU. +3. Resume the VM immediately so that it can make forward progress. +4. Transmit the checkpoint to the destination. +5. Repeat + +Upon failure, load the contents of the last MC at the destination back +into memory and run the VM normally. + +Additionally, a MC must include a consistent view of device I/O, +particularly the network, a problem commonly referred to as output commit. +This means that the outside world can not be allowed to experience duplicate +state that was committed by the virtual machine after failure. This is +possible because a checkpoint may diverge by N milliseconds of time and +commit state while the current checkpoint is being transmitted to the +destination. + +To guard against this problem, first, we must buffer the TX output of the +network (not the input) between MCs until the current MC is safely received +by the destination. For example, all outbound network packets must be held +at the source until the MC is transmitted. After transmission is complete, +those packets can be released. Similarly, in the case of disk I/O, we must +ensure that either the contents of the local disk is safely mirrored to a +remote disk before completing a MC or that the output to a shared disk, +such as iSCSI, is also buffered between checkpoints and then later released +in the same way. + +This implementation *currently* only supports buffering for the network. +This requires that the VM's root disk or any non-ephemeral disks also be +made network-accessible directly from within the VM. Until the aforementioned +buffering or mirroring support is available (ideally through drive-mirror), +the only consistent way to provide full fault tolerance of the VM's +non-ephemeral disks is to construct a VM whose root disk is made to boot +directly from iSCSI or NFS or similar such that all disk I/O is translated +into network I/O. + +RDMA INTEGRATION: += + +RDMA is instrumental in enabling better MC performance, which is the reason +why it was introduced into QEMU first. + +1. Checkpoint generation (RDMA-based memcpy): +2. Checkpoint transmission (for performance and less CPU impact) + +Checkpoint generation (step 2 in the previous section) must be done while +the VM is paused. In the worst case, the size of the checkpoint can be +equal in size to the amount of memory in total use by the VM. In order +to resume VM execution as fast as possible, the checkpoint is copied +consistently locally into a staging area
[Qemu-devel] [RFC PATCH v1: 10/12] mc: configure and makefile support
From: Michael R. Hines mrhi...@us.ibm.com Signed-off-by: Michael R. Hines mrhi...@us.ibm.com --- Makefile.objs | 1 + configure | 45 + 2 files changed, 46 insertions(+) diff --git a/Makefile.objs b/Makefile.objs index 2b6c1fe..15356d6 100644 --- a/Makefile.objs +++ b/Makefile.objs @@ -52,6 +52,7 @@ common-obj-$(CONFIG_LINUX) += fsdev/ common-obj-y += migration.o migration-tcp.o common-obj-$(CONFIG_RDMA) += migration-rdma.o +common-obj-$(CONFIG_MC) += migration-checkpoint.o common-obj-y += qemu-char.o #aio.o common-obj-y += block-migration.o common-obj-y += page_cache.o xbzrle.o diff --git a/configure b/configure index 57ee62a..64c0d5e 100755 --- a/configure +++ b/configure @@ -182,6 +182,7 @@ kvm=no rdma= gprof=no debug_tcg=no +mc= debug=no strip_opt=yes tcg_interpreter=no @@ -969,6 +970,10 @@ for opt do ;; --enable-libssh2) libssh2=yes ;; + --disable-mc) mc=no + ;; + --enable-mc) mc=yes + ;; *) echo ERROR: unknown option $opt; show_help=yes ;; esac @@ -1200,6 +1205,8 @@ echo --gcov=GCOV use specified gcov [$gcov_tool] echo --enable-tpm enable TPM support echo --disable-libssh2disable ssh block device support echo --enable-libssh2 enable ssh block device support +echo --disable-mc disable Micro-Checkpointing support +echo --enable-mc enable Micro-Checkpointing support echo echo NOTE: The object files are built at the place where configure is launched exit 1 @@ -1861,6 +1868,35 @@ EOF fi fi +## +# Micro-Checkpointing requires netlink +if test $mc != no ; then + cat $TMPC EOF +#include libnl3/netlink/route/qdisc/plug.h +#include libnl3/netlink/route/class.h +#include libnl3/netlink/cli/utils.h +#include libnl3/netlink/cli/tc.h +#include libnl3/netlink/cli/qdisc.h +#include libnl3/netlink/cli/link.h +int main(void) { return 0; } +EOF + mc_libs=-lnl-3 -lnl-cli-3 -lnl-route-3 + mc_cflags=-I/usr/include/libnl3 + if compile_prog $mc_cflags $mc_libs ; then +mc=yes +libs_softmmu=$libs_softmmu $mc_libs +QEMU_CFLAGS=$QEMU_CFLAGS $mc_cflags + else +if test $mc = yes ; then +error_exit \ + NetLink v3 libs/headers not present. \ + Please install the libnl3-*-dev(el) packages from your distro. +fi +mc=no + fi +fi + + ## # VNC TLS/WS detection if test $vnc = yes -a \( $vnc_tls != no -o $vnc_ws != no \) ; then @@ -3723,6 +3759,7 @@ echo KVM support $kvm echo RDMA support $rdma echo TCG interpreter $tcg_interpreter echo fdt support $fdt +echo Micro checkpointing $mc echo preadv support$preadv echo fdatasync $fdatasync echo madvise $madvise @@ -4206,6 +4243,10 @@ if test $rdma = yes ; then echo CONFIG_RDMA=y $config_host_mak fi +if test $mc = yes ; then + echo CONFIG_MC=y $config_host_mak +fi + if test $tcg_interpreter = yes; then QEMU_INCLUDES=-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES elif test $ARCH = sparc64 ; then @@ -4633,6 +4674,10 @@ echo QEMU_CFLAGS+=$cflags $config_target_mak done # for target in $targets +if test $mc = yes ; then +echo CONFIG_MC=y $config_host_mak +fi + if [ $pixman = internal ]; then echo config-host.h: subdir-pixman $config_host_mak fi -- 1.8.1.2
[Qemu-devel] [RFC PATCH v1: 07/12] mc: introduce state machine error handling and migration_bitmap prep
From: Michael R. Hines mrhi...@us.ibm.com Since MC will repeatedly call the pre-existing live migration call path over and over again (forever), the migration_bitmap initialization only needs to happen once and the destruction of the bitmap needs to be avoided in successive checkpoints. Also, there some additional state machine error handling to prepare for before introducing the MC core logic. Signed-off-by: Michael R. Hines mrhi...@us.ibm.com --- arch_init.c | 29 - include/migration/migration.h | 19 +++ include/migration/qemu-file.h | 1 + migration.c | 33 +++-- 4 files changed, 63 insertions(+), 19 deletions(-) diff --git a/arch_init.c b/arch_init.c index 9cf7d18..d47b38b 100644 --- a/arch_init.c +++ b/arch_init.c @@ -795,13 +795,13 @@ static void ram_migration_cancel(void *opaque) migration_end(); } -static void reset_ram_globals(void) +static void reset_ram_globals(bool reset_bulk_stage) { last_seen_block = NULL; last_sent_block = NULL; last_offset = 0; last_version = ram_list.version; -ram_bulk_stage = true; +ram_bulk_stage = reset_bulk_stage; } #define MAX_WAIT 50 /* ms, half buffered_file limit */ @@ -811,6 +811,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque) RAMBlock *block; int64_t ram_pages = last_ram_offset() TARGET_PAGE_BITS; +/* + * RAM stays open during micro-checkpointing for the next transaction. + */ +if (migration_is_mc(migrate_get_current())) { +qemu_mutex_lock_ramlist(); +reset_ram_globals(false); +goto skip_setup; +} + migration_bitmap = bitmap_new(ram_pages); bitmap_set(migration_bitmap, 0, ram_pages); migration_dirty_pages = ram_pages; @@ -833,12 +842,14 @@ static int ram_save_setup(QEMUFile *f, void *opaque) qemu_mutex_lock_iothread(); qemu_mutex_lock_ramlist(); bytes_transferred = 0; -reset_ram_globals(); +reset_ram_globals(true); memory_global_dirty_log_start(); migration_bitmap_sync(); qemu_mutex_unlock_iothread(); +skip_setup: + qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); QTAILQ_FOREACH(block, ram_list.blocks, next) { @@ -867,7 +878,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) qemu_mutex_lock_ramlist(); if (ram_list.version != last_version) { -reset_ram_globals(); +reset_ram_globals(true); } ram_control_before_iterate(f, RAM_CONTROL_ROUND); @@ -948,7 +959,15 @@ static int ram_save_complete(QEMUFile *f, void *opaque) } ram_control_after_iterate(f, RAM_CONTROL_FINISH); -migration_end(); + +/* + * Only cleanup at the end of normal migrations + * or if the MC destination failed and we got an error. + * Otherwise, we are (or will be soon) in MIG_STATE_MC. + */ +if(!migrate_use_mc() || migration_has_failed(migrate_get_current())) { +migration_end(); +} qemu_mutex_unlock_ramlist(); qemu_put_be64(f, RAM_SAVE_FLAG_EOS); diff --git a/include/migration/migration.h b/include/migration/migration.h index 0e7f121..fcf7684 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -95,6 +95,8 @@ int migrate_fd_close(MigrationState *s); void add_migration_state_change_notifier(Notifier *notify); void remove_migration_state_change_notifier(Notifier *notify); bool migration_in_setup(MigrationState *); +bool migration_is_active(MigrationState *); +bool migration_is_mc(MigrationState *s); bool migration_has_finished(MigrationState *); bool migration_has_failed(MigrationState *); MigrationState *migrate_get_current(void); @@ -126,6 +128,15 @@ void migration_bitmap_worker_start(MigrationState *s); void migration_bitmap_worker_stop(MigrationState *s); void migrate_set_state(MigrationState *s, int old_state, int new_state); +enum { +MIG_STATE_ERROR = -1, +MIG_STATE_NONE, +MIG_STATE_SETUP, +MIG_STATE_CANCELLED, +MIG_STATE_ACTIVE, +MIG_STATE_MC, +MIG_STATE_COMPLETED, +}; void ram_handle_compressed(void *host, uint8_t ch, uint64_t size); /** @@ -194,4 +205,12 @@ int ram_control_copy_page(QEMUFile *f, ram_addr_t block_offset_source, ram_addr_t offset_source, long size); + +int migrate_use_mc(void); +int migrate_use_mc_rdma_copy(void); + +#define MC_VERSION 1 + +void qemu_rdma_info_save(QEMUFile *f, void *opaque); +int qemu_rdma_info_load(QEMUFile *f, void *opaque, int version_id); #endif diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index d67e97a..b547de9 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -71,6 +71,7 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); #define RAM_CONTROL_ROUND1 #define RAM_CONTROL_HOOK 2 #define
[Qemu-devel] [RFC PATCH v1: 08/12] mc: modified QMP statistics and migration_thread handoff
From: Michael R. Hines mrhi...@us.ibm.com In addition to better handling of new QMP statistics associated with the migration_bitmap and MC performance, we need to transfer control from the migration thread to the MC thread more cleanly, which means dynamically allocating the threads and doing the handoff after the initial live migration has completed. Signed-off-by: Michael R. Hines mrhi...@us.ibm.com --- hmp.c | 17 include/migration/migration.h | 14 ++- migration.c | 94 +++ qapi-schema.json | 2 + savevm.c | 5 +-- 5 files changed, 93 insertions(+), 39 deletions(-) diff --git a/hmp.c b/hmp.c index 32ee285..43896e9 100644 --- a/hmp.c +++ b/hmp.c @@ -202,6 +202,23 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict) info-disk-total 10); } +if (info-has_mc) { +monitor_printf(mon, checkpoints: % PRIu64 \n, + info-mc-checkpoints); +monitor_printf(mon, xmit_time: % PRIu64 ms\n, + info-mc-xmit_time); +monitor_printf(mon, log_dirty_time: % PRIu64 ms\n, + info-mc-log_dirty_time); +monitor_printf(mon, migration_bitmap_time: % PRIu64 ms\n, + info-mc-migration_bitmap_time); +monitor_printf(mon, ram_copy_time: % PRIu64 ms\n, + info-mc-ram_copy_time); +monitor_printf(mon, copy_mbps: %0.2f mbps\n, + info-mc-copy_mbps); +monitor_printf(mon, throughput: %0.2f mbps\n, + info-mc-mbps); +} + if (info-has_xbzrle_cache) { monitor_printf(mon, cache size: % PRIu64 bytes\n, info-xbzrle_cache-cache_size); diff --git a/include/migration/migration.h b/include/migration/migration.h index fcf7684..a1ab06c 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -35,13 +35,14 @@ struct MigrationState int64_t bandwidth_limit; size_t bytes_xfer; size_t xfer_limit; -QemuThread thread; +QemuThread *thread; QEMUBH *cleanup_bh; QEMUFile *file; int state; MigrationParams params; double mbps; +double copy_mbps; int64_t total_time; int64_t downtime; int64_t expected_downtime; @@ -54,6 +55,7 @@ struct MigrationState bool enabled_capabilities[MIGRATION_CAPABILITY_MAX]; int64_t xbzrle_cache_size; int64_t setup_time; +int64_t checkpoints; }; void process_incoming_migration(QEMUFile *f); @@ -137,6 +139,12 @@ enum { MIG_STATE_MC, MIG_STATE_COMPLETED, }; + +int mc_enable_buffering(void); +int mc_start_buffer(void); +void mc_init_checkpointer(MigrationState *s); +void mc_process_incoming_checkpoints_if_requested(QEMUFile *f); + void ram_handle_compressed(void *host, uint8_t ch, uint64_t size); /** @@ -207,10 +215,14 @@ int ram_control_copy_page(QEMUFile *f, long size); int migrate_use_mc(void); +int migrate_use_mc_net(void); int migrate_use_mc_rdma_copy(void); #define MC_VERSION 1 +int mc_info_load(QEMUFile *f, void *opaque, int version_id); +void mc_info_save(QEMUFile *f, void *opaque); + void qemu_rdma_info_save(QEMUFile *f, void *opaque); int qemu_rdma_info_load(QEMUFile *f, void *opaque, int version_id); #endif diff --git a/migration.c b/migration.c index 62dded3..8e0827e 100644 --- a/migration.c +++ b/migration.c @@ -172,6 +172,31 @@ static void get_xbzrle_cache_stats(MigrationInfo *info) } } +static void get_ram_stats(MigrationState *s, MigrationInfo *info) +{ +info-has_total_time = true; +info-total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +- s-total_time; + +info-has_ram = true; +info-ram = g_malloc0(sizeof(*info-ram)); +info-ram-transferred = ram_bytes_transferred(); +info-ram-total = ram_bytes_total(); +info-ram-duplicate = dup_mig_pages_transferred(); +info-ram-skipped = skipped_mig_pages_transferred(); +info-ram-normal = norm_mig_pages_transferred(); +info-ram-normal_bytes = norm_mig_bytes_transferred(); +info-ram-mbps = s-mbps; + +if (blk_mig_active()) { +info-has_disk = true; +info-disk = g_malloc0(sizeof(*info-disk)); +info-disk-transferred = blk_mig_bytes_transferred(); +info-disk-remaining = blk_mig_bytes_remaining(); +info-disk-total = blk_mig_bytes_total(); +} +} + MigrationInfo *qmp_query_migrate(Error **errp) { MigrationInfo *info = g_malloc0(sizeof(*info)); @@ -197,26 +222,8 @@ MigrationInfo *qmp_query_migrate(Error **errp) info-has_setup_time = true; info-setup_time = s-setup_time; -info-has_ram = true; -info-ram = g_malloc0(sizeof(*info-ram)); -info-ram-transferred = ram_bytes_transferred(); -info-ram-remaining = ram_bytes_remaining(); -info-ram-total
[Qemu-devel] [RFC PATCH v1: 09/12] mc: core logic
From: Michael R. Hines mrhi...@us.ibm.com This implements the core logic, all described in docs/mc.txt Signed-off-by: Michael R. Hines mrhi...@us.ibm.com --- migration-checkpoint.c | 1589 1 file changed, 1589 insertions(+) create mode 100644 migration-checkpoint.c diff --git a/migration-checkpoint.c b/migration-checkpoint.c new file mode 100644 index 000..14b03e8 --- /dev/null +++ b/migration-checkpoint.c @@ -0,0 +1,1589 @@ +/* + * Copyright (C) 2014 Michael R. Hines mrhi...@us.ibm.com + * + * Micro-Checkpointing (MC) support + * (a.k.a. Fault Tolerance or Continuous Replication) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; under version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see http://www.gnu.org/licenses/. + */ +#include libnl3/netlink/route/qdisc/plug.h +#include libnl3/netlink/route/class.h +#include libnl3/netlink/cli/utils.h +#include libnl3/netlink/cli/tc.h +#include libnl3/netlink/cli/qdisc.h +#include libnl3/netlink/cli/link.h +#include qemu-common.h +#include hw/virtio/virtio.h +#include hw/virtio/virtio-net.h +#include qemu/sockets.h +#include migration/migration.h +#include migration/qemu-file.h +#include qmp-commands.h +#include net/tap-linux.h +#include sys/ioctl.h + +#define DEBUG_MC +//#define DEBUG_MC_VERBOSE +//#define DEBUG_MC_REALLY_VERBOSE + +#ifdef DEBUG_MC +#define DPRINTF(fmt, ...) \ +do { printf(mc: fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) \ +do { } while (0) +#endif + +#ifdef DEBUG_MC_VERBOSE +#define DDPRINTF(fmt, ...) \ +do { printf(mc: fmt, ## __VA_ARGS__); } while (0) +#else +#define DDPRINTF(fmt, ...) \ +do { } while (0) +#endif + +#ifdef DEBUG_MC_REALLY_VERBOSE +#define DDDPRINTF(fmt, ...) \ +do { printf(mc: fmt, ## __VA_ARGS__); } while (0) +#else +#define DDDPRINTF(fmt, ...) \ +do { } while (0) +#endif + +#define MBPS(bytes, time) time ? double) bytes * 8) \ +/ ((double) time / 1000.0)) / 1000.0 / 1000.0) : -1.0 + +/* + * Micro checkpoints (MC)s are typically only a few MB when idle. + * However, they can easily be very large during heavy workloads. + * In the *extreme* worst-case, QEMU might need double the amount of main memory + * than that of what was originally allocated to the virtual machine. + * + * To support this variability during transient periods, a MC + * consists of a linked list of slabs, each of identical size. A better name + * would be welcome, as the name was only chosen because it resembles linux + * memory allocation. Because MCs occur several times per second + * (a frequency of 10s of milliseconds), slabs allow MCs to grow and shrink + * without constantly re-allocating all memory in place during each checkpoint. + * + * During steady-state, the 'head' slab is permanently allocated and never goes + * away, so when the VM is idle, there is no memory allocation at all. + * This design supports the use of RDMA. Since RDMA requires memory pinning, we + * must be able to hold on to a slab for a reasonable amount of time to get any + * real use out of it. + * + * Regardless, the current strategy taken is: + * + * 1. If the checkpoint size increases, + *then grow the number of slabs to support it. + * 2. If the next checkpoint size is smaller than the last one, + then that's a strike. + * 3. After N strikes, cut the size of the slab cache in half + *(to a minimum of 1 slab as described before). + * + * As of this writing, a typical average size of + * an Idle-VM checkpoint is under 5MB. + */ + +#define MC_SLAB_BUFFER_SIZE (5UL * 1024UL * 1024UL) /* empirical */ +#define MC_DEV_NAME_MAX_SIZE256 + +#define MC_DEFAULT_CHECKPOINT_FREQ_MS 100 /* too slow, but best for now */ +#define CALC_MAX_STRIKES() \ +do { max_strikes = (max_strikes_delay_secs * 1000) / freq_ms; } \ +while (0) + +/* + * How many seconds-worth of checkpoints to wait before re-evaluating the size + * of the slab cache? + * + * #strikes_until_shrink_cache = Function(#checkpoints/sec) + * + * Increasing the number of seconds, increases the number of strikes needed to + * be reached until it is time to cut the cache in half. + * + * Below value is open for debate - we just want it to be small enough to ensure + * that a large, idle cache doesn't stay too large for too long. + */ +#define MC_DEFAULT_SLAB_MAX_CHECK_DELAY_SECS 10 + +/* + * MC serializes the actual RAM page contents in such a way that
[Qemu-devel] [RFC PATCH v1: 12/12] mc: activate and use MC core logic if requested
From: Michael R. Hines mrhi...@us.ibm.com Building on the previous patches, this finally actually activates protection of the VM by kicking off an MC thread after the initial live migration completes. The live migration thread will get destroyed and the MC thread will run and never die. Signed-off-by: Michael R. Hines mrhi...@us.ibm.com --- migration.c | 21 - 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/migration.c b/migration.c index 8e0827e..15ad264 100644 --- a/migration.c +++ b/migration.c @@ -94,6 +94,9 @@ static void process_incoming_migration_co(void *opaque) int ret; ret = qemu_loadvm_state(f); +if (ret = 0) { +mc_process_incoming_checkpoints_if_requested(f); +} qemu_fclose(f); if (ret 0) { fprintf(stderr, load of migration failed\n); @@ -670,11 +673,27 @@ static void *migration_thread(void *opaque) s-downtime = end_time - start_time; runstate_set(RUN_STATE_POSTMIGRATE); } else { +if(migrate_use_mc()) { +qemu_fflush(s-file); +if (migrate_use_mc_net()) { +if (mc_enable_buffering() 0 || +mc_start_buffer() 0) { +migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR); +} +} +} + if (old_vm_running) { vm_start(); } } -qemu_bh_schedule(s-cleanup_bh); + +if (migrate_use_mc() s-state != MIG_STATE_ERROR) { +mc_init_checkpointer(s); +} else { +qemu_bh_schedule(s-cleanup_bh); +} + qemu_mutex_unlock_iothread(); return NULL; -- 1.8.1.2
[Qemu-devel] [RFC PATCH v1: 11/12] mc: register MC qemu-file functions and expose MC tunable capability
From: Michael R. Hines mrhi...@us.ibm.com The capability allows management software to throttle the MC frequency during VM application transience. The qemu-file savevm() functions inform the destination that the incoming traffic is MC-specific traffic and not vanilla live-migration traffic. Signed-off-by: Michael R. Hines mrhi...@us.ibm.com --- hmp-commands.hx | 14 ++ hmp.c| 6 ++ hmp.h| 1 + qapi-schema.json | 13 + qmp-commands.hx | 23 +++ vl.c | 3 +++ 6 files changed, 60 insertions(+) diff --git a/hmp-commands.hx b/hmp-commands.hx index caae5ad..7db0597 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -960,6 +960,20 @@ Set maximum tolerated downtime (in seconds) for migration. ETEXI { +.name = migrate-set-mc-delay, +.args_type = value:i, +.params = value, +.help = set maximum delay (in milliseconds) between micro-checkpoints, +.mhandler.cmd = hmp_migrate_set_mc_delay, +}, + +STEXI +@item migrate_set_downtime @var{second} +@findex migrate_set_downtime +Set maximum tolerated downtime (in seconds) for migration. +ETEXI + +{ .name = migrate_set_capability, .args_type = capability:s,state:b, .params = capability state, diff --git a/hmp.c b/hmp.c index 43896e9..8e89ac7 100644 --- a/hmp.c +++ b/hmp.c @@ -1026,6 +1026,12 @@ void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict) qmp_migrate_set_downtime(value, NULL); } +void hmp_migrate_set_mc_delay(Monitor *mon, const QDict *qdict) +{ +int64_t value = qdict_get_int(qdict, value); +qmp_migrate_set_mc_delay(value, NULL); +} + void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict) { int64_t value = qdict_get_int(qdict, value); diff --git a/hmp.h b/hmp.h index 54cf71f..b6548a3 100644 --- a/hmp.h +++ b/hmp.h @@ -60,6 +60,7 @@ void hmp_drive_mirror(Monitor *mon, const QDict *qdict); void hmp_drive_backup(Monitor *mon, const QDict *qdict); void hmp_migrate_cancel(Monitor *mon, const QDict *qdict); void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict); +void hmp_migrate_set_mc_delay(Monitor *mon, const QDict *qdict); void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict); void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict); void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict); diff --git a/qapi-schema.json b/qapi-schema.json index e0a430c..2ed8098 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -2135,6 +2135,19 @@ { 'command': 'migrate_set_downtime', 'data': {'value': 'number'} } ## +# @migrate-set-mc-delay +# +# Set delay (in milliseconds) between micro checkpoints. +# +# @value: maximum delay in milliseconds +# +# Returns: nothing on success +# +# Since: 1.6 +## +{ 'command': 'migrate-set-mc-delay', 'data': {'value': 'int'} } + +## # @migrate_set_speed # # Set maximum speed for migration. diff --git a/qmp-commands.hx b/qmp-commands.hx index fba15cd..6d7ef2f 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -754,6 +754,29 @@ Example: EQMP { +.name = migrate-set-mc-delay, +.args_type = value:i, +.mhandler.cmd_new = qmp_marshal_input_migrate_set_mc_delay, +}, + +SQMP +migrate-set-mc-delay + + +Set maximum delay (in milliseconds) between micro-checkpoints. + +Arguments: + +- value: maximum delay (json-int) + +Example: + +- { execute: migrate-set-mc-delay, arguments: { value: 100 } } +- { return: {} } + +EQMP + +{ .name = client_migrate_info, .args_type = protocol:s,hostname:s,port:i?,tls-port:i?,cert-subject:s?, .params = protocol hostname port tls-port cert-subject, diff --git a/vl.c b/vl.c index 74d52ab..fa23d66 100644 --- a/vl.c +++ b/vl.c @@ -29,6 +29,7 @@ #include sys/time.h #include zlib.h #include qemu/bitmap.h +#include migration/qemu-file.h /* Needed early for CONFIG_BSD etc. */ #include config-host.h @@ -4192,6 +4193,8 @@ int main(int argc, char **argv, char **envp) default_drive(default_sdcard, snapshot, IF_SD, 0, SD_OPTS); register_savevm_live(NULL, ram, 0, 4, savevm_ram_handlers, NULL); +register_savevm(NULL, mc, -1, MC_VERSION, mc_info_save, +mc_info_load, NULL); if (nb_numa_nodes 0) { int i; -- 1.8.1.2
[Qemu-devel] [RFC PATCH v1: 05/12] migration: support custom page loading
From: Michael R. Hines mrhi...@us.ibm.com Just as RDMA has custom routines for saving memory, this provides us with custom routines for loading memory. Micro-checkpointing needs this support in order to be able to handle loading of the latest checkpoint into memory as they are received from the network. Signed-off-by: Michael R. Hines mrhi...@us.ibm.com --- arch_init.c | 17 - include/migration/migration.h | 12 ++-- include/migration/qemu-file.h | 16 ++-- savevm.c | 27 --- 4 files changed, 60 insertions(+), 12 deletions(-) diff --git a/arch_init.c b/arch_init.c index b139512..9cf7d18 100644 --- a/arch_init.c +++ b/arch_init.c @@ -684,7 +684,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage) /* In doubt sent page as normal */ bytes_sent = -1; ret = ram_control_save_page(f, block-offset, - offset, TARGET_PAGE_SIZE, bytes_sent); + block-host, offset, TARGET_PAGE_SIZE, bytes_sent); if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { if (ret != RAM_SAVE_CONTROL_DELAYED) { @@ -712,9 +712,11 @@ static int ram_save_block(QEMUFile *f, bool last_stage) /* XBZRLE overflow or normal page */ if (bytes_sent == -1) { bytes_sent = save_block_hdr(f, block, offset, cont, RAM_SAVE_FLAG_PAGE); -qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); -bytes_sent += TARGET_PAGE_SIZE; -acct_info.norm_pages++; +if (ret != RAM_SAVE_CONTROL_DELAYED) { +qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); +bytes_sent += TARGET_PAGE_SIZE; +acct_info.norm_pages++; +} } /* if page is unmodified, continue to the next */ @@ -1133,13 +1135,18 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id) ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); } else if (flags RAM_SAVE_FLAG_PAGE) { void *host; +int r; host = host_from_stream_offset(f, addr, flags); if (!host) { return -EINVAL; } -qemu_get_buffer(f, host, TARGET_PAGE_SIZE); +r = ram_control_load_page(f, host, TARGET_PAGE_SIZE); + +if (r == RAM_LOAD_CONTROL_NOT_SUPP) { +qemu_get_buffer(f, host, TARGET_PAGE_SIZE); +} } else if (flags RAM_SAVE_FLAG_XBZRLE) { void *host = host_from_stream_offset(f, addr, flags); if (!host) { diff --git a/include/migration/migration.h b/include/migration/migration.h index 3ad06c5..ac1b438 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -169,9 +169,17 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags); #define RAM_SAVE_CONTROL_NOT_SUPP -1000 #define RAM_SAVE_CONTROL_DELAYED -2000 +#define RAM_LOAD_CONTROL_NOT_SUPP -3000 +#define RAM_LOAD_CONTROL_DELAYED -4000 -size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, - ram_addr_t offset, size_t size, +#define RDMA_CONTROL_VERSION_CURRENT 1 + +int ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, + uint8_t *host_addr, + ram_addr_t offset, long size, int *bytes_sent); +int ram_control_load_page(QEMUFile *f, + void *host_addr, + long size); #endif diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h index 0f757fb..d396b40 100644 --- a/include/migration/qemu-file.h +++ b/include/migration/qemu-file.h @@ -76,12 +76,22 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, uint64_t flags); * This function allows override of where the RAM page * is saved (such as RDMA, for example.) */ -typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque, +typedef int (QEMURamSaveFunc)(QEMUFile *f, void *opaque, ram_addr_t block_offset, + uint8_t *host_addr, ram_addr_t offset, - size_t size, + long size, int *bytes_sent); +/* + * This function allows override of where the RAM page + * is saved (such as RDMA, for example.) + */ +typedef int (QEMURamLoadFunc)(QEMUFile *f, + void *opaque, + void *host_addr, + long size); + typedef struct QEMUFileOps { QEMUFilePutBufferFunc *put_buffer; QEMUFileGetBufferFunc *get_buffer; @@ -92,12 +102,14 @@ typedef struct QEMUFileOps { QEMURamHookFunc *after_ram_iterate; QEMURamHookFunc
Re: [Qemu-devel] [PATCH V14 00/11] Add support for binding guest numa nodes to host numa nodes
Hi folks, Any more comments? Thanks, Wanlong Gao As you know, QEMU can't direct it's memory allocation now, this may cause guest cross node access performance regression. And, the worse thing is that if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =kvm_assign_device() = kvm_iommu_map_memslots() = kvm_iommu_map_pages() = kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policy before the pages are really mapped. According to this patch set, we are able to set guest nodes memory policy like following: -numa node,nodeid=0,cpus=0, \ -numa mem,size=1024M,policy=membind,host-nodes=0-1 \ -numa node,nodeid=1,cpus=1 \ -numa mem,size=1024M,policy=interleave,host-nodes=1 This supports policy={default|membind|interleave|preferred},relative=true,host-nodes=N-N like format. And add a QMP command query-numa to show numa info through this API. And convert the info numa monitor command to use this QMP command query-numa. This version removes set-mem-policy qmp and hmp commands temporarily as Marcelo and Paolo suggested. V1-V2: change to use QemuOpts in numa options (Paolo) handle Error in mpol parser (Paolo) change qmp command format to mem-policy=membind,mem-hostnode=0-1 like (Paolo) V2-V3: also handle Error in cpus parser (5/10) split out common parser from cpus and hostnode parser (Bandan 6/10) V3-V4: rebase to request for comments V4-V5: use OptVisitor and split -numa option (Paolo) - s/set-mpol/set-mem-policy (Andreas) - s/mem-policy/policy - s/mem-hostnode/host-nodes fix hmp command process after error (Luiz) add qmp command query-numa and convert info numa to it (Luiz) V5-V6: remove tabs in json file (Laszlo, Paolo) add back -numa node,mem=xxx as legacy (Paolo) change cpus and host-nodes to array (Laszlo, Eric) change nodeid to uint16 add NumaMemPolicy enum type (Eric) rebased on Laszlo's OptsVisitor: support / flatten integer ranges for repeating options patch set, thanks for Laszlo's help V6-V7: change UInt16 to uint16 (Laszlo) fix a typo in adding qmp command set-mem-policy V7-V8: rebase to current master with Laszlo's V2 of OptsVisitor patch set fix an adding white space line error V8-V9: rebase to current master check if total numa memory size is equal to ram_size (Paolo) add comments to the OptsVisitor stuff in qapi-schema.json (Eric, Laszlo) replace the use of numa_num_configured_nodes() (Andrew) avoid abusing the fact i==nodeid (Andrew) V9-V10: rebase to current master remove libnuma (Andrew) MAX_NODES=64 - MAX_NODES=128 since libnuma selected 128 (Andrew) use MAX_NODES instead of MAX_CPUMASK_BITS for host_mem bitmap (Andrew) remove a useless clear_bit() operation (Andrew) V10-V11: rebase to current master fix maxnode argument of mbind(2) V11-V12: rebase to current master split patch 02/11 of V11 (Eduardo) add some max value check (Eduardo) split MAX_NODES change patch (Eduardo) V12-V13: rebase to current master thanks for Luiz's review (Luiz) doc hmp command set-mem-policy (Luiz) rename: NUMAInfo - NUMANode (Luiz) V13-V14: remove set-mem-policy qmp and hmp commands (Marcelo, Paolo) *I hope this can catch up the train of 1.7.* Thanks, Wanlong Gao Wanlong Gao (11): NUMA: move numa related code to new file numa.c NUMA: check if the total numa memory size is equal to ram_size NUMA: Add numa_info structure to contain numa nodes info NUMA: convert -numa option to use OptsVisitor NUMA: introduce NumaMemOptions NUMA: add -numa mem, options NUMA: expand MAX_NODES from 64 to 128 NUMA: parse guest numa nodes memory policy NUMA: set guest numa nodes memory policy NUMA: add qmp command query-numa NUMA: convert hmp command info_numa to use qmp command query_numa Makefile.target | 2 +- cpus.c | 14 -- hmp.c | 57 +++ hmp.h | 1 + hw/i386/pc.c| 4 +- include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 18 ++- monitor.c | 21 +-- numa.c | 395 qapi-schema.json| 112 ++ qemu-options.hx | 6 +- qmp-commands.hx | 48 ++ vl.c| 160 +++- 13 files changed, 654 insertions(+), 185 deletions(-) create mode 100644 numa.c
Re: [Qemu-devel] [PATCH 02/13] block: do not include monitor.h in block.c
于 2013/10/18 17:36, Paolo Bonzini 写道: Il 18/10/2013 03:11, Wenchao Xia ha scritto: block_int.h already included it. Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com Reviewed-by: Eric Blake ebl...@redhat.com --- block.c |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/block.c b/block.c index 2c15e5d..e92a556 100644 --- a/block.c +++ b/block.c @@ -24,7 +24,6 @@ #include config-host.h #include qemu-common.h #include trace.h -#include monitor/monitor.h #include block/block_int.h #include block/blockjob.h #include qemu/module.h Does this cause problems? block.c uses monitor_protocol_event, so it's good to include the file directly instead of relying on other header files. Paolo OK, will drop this patch.
Re: [Qemu-devel] [PATCH 03/13] qapi: move MonitorEvent define
于 2013/10/18 20:38, Eric Blake 写道: On 10/18/2013 03:36 AM, Paolo Bonzini wrote: Il 18/10/2013 03:11, Wenchao Xia ha scritto: Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com --- include/monitor/monitor.h | 38 +- include/qapi/qmp/qevent.h | 66 + include/qapi/qmp/types.h |1 + 3 files changed, 68 insertions(+), 37 deletions(-) create mode 100644 include/qapi/qmp/qevent.h Please move it qemu-schema.json instead. qapi-schema.json, but yes, I agree that declaring it as a qapi enum and letting the code generator create the constants, rather than moving the hand-maintained constants into a new header, is smarter. I will try use qapi-schema.json in next version.
Re: [Qemu-devel] [PATCH 05/13] error: define struct Error in only one place
于 2013/10/18 19:22, Markus Armbruster 写道: Paolo Bonzini pbonz...@redhat.com writes: Il 18/10/2013 03:11, Wenchao Xia ha scritto: Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com --- include/qapi/error.h |5 - qobject/qerror.c |7 --- util/error.c |6 -- 3 files changed, 4 insertions(+), 14 deletions(-) diff --git a/include/qapi/error.h b/include/qapi/error.h index 7d4c696..8688aaf 100644 --- a/include/qapi/error.h +++ b/include/qapi/error.h @@ -20,7 +20,10 @@ * A class representing internal errors within QEMU. An error has a ErrorClass * code and a human message. */ -typedef struct Error Error; +typedef struct Error { +char *msg; +ErrorClass err_class; +} Error; Please add a comment that it should be treated as an opaque type. Or keep it opaque here, and complete the type in an internal header. But see below. Paolo /** * Set an indirect pointer to an error given a ErrorClass value and a diff --git a/qobject/qerror.c b/qobject/qerror.c index 3aee1cf..5b487f3 100644 --- a/qobject/qerror.c +++ b/qobject/qerror.c @@ -97,13 +97,6 @@ void qerror_report(ErrorClass eclass, const char *fmt, ...) } } -/* Evil... */ -struct Error -{ -char *msg; -ErrorClass err_class; -}; - void qerror_report_err(Error *err) { QError *qerr; qerr = qerror_new(); loc_save(qerr-loc); qerr-err_msg = g_strdup(err-msg); qerr-err_class = err-err_class; if (monitor_cur_is_qmp()) { monitor_set_error(cur_mon, qerr); } else { qerror_print(qerr); QDECREF(qerr); } } This is the only use of the evil duplicate. I suspect it could be cleaned up like this: qerr-err_msg = g_strdup(error_get_pretty(err)); qerr-err_class = error_get_class(err); If that's true, the duplicate goes away, and we can keep the type opaque. seems a smart idea, will use it. diff --git a/util/error.c b/util/error.c index ec0faa6..da0d221 100644 --- a/util/error.c +++ b/util/error.c @@ -17,12 +17,6 @@ #include qapi-types.h #include qapi/qmp/qerror.h -struct Error -{ -char *msg; -ErrorClass err_class; -}; - void error_set(Error **errp, ErrorClass err_class, const char *fmt, ...) { Error *err;
Re: [Qemu-devel] [PATCH 08/13] error: don't set sep when print progname
于 2013/10/18 19:40, Markus Armbruster 写道: Paolo Bonzini pbonz...@redhat.com writes: Il 18/10/2013 03:11, Wenchao Xia ha scritto: The behavior to set sep brings trouble to modification later, the logic is not changed by add tailing space in fprintf(). Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com --- util/qemu-error.c |5 ++--- 1 files changed, 2 insertions(+), 3 deletions(-) diff --git a/util/qemu-error.c b/util/qemu-error.c index 0ccd3e9..d1e858a 100644 --- a/util/qemu-error.c +++ b/util/qemu-error.c @@ -161,8 +161,7 @@ static void error_print_loc(void) const char *const *argp; if (!cur_mon progname) { -fprintf(stderr, %s:, progname); -sep = ; +fprintf(stderr, %s: , progname); } switch (cur_loc-kind) { case LOC_CMDLINE: @@ -181,7 +180,7 @@ static void error_print_loc(void) error_printf( ); break; default: -error_printf(%s, sep); +break; } } This changes behavior for LOC_FILE. Before: $ cat xyz.cfg [device abc] driver = def $ qemu-system-x86_64 -readconfig xyz.cfg qemu-system-x86_64:xyz.cfg:2: parse error After: $ qemu-system-x86_64 -readconfig xyz.cfg qemu-system-x86_64: xyz.cfg:2: parse error Could even be an improvement, but you need to note it in the commit message. No, it is not an improvement. The old format matches exactly how other report errors with location, e.g. jade. Please leave it that way, I'll check whether there is way to leave the logic as it was.
Re: [Qemu-devel] [PATCH 09/13] error: print progname with error_vprintf()
于 2013/10/18 17:44, Paolo Bonzini 写道: Il 18/10/2013 03:11, Wenchao Xia ha scritto: This remove additional code path about where to print the error, error_vprintf() is only the controller now, making future change easier. The logic is not changed since when cur_mon = NULL, error_vprintf() will still print to stderr. Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com --- util/qemu-error.c | 11 ++- 1 files changed, 10 insertions(+), 1 deletions(-) diff --git a/util/qemu-error.c b/util/qemu-error.c index d1e858a..c29fcbd 100644 --- a/util/qemu-error.c +++ b/util/qemu-error.c @@ -151,6 +151,15 @@ const char *error_get_progname(void) return progname; } +static void error_print_progname(const char *fmt, ...) +{ +va_list ap; + +va_start(ap, fmt); +error_vprintf(fmt, ap); +va_end(ap); +} + /* * Print current location to current monitor if we have one, else to stderr. */ @@ -161,7 +170,7 @@ static void error_print_loc(void) const char *const *argp; if (!cur_mon progname) { -fprintf(stderr, %s: , progname); +error_print_progname(%s: , progname); } switch (cur_loc-kind) { case LOC_CMDLINE: I agree that using fprintf looks odd, but why not use error_printf directly? Paolo I used custom function since I have a following modification in my private branch. Since it is not send, I will use error_printf(), which is more straight.
Re: [Qemu-devel] [PATCH 10/13] qerror: deref once in qerror_report()
于 2013/10/18 17:46, Paolo Bonzini 写道: Il 18/10/2013 03:11, Wenchao Xia ha scritto: Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com --- qobject/qerror.c |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/qobject/qerror.c b/qobject/qerror.c index 5b487f3..685167a 100644 --- a/qobject/qerror.c +++ b/qobject/qerror.c @@ -77,7 +77,6 @@ static void qerror_print(QError *qerror) loc_push_restore(qerror-loc); error_report(%s, qstring_get_str(qstring)); loc_pop(qerror-loc); -QDECREF(qstring); } void qerror_report(ErrorClass eclass, const char *fmt, ...) Why isn't this a memory leak? Paolo My bad, I mistake QDECREF(qstring) as QDECREF(qerror), will drop this path, sorry to disturb.
Re: [Qemu-devel] [PATCH 12/13] monitor: hide *cur_mon in monitor_get_fd()
于 2013/10/18 17:51, Paolo Bonzini 写道: Il 18/10/2013 03:11, Wenchao Xia ha scritto: All existing caller are using *cur_mon as its parameter, and *cur_mon is an internal variable which used inside monitor.c. This patch reduce the exposing of details in monitor.c, by introduce a new function monitor_get_fd_cur() and make old one static. Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com --- dump.c|2 +- include/monitor/monitor.h |2 +- migration-fd.c|2 +- monitor.c |7 ++- qmp.c |2 +- stubs/get-fd.c|2 +- util/qemu-sockets.c |4 ++-- 7 files changed, 13 insertions(+), 8 deletions(-) diff --git a/dump.c b/dump.c index 846155c..8f5b6b0 100644 --- a/dump.c +++ b/dump.c @@ -860,7 +860,7 @@ void qmp_dump_guest_memory(bool paging, const char *file, bool has_begin, #if !defined(WIN32) if (strstart(file, fd:, p)) { -fd = monitor_get_fd(cur_mon, p, errp); +fd = monitor_get_fd_cur(p, errp); if (fd == -1) { return; } diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h index 97fcee3..637f7f3 100644 --- a/include/monitor/monitor.h +++ b/include/monitor/monitor.h @@ -35,7 +35,7 @@ int monitor_read_block_device_key(Monitor *mon, const char *device, BlockDriverCompletionFunc *completion_cb, void *opaque); -int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp); +int monitor_get_fd_cur(const char *fdname, Error **errp); int monitor_handle_fd_param(Monitor *mon, const char *fdname); void monitor_vprintf(Monitor *mon, const char *fmt, va_list ap) diff --git a/migration-fd.c b/migration-fd.c index d2e523a..022bc50 100644 --- a/migration-fd.c +++ b/migration-fd.c @@ -33,7 +33,7 @@ void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp) { -int fd = monitor_get_fd(cur_mon, fdname, errp); +int fd = monitor_get_fd_cur(fdname, errp); if (fd == -1) { return; } diff --git a/monitor.c b/monitor.c index 9377834..80a9dfd 100644 --- a/monitor.c +++ b/monitor.c @@ -2290,7 +2290,7 @@ static void do_loadvm(Monitor *mon, const QDict *qdict) } } -int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp) +static int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp) { mon_fd_t *monfd; @@ -2315,6 +2315,11 @@ int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp) return -1; } +int monitor_get_fd_cur(const char *fdname, Error **errp) +{ +return monitor_get_fd(cur_mon, fdname, errp); +} + static void monitor_fdset_cleanup(MonFdset *mon_fdset) { MonFdsetFd *mon_fdset_fd; diff --git a/qmp.c b/qmp.c index 4c149b3..a02804b 100644 --- a/qmp.c +++ b/qmp.c @@ -493,7 +493,7 @@ void qmp_add_client(const char *protocol, const char *fdname, CharDriverState *s; int fd; -fd = monitor_get_fd(cur_mon, fdname, errp); +fd = monitor_get_fd_cur(fdname, errp); if (fd 0) { return; } diff --git a/stubs/get-fd.c b/stubs/get-fd.c index 9f2c65c..7d9ec3b 100644 --- a/stubs/get-fd.c +++ b/stubs/get-fd.c @@ -1,7 +1,7 @@ #include qemu-common.h #include monitor/monitor.h -int monitor_get_fd(Monitor *mon, const char *name, Error **errp) +int monitor_get_fd_cur(const char *name, Error **errp) { error_setg(errp, only QEMU supports file descriptor passing); return -1; diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c index 6b97dc1..9cd85dd 100644 --- a/util/qemu-sockets.c +++ b/util/qemu-sockets.c @@ -902,7 +902,7 @@ int socket_connect(SocketAddress *addr, Error **errp, break; case SOCKET_ADDRESS_KIND_FD: -fd = monitor_get_fd(cur_mon, addr-fd-str, errp); +fd = monitor_get_fd_cur(addr-fd-str, errp); if (fd = 0 callback) { qemu_set_nonblock(fd); callback(fd, opaque); @@ -934,7 +934,7 @@ int socket_listen(SocketAddress *addr, Error **errp) break; case SOCKET_ADDRESS_KIND_FD: -fd = monitor_get_fd(cur_mon, addr-fd-str, errp); +fd = monitor_get_fd_cur(addr-fd-str, errp); break; default: Doesn't seem like an improvement. It would be if you could then make cur_mon static. Paolo OK, I will check all code using cur_mon and make it static.
[Qemu-devel] can we create complete image or start a vm from a snapshot point
hi.all can we create complete image or start a vm from a snapshot point thanks