date:20131020

[Qemu-devel] [PATCH 00/16] slirp: Adding IPv6 support to Qemu -net user mode

2013-10-20 Thread Samuel Thibault

We have developed IPv6 in Qemu -net user mode. 

These patches add ICMPv6, NDP, and make UDP and TCP compatible with 
IPv6. We have made some refactoring to make current code compatible 
with IPv6.

Some patches, like 2 and 13, can be reviewed using 
interdiff -w /dev/null patchfile 
to get rid of the indentation.

 [PATCH 01/16] slirp: goto bad in udp_input if sosendto fails
 [PATCH 02/16] slirp: Generalizing and neutralizing code before adding
 [PATCH 03/16] qemu/timer.h : Adding function to second scale
 [PATCH 04/16] slirp: Adding IPv6, ICMPv6 Echo and NDP
 [PATCH 05/16] slirp: Adding ICMPv6 error sending
 [PATCH 06/16] slirp: Make Socket structure IPv6 compatible
 [PATCH 07/16] slirp: Factorizing address translation
 [PATCH 08/16] slirp: Factorizing and cleaning solookup()
 [PATCH 09/16] slirp: Make udp_attach IPv6 compatible
 [PATCH 10/16] slirp: Adding IPv6 UDP support
 [PATCH 11/16] slirp: Adding family argument to tcp_fconnect()
 [PATCH 12/16] slirp: Factorizing tcpiphdr structure with an union
 [PATCH 13/16] slirp: Generalizing and neutralizing various TCP
 [PATCH 14/16] slirp: Handle IPv6 in TCP functions
 [PATCH 15/16] slirp: Adding IPv6 address for DNS relay
 [PATCH 16/16] qapi-schema, qemu-options  slirp: Adding Qemu options

[Qemu-devel] [PATCH 03/16] qemu/timer.h : Adding function to second scale

2013-10-20 Thread Samuel Thibault

This patch adds SCALE_S, timer_new_s(), and qemu_clock_get_s in qemu/timer.h to
manage second-scale timers.

Signed-off-by: Guillaume Subiron maet...@subiron.org
Signed-off-by: Samuel Thibault samuel.thiba...@ens-lyon.org
---
 include/qemu/timer.h | 32 
 1 file changed, 32 insertions(+)

diff --git a/include/qemu/timer.h b/include/qemu/timer.h
index b58903b..f71553d 100644
--- a/include/qemu/timer.h
+++ b/include/qemu/timer.h
@@ -7,6 +7,7 @@
 
 /* timers */
 
+#define SCALE_S  10
 #define SCALE_MS 100
 #define SCALE_US 1000
 #define SCALE_NS 1
@@ -81,6 +82,20 @@ extern QEMUTimerListGroup main_loop_tlg;
 int64_t qemu_clock_get_ns(QEMUClockType type);
 
 /**
+ * qemu_clock_get_s;
+ * @type: the clock type
+ *
+ * Get the second value of a clock with
+ * type @type
+ *
+ * Returns: the clock value in seconds
+ */
+static inline int64_t qemu_clock_get_s(QEMUClockType type)
+{
+return qemu_clock_get_ns(type) / SCALE_S;
+}
+
+/**
  * qemu_clock_get_ms;
  * @type: the clock type
  *
@@ -508,6 +523,23 @@ static inline QEMUTimer *timer_new_ms(QEMUClockType type, 
QEMUTimerCB *cb,
 }
 
 /**
+ * timer_new_s:
+ * @clock: the clock to associate with the timer
+ * @callback: the callback to call when the timer expires
+ * @opaque: the opaque pointer to pass to the callback
+ *
+ * Create a new timer with second scale on the default timer list
+ * associated with the clock.
+ *
+ * Returns: a pointer to the newly created timer
+ */
+static inline QEMUTimer *timer_new_s(QEMUClockType type, QEMUTimerCB *cb,
+ void *opaque)
+{
+return timer_new(type, SCALE_S, cb, opaque);
+}
+
+/**
  * timer_free:
  * @ts: the timer
  *
-- 
1.8.4.rc3

[Qemu-devel] [PATCH 15/16] slirp: Adding IPv6 address for DNS relay

2013-10-20 Thread Samuel Thibault

This patch adds an IPv6 address to the DNS relay. in6_equal_dns() is
developed using this Slirp attribute.
sotranslate_in/out() are also updated to manage the IPv6 case so the
guest can be able to join the host using one of the Slirp addresses.

Signed-off-by: Guillaume Subiron maet...@subiron.org
---
 slirp/ip6.h|  5 -
 slirp/slirp.c  |  2 ++
 slirp/slirp.h  |  1 +
 slirp/socket.c | 26 --
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/slirp/ip6.h b/slirp/ip6.h
index 16124ec..b88456d 100644
--- a/slirp/ip6.h
+++ b/slirp/ip6.h
@@ -74,7 +74,10 @@ static inline int in6_equal_mach(struct in6_addr a, struct 
in6_addr b,
   || (in6_equal_net(a, (struct in6_addr)LINKLOCAL_ADDR, 64)\
in6_equal_mach(a, slirp-vhost_addr6, 64)))
 
-#define in6_equal_dns(a) 0
+#define in6_equal_dns(a)\
+((in6_equal_net(a, slirp-vprefix_addr6, slirp-vprefix_len)\
+ || in6_equal_net(a, (struct in6_addr)LINKLOCAL_ADDR, 64))\
+  in6_equal_mach(a, slirp-vnameserver_addr6, slirp-vprefix_len))
 
 #define in6_equal_host(a)\
 (in6_equal_router(a) || in6_equal_dns(a))
diff --git a/slirp/slirp.c b/slirp/slirp.c
index 0f6f006..695e8a6 100644
--- a/slirp/slirp.c
+++ b/slirp/slirp.c
@@ -236,6 +236,8 @@ Slirp *slirp_init(int restricted, struct in_addr vnetwork,
 slirp-bootp_filename = g_strdup(bootfile);
 slirp-vdhcp_startaddr = vdhcp_start;
 slirp-vnameserver_addr = vnameserver;
+/* :TODO:maethor:130311: Use a parameter passed to the function */
+inet_pton(AF_INET6, fc00::2, slirp-vnameserver_addr6);
 
 if (vdnssearch) {
 translate_dnssearch(slirp, vdnssearch);
diff --git a/slirp/slirp.h b/slirp/slirp.h
index b6e805e..0688ea7 100644
--- a/slirp/slirp.h
+++ b/slirp/slirp.h
@@ -236,6 +236,7 @@ struct Slirp {
 struct in6_addr vhost_addr6;
 struct in_addr vdhcp_startaddr;
 struct in_addr vnameserver_addr;
+struct in6_addr vnameserver_addr6;
 
 struct in_addr client_ipaddr;
 char client_hostname[33];
diff --git a/slirp/socket.c b/slirp/socket.c
index 567f9bc..a9b3957 100644
--- a/slirp/socket.c
+++ b/slirp/socket.c
@@ -741,12 +741,12 @@ sofwdrain(struct socket *so)
 
 /*
  * Translate addr in host addr when it is a virtual address
- * :TODO:maethor:130314: Manage IPv6
  */
 void sotranslate_out(struct socket *so, struct sockaddr_storage *addr)
 {
 Slirp *slirp = so-slirp;
 struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
 
 switch (addr-ss_family) {
 case AF_INET:
@@ -767,16 +767,29 @@ void sotranslate_out(struct socket *so, struct 
sockaddr_storage *addr)
 ntohs(sin-sin_port), inet_ntoa(sin-sin_addr)));
 break;
 
+case AF_INET6:
+if (in6_equal_net(so-so_faddr6, slirp-vprefix_addr6,
+slirp-vprefix_len)) {
+if (in6_equal(so-so_faddr6, slirp-vnameserver_addr6)) {
+/*if (get_dns_addr(addr)  0) {*/ /* TODO */
+sin6-sin6_addr = in6addr_loopback;
+/*}*/
+} else {
+sin6-sin6_addr = in6addr_loopback;
+}
+}
+break;
+
 default:
 break;
 }
 }
 
-/* :TODO:maethor:130314: IPv6 */
 void sotranslate_in(struct socket *so, struct sockaddr_storage *addr)
 {
 Slirp *slirp = so-slirp;
 struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
 
 switch (addr-ss_family) {
 case AF_INET:
@@ -793,6 +806,15 @@ void sotranslate_in(struct socket *so, struct 
sockaddr_storage *addr)
 }
 break;
 
+case AF_INET6:
+if (in6_equal_net(so-so_faddr6, slirp-vprefix_addr6,
+slirp-vprefix_len)) {
+if (in6_equal(sin6-sin6_addr, in6addr_loopback)
+|| !in6_equal(so-so_faddr6, slirp-vhost_addr6)) {
+sin6-sin6_addr = so-so_faddr6;
+}
+}
+
 default:
 break;
 }
-- 
1.8.4.rc3

[Qemu-devel] [PATCH 11/16] slirp: Adding family argument to tcp_fconnect()

2013-10-20 Thread Samuel Thibault

This patch simply adds a sa_family_t argument to remove the hardcoded
AF_INET in the call of qemu_socket().

Signed-off-by: Guillaume Subiron maet...@subiron.org
---
 slirp/slirp.h | 2 +-
 slirp/tcp_input.c | 3 ++-
 slirp/tcp_subr.c  | 5 +++--
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/slirp/slirp.h b/slirp/slirp.h
index 0d1e14d..aa44055 100644
--- a/slirp/slirp.h
+++ b/slirp/slirp.h
@@ -367,7 +367,7 @@ void tcp_respond(struct tcpcb *, register struct tcpiphdr 
*, register struct mbu
 struct tcpcb * tcp_newtcpcb(struct socket *);
 struct tcpcb * tcp_close(register struct tcpcb *);
 void tcp_sockclosed(struct tcpcb *);
-int tcp_fconnect(struct socket *);
+int tcp_fconnect(struct socket *, sa_family_t af);
 void tcp_connect(struct socket *);
 int tcp_attach(struct socket *);
 uint8_t tcp_tos(struct socket *);
diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c
index f7a8d49..25929bd 100644
--- a/slirp/tcp_input.c
+++ b/slirp/tcp_input.c
@@ -581,7 +581,8 @@ findso:
goto cont_input;
  }
 
- if((tcp_fconnect(so) == -1)  (errno != EINPROGRESS)  (errno != 
EWOULDBLOCK)) {
+ if ((tcp_fconnect(so, so-so_ffamily) == -1)
+ (errno != EINPROGRESS)  (errno != EWOULDBLOCK)) {
u_char code=ICMP_UNREACH_NET;
DEBUG_MISC((dfd,  tcp fconnect errno = %d-%s\n,
errno,strerror(errno)));
diff --git a/slirp/tcp_subr.c b/slirp/tcp_subr.c
index 4791c0c..3558115 100644
--- a/slirp/tcp_subr.c
+++ b/slirp/tcp_subr.c
@@ -324,14 +324,15 @@ tcp_sockclosed(struct tcpcb *tp)
  * nonblocking.  Connect returns after the SYN is sent, and does
  * not wait for ACK+SYN.
  */
-int tcp_fconnect(struct socket *so)
+int tcp_fconnect(struct socket *so, sa_family_t af)
 {
   int ret=0;
 
   DEBUG_CALL(tcp_fconnect);
   DEBUG_ARG(so = %lx, (long )so);
 
-  if( (ret = so-s = qemu_socket(AF_INET,SOCK_STREAM,0)) = 0) {
+  ret = so-s = qemu_socket(af, SOCK_STREAM, 0);
+  if (ret = 0) {
 int opt, s=so-s;
 struct sockaddr_storage addr;
 
-- 
1.8.4.rc3

[Qemu-devel] [PATCH 01/16] slirp: goto bad in udp_input if sosendto fails

2013-10-20 Thread Samuel Thibault

Before this patch, if sosendto fails, udp_input is executed as if the
packet was sent. This could cause memory leak.
This patch adds a goto bad to cut the execution of this function.

Signed-off-by: Guillaume Subiron maet...@subiron.org
---
 slirp/udp.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/slirp/udp.c b/slirp/udp.c
index 8cc6cb6..fd2446a 100644
--- a/slirp/udp.c
+++ b/slirp/udp.c
@@ -218,6 +218,7 @@ udp_input(register struct mbuf *m, int iphlen)
  *ip=save_ip;
  DEBUG_MISC((dfd,udp tx errno = %d-%s\n,errno,strerror(errno)));
  icmp_error(m, ICMP_UNREACH,ICMP_UNREACH_NET, 0,strerror(errno));
+ goto bad;
}
 
m_free(so-so_m);   /* used for ICMP if error on sorecvfrom */
-- 
1.8.4.rc3

[Qemu-devel] [PATCH 08/16] slirp: Factorizing and cleaning solookup()

2013-10-20 Thread Samuel Thibault

This patch makes solookup() compatible with all address family. Also,
this function was only compatible with TCP. Having the socket list in
argument, it is now compatible with UDP too. Finally, some optimization
code is factorized inside the function (the function look at the last
returned result before browsing the complete socket list).

This also adds a sockaddr_equal() function to compare two
sockaddr_storage.

Signed-off-by: Guillaume Subiron maet...@subiron.org
---
 slirp/socket.c| 30 --
 slirp/socket.h| 30 +-
 slirp/tcp_input.c | 27 +++
 slirp/udp.c   | 25 ++---
 4 files changed, 62 insertions(+), 50 deletions(-)

diff --git a/slirp/socket.c b/slirp/socket.c
index 375281c..f333fcf 100644
--- a/slirp/socket.c
+++ b/slirp/socket.c
@@ -15,24 +15,26 @@
 static void sofcantrcvmore(struct socket *so);
 static void sofcantsendmore(struct socket *so);
 
-struct socket *
-solookup(struct socket *head, struct in_addr laddr, u_int lport,
- struct in_addr faddr, u_int fport)
+struct socket *solookup(struct socket **last, struct socket *head,
+struct sockaddr_storage *lhost, struct sockaddr_storage *fhost)
 {
-   struct socket *so;
+struct socket *so = *last;
 
-   for (so = head-so_next; so != head; so = so-so_next) {
-   if (so-so_lport == lport 
-   so-so_laddr.s_addr == laddr.s_addr 
-   so-so_faddr.s_addr == faddr.s_addr 
-   so-so_fport == fport)
-  break;
-   }
+/* Optimisation */
+if (sockaddr_equal((so-lhost.ss), lhost)
+ (!fhost || sockaddr_equal((so-fhost.ss), fhost))) {
+return so;
+}
 
-   if (so == head)
-  return (struct socket *)NULL;
-   return so;
+for (so = head-so_next; so != head; so = so-so_next) {
+if (sockaddr_equal((so-lhost.ss), lhost)
+ (!fhost || sockaddr_equal((so-fhost.ss), fhost))) {
+*last = so;
+return so;
+}
+}
 
+return (struct socket *)NULL;
 }
 
 /*
diff --git a/slirp/socket.h b/slirp/socket.h
index 50059be..ad509b9 100644
--- a/slirp/socket.h
+++ b/slirp/socket.h
@@ -93,7 +93,35 @@ struct socket {
 #define SS_HOSTFWD 0x1000  /* Socket describes host-guest 
forwarding */
 #define SS_INCOMING0x2000  /* Connection was initiated by a host 
on the internet */
 
-struct socket * solookup(struct socket *, struct in_addr, u_int, struct 
in_addr, u_int);
+static inline int sockaddr_equal(struct sockaddr_storage *a,
+struct sockaddr_storage *b)
+{
+if (a-ss_family != b-ss_family) {
+return 0;
+} else {
+switch (a-ss_family) {
+case AF_INET:
+{
+struct sockaddr_in *a4 = (struct sockaddr_in *) a;
+struct sockaddr_in *b4 = (struct sockaddr_in *) b;
+return (a4-sin_addr.s_addr == b4-sin_addr.s_addr
+ a4-sin_port == b4-sin_port);
+}
+case AF_INET6:
+{
+struct sockaddr_in6 *a6 = (struct sockaddr_in6 *) a;
+struct sockaddr_in6 *b6 = (struct sockaddr_in6 *) b;
+return (in6_equal(a6-sin6_addr, b6-sin6_addr)
+ a6-sin6_port == b6-sin6_port);
+}
+default:
+return 0;
+}
+}
+}
+
+struct socket *solookup(struct socket **, struct socket *,
+struct sockaddr_storage *, struct sockaddr_storage *);
 struct socket * socreate(Slirp *);
 void sofree(struct socket *);
 int soread(struct socket *);
diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c
index 70ef376..f7a8d49 100644
--- a/slirp/tcp_input.c
+++ b/slirp/tcp_input.c
@@ -227,6 +227,7 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso)
int iss = 0;
u_long tiwin;
int ret;
+   struct sockaddr_storage lhost, fhost;
 struct ex_list *ex_ptr;
 Slirp *slirp;
 
@@ -320,16 +321,14 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso)
 * Locate pcb for segment.
 */
 findso:
-   so = slirp-tcp_last_so;
-   if (so-so_fport != ti-ti_dport ||
-   so-so_lport != ti-ti_sport ||
-   so-so_laddr.s_addr != ti-ti_src.s_addr ||
-   so-so_faddr.s_addr != ti-ti_dst.s_addr) {
-   so = solookup(slirp-tcb, ti-ti_src, ti-ti_sport,
-  ti-ti_dst, ti-ti_dport);
-   if (so)
-   slirp-tcp_last_so = so;
-   }
+   lhost.ss_family = AF_INET;
+   ((struct sockaddr_in *)lhost)-sin_addr = ti-ti_src;
+   ((struct sockaddr_in *)lhost)-sin_port = ti-ti_sport;
+   fhost.ss_family = AF_INET;
+   ((struct sockaddr_in *)fhost)-sin_addr = ti-ti_dst;
+   ((struct sockaddr_in *)fhost)-sin_port = ti-ti_dport;
+
+   so = solookup(slirp-tcp_last_so, slirp-tcb, lhost, fhost);
 
/*
 * If the state is

[Qemu-devel] [PATCH 05/16] slirp: Adding ICMPv6 error sending

2013-10-20 Thread Samuel Thibault

Disambiguation : icmp_error is renamed into icmp_send_error, since it
doesn't manage errors, but only sends ICMP Error messages.

Adding icmp6_send_error to send ICMPv6 Error messages. This function is
simpler than the v4 version.
Adding some calls in various functions to send ICMP errors, when a
received packet is too big, or when its hop limit is 0.

Signed-off-by: Yann Bordenave m...@meowstars.org
---
 slirp/ip6_icmp.c  | 60 +++
 slirp/ip6_icmp.h  | 10 ++
 slirp/ip6_input.c | 16 ---
 slirp/ip_icmp.c   | 12 +--
 slirp/ip_icmp.h   |  4 ++--
 slirp/ip_input.c  |  8 
 slirp/socket.c|  4 ++--
 slirp/tcp_input.c |  2 +-
 slirp/udp.c   |  3 ++-
 9 files changed, 96 insertions(+), 23 deletions(-)

diff --git a/slirp/ip6_icmp.c b/slirp/ip6_icmp.c
index 32de0ba..706e430 100644
--- a/slirp/ip6_icmp.c
+++ b/slirp/ip6_icmp.c
@@ -65,6 +65,66 @@ static void icmp6_send_echoreply(struct mbuf *m, Slirp 
*slirp, struct ip6 *ip,
 ip6_output(NULL, t, 0);
 }
 
+void icmp6_send_error(struct mbuf *m, uint8_t type, uint8_t code)
+{
+Slirp *slirp = m-slirp;
+struct mbuf *t = m_get(slirp);
+struct ip6 *ip = mtod(m, struct ip6 *);
+
+char addrstr[INET6_ADDRSTRLEN];
+DEBUG_CALL(icmp_send_error);
+DEBUG_ARGS((dfd,  type = %d, code = %d\n, type, code));
+
+/* IPv6 packet */
+struct ip6 *rip = mtod(t, struct ip6 *);
+rip-ip_src = (struct in6_addr)LINKLOCAL_ADDR;
+if (in6_multicast(ip-ip_src) || in6_unspecified(ip-ip_src)) {
+/* :TODO:maethor:130317: icmp error? */
+return;
+}
+rip-ip_dst = ip-ip_src;
+inet_ntop(AF_INET6, rip-ip_dst, addrstr, INET6_ADDRSTRLEN);
+DEBUG_ARG(target = %s, addrstr);
+
+rip-ip_nh = IPPROTO_ICMPV6;
+const int error_data_len = min(m-m_len,
+IF_MTU - (sizeof(struct ip6) + ICMP6_ERROR_MINLEN));
+rip-ip_pl = htons(ICMP6_ERROR_MINLEN + error_data_len);
+t-m_len = sizeof(struct ip6) + ntohs(rip-ip_pl);
+
+/* ICMPv6 packet */
+t-m_data += sizeof(struct ip6);
+struct icmp6 *ricmp = mtod(t, struct icmp6 *);
+ricmp-icmp6_type = type;
+ricmp-icmp6_code = code;
+ricmp-icmp6_cksum = 0;
+
+switch (type) {
+case ICMP6_UNREACH:
+case ICMP6_TIMXCEED:
+ricmp-icmp6_err.unused = 0;
+break;
+case ICMP6_TOOBIG:
+ricmp-icmp6_err.mtu = htonl(IF_MTU);
+break;
+case ICMP6_PARAMPROB:
+/* :TODO:Meow:130316: Handle this case */
+break;
+default:
+assert(0);
+break;
+}
+t-m_data += ICMP6_ERROR_MINLEN;
+memcpy(t-m_data, m-m_data, error_data_len);
+
+/* Checksum */
+t-m_data -= ICMP6_ERROR_MINLEN;
+t-m_data -= sizeof(struct ip6);
+ricmp-icmp6_cksum = ip6_cksum(t);
+
+ip6_output(NULL, t, 0);
+}
+
 /*
  * Process a NDP message
  */
diff --git a/slirp/ip6_icmp.h b/slirp/ip6_icmp.h
index 2b21c84..7779964 100644
--- a/slirp/ip6_icmp.h
+++ b/slirp/ip6_icmp.h
@@ -22,6 +22,12 @@ struct icmp6_echo { /* Echo Messages */
 uint16_t seq_num;
 };
 
+union icmp6_error_body {
+uint32_t unused;
+uint32_t pointer;
+uint32_t mtu;
+};
+
 /*
  * NDP Messages
  */
@@ -85,6 +91,7 @@ struct icmp6 {
 uint8_t icmp6_code; /* type sub code */
 uint16_ticmp6_cksum;/* ones complement cksum of struct */
 union {
+union icmp6_error_body error_body;
 struct icmp6_echo echo;
 struct ndp_rs ndp_rs;
 struct ndp_ra ndp_ra;
@@ -92,6 +99,7 @@ struct icmp6 {
 struct ndp_na ndp_na;
 struct ndp_redirect ndp_redirect;
 } icmp6_body;
+#define icmp6_err icmp6_body.error_body
 #define icmp6_echo icmp6_body.echo
 #define icmp6_nrs icmp6_body.ndp_rs
 #define icmp6_nra icmp6_body.ndp_ra
@@ -101,6 +109,7 @@ struct icmp6 {
 } QEMU_PACKED;
 
 #define ICMP6_MINLEN4
+#define ICMP6_ERROR_MINLEN  8
 #define ICMP6_ECHO_MINLEN   8
 #define ICMP6_NDP_RS_MINLEN 8
 #define ICMP6_NDP_RA_MINLEN 16
@@ -241,6 +250,7 @@ void icmp6_input(struct mbuf *);
 void icmp6_error(struct mbuf *msrc, u_char type, u_char code, int minsize,
 const char *message);
 */
+void icmp6_send_error(struct mbuf *m, uint8_t type, uint8_t code);
 void ndp_send_ra(Slirp *slirp);
 void ndp_send_ns(Slirp *slirp, struct in6_addr addr);
 
diff --git a/slirp/ip6_input.c b/slirp/ip6_input.c
index 9663c42..af098a5 100644
--- a/slirp/ip6_input.c
+++ b/slirp/ip6_input.c
@@ -33,7 +33,7 @@ void ip6_input(struct mbuf *m)
 DEBUG_ARG(m_len = %d, m-m_len);
 
 if (m-m_len  sizeof(struct ip6)) {
-return;
+goto bad;
 }
 
 ip6 = mtod(m, struct ip6 *);
@@ -42,10 +42,14 @@ void ip6_input(struct mbuf *m)
 goto bad;
 }
 
+if (ntohs(ip6-ip_pl)  IF_MTU) {
+icmp6_send_error(m, ICMP6_TOOBIG, 0);
+goto bad;
+}
+
 /* check ip_ttl for a correct ICMP reply */
 if (ip6-ip_hl == 0) {
-/* :TODO:maethor:130307: icmp6_error

[Qemu-devel] [PATCH 09/16] slirp: Make udp_attach IPv6 compatible

2013-10-20 Thread Samuel Thibault

A sa_family_t is now passed in argument to udp_attach instead of using a
hardcoded AF_INET to call qemu_socket().

Signed-off-by: Guillaume Subiron maet...@subiron.org
---
 slirp/ip_icmp.c | 2 +-
 slirp/udp.c | 7 ---
 slirp/udp.h | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/slirp/ip_icmp.c b/slirp/ip_icmp.c
index 8787aae..c896574 100644
--- a/slirp/ip_icmp.c
+++ b/slirp/ip_icmp.c
@@ -162,7 +162,7 @@ icmp_input(struct mbuf *m, int hlen)
   if (icmp_send(so, m, hlen) == 0) {
 return;
   }
-  if(udp_attach(so) == -1) {
+  if (udp_attach(so, AF_INET) == -1) {
DEBUG_MISC((dfd,icmp_input udp_attach errno = %d-%s\n,
errno,strerror(errno)));
sofree(so);
diff --git a/slirp/udp.c b/slirp/udp.c
index 7e0f1b2..f53ee11 100644
--- a/slirp/udp.c
+++ b/slirp/udp.c
@@ -167,7 +167,7 @@ udp_input(register struct mbuf *m, int iphlen)
  if (!so) {
  goto bad;
  }
- if(udp_attach(so) == -1) {
+ if (udp_attach(so, AF_INET) == -1) {
DEBUG_MISC((dfd, udp_attach errno = %d-%s\n,
errno,strerror(errno)));
sofree(so);
@@ -276,9 +276,10 @@ int udp_output(struct socket *so, struct mbuf *m,
 }
 
 int
-udp_attach(struct socket *so)
+udp_attach(struct socket *so, sa_family_t af)
 {
-  if((so-s = qemu_socket(AF_INET,SOCK_DGRAM,0)) != -1) {
+  so-s = qemu_socket(af, SOCK_DGRAM, 0);
+  if (so-s != -1) {
 so-so_expire = curtime + SO_EXPIRE;
 insque(so, so-slirp-udb);
   }
diff --git a/slirp/udp.h b/slirp/udp.h
index a04b8ce..15e73c1 100644
--- a/slirp/udp.h
+++ b/slirp/udp.h
@@ -76,7 +76,7 @@ struct mbuf;
 void udp_init(Slirp *);
 void udp_cleanup(Slirp *);
 void udp_input(register struct mbuf *, int);
-int udp_attach(struct socket *);
+int udp_attach(struct socket *, sa_family_t af);
 void udp_detach(struct socket *);
 struct socket * udp_listen(Slirp *, uint32_t, u_int, uint32_t, u_int,
int);
-- 
1.8.4.rc3

[Qemu-devel] [PATCH 10/16] slirp: Adding IPv6 UDP support

2013-10-20 Thread Samuel Thibault

This patch adds udp6_input() and udp6_output().
It also adds the IPv6 case in sorecvfrom().
Finally, udp_input() is called by ip6_input().

Signed-off-by: Guillaume Subiron maet...@subiron.org
---
 slirp/Makefile.objs |   2 +-
 slirp/ip6_input.c   |   3 +-
 slirp/socket.c  |   7 ++-
 slirp/udp.h |   5 ++
 slirp/udp6.c| 149 
 5 files changed, 162 insertions(+), 4 deletions(-)
 create mode 100644 slirp/udp6.c

diff --git a/slirp/Makefile.objs b/slirp/Makefile.objs
index 2dfe8e0..faa32b6 100644
--- a/slirp/Makefile.objs
+++ b/slirp/Makefile.objs
@@ -1,3 +1,3 @@
 common-obj-y = cksum.o if.o ip_icmp.o ip6_icmp.o ip6_input.o ip6_output.o 
ip_input.o ip_output.o dnssearch.o
 common-obj-y += slirp.o mbuf.o misc.o sbuf.o socket.o tcp_input.o tcp_output.o
-common-obj-y += tcp_subr.o tcp_timer.o udp.o bootp.o tftp.o arp_table.o 
ndp_table.o
+common-obj-y += tcp_subr.o tcp_timer.o udp.o udp6.o bootp.o tftp.o arp_table.o 
ndp_table.o
diff --git a/slirp/ip6_input.c b/slirp/ip6_input.c
index af098a5..3290af8 100644
--- a/slirp/ip6_input.c
+++ b/slirp/ip6_input.c
@@ -62,8 +62,7 @@ void ip6_input(struct mbuf *m)
 icmp6_send_error(m, ICMP6_UNREACH, ICMP6_UNREACH_NO_ROUTE);
 break;
 case IPPROTO_UDP:
-/* :TODO:maethor:130312: UDP */
-icmp6_send_error(m, ICMP6_UNREACH, ICMP6_UNREACH_NO_ROUTE);
+udp6_input(m);
 break;
 case IPPROTO_ICMPV6:
 icmp6_input(m);
diff --git a/slirp/socket.c b/slirp/socket.c
index f333fcf..31bbb7e 100644
--- a/slirp/socket.c
+++ b/slirp/socket.c
@@ -540,8 +540,13 @@ sorecvfrom(struct socket *so)
   (struct sockaddr_in *) daddr,
   so-so_iptos);
break;
-   default:
+   case AF_INET6:
+   udp6_output(so, m, (struct sockaddr_in6 *) saddr,
+   (struct sockaddr_in6 *) daddr);
break;
+   default:
+  assert(0);
+  break;
}
  } /* rx error */
} /* if ping packet */
diff --git a/slirp/udp.h b/slirp/udp.h
index 15e73c1..8a4d9f5 100644
--- a/slirp/udp.h
+++ b/slirp/udp.h
@@ -83,4 +83,9 @@ struct socket * udp_listen(Slirp *, uint32_t, u_int, 
uint32_t, u_int,
 int udp_output(struct socket *so, struct mbuf *m,
 struct sockaddr_in *saddr, struct sockaddr_in *daddr,
 int iptos);
+
+void udp6_input(register struct mbuf *);
+int udp6_output(struct socket *so, struct mbuf *m,
+struct sockaddr_in6 *saddr, struct sockaddr_in6 *daddr);
+
 #endif
diff --git a/slirp/udp6.c b/slirp/udp6.c
new file mode 100644
index 000..3940959
--- /dev/null
+++ b/slirp/udp6.c
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2013
+ * Guillaume Subiron
+ *
+ * Please read the file COPYRIGHT for the
+ * terms and conditions of the copyright.
+ */
+
+#include slirp.h
+#include udp.h
+
+void udp6_input(struct mbuf *m)
+{
+Slirp *slirp = m-slirp;
+struct ip6 *ip, save_ip;
+struct udphdr *uh;
+int hlen = sizeof(struct ip6);
+int len;
+struct socket *so;
+struct sockaddr_storage lhost;
+
+DEBUG_CALL(udp6_input);
+DEBUG_ARG(m = %lx, (long)m);
+
+if (slirp-restricted) {
+goto bad;
+}
+
+ip = mtod(m, struct ip6 *);
+m-m_len -= hlen;
+m-m_data += hlen;
+uh = mtod(m, struct udphdr *);
+m-m_len += hlen;
+m-m_data -= hlen;
+
+if (ip6_cksum(m)) {
+goto bad;
+}
+
+len = ntohs((uint16_t)uh-uh_ulen);
+
+/*
+ * Make mbuf data length reflect UDP length.
+ * If not enough data to reflect UDP length, drop.
+ */
+if (ntohs(ip-ip_pl) != len) {
+if (len  ntohs(ip-ip_pl)) {
+goto bad;
+}
+m_adj(m, len - ntohs(ip-ip_pl));
+ip-ip_pl = htons(len);
+}
+
+/* TODO handle DHCP/BOOTP */
+/* TODO handle TFTP */
+
+/* Locate pcb for datagram. */
+lhost.ss_family = AF_INET6;
+((struct sockaddr_in6 *)lhost)-sin6_addr = ip-ip_src;
+((struct sockaddr_in6 *)lhost)-sin6_port = uh-uh_sport;
+
+so = solookup(slirp-udp_last_so, slirp-udb, lhost, NULL);
+
+if (so == NULL) {
+/* If there's no socket for this packet, create one. */
+so = socreate(slirp);
+if (!so) {
+goto bad;
+}
+if (udp_attach(so, AF_INET6) == -1) {
+DEBUG_MISC((dfd,  udp6_attach errno = %d-%s\n,
+errno, strerror(errno)));
+sofree(so);
+goto bad;
+}
+
+/* Setup fields */
+so-so_lfamily = AF_INET6;
+so-so_laddr6 = ip-ip_src;
+so-so_lport6 = uh-uh_sport;
+}
+
+so-so_ffamily = AF_INET6;
+so-so_faddr6 = ip-ip_dst; /* XXX */
+so-so_fport6 = uh-uh_dport; /* XXX */
+
+hlen += sizeof(struct udphdr);
+m-m_len -= hlen;
+m-m_data += hlen;
+
+/*
+ * Now we sendto() the packet.
+ */
+

[Qemu-devel] [PATCH 13/16] slirp: Generalizing and neutralizing various TCP functions before adding IPv6 stuff

2013-10-20 Thread Samuel Thibault

Basically, this patch adds some switch in various TCP functions to
prepare them for the IPv6 case.

To have something to switch in tcp_input() and tcp_respond(), a new
argument is used to give them the sa_family of the addresses they are
working on.

Signed-off-by: Guillaume Subiron maet...@subiron.org
---
 slirp/ip_input.c   |   2 +-
 slirp/slirp.c  |   6 ++-
 slirp/slirp.h  |   5 +-
 slirp/tcp_input.c  | 142 +
 slirp/tcp_output.c |  43 +---
 slirp/tcp_subr.c   |  94 +--
 slirp/tcp_timer.c  |   3 +-
 7 files changed, 181 insertions(+), 114 deletions(-)

diff --git a/slirp/ip_input.c b/slirp/ip_input.c
index 1925cdc..9aa8909 100644
--- a/slirp/ip_input.c
+++ b/slirp/ip_input.c
@@ -199,7 +199,7 @@ ip_input(struct mbuf *m)
 */
switch (ip-ip_p) {
 case IPPROTO_TCP:
-   tcp_input(m, hlen, (struct socket *)NULL);
+   tcp_input(m, hlen, (struct socket *)NULL, AF_INET);
break;
 case IPPROTO_UDP:
udp_input(m, hlen);
diff --git a/slirp/slirp.c b/slirp/slirp.c
index d0b8c79..0f6f006 100644
--- a/slirp/slirp.c
+++ b/slirp/slirp.c
@@ -576,7 +576,8 @@ void slirp_pollfds_poll(GArray *pollfds, int select_error)
 /*
  * Continue tcp_input
  */
-tcp_input((struct mbuf *)NULL, sizeof(struct ip), so);
+tcp_input((struct mbuf *)NULL, sizeof(struct ip), so,
+so-so_ffamily);
 /* continue; */
 } else {
 ret = sowrite(so);
@@ -625,7 +626,8 @@ void slirp_pollfds_poll(GArray *pollfds, int select_error)
 }
 
 }
-tcp_input((struct mbuf *)NULL, sizeof(struct ip), so);
+tcp_input((struct mbuf *)NULL, sizeof(struct ip), so,
+so-so_ffamily);
 } /* SS_ISFCONNECTING */
 #endif
 }
diff --git a/slirp/slirp.h b/slirp/slirp.h
index aa44055..b6e805e 100644
--- a/slirp/slirp.h
+++ b/slirp/slirp.h
@@ -352,7 +352,7 @@ void ip6_input(struct mbuf *);
 int ip6_output(struct socket *, struct mbuf *, int fast);
 
 /* tcp_input.c */
-void tcp_input(register struct mbuf *, int, struct socket *);
+void tcp_input(register struct mbuf *, int, struct socket *, sa_family_t af);
 int tcp_mss(register struct tcpcb *, u_int);
 
 /* tcp_output.c */
@@ -363,7 +363,8 @@ void tcp_setpersist(register struct tcpcb *);
 void tcp_init(Slirp *);
 void tcp_cleanup(Slirp *);
 void tcp_template(struct tcpcb *);
-void tcp_respond(struct tcpcb *, register struct tcpiphdr *, register struct 
mbuf *, tcp_seq, tcp_seq, int);
+void tcp_respond(struct tcpcb *, register struct tcpiphdr *,
+register struct mbuf *, tcp_seq, tcp_seq, int, sa_family_t);
 struct tcpcb * tcp_newtcpcb(struct socket *);
 struct tcpcb * tcp_close(register struct tcpcb *);
 void tcp_sockclosed(struct tcpcb *);
diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c
index dde89b6..3409557 100644
--- a/slirp/tcp_input.c
+++ b/slirp/tcp_input.c
@@ -213,7 +213,7 @@ present:
  * protocol specification dated September, 1981 very closely.
  */
 void
-tcp_input(struct mbuf *m, int iphlen, struct socket *inso)
+tcp_input(struct mbuf *m, int iphlen, struct socket *inso, sa_family_t af)
 {
struct ip save_ip, *ip;
register struct tcpiphdr *ti;
@@ -254,46 +254,53 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso)
}
slirp = m-slirp;
 
-   if (iphlen  sizeof(struct ip )) {
- ip_stripoptions(m, (struct mbuf *)0);
- iphlen=sizeof(struct ip );
-   }
-   /* XXX Check if too short */
+   switch (af) {
+   case AF_INET:
+   if (iphlen  sizeof(struct ip)) {
+   ip_stripoptions(m, (struct mbuf *)0);
+   iphlen = sizeof(struct ip);
+   }
+   /* XXX Check if too short */
 
 
-   /*
-* Save a copy of the IP header in case we want restore it
-* for sending an ICMP error message in response.
-*/
-   ip=mtod(m, struct ip *);
-   save_ip = *ip;
-   save_ip.ip_len+= iphlen;
+   /*
+* Save a copy of the IP header in case we want restore it
+* for sending an ICMP error message in response.
+*/
+   ip = mtod(m, struct ip *);
+   save_ip = *ip;
+   save_ip.ip_len += iphlen;
 
-   /*
-* Get IP and TCP header together in first mbuf.
-* Note: IP leaves IP header in first mbuf.
-*/
-   m-m_data -= sizeof(struct tcpiphdr) - (sizeof(struct ip)
-+ sizeof(struct tcphdr));
-   m-m_len += sizeof(struct tcpiphdr) - (sizeof(struct ip)
-   + sizeof(struct

[Qemu-devel] [PATCH 12/16] slirp: Factorizing tcpiphdr structure with an union

2013-10-20 Thread Samuel Thibault

This patch factorizes the tcpiphdr structure to put the IPv4 fields in
an union, for addition of version 6 in further patch.
Using some macros, retrocompatibility of the existing code is assured.

This patch also fixes the SLIRP_MSIZE and margin computation in various
functions, and makes them compatible with the new tcpiphdr structure,
whose size will be bigger than sizeof(struct tcphdr) + sizeof(struct ip)

Signed-off-by: Guillaume Subiron maet...@subiron.org
Signed-off-by: Samuel Thibault samuel.thiba...@ens-lyon.org
---
 slirp/if.h |  4 ++--
 slirp/mbuf.c   |  3 ++-
 slirp/slirp.c  | 15 ---
 slirp/socket.c | 13 -
 slirp/tcp_input.c  | 31 ---
 slirp/tcp_output.c | 18 +-
 slirp/tcp_subr.c   | 31 ++-
 slirp/tcpip.h  | 31 +++
 8 files changed, 102 insertions(+), 44 deletions(-)

diff --git a/slirp/if.h b/slirp/if.h
index 3327023..c7a5c57 100644
--- a/slirp/if.h
+++ b/slirp/if.h
@@ -17,7 +17,7 @@
 #define IF_MRU 1500
 #defineIF_COMP IF_AUTOCOMP /* Flags for compression */
 
-/* 2 for alignment, 14 for ethernet, 40 for TCP/IP */
-#define IF_MAXLINKHDR (2 + 14 + 40)
+/* 2 for alignment, 14 for ethernet */
+#define IF_MAXLINKHDR (2 + ETH_HLEN)
 
 #endif
diff --git a/slirp/mbuf.c b/slirp/mbuf.c
index 92c429e..87ee550 100644
--- a/slirp/mbuf.c
+++ b/slirp/mbuf.c
@@ -23,7 +23,8 @@
  * Find a nice value for msize
  * XXX if_maxlinkhdr already in mtu
  */
-#define SLIRP_MSIZE (IF_MTU + IF_MAXLINKHDR + offsetof(struct mbuf, m_dat) + 6)
+#define SLIRP_MSIZE\
+(offsetof(struct mbuf, m_dat) + IF_MAXLINKHDR + TCPIPHDR_DELTA + IF_MTU)
 
 void
 m_init(Slirp *slirp)
diff --git a/slirp/slirp.c b/slirp/slirp.c
index 2caddcd..d0b8c79 100644
--- a/slirp/slirp.c
+++ b/slirp/slirp.c
@@ -756,15 +756,16 @@ void slirp_input(Slirp *slirp, const uint8_t *pkt, int 
pkt_len)
 m = m_get(slirp);
 if (!m)
 return;
-/* Note: we add to align the IP header */
-if (M_FREEROOM(m)  pkt_len + 2) {
-m_inc(m, pkt_len + 2);
+/* Note: we add 2 to align the IP header on 4 bytes,
+ * and add the margin for the tcpiphdr overhead  */
+if (M_FREEROOM(m)  pkt_len + TCPIPHDR_DELTA + 2) {
+m_inc(m, pkt_len + TCPIPHDR_DELTA + 2);
 }
-m-m_len = pkt_len + 2;
-memcpy(m-m_data + 2, pkt, pkt_len);
+m-m_len = pkt_len + TCPIPHDR_DELTA + 2;
+memcpy(m-m_data + TCPIPHDR_DELTA + 2, pkt, pkt_len);
 
-m-m_data += 2 + ETH_HLEN;
-m-m_len -= 2 + ETH_HLEN;
+m-m_data += TCPIPHDR_DELTA + 2 + ETH_HLEN;
+m-m_len -= TCPIPHDR_DELTA + 2 + ETH_HLEN;
 
 if (proto == ETH_P_IP) {
 ip_input(m);
diff --git a/slirp/socket.c b/slirp/socket.c
index 31bbb7e..567f9bc 100644
--- a/slirp/socket.c
+++ b/slirp/socket.c
@@ -482,7 +482,18 @@ sorecvfrom(struct socket *so)
  if (!m) {
  return;
  }
- m-m_data += IF_MAXLINKHDR;
+ switch (so-so_ffamily) {
+ case AF_INET:
+ m-m_data += IF_MAXLINKHDR + sizeof(struct udpiphdr);
+ break;
+ case AF_INET6:
+ m-m_data += IF_MAXLINKHDR + sizeof(struct ip6)
++ sizeof(struct udphdr);
+ break;
+ default:
+ assert(0);
+ break;
+ }
 
  /*
   * XXX Shouldn't FIONREAD packets destined for port 53,
diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c
index 25929bd..dde89b6 100644
--- a/slirp/tcp_input.c
+++ b/slirp/tcp_input.c
@@ -254,11 +254,6 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso)
}
slirp = m-slirp;
 
-   /*
-* Get IP and TCP header together in first mbuf.
-* Note: IP leaves IP header in first mbuf.
-*/
-   ti = mtod(m, struct tcpiphdr *);
if (iphlen  sizeof(struct ip )) {
  ip_stripoptions(m, (struct mbuf *)0);
  iphlen=sizeof(struct ip );
@@ -275,14 +270,28 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso)
save_ip.ip_len+= iphlen;
 
/*
+* Get IP and TCP header together in first mbuf.
+* Note: IP leaves IP header in first mbuf.
+*/
+   m-m_data -= sizeof(struct tcpiphdr) - (sizeof(struct ip)
++ sizeof(struct tcphdr));
+   m-m_len += sizeof(struct tcpiphdr) - (sizeof(struct ip)
+   + sizeof(struct tcphdr));
+   ti = mtod(m, struct tcpiphdr *);
+
+   /*
 * Checksum extended TCP header and data.
 */
-   tlen = ((struct ip *)ti)-ip_len;
-tcpiphdr2qlink(ti)-next = tcpiphdr2qlink(ti)-prev = NULL;
-memset(ti-ti_i.ih_mbuf, 0 , sizeof(struct mbuf_ptr));
-   ti-ti_x1 = 0;
+   tlen = ip-ip_len;
+   tcpiphdr2qlink(ti)-next =

[Qemu-devel] [PATCH 04/16] slirp: Adding IPv6, ICMPv6 Echo and NDP autoconfiguration

2013-10-20 Thread Samuel Thibault

This patch adds the functions needed to handle IPv6 packets. ICMPv6 and
NDP headers are implemented.

Slirp is now able to send NDP Router or Neighbor Advertisement when it
receives Router or Neighbor Solicitation. Using a 64bit-sized IPv6
prefix, the guest is now able to perform stateless autoconfiguration
(SLAAC) and to compute its IPv6 address.

This patch adds an ndp_table, mainly inspired by arp_table, to keep an
NDP cache and manage network address resolution.
Slirp regularly sends NDP Neighbor Advertisement, as recommended by the
RFC, to make the guest refresh its route.

This also adds ip6_cksum() to compute ICMPv6 checksums using IPv6
pseudo-header.

Signed-off-by: Guillaume Subiron maet...@subiron.org
Signed-off-by: Samuel Thibault samuel.thiba...@ens-lyon.org
---
 slirp/Makefile.objs |   4 +-
 slirp/cksum.c   |  23 
 slirp/ip6.h | 139 +
 slirp/ip6_icmp.c| 350 
 slirp/ip6_icmp.h| 247 
 slirp/ip6_input.c   |  75 +++
 slirp/ip6_output.c  |  41 ++
 slirp/ndp_table.c   |  87 +
 slirp/slirp.c   |  47 +--
 slirp/slirp.h   |  33 +
 10 files changed, 1036 insertions(+), 10 deletions(-)
 create mode 100644 slirp/ip6.h
 create mode 100644 slirp/ip6_icmp.c
 create mode 100644 slirp/ip6_icmp.h
 create mode 100644 slirp/ip6_input.c
 create mode 100644 slirp/ip6_output.c
 create mode 100644 slirp/ndp_table.c

diff --git a/slirp/Makefile.objs b/slirp/Makefile.objs
index 2daa9dc..2dfe8e0 100644
--- a/slirp/Makefile.objs
+++ b/slirp/Makefile.objs
@@ -1,3 +1,3 @@
-common-obj-y = cksum.o if.o ip_icmp.o ip_input.o ip_output.o dnssearch.o
+common-obj-y = cksum.o if.o ip_icmp.o ip6_icmp.o ip6_input.o ip6_output.o 
ip_input.o ip_output.o dnssearch.o
 common-obj-y += slirp.o mbuf.o misc.o sbuf.o socket.o tcp_input.o tcp_output.o
-common-obj-y += tcp_subr.o tcp_timer.o udp.o bootp.o tftp.o arp_table.o
+common-obj-y += tcp_subr.o tcp_timer.o udp.o bootp.o tftp.o arp_table.o 
ndp_table.o
diff --git a/slirp/cksum.c b/slirp/cksum.c
index 6328660..f0a1398 100644
--- a/slirp/cksum.c
+++ b/slirp/cksum.c
@@ -137,3 +137,26 @@ cont:
REDUCE;
return (~sum  0x);
 }
+
+int ip6_cksum(struct mbuf *m)
+{
+struct ip6 save_ip, *ip = mtod(m, struct ip6 *);
+struct ip6_pseudohdr *ih = mtod(m, struct ip6_pseudohdr *);
+int sum;
+
+save_ip = *ip;
+
+ih-ih_src = save_ip.ip_src;
+ih-ih_dst = save_ip.ip_dst;
+ih-ih_pl = htonl((uint32_t)ntohs(save_ip.ip_pl));
+ih-ih_zero_hi = 0;
+ih-ih_zero_lo = 0;
+ih-ih_nh = save_ip.ip_nh;
+
+sum = cksum(m, ((int)sizeof(struct ip6_pseudohdr))
++ ntohl(ih-ih_pl));
+
+*ip = save_ip;
+
+return sum;
+}
diff --git a/slirp/ip6.h b/slirp/ip6.h
new file mode 100644
index 000..16124ec
--- /dev/null
+++ b/slirp/ip6.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2013
+ * Guillaume Subiron, Yann Bordenave, Serigne Modou Wagne.
+ *
+ * Please read the file COPYRIGHT for the
+ * terms and conditions of the copyright.
+ */
+
+#ifndef _IP6_H_
+#define _IP6_H_
+
+#define in6_multicast(a) IN6_IS_ADDR_MULTICAST((a))
+#define in6_linklocal(a) IN6_IS_ADDR_LINKLOCAL((a))
+#define in6_unspecified(a) IN6_IS_ADDR_UNSPECIFIED((a))
+
+#define ALLNODES_MULTICAST  { .s6_addr = \
+{ 0xff, 0x02, 0x00, 0x00,\
+0x00, 0x00, 0x00, 0x00,\
+0x00, 0x00, 0x00, 0x00,\
+0x00, 0x00, 0x00, 0x01 } }
+
+#define SOLICITED_NODE_PREFIX { .s6_addr = \
+{ 0xff, 0x02, 0x00, 0x00,\
+0x00, 0x00, 0x00, 0x00,\
+0x00, 0x00, 0x00, 0x01,\
+0xff, 0x00, 0x00, 0x00 } }
+
+#define LINKLOCAL_ADDR  { .s6_addr = \
+{ 0xfe, 0x80, 0x00, 0x00,\
+0x00, 0x00, 0x00, 0x00,\
+0x00, 0x00, 0x00, 0x00,\
+0x00, 0x00, 0x00, 0x01 } }
+
+static inline int in6_equal(struct in6_addr a, struct in6_addr b)
+{
+return memcmp(a, b, sizeof(a)) == 0;
+}
+
+static inline int in6_equal_net(struct in6_addr a, struct in6_addr b,
+int prefix_len)
+{
+if (memcmp(a, b, prefix_len / 8) != 0) {
+return 0;
+}
+
+if (prefix_len % 8 == 0) {
+return 1;
+}
+
+return (a.s6_addr[prefix_len / 8]  (8 - (prefix_len % 8)))
+== (b.s6_addr[prefix_len / 8]  (8 - (prefix_len % 8)));
+}
+
+static inline int in6_equal_mach(struct in6_addr a, struct in6_addr b,
+int prefix_len)
+{
+if (memcmp((a.s6_addr[(prefix_len + 7) / 8]),
+(b.s6_addr[(prefix_len + 7) / 8]),
+16 - (prefix_len + 7) / 8) != 0) {
+return 0;
+}
+
+if (prefix_len % 8 == 0) {
+return 1;
+}
+
+return (a.s6_addr[prefix_len / 8]  ((1U  (8 - (prefix_len % 8))) -

[Qemu-devel] [PATCH 06/16] slirp: Make Socket structure IPv6 compatible

2013-10-20 Thread Samuel Thibault

This patch replaces foreign and local address/port couples in Socket
structure by 2 sockaddr_storage which can be casted in sockaddr_in or
sockaddr_in6.
Direct access to address and port is still possible thanks to some
\#define, so retrocompatibility of the existing code is assured.

The ss_family field of sockaddr_storage is declared after each socket
creation.

The whole structure is also saved/restored when a Qemu session is
saved/restored.

Signed-off-by: Guillaume Subiron maet...@subiron.org
---
 slirp/ip_icmp.c   |  2 ++
 slirp/slirp.c | 48 
 slirp/socket.c| 14 +++---
 slirp/socket.h| 25 +
 slirp/tcp_input.c |  2 ++
 slirp/tcp_subr.c  |  2 ++
 slirp/udp.c   |  4 
 7 files changed, 82 insertions(+), 15 deletions(-)

diff --git a/slirp/ip_icmp.c b/slirp/ip_icmp.c
index 1808976..768ea4a 100644
--- a/slirp/ip_icmp.c
+++ b/slirp/ip_icmp.c
@@ -170,8 +170,10 @@ icmp_input(struct mbuf *m, int hlen)
goto end_error;
   }
   so-so_m = m;
+  so-so_ffamily = AF_INET;
   so-so_faddr = ip-ip_dst;
   so-so_fport = htons(7);
+  so-so_lfamily = AF_INET;
   so-so_laddr = ip-ip_src;
   so-so_lport = htons(9);
   so-so_iptos = ip-ip_tos;
diff --git a/slirp/slirp.c b/slirp/slirp.c
index 1533d31..2caddcd 100644
--- a/slirp/slirp.c
+++ b/slirp/slirp.c
@@ -1056,10 +1056,26 @@ static void slirp_sbuf_save(QEMUFile *f, struct sbuf 
*sbuf)
 static void slirp_socket_save(QEMUFile *f, struct socket *so)
 {
 qemu_put_be32(f, so-so_urgc);
-qemu_put_be32(f, so-so_faddr.s_addr);
-qemu_put_be32(f, so-so_laddr.s_addr);
-qemu_put_be16(f, so-so_fport);
-qemu_put_be16(f, so-so_lport);
+qemu_put_be16(f, so-so_ffamily);
+switch (so-so_ffamily) {
+case AF_INET:
+qemu_put_be32(f, so-so_faddr.s_addr);
+qemu_put_be16(f, so-so_fport);
+break;
+default:
+fprintf(stderr,
+so_ffamily unknown, unable to save so_faddr and so_fport\n);
+}
+qemu_put_be16(f, so-so_lfamily);
+switch (so-so_lfamily) {
+case AF_INET:
+qemu_put_be32(f, so-so_laddr.s_addr);
+qemu_put_be16(f, so-so_lport);
+break;
+default:
+fprintf(stderr,
+so_ffamily unknown, unable to save so_laddr and so_lport\n);
+}
 qemu_put_byte(f, so-so_iptos);
 qemu_put_byte(f, so-so_emu);
 qemu_put_byte(f, so-so_type);
@@ -1179,10 +1195,26 @@ static int slirp_socket_load(QEMUFile *f, struct socket 
*so)
 return -ENOMEM;
 
 so-so_urgc = qemu_get_be32(f);
-so-so_faddr.s_addr = qemu_get_be32(f);
-so-so_laddr.s_addr = qemu_get_be32(f);
-so-so_fport = qemu_get_be16(f);
-so-so_lport = qemu_get_be16(f);
+so-so_ffamily = qemu_get_be16(f);
+switch (so-so_ffamily) {
+case AF_INET:
+so-so_faddr.s_addr = qemu_get_be32(f);
+so-so_fport = qemu_get_be16(f);
+break;
+default:
+fprintf(stderr,
+so_ffamily unknown, unable to restore so_faddr and 
so_lport\n);
+}
+so-so_lfamily = qemu_get_be16(f);
+switch (so-so_lfamily) {
+case AF_INET:
+so-so_laddr.s_addr = qemu_get_be32(f);
+so-so_lport = qemu_get_be16(f);
+break;
+default:
+fprintf(stderr,
+so_ffamily unknown, unable to restore so_laddr and 
so_lport\n);
+}
 so-so_iptos = qemu_get_byte(f);
 so-so_emu = qemu_get_byte(f);
 so-so_type = qemu_get_byte(f);
diff --git a/slirp/socket.c b/slirp/socket.c
index e87c70e..2f166fb 100644
--- a/slirp/socket.c
+++ b/slirp/socket.c
@@ -437,8 +437,8 @@ sowrite(struct socket *so)
 void
 sorecvfrom(struct socket *so)
 {
-   struct sockaddr_in addr;
-   socklen_t addrlen = sizeof(struct sockaddr_in);
+   struct sockaddr_storage addr;
+   socklen_t addrlen = sizeof(struct sockaddr_storage);
 
DEBUG_CALL(sorecvfrom);
DEBUG_ARG(so = %lx, (long)so);
@@ -527,7 +527,13 @@ sorecvfrom(struct socket *so)
 * If this packet was destined for CTL_ADDR,
 * make it look like that's where it came from, done by udp_output
 */
-   udp_output(so, m, addr);
+   switch (so-so_ffamily) {
+   case AF_INET:
+   udp_output(so, m, (struct sockaddr_in *) addr);
+   break;
+   default:
+   break;
+   }
  } /* rx error */
} /* if ping packet */
 }
@@ -619,6 +625,7 @@ tcp_listen(Slirp *slirp, uint32_t haddr, u_int hport, 
uint32_t laddr,
 
so-so_state = SS_PERSISTENT_MASK;
so-so_state |= (SS_FACCEPTCONN | flags);
+   so-so_lfamily = AF_INET;
so-so_lport = lport; /* Kept in network format */
so-so_laddr.s_addr = laddr; /* Ditto */
 
@@ -645,6 +652,7 @@ tcp_listen(Slirp *slirp, uint32_t haddr, u_int hport, 
uint32_t laddr,
qemu_setsockopt(s, SOL_SOCKET, SO_OOBINLINE, opt, sizeof(int));

[Qemu-devel] [PATCH 14/16] slirp: Handle IPv6 in TCP functions

2013-10-20 Thread Samuel Thibault

This patch adds IPv6 case in TCP functions refactored by the last
patches.
This also adds IPv6 pseudo-header in tcpiphdr structure.
Finally, tcp_input() is called by ip6_input().

Signed-off-by: Guillaume Subiron maet...@subiron.org
Signed-off-by: Samuel Thibault samuel.thiba...@ens-lyon.org
---
 slirp/ip6_input.c  |  4 ++--
 slirp/tcp.h|  2 ++
 slirp/tcp_input.c  | 58 +-
 slirp/tcp_output.c | 16 +++
 slirp/tcp_subr.c   | 36 +
 slirp/tcpip.h  |  9 +
 6 files changed, 105 insertions(+), 20 deletions(-)

diff --git a/slirp/ip6_input.c b/slirp/ip6_input.c
index 3290af8..b03b795 100644
--- a/slirp/ip6_input.c
+++ b/slirp/ip6_input.c
@@ -58,8 +58,8 @@ void ip6_input(struct mbuf *m)
  */
 switch (ip6-ip_nh) {
 case IPPROTO_TCP:
-/* :TODO:maethor:130307: TCP */
-icmp6_send_error(m, ICMP6_UNREACH, ICMP6_UNREACH_NO_ROUTE);
+NTOHS(ip6-ip_pl);
+tcp_input(m, sizeof(struct ip6), (struct socket *)NULL, AF_INET6);
 break;
 case IPPROTO_UDP:
 udp6_input(m);
diff --git a/slirp/tcp.h b/slirp/tcp.h
index 2e2b403..61befcd 100644
--- a/slirp/tcp.h
+++ b/slirp/tcp.h
@@ -106,6 +106,8 @@ struct tcphdr {
  */
 #undef TCP_MSS
 #defineTCP_MSS 1460
+#undef TCP6_MSS
+#define TCP6_MSS 1440
 
 #undef TCP_MAXWIN
 #defineTCP_MAXWIN  65535   /* largest value for (unscaled) window 
*/
diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c
index 3409557..e5056f8 100644
--- a/slirp/tcp_input.c
+++ b/slirp/tcp_input.c
@@ -215,7 +215,8 @@ present:
 void
 tcp_input(struct mbuf *m, int iphlen, struct socket *inso, sa_family_t af)
 {
-   struct ip save_ip, *ip;
+   struct ip save_ip, *ip;
+   struct ip6 save_ip6, *ip6;
register struct tcpiphdr *ti;
caddr_t optp = NULL;
int optlen = 0;
@@ -254,6 +255,11 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso, 
sa_family_t af)
}
slirp = m-slirp;
 
+   ip = mtod(m, struct ip *);
+   ip6 = mtod(m, struct ip6 *);
+   save_ip = *ip;
+   save_ip6 = *ip6;
+
switch (af) {
case AF_INET:
if (iphlen  sizeof(struct ip)) {
@@ -262,13 +268,6 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso, 
sa_family_t af)
}
/* XXX Check if too short */
 
-
-   /*
-* Save a copy of the IP header in case we want restore it
-* for sending an ICMP error message in response.
-*/
-   ip = mtod(m, struct ip *);
-   save_ip = *ip;
save_ip.ip_len += iphlen;
 
/*
@@ -293,16 +292,35 @@ tcp_input(struct mbuf *m, int iphlen, struct socket 
*inso, sa_family_t af)
ti-ti_dst = save_ip.ip_dst;
ti-ti_pr = save_ip.ip_p;
ti-ti_len = htons((uint16_t)tlen);
-   len = ((sizeof(struct tcpiphdr) - sizeof(struct tcphdr)) + tlen);
-   if (cksum(m, len)) {
-   goto drop;
-   }
+   break;
+
+   case AF_INET6:
+   m-m_data -= sizeof(struct tcpiphdr) - (sizeof(struct ip6)
++ sizeof(struct tcphdr));
+   m-m_len  += sizeof(struct tcpiphdr) - (sizeof(struct ip6)
++ sizeof(struct tcphdr));
+   ti = mtod(m, struct tcpiphdr *);
+
+   tlen = ip6-ip_pl;
+   tcpiphdr2qlink(ti)-next = tcpiphdr2qlink(ti)-prev = NULL;
+   memset(ti-ih_mbuf, 0 , sizeof(struct mbuf_ptr));
+   memset(ti-ti, 0, sizeof(ti-ti));
+   ti-ti_x0 = 0;
+   ti-ti_src6 = save_ip6.ip_src;
+   ti-ti_dst6 = save_ip6.ip_dst;
+   ti-ti_nh6 = save_ip6.ip_nh;
+   ti-ti_len = htons((uint16_t)tlen);
break;
 
default:
goto drop;
}
 
+   len = ((sizeof(struct tcpiphdr) - sizeof(struct tcphdr)) + tlen);
+   if (cksum(m, len)) {
+   goto drop;
+   }
+
/*
 * Check that TCP offset makes sense,
 * pull out TCP options and adjust length.  XXX
@@ -346,6 +364,12 @@ findso:
((struct sockaddr_in *)fhost)-sin_addr = ti-ti_dst;
((struct sockaddr_in *)fhost)-sin_port = ti-ti_dport;
break;
+   case AF_INET6:
+   ((struct sockaddr_in6 *)lhost)-sin6_addr = ti-ti_src6;
+   ((struct sockaddr_in6 *)lhost)-sin6_port = ti-ti_sport;
+   ((struct sockaddr_in6 *)fhost)-sin6_addr = ti-ti_dst6;
+   ((struct sockaddr_in6 *)fhost)-sin6_port = ti-ti_dport;
+   break;
default:
goto drop;
}
@@ -405,7 +429,6 @@ findso:
  so-so_iptos = ((struct ip *)ti)-ip_tos;
  break;
  default:
- goto drop;
  break;
  }
  }
@@ -634,6 +657,9 @@ findso:
  case AF_INET:

[Qemu-devel] [PATCH 02/16] slirp: Generalizing and neutralizing code before adding IPv6 stuff

2013-10-20 Thread Samuel Thibault

Basically, this patch replaces arp by resolution every time arp
means mac resolution and not specifically ARP.

Some indentation problems are solved in functions that will be modified
in the next patches (ip_input…).

In if_encap, a switch is added to prepare for the IPv6 case. Some code
is factorized.

Some #define ETH_* are moved upper in slirp.h to make them accessible to
other slirp/*.h

Signed-off-by: Guillaume Subiron maet...@subiron.org
Signed-off-by: Samuel Thibault samuel.thiba...@ens-lyon.org
---
 slirp/if.c|   2 +-
 slirp/mbuf.c  |   2 +-
 slirp/mbuf.h  |   2 +-
 slirp/slirp.c | 107 ++
 slirp/slirp.h |  12 +++
 5 files changed, 71 insertions(+), 54 deletions(-)

diff --git a/slirp/if.c b/slirp/if.c
index 87ca8a5..c138ff4 100644
--- a/slirp/if.c
+++ b/slirp/if.c
@@ -193,7 +193,7 @@ void if_start(Slirp *slirp)
 
 /* Try to send packet unless it already expired */
 if (ifm-expiration_date = now  !if_encap(slirp, ifm)) {
-/* Packet is delayed due to pending ARP resolution */
+/* Packet is delayed due to pending ARP or NDP resolution */
 continue;
 }
 
diff --git a/slirp/mbuf.c b/slirp/mbuf.c
index 4fefb04..92c429e 100644
--- a/slirp/mbuf.c
+++ b/slirp/mbuf.c
@@ -91,7 +91,7 @@ m_get(Slirp *slirp)
m-m_len = 0;
 m-m_nextpkt = NULL;
 m-m_prevpkt = NULL;
-m-arp_requested = false;
+m-resolution_requested = false;
 m-expiration_date = (uint64_t)-1;
 end_error:
DEBUG_ARG(m = %lx, (long )m);
diff --git a/slirp/mbuf.h b/slirp/mbuf.h
index b144f1c..38fedf4 100644
--- a/slirp/mbuf.h
+++ b/slirp/mbuf.h
@@ -79,7 +79,7 @@ struct mbuf {
int m_len;  /* Amount of data in this mbuf */
 
Slirp *slirp;
-   boolarp_requested;
+   boolresolution_requested;
uint64_t expiration_date;
/* start of dynamic buffer area, must be last element */
union {
diff --git a/slirp/slirp.c b/slirp/slirp.c
index bad8dad..bfc4832 100644
--- a/slirp/slirp.c
+++ b/slirp/slirp.c
@@ -778,53 +778,70 @@ int if_encap(Slirp *slirp, struct mbuf *ifm)
 return 1;
 }
 
-if (!arp_table_search(slirp, iph-ip_dst.s_addr, ethaddr)) {
-uint8_t arp_req[ETH_HLEN + sizeof(struct arphdr)];
-struct ethhdr *reh = (struct ethhdr *)arp_req;
-struct arphdr *rah = (struct arphdr *)(arp_req + ETH_HLEN);
-
-if (!ifm-arp_requested) {
-/* If the client addr is not known, send an ARP request */
-memset(reh-h_dest, 0xff, ETH_ALEN);
-memcpy(reh-h_source, special_ethaddr, ETH_ALEN - 4);
-memcpy(reh-h_source[2], slirp-vhost_addr, 4);
-reh-h_proto = htons(ETH_P_ARP);
-rah-ar_hrd = htons(1);
-rah-ar_pro = htons(ETH_P_IP);
-rah-ar_hln = ETH_ALEN;
-rah-ar_pln = 4;
-rah-ar_op = htons(ARPOP_REQUEST);
-
-/* source hw addr */
-memcpy(rah-ar_sha, special_ethaddr, ETH_ALEN - 4);
-memcpy(rah-ar_sha[2], slirp-vhost_addr, 4);
-
-/* source IP */
-rah-ar_sip = slirp-vhost_addr.s_addr;
-
-/* target hw addr (none) */
-memset(rah-ar_tha, 0, ETH_ALEN);
-
-/* target IP */
-rah-ar_tip = iph-ip_dst.s_addr;
-slirp-client_ipaddr = iph-ip_dst;
-slirp_output(slirp-opaque, arp_req, sizeof(arp_req));
-ifm-arp_requested = true;
-
-/* Expire request and drop outgoing packet after 1 second */
-ifm-expiration_date = qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + 
10ULL;
+switch (iph-ip_v) {
+case IPVERSION:
+if (!arp_table_search(slirp, iph-ip_dst.s_addr, ethaddr)) {
+uint8_t arp_req[ETH_HLEN + sizeof(struct arphdr)];
+struct ethhdr *reh = (struct ethhdr *)arp_req;
+struct arphdr *rah = (struct arphdr *)(arp_req + ETH_HLEN);
+
+if (!ifm-resolution_requested) {
+/* If the client addr is not known, send an ARP request */
+memset(reh-h_dest, 0xff, ETH_ALEN);
+memcpy(reh-h_source, special_ethaddr, ETH_ALEN - 4);
+memcpy(reh-h_source[2], slirp-vhost_addr, 4);
+reh-h_proto = htons(ETH_P_ARP);
+rah-ar_hrd = htons(1);
+rah-ar_pro = htons(ETH_P_IP);
+rah-ar_hln = ETH_ALEN;
+rah-ar_pln = 4;
+rah-ar_op = htons(ARPOP_REQUEST);
+
+/* source hw addr */
+memcpy(rah-ar_sha, special_ethaddr, ETH_ALEN - 4);
+memcpy(rah-ar_sha[2], slirp-vhost_addr, 4);
+
+/* source IP */
+rah-ar_sip = slirp-vhost_addr.s_addr;
+
+/* target hw addr (none) */
+memset(rah-ar_tha, 0, ETH_ALEN);
+
+/* target IP */
+

[Qemu-devel] [PATCH 16/16] qapi-schema, qemu-options slirp: Adding Qemu options for IPv6 addresses

2013-10-20 Thread Samuel Thibault

This patchs adds parameters to manage some new options in the qemu -net
command.
Slirp IPv6 address, network prefix, and DNS IPv6 address can be given in
argument to the qemu command.
Defaults parameters are respectively fc00::1, fc00::, /64 and fc00::2.

Signed-off-by: Yann Bordenave m...@meowstars.org
---
 net/slirp.c  | 56 
 qapi-schema.json | 37 +++--
 qemu-options.hx  |  5 +++--
 slirp/libslirp.h |  8 +---
 slirp/slirp.c| 20 +---
 5 files changed, 88 insertions(+), 38 deletions(-)

diff --git a/net/slirp.c b/net/slirp.c
index 124e953..68f4aa9 100644
--- a/net/slirp.c
+++ b/net/slirp.c
@@ -134,17 +134,23 @@ static NetClientInfo net_slirp_info = {
 static int net_slirp_init(NetClientState *peer, const char *model,
   const char *name, int restricted,
   const char *vnetwork, const char *vhost,
+  const char *vprefix6, const char *vhost6,
   const char *vhostname, const char *tftp_export,
   const char *bootfile, const char *vdhcp_start,
-  const char *vnameserver, const char *smb_export,
-  const char *vsmbserver, const char **dnssearch)
+  const char *vnameserver, const char *vnameserver6,
+  const char *smb_export, const char *vsmbserver,
+  const char **dnssearch)
 {
-/* default settings according to historic slirp */
+/* default settings according to historic slirp and updated for IPv6 */
 struct in_addr net  = { .s_addr = htonl(0x0a000200) }; /* 10.0.2.0 */
 struct in_addr mask = { .s_addr = htonl(0xff00) }; /* 255.255.255.0 */
 struct in_addr host = { .s_addr = htonl(0x0a000202) }; /* 10.0.2.2 */
+struct in6_addr ip6_prefix;
+uint8_t ip6_prefix_len = 64;
+struct in6_addr ip6_host;
 struct in_addr dhcp = { .s_addr = htonl(0x0a00020f) }; /* 10.0.2.15 */
 struct in_addr dns  = { .s_addr = htonl(0x0a000203) }; /* 10.0.2.3 */
+struct in6_addr ip6_dns;
 #ifndef _WIN32
 struct in_addr smbsrv = { .s_addr = 0 };
 #endif
@@ -156,6 +162,11 @@ static int net_slirp_init(NetClientState *peer, const char 
*model,
 char *end;
 struct slirp_config_str *config;
 
+/* IPv6 defaults initialisations */
+inet_pton(AF_INET6, fc00::0, ip6_prefix);
+inet_pton(AF_INET6, fc00::1, ip6_host);
+inet_pton(AF_INET6, fc00::2, ip6_dns);
+
 if (!tftp_export) {
 tftp_export = legacy_tftp_prefix;
 }
@@ -228,6 +239,32 @@ static int net_slirp_init(NetClientState *peer, const char 
*model,
 return -1;
 }
 
+if (vprefix6) {
+if (get_str_sep(buf, sizeof(buf), vprefix6, '/')  0) {
+if (!inet_pton(AF_INET6, vprefix6, ip6_prefix)) {
+return -1;
+}
+} else {
+if (!inet_pton(AF_INET6, buf, ip6_prefix)) {
+return -1;
+}
+shift = strtol(vprefix6, end, 10);
+if (*end != '\0' || (shift  0  shift  129)) {
+ip6_prefix_len = shift;
+} else {
+return -1;
+}
+}
+}
+
+if (vhost6  !inet_pton(AF_INET6, vhost6, ip6_host)) {
+return -1;
+}
+
+if (vnameserver6  !inet_pton(AF_INET6, vnameserver6, ip6_dns)) {
+return -1;
+}
+
 #ifndef _WIN32
 if (vsmbserver  !inet_aton(vsmbserver, smbsrv)) {
 return -1;
@@ -242,8 +279,10 @@ static int net_slirp_init(NetClientState *peer, const char 
*model,
 
 s = DO_UPCAST(SlirpState, nc, nc);
 
-s-slirp = slirp_init(restricted, net, mask, host, vhostname,
-  tftp_export, bootfile, dhcp, dns, dnssearch, s);
+s-slirp = slirp_init(restricted, net, mask, host,
+  ip6_prefix, ip6_prefix_len, ip6_host,
+  vhostname, tftp_export, bootfile, dhcp,
+  dns, ip6_dns, dnssearch, s);
 QTAILQ_INSERT_TAIL(slirp_stacks, s, entry);
 
 for (config = slirp_configs; config; config = config-next) {
@@ -750,9 +789,10 @@ int net_init_slirp(const NetClientOptions *opts, const 
char *name,
 net_init_slirp_configs(user-guestfwd, 0);
 
 ret = net_slirp_init(peer, user, name, user-q_restrict, vnet,
- user-host, user-hostname, user-tftp,
- user-bootfile, user-dhcpstart, user-dns, user-smb,
- user-smbserver, dnssearch);
+ user-host, user-ip6_prefix, user-ip6_host,
+ user-hostname, user-tftp, user-bootfile,
+ user-dhcpstart, user-dns, user-ip6_dns,
+ user-smb, user-smbserver, dnssearch);
 
 while (slirp_configs) {
 config = slirp_configs;
diff --git a/qapi-schema.json

[Qemu-devel] [PATCH 07/16] slirp: Factorizing address translation

2013-10-20 Thread Samuel Thibault

This patch factorizes some duplicate code into a new function,
sotranslate_out(). This function perform the address translation when a
packet is transmitted to the host network. If the paquet is destinated
to the host, the loopback address is used, and if the paquet is
destinated to the virtual DNS, the real DNS address is used. This code
is just a copy of the existant, but factorized and ready to manage the
IPv6 case.

On the same model, the major part of udp_output() code is moved into a
new sotranslate_in(). This function is directly used in sorecvfrom(),
like sotranslate_out() in sosendto().
udp_output() becoming useless, it is removed and udp_output2() is
renamed into udp_output(). This adds consistency with the udp6_output()
function introduced by further patches.

Signed-off-by: Guillaume Subiron maet...@subiron.org
---
 slirp/bootp.c|  2 +-
 slirp/ip_icmp.c  | 19 +++-
 slirp/socket.c   | 93 
 slirp/socket.h   |  3 ++
 slirp/tcp_subr.c | 24 +++
 slirp/tftp.c |  6 ++--
 slirp/udp.c  | 27 +---
 slirp/udp.h  |  3 +-
 8 files changed, 91 insertions(+), 86 deletions(-)

diff --git a/slirp/bootp.c b/slirp/bootp.c
index b7db9fa..03e2e42 100644
--- a/slirp/bootp.c
+++ b/slirp/bootp.c
@@ -319,7 +319,7 @@ static void bootp_reply(Slirp *slirp, const struct bootp_t 
*bp)
 
 m-m_len = sizeof(struct bootp_t) -
 sizeof(struct ip) - sizeof(struct udphdr);
-udp_output2(NULL, m, saddr, daddr, IPTOS_LOWDELAY);
+udp_output(NULL, m, saddr, daddr, IPTOS_LOWDELAY);
 }
 
 void bootp_input(struct mbuf *m)
diff --git a/slirp/ip_icmp.c b/slirp/ip_icmp.c
index 768ea4a..8787aae 100644
--- a/slirp/ip_icmp.c
+++ b/slirp/ip_icmp.c
@@ -157,7 +157,7 @@ icmp_input(struct mbuf *m, int hlen)
 goto freeit;
 } else {
   struct socket *so;
-  struct sockaddr_in addr;
+  struct sockaddr_storage addr;
   if ((so = socreate(slirp)) == NULL) goto freeit;
   if (icmp_send(so, m, hlen) == 0) {
 return;
@@ -181,20 +181,9 @@ icmp_input(struct mbuf *m, int hlen)
   so-so_state = SS_ISFCONNECTED;
 
   /* Send the packet */
-  addr.sin_family = AF_INET;
-  if ((so-so_faddr.s_addr  slirp-vnetwork_mask.s_addr) ==
-  slirp-vnetwork_addr.s_addr) {
-   /* It's an alias */
-   if (so-so_faddr.s_addr == slirp-vnameserver_addr.s_addr) {
- if (get_dns_addr(addr.sin_addr)  0)
-   addr.sin_addr = loopback_addr;
-   } else {
- addr.sin_addr = loopback_addr;
-   }
-  } else {
-   addr.sin_addr = so-so_faddr;
-  }
-  addr.sin_port = so-so_fport;
+  addr = so-fhost.ss;
+  sotranslate_out(so, addr);
+
   if(sendto(so-s, icmp_ping_msg, strlen(icmp_ping_msg), 0,
(struct sockaddr *)addr, sizeof(addr)) == -1) {
DEBUG_MISC((dfd,icmp_input udp sendto tx errno = %d-%s\n,
diff --git a/slirp/socket.c b/slirp/socket.c
index 2f166fb..375281c 100644
--- a/slirp/socket.c
+++ b/slirp/socket.c
@@ -438,6 +438,7 @@ void
 sorecvfrom(struct socket *so)
 {
struct sockaddr_storage addr;
+   struct sockaddr_storage saddr, daddr;
socklen_t addrlen = sizeof(struct sockaddr_storage);
 
DEBUG_CALL(sorecvfrom);
@@ -525,11 +526,17 @@ sorecvfrom(struct socket *so)
 
/*
 * If this packet was destined for CTL_ADDR,
-* make it look like that's where it came from, done by udp_output
+* make it look like that's where it came from
 */
+   saddr = addr;
+   sotranslate_in(so, saddr);
+   daddr = so-lhost.ss;
+
switch (so-so_ffamily) {
case AF_INET:
-   udp_output(so, m, (struct sockaddr_in *) addr);
+   udp_output(so, m, (struct sockaddr_in *) saddr,
+  (struct sockaddr_in *) daddr,
+  so-so_iptos);
break;
default:
break;
@@ -544,33 +551,20 @@ sorecvfrom(struct socket *so)
 int
 sosendto(struct socket *so, struct mbuf *m)
 {
-   Slirp *slirp = so-slirp;
int ret;
-   struct sockaddr_in addr;
+   struct sockaddr_storage addr;
 
DEBUG_CALL(sosendto);
DEBUG_ARG(so = %lx, (long)so);
DEBUG_ARG(m = %lx, (long)m);
 
-addr.sin_family = AF_INET;
-   if ((so-so_faddr.s_addr  slirp-vnetwork_mask.s_addr) ==
-   slirp-vnetwork_addr.s_addr) {
- /* It's an alias */
- if (so-so_faddr.s_addr == slirp-vnameserver_addr.s_addr) {
-   if (get_dns_addr(addr.sin_addr)  0)
- addr.sin_addr = loopback_addr;
- } else {
-   addr.sin_addr = loopback_addr;
- }
-   } else
- addr.sin_addr = so-so_faddr;
-   addr.sin_port = so-so_fport;
-
-   DEBUG_MISC((dfd,  sendto()ing, addr.sin_port=%d, 
addr.sin_addr.s_addr=%.16s\n, ntohs(addr.sin_port), inet_ntoa(addr.sin_addr)));

[Qemu-devel] [PATCHv5 02/17] block: add flags to bdrv_*_write_zeroes

2013-10-20 Thread Peter Lieven

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Peter Lieven p...@kamp.de
---
 block-migration.c |2 +-
 block.c   |   20 +++-
 block/backup.c|3 ++-
 block/qcow2-cluster.c |2 +-
 block/qcow2.c |2 +-
 block/qed.c   |3 ++-
 block/raw_bsd.c   |5 +++--
 block/vmdk.c  |3 ++-
 include/block/block.h |4 ++--
 include/block/block_int.h |2 +-
 qemu-io-cmds.c|2 +-
 11 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index daf9ec1..713a8e3 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -780,7 +780,7 @@ static int block_load(QEMUFile *f, void *opaque, int 
version_id)
 }
 
 if (flags  BLK_MIG_FLAG_ZERO_BLOCK) {
-ret = bdrv_write_zeroes(bs, addr, nr_sectors);
+ret = bdrv_write_zeroes(bs, addr, nr_sectors, 0);
 } else {
 buf = g_malloc(BLOCK_SIZE);
 qemu_get_buffer(f, buf, BLOCK_SIZE);
diff --git a/block.c b/block.c
index eb11a07..3259429 100644
--- a/block.c
+++ b/block.c
@@ -79,7 +79,7 @@ static BlockDriverAIOCB 
*bdrv_co_aio_rw_vector(BlockDriverState *bs,
bool is_write);
 static void coroutine_fn bdrv_co_do_rw(void *opaque);
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors);
+int64_t sector_num, int nb_sectors, BdrvRequestFlags flags);
 
 static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
 QTAILQ_HEAD_INITIALIZER(bdrv_states);
@@ -2384,10 +2384,11 @@ int bdrv_writev(BlockDriverState *bs, int64_t 
sector_num, QEMUIOVector *qiov)
 return bdrv_rwv_co(bs, sector_num, qiov, true, 0);
 }
 
-int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
+int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+  int nb_sectors, BdrvRequestFlags flags)
 {
 return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true,
-  BDRV_REQ_ZERO_WRITE);
+  BDRV_REQ_ZERO_WRITE | flags);
 }
 
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
@@ -2569,7 +2570,7 @@ static int coroutine_fn 
bdrv_co_do_copy_on_readv(BlockDriverState *bs,
 if (drv-bdrv_co_write_zeroes 
 buffer_is_zero(bounce_buffer, iov.iov_len)) {
 ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num,
-  cluster_nb_sectors);
+  cluster_nb_sectors, 0);
 } else {
 /* This does not change the data on the disk, it is not necessary
  * to flush even in cache=writethrough mode.
@@ -2703,7 +2704,7 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState 
*bs,
 }
 
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
-int64_t sector_num, int nb_sectors)
+int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
 BlockDriver *drv = bs-drv;
 QEMUIOVector qiov;
@@ -2715,7 +2716,7 @@ static int coroutine_fn 
bdrv_co_do_write_zeroes(BlockDriverState *bs,
 
 /* First try the efficient write zeroes operation */
 if (drv-bdrv_co_write_zeroes) {
-ret = drv-bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
+ret = drv-bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
 if (ret != -ENOTSUP) {
 return ret;
 }
@@ -2770,7 +2771,7 @@ static int coroutine_fn 
bdrv_co_do_writev(BlockDriverState *bs,
 if (ret  0) {
 /* Do nothing, write notifier decided to fail this request */
 } else if (flags  BDRV_REQ_ZERO_WRITE) {
-ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors);
+ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags);
 } else {
 ret = drv-bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 }
@@ -2804,12 +2805,13 @@ int coroutine_fn bdrv_co_writev(BlockDriverState *bs, 
int64_t sector_num,
 }
 
 int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs,
-  int64_t sector_num, int nb_sectors)
+  int64_t sector_num, int nb_sectors,
+  BdrvRequestFlags flags)
 {
 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
 
 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
- BDRV_REQ_ZERO_WRITE);
+ BDRV_REQ_ZERO_WRITE | flags);
 }
 
 /**
diff --git a/block/backup.c b/block/backup.c
index cad14c9..830a179 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -138,7 +138,8 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
 
 if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
 ret = bdrv_co_write_zeroes(job-target,
-   start * BACKUP_SECTORS_PER_CLUSTER, n);
+

[Qemu-devel] [PATCHv5 01/17] block: make BdrvRequestFlags public

2013-10-20 Thread Peter Lieven

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Peter Lieven p...@kamp.de
---
 block.c   |5 -
 include/block/block.h |5 +
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/block.c b/block.c
index fd05a80..eb11a07 100644
--- a/block.c
+++ b/block.c
@@ -51,11 +51,6 @@
 
 #define NOT_DONE 0x7fff /* used while emulated sync operation in progress 
*/
 
-typedef enum {
-BDRV_REQ_COPY_ON_READ = 0x1,
-BDRV_REQ_ZERO_WRITE   = 0x2,
-} BdrvRequestFlags;
-
 static void bdrv_dev_change_media_cb(BlockDriverState *bs, bool load);
 static BlockDriverAIOCB *bdrv_aio_readv_em(BlockDriverState *bs,
 int64_t sector_num, QEMUIOVector *qiov, int nb_sectors,
diff --git a/include/block/block.h b/include/block/block.h
index 3560deb..ba2082c 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -62,6 +62,11 @@ typedef struct BlockDevOps {
 void (*resize_cb)(void *opaque);
 } BlockDevOps;
 
+typedef enum {
+BDRV_REQ_COPY_ON_READ = 0x1,
+BDRV_REQ_ZERO_WRITE   = 0x2,
+} BdrvRequestFlags;
+
 #define BDRV_O_RDWR0x0002
 #define BDRV_O_SNAPSHOT0x0008 /* open the file read only and save writes 
in a snapshot */
 #define BDRV_O_NOCACHE 0x0020 /* do not use the host page cache */
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 03/17] block: introduce BDRV_REQ_MAY_UNMAP request flag

2013-10-20 Thread Peter Lieven

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Peter Lieven p...@kamp.de
---
 block-migration.c |3 ++-
 block.c   |4 
 block/backup.c|2 +-
 include/block/block.h |7 +++
 4 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/block-migration.c b/block-migration.c
index 713a8e3..fc4ef93 100644
--- a/block-migration.c
+++ b/block-migration.c
@@ -780,7 +780,8 @@ static int block_load(QEMUFile *f, void *opaque, int 
version_id)
 }
 
 if (flags  BLK_MIG_FLAG_ZERO_BLOCK) {
-ret = bdrv_write_zeroes(bs, addr, nr_sectors, 0);
+ret = bdrv_write_zeroes(bs, addr, nr_sectors,
+BDRV_REQ_MAY_UNMAP);
 } else {
 buf = g_malloc(BLOCK_SIZE);
 qemu_get_buffer(f, buf, BLOCK_SIZE);
diff --git a/block.c b/block.c
index 3259429..0d97ce6 100644
--- a/block.c
+++ b/block.c
@@ -2810,6 +2810,10 @@ int coroutine_fn bdrv_co_write_zeroes(BlockDriverState 
*bs,
 {
 trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors);
 
+if (!(bs-open_flags  BDRV_O_UNMAP)) {
+flags = ~BDRV_REQ_MAY_UNMAP;
+}
+
 return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL,
  BDRV_REQ_ZERO_WRITE | flags);
 }
diff --git a/block/backup.c b/block/backup.c
index 830a179..0198514 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -139,7 +139,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs,
 if (buffer_is_zero(iov.iov_base, iov.iov_len)) {
 ret = bdrv_co_write_zeroes(job-target,
start * BACKUP_SECTORS_PER_CLUSTER,
-   n, 0);
+   n, BDRV_REQ_MAY_UNMAP);
 } else {
 ret = bdrv_co_writev(job-target,
  start * BACKUP_SECTORS_PER_CLUSTER, n,
diff --git a/include/block/block.h b/include/block/block.h
index 8ba9f0c..1f30a56 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -65,6 +65,13 @@ typedef struct BlockDevOps {
 typedef enum {
 BDRV_REQ_COPY_ON_READ = 0x1,
 BDRV_REQ_ZERO_WRITE   = 0x2,
+/* The BDRV_REQ_MAY_UNMAP flag is used to indicate that the block driver
+ * is allowed to optimize a write zeroes request by unmapping (discarding)
+ * blocks if it is guaranteed that the result will read back as
+ * zeroes. The flag is only passed to the driver if the block device is
+ * opened with BDRV_O_UNMAP.
+ */
+BDRV_REQ_MAY_UNMAP= 0x4,
 } BdrvRequestFlags;
 
 #define BDRV_O_RDWR0x0002
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 00/17] block: logical block provisioning enhancements

2013-10-20 Thread Peter Lieven

this patch adds the ability for targets to stay sparse during
block migration (if the zero_blocks capability is set) and qemu-img convert
even if the target does not have has_zero_init = 1.

the series was especially developed for iSCSI, but it should also work
with other drivers with little or no adjustments. these adjustments
should be limited to providing block provisioning information through
get_block_info and/or honouring BDRV_REQ_MAY_UNMAP on writing zeroes.

v4-v5:
 - new patches 4-6 to move the block provisioning information
   to the BlockDriverInfo.
 - kept 2 wrappers to read the information from the BDI and
   renamed them to make more clear what they do:

 bdrv_has_discard_zeroes - bdrv_unallocated_blocks_are_zero
 bdrv_has_discard_write_zeroes - bdrv_can_write_zeroes_with_unmap

 - added additional information about the 2 flags in the
   BDI struct in block.h

v3-v4:
 - changed BlockLimits struct to typedef (Stefan, Eric)
 - renamed bdrv_zeroize to bdrv_make_zero (Stefan)
 - added comment about the -S flag of qemu-img convert in
   qemu-img.texi (Eric)
 - used struct assignment for bs-bl in raw_open (Stefan, Eric)
 - dropped 3 get_block_status fixes that are independent of
   this series and already partly merged.

v2-v3:
 - fix merge conflict in block/qcow2_cluster.c
 - changed return type of bdrv_has_discard_zeroes and
   bdrv_has_discard_write_zeroes to bool.
 - moved alignment and limits info to a BlockLimits struct (Paolo).
 - added magic constanst for default maximum in bdrv_co_do_write_zeroes
   and bdrv_co_discard (Eric).
 - bdrv_co_do_write_zeroes: allocating the bounce buffer only once (Eric),
   fixed bounce iov_len in the fall back path.
 - bdrv_zeroize: added inline docu (Eric) and do not mask flags passed
   to bdrv_write_zeroes (Eric).
 - qemu-img: changed the default hint for -S (min_sparse) in the usage
   help to 4k. not changing the default as it is unclear why this default
   was set. size suffixes are already supported (Eric).

v1-v2:
 - moved block max_discard and max_write_zeroes to BlockDriverState
 - added discard_alignment and write_zeroes_alignment to BlockDriverState
 - added bdrv_has_discard_zeroes() and bdrv_has_discard_write_zeroes()
 - added logic to bdrv_co_discard and bdrv_co_do_write_zeroes to honour
   limit and alignment info.
 - added support for -S 0 in qemu-img convert.

Peter Lieven (17):
  block: make BdrvRequestFlags public
  block: add flags to bdrv_*_write_zeroes
  block: introduce BDRV_REQ_MAY_UNMAP request flag
  block: add logical block provisioning info to BlockDriverInfo
  block: add wrappers for logical block provisioning information
  block/iscsi: add .bdrv_get_info
  block: add BlockLimits structure to BlockDriverState
  block: honour BlockLimits in bdrv_co_do_write_zeroes
  block: honour BlockLimits in bdrv_co_discard
  iscsi: simplify iscsi_co_discard
  iscsi: set limits in BlockDriverState
  iscsi: add bdrv_co_write_zeroes
  block: introduce bdrv_make_zero
  block/get_block_status: fix BDRV_BLOCK_ZERO for unallocated blocks
  qemu-img: add support for fully allocated images
  qemu-img: conditionally zero out target on convert
  block/raw: copy BlockLimits on raw_open

 block-migration.c |3 +-
 block.c   |  200 +
 block/backup.c|3 +-
 block/iscsi.c |  145 +++-
 block/qcow2-cluster.c |2 +-
 block/qcow2.c |2 +-
 block/qed.c   |3 +-
 block/raw_bsd.c   |6 +-
 block/vmdk.c  |3 +-
 include/block/block.h |   35 +++-
 include/block/block_int.h |   19 -
 qemu-img.c|   18 +++-
 qemu-img.texi |5 ++
 qemu-io-cmds.c|2 +-
 14 files changed, 358 insertions(+), 88 deletions(-)

-- 
1.7.9.5

[Qemu-devel] [PATCHv5 06/17] block/iscsi: add .bdrv_get_info

2013-10-20 Thread Peter Lieven

Signed-off-by: Peter Lieven p...@kamp.de
---
 block/iscsi.c |9 +
 1 file changed, 9 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index a2a961e..1dbbcad 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1506,6 +1506,14 @@ out:
 return ret;
 }
 
+static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+IscsiLun *iscsilun = bs-opaque;
+bdi-unallocated_blocks_are_zero = !!iscsilun-lbprz;
+bdi-can_write_zeroes_with_unmap = iscsilun-lbprz  iscsilun-lbp.lbpws;
+return 0;
+}
+
 static QEMUOptionParameter iscsi_create_options[] = {
 {
 .name = BLOCK_OPT_SIZE,
@@ -1527,6 +1535,7 @@ static BlockDriver bdrv_iscsi = {
 .create_options  = iscsi_create_options,
 
 .bdrv_getlength  = iscsi_getlength,
+.bdrv_get_info   = iscsi_get_info,
 .bdrv_truncate   = iscsi_truncate,
 
 #if defined(LIBISCSI_FEATURE_IOVECTOR)
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 08/17] block: honour BlockLimits in bdrv_co_do_write_zeroes

2013-10-20 Thread Peter Lieven

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Peter Lieven p...@kamp.de
---
 block.c |   65 +++
 1 file changed, 49 insertions(+), 16 deletions(-)

diff --git a/block.c b/block.c
index 0601b02..0c0b0ac 100644
--- a/block.c
+++ b/block.c
@@ -2703,32 +2703,65 @@ int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState 
*bs,
 BDRV_REQ_COPY_ON_READ);
 }
 
+/* if no limit is specified in the BlockLimits use a default
+ * of 32768 512-byte sectors (16 MiB) per request.
+ */
+#define MAX_WRITE_ZEROES_DEFAULT 32768
+
 static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs,
 int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
 BlockDriver *drv = bs-drv;
 QEMUIOVector qiov;
-struct iovec iov;
-int ret;
+struct iovec iov = {0};
+int ret = 0;
 
-/* TODO Emulate only part of misaligned requests instead of letting block
- * drivers return -ENOTSUP and emulate everything */
+int max_write_zeroes = bs-bl.max_write_zeroes ?
+   bs-bl.max_write_zeroes : MAX_WRITE_ZEROES_DEFAULT;
 
-/* First try the efficient write zeroes operation */
-if (drv-bdrv_co_write_zeroes) {
-ret = drv-bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags);
-if (ret != -ENOTSUP) {
-return ret;
+while (nb_sectors  0  !ret) {
+int num = nb_sectors;
+
+/* align request */
+if (bs-bl.write_zeroes_alignment 
+num = bs-bl.write_zeroes_alignment 
+sector_num % bs-bl.write_zeroes_alignment) {
+if (num  bs-bl.write_zeroes_alignment) {
+num = bs-bl.write_zeroes_alignment;
+}
+num -= sector_num % bs-bl.write_zeroes_alignment;
 }
-}
 
-/* Fall back to bounce buffer if write zeroes is unsupported */
-iov.iov_len  = nb_sectors * BDRV_SECTOR_SIZE;
-iov.iov_base = qemu_blockalign(bs, iov.iov_len);
-memset(iov.iov_base, 0, iov.iov_len);
-qemu_iovec_init_external(qiov, iov, 1);
+/* limit request size */
+if (num  max_write_zeroes) {
+num = max_write_zeroes;
+}
+
+ret = -ENOTSUP;
+/* First try the efficient write zeroes operation */
+if (drv-bdrv_co_write_zeroes) {
+ret = drv-bdrv_co_write_zeroes(bs, sector_num, num, flags);
+}
+
+if (ret == -ENOTSUP) {
+/* Fall back to bounce buffer if write zeroes is unsupported */
+iov.iov_len = num * BDRV_SECTOR_SIZE;
+if (iov.iov_base == NULL) {
+/* allocate bounce buffer only once and ensure that it
+ * is big enough for this and all future requests.
+ */
+size_t bufsize = num = nb_sectors ? num : max_write_zeroes;
+iov.iov_base = qemu_blockalign(bs, bufsize * BDRV_SECTOR_SIZE);
+memset(iov.iov_base, 0, bufsize * BDRV_SECTOR_SIZE);
+}
+qemu_iovec_init_external(qiov, iov, 1);
 
-ret = drv-bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
+ret = drv-bdrv_co_writev(bs, sector_num, num, qiov);
+}
+
+sector_num += num;
+nb_sectors -= num;
+}
 
 qemu_vfree(iov.iov_base);
 return ret;
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 04/17] block: add logical block provisioning info to BlockDriverInfo

2013-10-20 Thread Peter Lieven

Signed-off-by: Peter Lieven p...@kamp.de
---
 include/block/block.h |   16 
 1 file changed, 16 insertions(+)

diff --git a/include/block/block.h b/include/block/block.h
index 1f30a56..5fbab01 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -18,6 +18,22 @@ typedef struct BlockDriverInfo {
 /* offset at which the VM state can be saved (0 if not possible) */
 int64_t vm_state_offset;
 bool is_dirty;
+/*
+ * True if unallocated blocks read back as zeroes. This is equivalent
+ * to the the LBPRZ flag in the SCSI logical block provisioning page.
+ */
+bool unallocated_blocks_are_zero;
+/*
+ * True if the driver can optimize writing zeroes by unmapping
+ * sectors. This is equivalent to the BLKDISCARDZEROES ioctl in Linux
+ * with the difference that in qemu a discard is allowed to silently
+ * fail. Therefore we have to use bdrv_write_zeroes with the
+ * BDRV_REQ_MAY_UNMAP flag for an optimized zero write with unmapping.
+ * After this call the driver has to guarantee that the contents read
+ * back as zero. It is additionally required that the block device is
+ * opened with BDRV_O_UNMAP flag for this to work.
+ */
+bool can_write_zeroes_with_unmap;
 } BlockDriverInfo;
 
 typedef struct BlockFragInfo {
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 07/17] block: add BlockLimits structure to BlockDriverState

2013-10-20 Thread Peter Lieven

this patch adds BlockLimits which introduces discard and write_zeroes
limits and alignment information to the BlockDriverState.

Signed-off-by: Peter Lieven p...@kamp.de
---
 include/block/block_int.h |   17 +
 1 file changed, 17 insertions(+)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 9bbaa29..33be247 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -227,6 +227,20 @@ struct BlockDriver {
 QLIST_ENTRY(BlockDriver) list;
 };
 
+typedef struct BlockLimits {
+/* maximum number of sectors that can be discarded at once */
+int max_discard;
+
+/* optimal alignment for discard requests in sectors */
+int64_t discard_alignment;
+
+/* maximum number of sectors that can zeroized at once */
+int max_write_zeroes;
+
+/* optimal alignment for write zeroes requests in sectors */
+int64_t write_zeroes_alignment;
+} BlockLimits;
+
 /*
  * Note: the function bdrv_append() copies and swaps contents of
  * BlockDriverStates, so if you add new fields to this struct, please
@@ -280,6 +294,9 @@ struct BlockDriverState {
 uint64_t total_time_ns[BDRV_MAX_IOTYPE];
 uint64_t wr_highest_sector;
 
+/* I/O Limits */
+BlockLimits bl;
+
 /* Whether the disk can expand beyond total_sectors */
 int growable;
 
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 12/17] iscsi: add bdrv_co_write_zeroes

2013-10-20 Thread Peter Lieven

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Peter Lieven p...@kamp.de
---
 block/iscsi.c |   59 +
 1 file changed, 59 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index c0465aa..1845fc8 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -56,6 +56,7 @@ typedef struct IscsiLun {
 uint8_t lbprz;
 struct scsi_inquiry_logical_block_provisioning lbp;
 struct scsi_inquiry_block_limits bl;
+unsigned char *zeroblock;
 } IscsiLun;
 
 typedef struct IscsiTask {
@@ -959,6 +960,62 @@ retry:
 return 0;
 }
 
+
+static int
+coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num,
+   int nb_sectors, BdrvRequestFlags flags)
+{
+IscsiLun *iscsilun = bs-opaque;
+struct IscsiTask iTask;
+uint64_t lba;
+uint32_t nb_blocks;
+
+if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
+return -EINVAL;
+}
+
+if (!iscsilun-lbp.lbpws) {
+/* WRITE SAME is not supported by the target */
+return -ENOTSUP;
+}
+
+lba = sector_qemu2lun(sector_num, iscsilun);
+nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);
+
+if (iscsilun-zeroblock == NULL) {
+iscsilun-zeroblock = g_malloc0(iscsilun-block_size);
+}
+
+iscsi_co_init_iscsitask(iscsilun, iTask);
+retry:
+if (iscsi_writesame16_task(iscsilun-iscsi, iscsilun-lun, lba,
+   iscsilun-zeroblock, iscsilun-block_size,
+   nb_blocks, 0, !!(flags  BDRV_REQ_MAY_UNMAP),
+   0, 0, iscsi_co_generic_cb, iTask) == NULL) {
+return -EIO;
+}
+
+while (!iTask.complete) {
+iscsi_set_events(iscsilun);
+qemu_coroutine_yield();
+}
+
+if (iTask.task != NULL) {
+scsi_free_scsi_task(iTask.task);
+iTask.task = NULL;
+}
+
+if (iTask.do_retry) {
+goto retry;
+}
+
+if (iTask.status != SCSI_STATUS_GOOD) {
+return -EIO;
+}
+
+return 0;
+}
+
 static int parse_chap(struct iscsi_context *iscsi, const char *target)
 {
 QemuOptsList *list;
@@ -1421,6 +1478,7 @@ static void iscsi_close(BlockDriverState *bs)
 }
 qemu_aio_set_fd_handler(iscsi_get_fd(iscsi), NULL, NULL, NULL);
 iscsi_destroy_context(iscsi);
+g_free(iscsilun-zeroblock);
 memset(iscsilun, 0, sizeof(IscsiLun));
 }
 
@@ -1539,6 +1597,7 @@ static BlockDriver bdrv_iscsi = {
 .bdrv_co_get_block_status = iscsi_co_get_block_status,
 #endif
 .bdrv_co_discard  = iscsi_co_discard,
+.bdrv_co_write_zeroes = iscsi_co_write_zeroes,
 
 .bdrv_aio_readv  = iscsi_aio_readv,
 .bdrv_aio_writev = iscsi_aio_writev,
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 05/17] block: add wrappers for logical block provisioning information

2013-10-20 Thread Peter Lieven

This adds 2 wrappers to read the unallocated_blocks_are_zero and
can_write_zeroes_with_unmap info from the BDI. The wrappers are
required to check for the existence of a backing_hd and
if the devices are opened with the correct flags.

Signed-off-by: Peter Lieven p...@kamp.de
---
 block.c   |   30 ++
 include/block/block.h |2 ++
 2 files changed, 32 insertions(+)

diff --git a/block.c b/block.c
index 0d97ce6..0601b02 100644
--- a/block.c
+++ b/block.c
@@ -3094,6 +3094,36 @@ int bdrv_has_zero_init(BlockDriverState *bs)
 return 0;
 }
 
+bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs)
+{
+BlockDriverInfo bdi;
+
+if (bs-backing_hd) {
+return false;
+}
+
+if (bdrv_get_info(bs, bdi) == 0) {
+return bdi.unallocated_blocks_are_zero;
+}
+
+return false;
+}
+
+bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs)
+{
+BlockDriverInfo bdi;
+
+if (bs-backing_hd || !(bs-open_flags  BDRV_O_UNMAP)) {
+return false;
+}
+
+if (bdrv_get_info(bs, bdi) == 0) {
+return bdi.can_write_zeroes_with_unmap;
+}
+
+return false;
+}
+
 typedef struct BdrvCoGetBlockStatusData {
 BlockDriverState *bs;
 BlockDriverState *base;
diff --git a/include/block/block.h b/include/block/block.h
index 5fbab01..de0d6ab 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -344,6 +344,8 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, 
int nb_sectors);
 int bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors);
 int bdrv_has_zero_init_1(BlockDriverState *bs);
 int bdrv_has_zero_init(BlockDriverState *bs);
+bool bdrv_unallocated_blocks_are_zero(BlockDriverState *bs);
+bool bdrv_can_write_zeroes_with_unmap(BlockDriverState *bs);
 int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num,
   int nb_sectors, int *pnum);
 int bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 09/17] block: honour BlockLimits in bdrv_co_discard

2013-10-20 Thread Peter Lieven

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Peter Lieven p...@kamp.de
---
 block.c |   37 -
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index 0c0b0ac..b28dd42 100644
--- a/block.c
+++ b/block.c
@@ -4234,6 +4234,11 @@ static void coroutine_fn bdrv_discard_co_entry(void 
*opaque)
 rwco-ret = bdrv_co_discard(rwco-bs, rwco-sector_num, rwco-nb_sectors);
 }
 
+/* if no limit is specified in the BlockLimits use a default
+ * of 32768 512-byte sectors (16 MiB) per request.
+ */
+#define MAX_DISCARD_DEFAULT 32768
+
 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
  int nb_sectors)
 {
@@ -4255,7 +4260,37 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, 
int64_t sector_num,
 }
 
 if (bs-drv-bdrv_co_discard) {
-return bs-drv-bdrv_co_discard(bs, sector_num, nb_sectors);
+int max_discard = bs-bl.max_discard ?
+  bs-bl.max_discard : MAX_DISCARD_DEFAULT;
+
+while (nb_sectors  0) {
+int ret;
+int num = nb_sectors;
+
+/* align request */
+if (bs-bl.discard_alignment 
+num = bs-bl.discard_alignment 
+sector_num % bs-bl.discard_alignment) {
+if (num  bs-bl.discard_alignment) {
+num = bs-bl.discard_alignment;
+}
+num -= sector_num % bs-bl.discard_alignment;
+}
+
+/* limit request size */
+if (num  max_discard) {
+num = max_discard;
+}
+
+ret = bs-drv-bdrv_co_discard(bs, sector_num, num);
+if (ret) {
+return ret;
+}
+
+sector_num += num;
+nb_sectors -= num;
+}
+return 0;
 } else if (bs-drv-bdrv_aio_discard) {
 BlockDriverAIOCB *acb;
 CoroutineIOCompletion co = {
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 16/17] qemu-img: conditionally zero out target on convert

2013-10-20 Thread Peter Lieven

If the target has_zero_init = 0, but supports efficiently
writing zeroes by unmapping we call bdrv_make_zero to
avoid fully allocating the target. This currently
is designed especially for iscsi.

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Peter Lieven p...@kamp.de
---
 qemu-img.c |   10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/qemu-img.c b/qemu-img.c
index c6eff15..fe0bdb1 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1353,7 +1353,7 @@ static int img_convert(int argc, char **argv)
 }
 }
 
-flags = BDRV_O_RDWR;
+flags = min_sparse ? (BDRV_O_RDWR | BDRV_O_UNMAP) : BDRV_O_RDWR;
 ret = bdrv_parse_cache_flags(cache, flags);
 if (ret  0) {
 error_report(Invalid cache option: %s, cache);
@@ -1469,6 +1469,14 @@ static int img_convert(int argc, char **argv)
 } else {
 int has_zero_init = min_sparse ? bdrv_has_zero_init(out_bs) : 0;
 
+if (!has_zero_init  bdrv_can_write_zeroes_with_unmap(out_bs)) {
+ret = bdrv_make_zero(out_bs, BDRV_REQ_MAY_UNMAP);
+if (ret  0) {
+goto out;
+}
+has_zero_init = 1;
+}
+
 sector_num = 0; // total number of sectors converted so far
 nb_sectors = total_sectors - sector_num;
 if (nb_sectors != 0) {
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 17/17] block/raw: copy BlockLimits on raw_open

2013-10-20 Thread Peter Lieven

Signed-off-by: Peter Lieven p...@kamp.de
---
 block/raw_bsd.c |1 +
 1 file changed, 1 insertion(+)

diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index b0dd23f..49ac18c 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -150,6 +150,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, 
int flags,
 Error **errp)
 {
 bs-sg = bs-file-sg;
+bs-bl = bs-file-bl;
 return 0;
 }
 
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 13/17] block: introduce bdrv_make_zero

2013-10-20 Thread Peter Lieven

this patch adds a call to completely zero out a block device.
the operation is sped up by checking the block status and
only writing zeroes to the device if they currently do not
return zeroes. optionally the zero writing can be sped up
by setting the flag BDRV_REQ_MAY_UNMAP to emulate the zero
write by unmapping if the driver supports it.

Signed-off-by: Peter Lieven p...@kamp.de
---
 block.c   |   37 +
 include/block/block.h |1 +
 2 files changed, 38 insertions(+)

diff --git a/block.c b/block.c
index b28dd42..21a992a 100644
--- a/block.c
+++ b/block.c
@@ -2391,6 +2391,43 @@ int bdrv_write_zeroes(BlockDriverState *bs, int64_t 
sector_num,
   BDRV_REQ_ZERO_WRITE | flags);
 }
 
+/*
+ * Completely zero out a block device with the help of bdrv_write_zeroes.
+ * The operation is sped up by checking the block status and only writing
+ * zeroes to the device if they currently do not return zeroes. Optional
+ * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP).
+ *
+ * Returns  0 on error, 0 on success. For error codes see bdrv_write().
+ */
+int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags)
+{
+int64_t target_size = bdrv_getlength(bs) / BDRV_SECTOR_SIZE;
+int64_t ret, nb_sectors, sector_num = 0;
+int n;
+
+for (;;) {
+nb_sectors = target_size - sector_num;
+if (nb_sectors = 0) {
+return 0;
+}
+if (nb_sectors  INT_MAX) {
+nb_sectors = INT_MAX;
+}
+ret = bdrv_get_block_status(bs, sector_num, nb_sectors, n);
+if (ret  BDRV_BLOCK_ZERO) {
+sector_num += n;
+continue;
+}
+ret = bdrv_write_zeroes(bs, sector_num, n, flags);
+if (ret  0) {
+error_report(error writing zeroes at sector % PRId64 : %s,
+ sector_num, strerror(-ret));
+return ret;
+}
+sector_num += n;
+}
+}
+
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
void *buf, int count1)
 {
diff --git a/include/block/block.h b/include/block/block.h
index de0d6ab..8244adb 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -216,6 +216,7 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num,
const uint8_t *buf, int nb_sectors);
 int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num,
int nb_sectors, BdrvRequestFlags flags);
+int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags);
 int bdrv_writev(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov);
 int bdrv_pread(BlockDriverState *bs, int64_t offset,
void *buf, int count);
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 14/17] block/get_block_status: fix BDRV_BLOCK_ZERO for unallocated blocks

2013-10-20 Thread Peter Lieven

this patch does 2 things:
a) only do additional call outs if BDRV_BLOCK_ZERO is not already set.
b) use the newly introduced bdrv_has_discard_zeroes() to return the
   zero state of an unallocated block. the used callout to
   bdrv_has_zero_init() is only valid right after bdrv_create.

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Peter Lieven p...@kamp.de
---
 block.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 21a992a..69a2d2b 100644
--- a/block.c
+++ b/block.c
@@ -3263,8 +3263,8 @@ static int64_t coroutine_fn 
bdrv_co_get_block_status(BlockDriverState *bs,
  *pnum, pnum);
 }
 
-if (!(ret  BDRV_BLOCK_DATA)) {
-if (bdrv_has_zero_init(bs)) {
+if (!(ret  BDRV_BLOCK_DATA)  !(ret  BDRV_BLOCK_ZERO)) {
+if (bdrv_unallocated_blocks_are_zero(bs)) {
 ret |= BDRV_BLOCK_ZERO;
 } else if (bs-backing_hd) {
 BlockDriverState *bs2 = bs-backing_hd;
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 15/17] qemu-img: add support for fully allocated images

2013-10-20 Thread Peter Lieven

Signed-off-by: Peter Lieven p...@kamp.de
---
 qemu-img.c|8 +---
 qemu-img.texi |5 +
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 926f0a0..c6eff15 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -100,8 +100,10 @@ static void help(void)
  '-h' with or without a command shows this help and lists the 
supported formats\n
  '-p' show progress of command (only certain commands)\n
  '-q' use Quiet mode - do not print any output (except errors)\n
- '-S' indicates the consecutive number of bytes that must contain 
only zeros\n
-  for qemu-img to create a sparse image during conversion\n
+ '-S' indicates the consecutive number of bytes (defaults to 4k) 
that must\n
+  contain only zeros for qemu-img to create a sparse image 
during\n
+  conversion. if the number of bytes is 0 sparse files are 
disabled and\n
+  images will always be fully allocated\n
  '--output' takes the format in which the output must be done 
(human or json)\n
  '-n' skips the target volume creation (useful if the volume is 
created\n
   prior to running qemu-img)\n
@@ -1465,7 +1467,7 @@ static int img_convert(int argc, char **argv)
 /* signal EOF to align */
 bdrv_write_compressed(out_bs, 0, NULL, 0);
 } else {
-int has_zero_init = bdrv_has_zero_init(out_bs);
+int has_zero_init = min_sparse ? bdrv_has_zero_init(out_bs) : 0;
 
 sector_num = 0; // total number of sectors converted so far
 nb_sectors = total_sectors - sector_num;
diff --git a/qemu-img.texi b/qemu-img.texi
index 768054e..51a1ee5 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -193,6 +193,11 @@ Image conversion is also useful to get smaller image when 
using a
 growable format such as @code{qcow} or @code{cow}: the empty sectors
 are detected and suppressed from the destination image.
 
+@var{sparse_size} indicates the consecutive number of bytes (defaults to 4k)
+that must contain only zeros for qemu-img to create a sparse image during
+conversion. If the number of bytes is 0 sparse files are disabled and
+images will always be fully allocated.
+
 You can use the @var{backing_file} option to force the output image to be
 created as a copy on write image of the specified base image; the
 @var{backing_file} should have the same content as the input's base image,
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 10/17] iscsi: simplify iscsi_co_discard

2013-10-20 Thread Peter Lieven

now that bdrv_co_discard can handle limits we do not need
the request split logic here anymore.

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Peter Lieven p...@kamp.de
---
 block/iscsi.c |   67 +
 1 file changed, 25 insertions(+), 42 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 1dbbcad..47b9cc9 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -87,7 +87,6 @@ typedef struct IscsiAIOCB {
 #define NOP_INTERVAL 5000
 #define MAX_NOP_FAILURES 3
 #define ISCSI_CMD_RETRIES 5
-#define ISCSI_MAX_UNMAP 131072
 
 static void
 iscsi_bh_cb(void *p)
@@ -912,8 +911,6 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t 
sector_num,
 IscsiLun *iscsilun = bs-opaque;
 struct IscsiTask iTask;
 struct unmap_list list;
-uint32_t nb_blocks;
-uint32_t max_unmap;
 
 if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) {
 return -EINVAL;
@@ -925,52 +922,38 @@ coroutine_fn iscsi_co_discard(BlockDriverState *bs, 
int64_t sector_num,
 }
 
 list.lba = sector_qemu2lun(sector_num, iscsilun);
-nb_blocks = sector_qemu2lun(nb_sectors, iscsilun);
+list.num = sector_qemu2lun(nb_sectors, iscsilun);
 
-max_unmap = iscsilun-bl.max_unmap;
-if (max_unmap == 0x) {
-max_unmap = ISCSI_MAX_UNMAP;
-}
-
-while (nb_blocks  0) {
-iscsi_co_init_iscsitask(iscsilun, iTask);
-list.num = nb_blocks;
-if (list.num  max_unmap) {
-list.num = max_unmap;
-}
+iscsi_co_init_iscsitask(iscsilun, iTask);
 retry:
-if (iscsi_unmap_task(iscsilun-iscsi, iscsilun-lun, 0, 0, list, 1,
- iscsi_co_generic_cb, iTask) == NULL) {
-return -EIO;
-}
-
-while (!iTask.complete) {
-iscsi_set_events(iscsilun);
-qemu_coroutine_yield();
-}
+if (iscsi_unmap_task(iscsilun-iscsi, iscsilun-lun, 0, 0, list, 1,
+ iscsi_co_generic_cb, iTask) == NULL) {
+return -EIO;
+}
 
-if (iTask.task != NULL) {
-scsi_free_scsi_task(iTask.task);
-iTask.task = NULL;
-}
+while (!iTask.complete) {
+iscsi_set_events(iscsilun);
+qemu_coroutine_yield();
+}
 
-if (iTask.do_retry) {
-goto retry;
-}
+if (iTask.task != NULL) {
+scsi_free_scsi_task(iTask.task);
+iTask.task = NULL;
+}
 
-if (iTask.status == SCSI_STATUS_CHECK_CONDITION) {
-/* the target might fail with a check condition if it
-   is not happy with the alignment of the UNMAP request
-   we silently fail in this case */
-return 0;
-}
+if (iTask.do_retry) {
+goto retry;
+}
 
-if (iTask.status != SCSI_STATUS_GOOD) {
-return -EIO;
-}
+if (iTask.status == SCSI_STATUS_CHECK_CONDITION) {
+/* the target might fail with a check condition if it
+   is not happy with the alignment of the UNMAP request
+   we silently fail in this case */
+return 0;
+}
 
-list.lba += list.num;
-nb_blocks -= list.num;
+if (iTask.status != SCSI_STATUS_GOOD) {
+return -EIO;
 }
 
 return 0;
-- 
1.7.9.5

[Qemu-devel] [PATCHv5 11/17] iscsi: set limits in BlockDriverState

2013-10-20 Thread Peter Lieven

Reviewed-by: Eric Blake ebl...@redhat.com
Signed-off-by: Peter Lieven p...@kamp.de
---
 block/iscsi.c |   14 ++
 1 file changed, 14 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index 47b9cc9..c0465aa 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -1367,6 +1367,20 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
sizeof(struct scsi_inquiry_block_limits));
 scsi_free_scsi_task(task);
 task = NULL;
+
+if (iscsilun-bl.max_unmap  0x) {
+bs-bl.max_discard = sector_lun2qemu(iscsilun-bl.max_unmap,
+ iscsilun);
+}
+bs-bl.discard_alignment = sector_lun2qemu(iscsilun-bl.opt_unmap_gran,
+   iscsilun);
+
+if (iscsilun-bl.max_ws_len  0x) {
+bs-bl.max_write_zeroes = sector_lun2qemu(iscsilun-bl.max_ws_len,
+  iscsilun);
+}
+bs-bl.write_zeroes_alignment = 
sector_lun2qemu(iscsilun-bl.opt_unmap_gran,
+iscsilun);
 }
 
 #if defined(LIBISCSI_FEATURE_NOP_COUNTER)
-- 
1.7.9.5

Re: [Qemu-devel] [PATCH 54/66] add a header file for atomic operations

2013-10-20 Thread Peter Maydell

On 4 July 2013 16:13, Paolo Bonzini pbonz...@redhat.com wrote:
 +#ifndef atomic_xchg
 +#ifdef __ATOMIC_SEQ_CST
 +#define atomic_xchg(ptr, i)({   \
 +typeof(*ptr) _new = (i), _old;  \
 +__atomic_exchange(ptr, _new, _old, __ATOMIC_SEQ_CST); \
 +_old;   \
 +})
 +#elif defined __clang__
 +#define atomic_xchg(ptr, i)__sync_exchange(ptr, i)
 +#else
 +/* __sync_lock_test_and_set() is documented to be an acquire barrier only.  
 */
 +#define atomic_xchg(ptr, i)(smp_mb(), __sync_lock_test_and_set(ptr, i))
 +#endif
 +#endif

Hi. I'm afraid this doesn't compile on MacOSX/clang:

  CCutil/qemu-thread-posix.o
util/qemu-thread-posix.c:351:13: error: too many arguments to function
call, expected 3, have 4
if (atomic_xchg(ev-value, EV_SET) == EV_BUSY) {
^~~
/Users/pm215/src/qemu/include/qemu/atomic.h:174:42: note: expanded
from macro 'atomic_xchg'
__atomic_exchange(ptr, _new, _old, __ATOMIC_SEQ_CST); \
~^
built-in:16:26: note: expanded from macro '__ATOMIC_SEQ_CST'
#define __ATOMIC_SEQ_CST 5
 ^
1 error generated.
make: *** [util/qemu-thread-posix.o] Error 1


I tried the '#elif defined__clang__' block instead and
that doesn't work either:

  CCutil/qemu-thread-posix.o
util/qemu-thread-posix.c:351:13: warning: implicit declaration of
function '__sync_exchange' is invalid in
  C99 [-Wimplicit-function-declaration]
if (atomic_xchg(ev-value, EV_SET) == EV_BUSY) {
^
/Users/pm215/src/qemu/include/qemu/atomic.h:179:32: note: expanded
from macro 'atomic_xchg'
#define atomic_xchg(ptr, i)__sync_exchange(ptr, i)
   ^
1 warning generated.
  LINK  qemu-nbd
Undefined symbols for architecture x86_64:
  ___sync_exchange, referenced from:
  _qemu_event_set in libqemuutil.a(qemu-thread-posix.o)
ld: symbol(s) not found for architecture x86_64
clang: error: linker command failed with exit code 1 (use -v to see invocation)
make: *** [qemu-nbd] Error 1

It looks like we need to select the '#else' case for MacOSX...
any suggestions about how best to do that?

thanks
-- PMM

[Qemu-devel] [PATCH] configure: Add config.status to recreate the current configuration

2013-10-20 Thread Stefan Weil

The latest configure invocation was saved in config-host.mak and could
be extracted from that file to recreate the configuration.

Now it is saved in a new file config.status which can be directly executed
to recreate the configuration. The file name and the comments were copied
from GNU autoconf.

Makefile now uses config.status, but also includes transitional code
for the old mechanism.

Signed-off-by: Stefan Weil s...@weilnetz.de
---

This patch still does not escape special characters in the command line.
I expect that most users don't use such characters, so they have no problem.

It's easy to fix that for hosts with bash: simply replace %s by %q.
If required, this can be done in a later patch.

Regards,
Stefan Weil

 Makefile  |9 -
 configure |   15 +--
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index b15003f..073f18b 100644
--- a/Makefile
+++ b/Makefile
@@ -28,7 +28,14 @@ CONFIG_ALL=y
 include $(SRC_PATH)/rules.mak
 config-host.mak: $(SRC_PATH)/configure
@echo $@ is out-of-date, running configure
-   @sed -n /.*Configured with/s/[^:]*: //p $@ | sh
+   @# TODO: The next lines include code which supports a smooth
+   @# transition from old configurations without config.status.
+   @# This code can be removed after QEMU 1.7.
+   @if test -x config.status; then \
+   ./config.status; \
+else \
+   sed -n /.*Configured with/s/[^:]*: //p $@ | sh; \
+   fi
 else
 config-host.mak:
 ifneq ($(filter-out %clean,$(MAKECMDGOALS)),$(if $(MAKECMDGOALS),,fail))
diff --git a/configure b/configure
index 57ee62a..125d0c5 100755
--- a/configure
+++ b/configure
@@ -27,6 +27,19 @@ printf  '%s' $0 $@  config.log
 echo  config.log
 echo #  config.log
 
+# Save the configure command line for later reuse.
+cat EOD config.status
+#!/bin/sh
+# Generated by configure.
+# Run this file to recreate the current configuration.
+# Compiler output produced by configure, useful for debugging
+# configure, is in config.log if it exists.
+EOD
+printf exec config.status
+printf  '%s' $0 $@ config.status
+echo config.status
+chmod +x config.status
+
 error_exit() {
 echo
 echo ERROR: $1
@@ -3765,8 +3778,6 @@ config_host_mak=config-host.mak
 echo # Automatically generated by configure - do not modify 
config-all-disas.mak
 
 echo # Automatically generated by configure - do not modify  
$config_host_mak
-printf # Configured with:  $config_host_mak
-printf  '%s' $0 $@  $config_host_mak
 echo  $config_host_mak
 
 echo all:  $config_host_mak
-- 
1.7.10.4

[Qemu-devel] [PATCH] qcow2: Restore total_sectors value in save_vmstate

2013-10-20 Thread Max Reitz

Since df2a6f29a5, bdrv_co_do_writev increases the total_sectors value of
a growable block devices on writes after the current end. This leads to
the virtual disk apparently growing in qcow2_save_vmstate, which in turn
affects the disk size captured by the internal snapshot taken directly
afterwards through e.g. the HMP savevm command. Such a grown snapshot
cannot be loaded after reopening the qcow2 image, since its disk size
differs from the actual virtual disk size (writing a VM state does not
actually increase the virtual disk size).

Fix this by restoring total_sectors at the end of qcow2_save_vmstate.

Signed-off-by: Max Reitz mre...@redhat.com
---
 block/qcow2.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index c1abaff..5c05bb5 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1939,6 +1939,7 @@ static int qcow2_save_vmstate(BlockDriverState *bs, 
QEMUIOVector *qiov,
   int64_t pos)
 {
 BDRVQcowState *s = bs-opaque;
+int64_t total_sectors = bs-total_sectors;
 int growable = bs-growable;
 int ret;
 
@@ -1946,6 +1947,10 @@ static int qcow2_save_vmstate(BlockDriverState *bs, 
QEMUIOVector *qiov,
 bs-growable = 1;
 ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
 bs-growable = growable;
+// bdrv_co_do_writev will have increased the total_sectors value to include
+// the VM state - the VM state is however not an actual part of the block
+// device, therefore, we need to restore the old value.
+bs-total_sectors = total_sectors;
 
 return ret;
 }
-- 
1.8.3.1

[Qemu-devel] [PATCH] qcow2: Unset zero_beyond_eof in save_vmstate

2013-10-20 Thread Max Reitz

Saving the VM state is done using bdrv_pwrite. This function may perform
a read-modify-write, which in this case results in data being read from
beyond the end of the virtual disk. Since we are actually trying to
access an area which is not a part of the virtual disk, zero_beyond_eof
has to be set to false before performing the partial write, otherwise
the VM state may become corrupted.

Signed-off-by: Max Reitz mre...@redhat.com
---
Follow-up to (depends on):
 - qcow2: Restore total_sectors value in save_vmstate
---
 block/qcow2.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 5c05bb5..3e11f25 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1941,12 +1941,15 @@ static int qcow2_save_vmstate(BlockDriverState *bs, 
QEMUIOVector *qiov,
 BDRVQcowState *s = bs-opaque;
 int64_t total_sectors = bs-total_sectors;
 int growable = bs-growable;
+bool zero_beyond_eof = bs-zero_beyond_eof;
 int ret;
 
 BLKDBG_EVENT(bs-file, BLKDBG_VMSTATE_SAVE);
 bs-growable = 1;
+bs-zero_beyond_eof = false;
 ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
 bs-growable = growable;
+bs-zero_beyond_eof = zero_beyond_eof;
 // bdrv_co_do_writev will have increased the total_sectors value to include
 // the VM state - the VM state is however not an actual part of the block
 // device, therefore, we need to restore the old value.
-- 
1.8.3.1

[Qemu-devel] [RFC PATCH v1: 02/12] rdma: remove reference to github.com

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com


Signed-off-by: Michael R. Hines mrhi...@us.ibm.com
---
 docs/rdma.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/docs/rdma.txt b/docs/rdma.txt
index 2aca63b..6d116e2 100644
--- a/docs/rdma.txt
+++ b/docs/rdma.txt
@@ -2,7 +2,6 @@
 RDMA Live Migration Specification, Version # 1
 ==
 Wiki: http://wiki.qemu-project.org/Features/RDMALiveMigration
-Github: g...@github.com:hinesmr/qemu.git, 'rdma' branch
 
 Copyright (C) 2013 Michael R. Hines mrhi...@us.ibm.com
 
-- 
1.8.1.2

[Qemu-devel] [RFC PATCH v1: 00/12] fault tolerance through micro-checkpointing

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com

This patch implements RDMA-aware fault tolerance for the VM
using Micro-Checkpointing (to be presented at the KVM Forum). 

The breakout of the patches is not ideal and is really meant to
kick things off for review, which will likely extend well past 1.7
and into 1.8 version of QEMU, assuming about 5-6 months of reviews. 

Please begin with patch #01 as it provides a good narrative of
what is different about this and previous attempts at fault tolerance,
including a breakdown of the current empirical performance challenges.

Michael R. Hines (12):
  mc: add documentation for micro-checkpointing
  rdma: remove reference to github.com
  migration: introduce parallelization of migration_bitmap
  mc: introduce a checkpointing status check into the VCPU states
  migration: support custom page loading
  rdma: accelerated memcpy() support
  mc: introduce state machine error handling and migration_bitmap prep
  mc: modified QMP statistics and migration_thread handoff
  mc: core logic
  mc: configure and makefile support
  mc: register MC qemu-file functions and expose MC tunable capability
  mc: activate and use MC core logic if requested

 Makefile.objs |1 +
 arch_init.c   |  276 +-
 configure |   45 +
 cpus.c|9 +-
 docs/mc.txt   |  261 ++
 docs/rdma.txt |1 -
 hmp-commands.hx   |   14 +
 hmp.c |   23 +
 hmp.h |1 +
 include/migration/migration.h |   69 +-
 include/migration/qemu-file.h |   55 +-
 include/qemu-common.h |   12 +
 migration-checkpoint.c| 1589 
 migration-rdma.c  | 2008 ++---
 migration.c   |  148 ++-
 qapi-schema.json  |   92 +-
 qmp-commands.hx   |   23 +
 savevm.c  |   84 +-
 vl.c  |   42 +
 19 files changed, 4123 insertions(+), 630 deletions(-)
 create mode 100644 docs/mc.txt
 create mode 100644 migration-checkpoint.c

-- 
1.8.1.2

[Qemu-devel] [RFC PATCH v1: 04/12] mc: introduce a checkpointing status check into the VCPU states

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com

During micro-checkpointing, the VCPUs get repeatedly paused and
resumed. We need to not freak out when the VM begins micro-checkpointing.

Signed-off-by: Michael R. Hines mrhi...@us.ibm.com
---
 arch_init.c   | 2 +-
 cpus.c| 9 -
 include/migration/migration.h | 2 ++
 qapi-schema.json  | 4 +++-
 vl.c  | 6 ++
 5 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 4a71311..b139512 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -199,7 +199,7 @@ typedef struct AccountingInfo {
 
 static AccountingInfo acct_info;
 
-static void acct_clear(void)
+void acct_clear(void)
 {
 memset(acct_info, 0, sizeof(acct_info));
 }
diff --git a/cpus.c b/cpus.c
index 398229e..d090c2c 100644
--- a/cpus.c
+++ b/cpus.c
@@ -530,7 +530,14 @@ static int do_vm_stop(RunState state)
 pause_all_vcpus();
 runstate_set(state);
 vm_state_notify(0, state);
-monitor_protocol_event(QEVENT_STOP, NULL);
+/*
+ * If MC is enabled, libvirt gets confused 
+ * because it thinks the VM is stopped when 
+ * its just being micro-checkpointed.
+ */
+if(state != RUN_STATE_CHECKPOINT_VM) {
+monitor_protocol_event(QEVENT_STOP, NULL);
+}
 }
 
 bdrv_drain_all();
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 3ffc433..3ad06c5 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -119,6 +119,8 @@ uint64_t xbzrle_mig_bytes_transferred(void);
 uint64_t xbzrle_mig_pages_transferred(void);
 uint64_t xbzrle_mig_pages_overflow(void);
 uint64_t xbzrle_mig_pages_cache_miss(void);
+void acct_clear(void);
+
 void *migration_bitmap_worker(void *opaque);
 void migration_bitmap_worker_start(MigrationState *s);
 void migration_bitmap_worker_stop(MigrationState *s);
diff --git a/qapi-schema.json b/qapi-schema.json
index aac0894..8e72bcf 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -169,6 +169,8 @@
 #
 # @save-vm: guest is paused to save the VM state
 #
+# @checkpoint-vm: guest is paused to checkpoint the VM state
+#
 # @shutdown: guest is shut down (and -no-shutdown is in use)
 #
 # @suspended: guest is suspended (ACPI S3)
@@ -181,7 +183,7 @@
   'data': [ 'debug', 'inmigrate', 'internal-error', 'io-error', 'paused',
 'postmigrate', 'prelaunch', 'finish-migrate', 'restore-vm',
 'running', 'save-vm', 'shutdown', 'suspended', 'watchdog',
-'guest-panicked' ] }
+'guest-panicked', 'checkpoint-vm' ] }
 
 ##
 # @SnapshotInfo
diff --git a/vl.c b/vl.c
index e2ba2e8..74d52ab 100644
--- a/vl.c
+++ b/vl.c
@@ -611,14 +611,18 @@ static const RunStateTransition 
runstate_transitions_def[] = {
 
 { RUN_STATE_FINISH_MIGRATE, RUN_STATE_RUNNING },
 { RUN_STATE_FINISH_MIGRATE, RUN_STATE_POSTMIGRATE },
+{ RUN_STATE_FINISH_MIGRATE, RUN_STATE_CHECKPOINT_VM },
 
 { RUN_STATE_RESTORE_VM, RUN_STATE_RUNNING },
 
+{ RUN_STATE_CHECKPOINT_VM, RUN_STATE_RUNNING },
+
 { RUN_STATE_RUNNING, RUN_STATE_DEBUG },
 { RUN_STATE_RUNNING, RUN_STATE_INTERNAL_ERROR },
 { RUN_STATE_RUNNING, RUN_STATE_IO_ERROR },
 { RUN_STATE_RUNNING, RUN_STATE_PAUSED },
 { RUN_STATE_RUNNING, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_RUNNING, RUN_STATE_CHECKPOINT_VM },
 { RUN_STATE_RUNNING, RUN_STATE_RESTORE_VM },
 { RUN_STATE_RUNNING, RUN_STATE_SAVE_VM },
 { RUN_STATE_RUNNING, RUN_STATE_SHUTDOWN },
@@ -634,9 +638,11 @@ static const RunStateTransition runstate_transitions_def[] 
= {
 { RUN_STATE_RUNNING, RUN_STATE_SUSPENDED },
 { RUN_STATE_SUSPENDED, RUN_STATE_RUNNING },
 { RUN_STATE_SUSPENDED, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_SUSPENDED, RUN_STATE_CHECKPOINT_VM },
 
 { RUN_STATE_WATCHDOG, RUN_STATE_RUNNING },
 { RUN_STATE_WATCHDOG, RUN_STATE_FINISH_MIGRATE },
+{ RUN_STATE_WATCHDOG, RUN_STATE_CHECKPOINT_VM },
 
 { RUN_STATE_GUEST_PANICKED, RUN_STATE_PAUSED },
 { RUN_STATE_GUEST_PANICKED, RUN_STATE_FINISH_MIGRATE },
-- 
1.8.1.2

[Qemu-devel] [RFC PATCH v1: 03/12] migration: introduce parallelization of migration_bitmap

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com

This patch allows the preparation of the migration_bitmap
to be parallelized. For very large VMs, this can take on
the order of 10s of milliseconds, which translates as downtime.

We count the number of cores first, and then handout chunks of
the logdirty bitmap to a thread per core. Each thread scans for
dirty bits in parallel.

Signed-off-by: Michael R. Hines mrhi...@us.ibm.com
---
 arch_init.c   | 228 +++---
 include/migration/migration.h |  10 ++
 include/qemu-common.h |  12 +++
 qapi-schema.json  |  73 +-
 vl.c  |  33 ++
 5 files changed, 340 insertions(+), 16 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 7545d96..4a71311 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -189,6 +189,8 @@ typedef struct AccountingInfo {
 uint64_t skipped_pages;
 uint64_t norm_pages;
 uint64_t iterations;
+uint64_t log_dirty_time;
+uint64_t migration_bitmap_time;
 uint64_t xbzrle_bytes;
 uint64_t xbzrle_pages;
 uint64_t xbzrle_cache_miss;
@@ -232,6 +234,16 @@ uint64_t norm_mig_pages_transferred(void)
 return acct_info.norm_pages;
 }
 
+uint64_t norm_mig_log_dirty_time(void)
+{
+return acct_info.log_dirty_time;
+}
+
+uint64_t norm_mig_bitmap_time(void)
+{
+return acct_info.migration_bitmap_time;
+}
+
 uint64_t xbzrle_mig_bytes_transferred(void)
 {
 return acct_info.xbzrle_bytes;
@@ -362,15 +374,189 @@ ram_addr_t 
migration_bitmap_find_and_reset_dirty(MemoryRegion *mr,
 static inline bool migration_bitmap_set_dirty(MemoryRegion *mr,
   ram_addr_t offset)
 {
-bool ret;
-int nr = (mr-ram_addr + offset)  TARGET_PAGE_BITS;
+return test_and_set_bit((mr-ram_addr + offset)  TARGET_PAGE_BITS, 
+migration_bitmap);
+}
+
+typedef struct BitmapWalkerParams {
+QemuMutex ready_mutex;
+QemuMutex done_mutex;
+QemuCond cond;
+QemuThread walker;
+MigrationState *s;
+int core_id;
+int keep_running;
+ram_addr_t start;
+ram_addr_t stop;
+void *block;
+uint64_t dirty_pages;
+} BitmapWalkerParams;
 
-ret = test_and_set_bit(nr, migration_bitmap);
+static int nb_bitmap_workers = 0;
 
-if (!ret) {
-migration_dirty_pages++;
+BitmapWalkerParams *bitmap_walkers = NULL;
+
+/*
+ * Bitmap workers: This is a temporary performance-driven
+ * workaround for the slowness (10s of milliseconds) incurred
+ * during calls to migration_bitmap_sync().
+ *
+ * Ideally, migration_bitmap_sync() should be able to use the
+ * GET_LOG_DIRTY bitmap from KVM directly, but it does not right
+ * now because the bitmap is not retrieved as a single memory
+ * allocation which requires a couple of transformations into
+ * a 'unified' bitmap before the migration code can make good use
+ * of it.
+ *
+ * Bitmap workers perform this transformation in parallel
+ * in a multi-threaded fashion until a patch is ready to process
+ * the bitmaps from GET_LOG_DIRTY directly.
+ */
+static uint64_t migration_bitmap_sync_range(RAMBlock *block, 
+ram_addr_t start, ram_addr_t stop)
+{
+ram_addr_t addr;
+uint64_t dirty_pages = 0;
+
+
+for (addr = start; addr  stop; addr += TARGET_PAGE_SIZE) {
+if (memory_region_test_and_clear_dirty(block-mr,
+   addr, TARGET_PAGE_SIZE,
+   DIRTY_MEMORY_MIGRATION)) {
+if (!migration_bitmap_set_dirty(block-mr, addr)) {
+dirty_pages++;
+}
+}
+}
+
+return dirty_pages;
+}
+
+/*
+ * The worker sleeps until it gets some work to transform a 
+ * chunk of bitmap from KVM to the migration_bitmap.
+ */
+void *migration_bitmap_worker(void *opaque)
+{
+BitmapWalkerParams * bwp = opaque;
+
+do {
+qemu_mutex_lock(bwp-ready_mutex);
+qemu_mutex_lock(bwp-done_mutex);
+qemu_mutex_unlock(bwp-ready_mutex);
+qemu_cond_signal(bwp-cond);
+
+if(!bwp-keep_running) {
+break;
+}
+
+bwp-dirty_pages = migration_bitmap_sync_range(bwp-block, bwp-start, 
bwp-stop);
+
+qemu_cond_wait(bwp-cond, bwp-done_mutex);
+qemu_mutex_unlock(bwp-done_mutex);
+} while(bwp-keep_running);
+
+return NULL;
+}
+
+void migration_bitmap_worker_start(MigrationState *s)
+{
+int core;
+
+/* 
+ * CPUs N - 1 are reserved for N - 1 worker threads 
+ * processing the pc.ram bytemap = migration_bitmap.
+ * The migration thread goes on the last CPU,
+ * which process the remaining, smaller RAMblocks.
+ */
+nb_bitmap_workers = getNumCores() - 1;
+
+bitmap_walkers = g_malloc0(sizeof(struct BitmapWalkerParams) * 
+nb_bitmap_workers);
+
+memset(bitmap_walkers, 0, sizeof(BitmapWalkerParams)

[Qemu-devel] [RFC PATCH v1: 01/12] mc: add documentation for micro-checkpointing

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com


Signed-off-by: Michael R. Hines mrhi...@us.ibm.com
---
 docs/mc.txt | 261 
 1 file changed, 261 insertions(+)
 create mode 100644 docs/mc.txt

diff --git a/docs/mc.txt b/docs/mc.txt
new file mode 100644
index 000..90888f7
--- /dev/null
+++ b/docs/mc.txt
@@ -0,0 +1,261 @@
+Micro Checkpointing Specification
+==
+Wiki: http://wiki.qemu.org/Features/MicroCheckpointing
+Github: g...@github.com:hinesmr/qemu.git, 'mc' branch
+
+Copyright (C) 2014 Michael R. Hines mrhi...@us.ibm.com
+
+Contents:
+=
+* Introduction
+* The Micro-Checkpointing Process 
+* RDMA Integration
+* Failure Recovery
+* Before running
+* Running
+* Performance
+* TODO
+
+INTRODUCTION:
+=
+
+Micro-Checkpointing (MC) is one method for providing Fault Tolerance to a
+running virtual machine (VM) with neither runtime assistance from the guest
+kernel nor from the guest application software. Furthermore, Fault Tolerance
+is one method of providing high availability to a VM such that, from the
+perspective of the outside world (clients, devices, and neighboring VMs that
+may be paired with it), the VM and its applications have not lost any runtime
+state in the event of either a failure of the hypervisor/hardware to allow the 
+VM to make forward progress or a complete loss of power. This mechanism for
+providing fault tolerance does *not* provide any protection whatsoever against 
+software-level faults in the guest kernel or applications. In fact, due to
+the potentially extended lifetime of the VM because of this type of high
+availability, such software-level bugs may in fact manifest themselves 
+*more often* than they ordinarily would, in which case you would need to
+employ other forms of availability to guard against such software-level faults.
+
+This implementation is also fully compatible with RDMA. (See docs/rdma.txt
+for more details).
+
+THE MICRO-CHECKPOINTING PROCESS:
+
+
+Micro-Checkpointing works against the existing live migration path in QEMU,
+and can effectively be understood as a live migration that never ends.
+As such, iterations rounds happen at the granularity of 10s of milliseconds
+and perform the following steps:
+
+1. After N milliseconds, stop the VM.
+2. Generate a MC by invoking the live migration software path
+   to identify and copy dirty memory into a local staging area inside QEMU.
+3. Resume the VM immediately so that it can make forward progress.
+4. Transmit the checkpoint to the destination.
+5. Repeat 
+
+Upon failure, load the contents of the last MC at the destination back
+into memory and run the VM normally.
+
+Additionally, a MC must include a consistent view of device I/O,
+particularly the network, a problem commonly referred to as output commit. 
+This means that the outside world can not be allowed to experience duplicate
+state that was committed by the virtual machine after failure. This is
+possible because a checkpoint may diverge by N milliseconds of time and
+commit state while the current checkpoint is being transmitted to the
+destination. 
+
+To guard against this problem, first, we must buffer the TX output of the
+network (not the input) between MCs until the current MC is safely received
+by the destination. For example, all outbound network packets must be held
+at the source until the MC is transmitted. After transmission is complete, 
+those packets can be released. Similarly, in the case of disk I/O, we must
+ensure that either the contents of the local disk is safely mirrored to a 
+remote disk before completing a MC or that the output to a shared disk, 
+such as iSCSI, is also buffered between checkpoints and then later released
+in the same way.
+
+This implementation *currently* only supports buffering for the network.
+This requires that the VM's root disk or any non-ephemeral disks also be 
+made network-accessible directly from within the VM. Until the aforementioned
+buffering or mirroring support is available (ideally through drive-mirror),
+the only consistent way to provide full fault tolerance of the VM's
+non-ephemeral disks is to construct a VM whose root disk is made to boot
+directly from iSCSI or NFS or similar such that all disk I/O is translated
+into network I/O. 
+
+RDMA INTEGRATION:
+=
+
+RDMA is instrumental in enabling better MC performance, which is the reason
+why it was introduced into QEMU first.
+
+1. Checkpoint generation (RDMA-based memcpy):
+2. Checkpoint transmission (for performance and less CPU impact)
+
+Checkpoint generation (step 2 in the previous section) must be done while
+the VM is paused. In the worst case, the size of the checkpoint can be 
+equal in size to the amount of memory in total use by the VM. In order
+to resume VM execution as fast as possible, the checkpoint is copied
+consistently locally into a staging area

[Qemu-devel] [RFC PATCH v1: 10/12] mc: configure and makefile support

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com


Signed-off-by: Michael R. Hines mrhi...@us.ibm.com
---
 Makefile.objs |  1 +
 configure | 45 +
 2 files changed, 46 insertions(+)

diff --git a/Makefile.objs b/Makefile.objs
index 2b6c1fe..15356d6 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -52,6 +52,7 @@ common-obj-$(CONFIG_LINUX) += fsdev/
 
 common-obj-y += migration.o migration-tcp.o
 common-obj-$(CONFIG_RDMA) += migration-rdma.o
+common-obj-$(CONFIG_MC) += migration-checkpoint.o
 common-obj-y += qemu-char.o #aio.o
 common-obj-y += block-migration.o
 common-obj-y += page_cache.o xbzrle.o
diff --git a/configure b/configure
index 57ee62a..64c0d5e 100755
--- a/configure
+++ b/configure
@@ -182,6 +182,7 @@ kvm=no
 rdma=
 gprof=no
 debug_tcg=no
+mc=
 debug=no
 strip_opt=yes
 tcg_interpreter=no
@@ -969,6 +970,10 @@ for opt do
   ;;
   --enable-libssh2) libssh2=yes
   ;;
+  --disable-mc) mc=no
+  ;;
+  --enable-mc) mc=yes
+  ;;
   *) echo ERROR: unknown option $opt; show_help=yes
   ;;
   esac
@@ -1200,6 +1205,8 @@ echo   --gcov=GCOV  use specified gcov 
[$gcov_tool]
 echo   --enable-tpm enable TPM support
 echo   --disable-libssh2disable ssh block device support
 echo   --enable-libssh2 enable ssh block device support
+echo   --disable-mc disable Micro-Checkpointing support
+echo   --enable-mc  enable Micro-Checkpointing support
 echo 
 echo NOTE: The object files are built at the place where configure is 
launched
 exit 1
@@ -1861,6 +1868,35 @@ EOF
   fi
 fi
 
+##
+# Micro-Checkpointing requires netlink
+if test $mc != no ; then
+  cat  $TMPC EOF
+#include libnl3/netlink/route/qdisc/plug.h
+#include libnl3/netlink/route/class.h
+#include libnl3/netlink/cli/utils.h
+#include libnl3/netlink/cli/tc.h
+#include libnl3/netlink/cli/qdisc.h
+#include libnl3/netlink/cli/link.h
+int main(void) { return 0; }
+EOF
+  mc_libs=-lnl-3 -lnl-cli-3 -lnl-route-3
+  mc_cflags=-I/usr/include/libnl3
+  if compile_prog $mc_cflags $mc_libs ; then
+mc=yes
+libs_softmmu=$libs_softmmu $mc_libs
+QEMU_CFLAGS=$QEMU_CFLAGS $mc_cflags
+  else
+if test $mc = yes ; then
+error_exit \
+ NetLink v3 libs/headers not present. \
+ Please install the libnl3-*-dev(el) packages from your distro.
+fi
+mc=no
+  fi
+fi
+
+
 ##
 # VNC TLS/WS detection
 if test $vnc = yes -a \( $vnc_tls != no -o $vnc_ws != no \) ; then
@@ -3723,6 +3759,7 @@ echo KVM support   $kvm
 echo RDMA support  $rdma
 echo TCG interpreter   $tcg_interpreter
 echo fdt support   $fdt
+echo Micro checkpointing $mc
 echo preadv support$preadv
 echo fdatasync $fdatasync
 echo madvise   $madvise
@@ -4206,6 +4243,10 @@ if test $rdma = yes ; then
   echo CONFIG_RDMA=y  $config_host_mak
 fi
 
+if test $mc = yes ; then
+  echo CONFIG_MC=y  $config_host_mak
+fi
+
 if test $tcg_interpreter = yes; then
   QEMU_INCLUDES=-I\$(SRC_PATH)/tcg/tci $QEMU_INCLUDES
 elif test $ARCH = sparc64 ; then
@@ -4633,6 +4674,10 @@ echo QEMU_CFLAGS+=$cflags  $config_target_mak
 
 done # for target in $targets
 
+if test $mc = yes ; then
+echo CONFIG_MC=y  $config_host_mak
+fi
+
 if [ $pixman = internal ]; then
   echo config-host.h: subdir-pixman  $config_host_mak
 fi
-- 
1.8.1.2

[Qemu-devel] [RFC PATCH v1: 07/12] mc: introduce state machine error handling and migration_bitmap prep

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com

Since MC will repeatedly call the pre-existing live migration
call path over and over again (forever), the migration_bitmap
initialization only needs to happen once and the destruction of
the bitmap needs to be avoided in successive checkpoints.

Also, there some additional state machine error handling to
prepare for before introducing the MC core logic.

Signed-off-by: Michael R. Hines mrhi...@us.ibm.com
---
 arch_init.c   | 29 -
 include/migration/migration.h | 19 +++
 include/migration/qemu-file.h |  1 +
 migration.c   | 33 +++--
 4 files changed, 63 insertions(+), 19 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index 9cf7d18..d47b38b 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -795,13 +795,13 @@ static void ram_migration_cancel(void *opaque)
 migration_end();
 }
 
-static void reset_ram_globals(void)
+static void reset_ram_globals(bool reset_bulk_stage)
 {
 last_seen_block = NULL;
 last_sent_block = NULL;
 last_offset = 0;
 last_version = ram_list.version;
-ram_bulk_stage = true;
+ram_bulk_stage = reset_bulk_stage;
 }
 
 #define MAX_WAIT 50 /* ms, half buffered_file limit */
@@ -811,6 +811,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 RAMBlock *block;
 int64_t ram_pages = last_ram_offset()  TARGET_PAGE_BITS;
 
+/*
+ * RAM stays open during micro-checkpointing for the next transaction.
+ */
+if (migration_is_mc(migrate_get_current())) {
+qemu_mutex_lock_ramlist();
+reset_ram_globals(false);
+goto skip_setup;
+}
+
 migration_bitmap = bitmap_new(ram_pages);
 bitmap_set(migration_bitmap, 0, ram_pages);
 migration_dirty_pages = ram_pages;
@@ -833,12 +842,14 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 qemu_mutex_lock_iothread();
 qemu_mutex_lock_ramlist();
 bytes_transferred = 0;
-reset_ram_globals();
+reset_ram_globals(true);
 
 memory_global_dirty_log_start();
 migration_bitmap_sync();
 qemu_mutex_unlock_iothread();
 
+skip_setup:
+
 qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE);
 
 QTAILQ_FOREACH(block, ram_list.blocks, next) {
@@ -867,7 +878,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque)
 qemu_mutex_lock_ramlist();
 
 if (ram_list.version != last_version) {
-reset_ram_globals();
+reset_ram_globals(true);
 }
 
 ram_control_before_iterate(f, RAM_CONTROL_ROUND);
@@ -948,7 +959,15 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 }
 
 ram_control_after_iterate(f, RAM_CONTROL_FINISH);
-migration_end();
+
+/*
+ * Only cleanup at the end of normal migrations
+ * or if the MC destination failed and we got an error.
+ * Otherwise, we are (or will be soon) in MIG_STATE_MC.
+ */
+if(!migrate_use_mc() || migration_has_failed(migrate_get_current())) {
+migration_end();
+}
 
 qemu_mutex_unlock_ramlist();
 qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 0e7f121..fcf7684 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -95,6 +95,8 @@ int migrate_fd_close(MigrationState *s);
 void add_migration_state_change_notifier(Notifier *notify);
 void remove_migration_state_change_notifier(Notifier *notify);
 bool migration_in_setup(MigrationState *);
+bool migration_is_active(MigrationState *);
+bool migration_is_mc(MigrationState *s);
 bool migration_has_finished(MigrationState *);
 bool migration_has_failed(MigrationState *);
 MigrationState *migrate_get_current(void);
@@ -126,6 +128,15 @@ void migration_bitmap_worker_start(MigrationState *s);
 void migration_bitmap_worker_stop(MigrationState *s);
 void migrate_set_state(MigrationState *s, int old_state, int new_state);
 
+enum {
+MIG_STATE_ERROR = -1,
+MIG_STATE_NONE,
+MIG_STATE_SETUP,
+MIG_STATE_CANCELLED,
+MIG_STATE_ACTIVE,
+MIG_STATE_MC,
+MIG_STATE_COMPLETED,
+};
 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
 
 /**
@@ -194,4 +205,12 @@ int ram_control_copy_page(QEMUFile *f,
  ram_addr_t block_offset_source,
  ram_addr_t offset_source,
  long size);
+
+int migrate_use_mc(void);
+int migrate_use_mc_rdma_copy(void);
+
+#define MC_VERSION 1
+
+void qemu_rdma_info_save(QEMUFile *f, void *opaque);
+int qemu_rdma_info_load(QEMUFile *f, void *opaque, int version_id);
 #endif
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index d67e97a..b547de9 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -71,6 +71,7 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, 
uint64_t flags);
 #define RAM_CONTROL_ROUND1
 #define RAM_CONTROL_HOOK 2
 #define

[Qemu-devel] [RFC PATCH v1: 08/12] mc: modified QMP statistics and migration_thread handoff

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com

In addition to better handling of new QMP statistics associated
with the migration_bitmap and MC performance, we need to transfer
control from the migration thread to the MC thread more cleanly,
which means dynamically allocating the threads and doing
the handoff after the initial live migration has completed.

Signed-off-by: Michael R. Hines mrhi...@us.ibm.com
---
 hmp.c | 17 
 include/migration/migration.h | 14 ++-
 migration.c   | 94 +++
 qapi-schema.json  |  2 +
 savevm.c  |  5 +--
 5 files changed, 93 insertions(+), 39 deletions(-)

diff --git a/hmp.c b/hmp.c
index 32ee285..43896e9 100644
--- a/hmp.c
+++ b/hmp.c
@@ -202,6 +202,23 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
info-disk-total  10);
 }
 
+if (info-has_mc) {
+monitor_printf(mon, checkpoints: % PRIu64 \n,
+   info-mc-checkpoints);
+monitor_printf(mon, xmit_time: % PRIu64  ms\n,
+   info-mc-xmit_time);
+monitor_printf(mon, log_dirty_time: % PRIu64  ms\n,
+   info-mc-log_dirty_time);
+monitor_printf(mon, migration_bitmap_time: % PRIu64  ms\n,
+   info-mc-migration_bitmap_time);
+monitor_printf(mon, ram_copy_time: % PRIu64  ms\n,
+   info-mc-ram_copy_time);
+monitor_printf(mon, copy_mbps: %0.2f mbps\n,
+   info-mc-copy_mbps);
+monitor_printf(mon, throughput: %0.2f mbps\n,
+   info-mc-mbps);
+}
+
 if (info-has_xbzrle_cache) {
 monitor_printf(mon, cache size: % PRIu64  bytes\n,
info-xbzrle_cache-cache_size);
diff --git a/include/migration/migration.h b/include/migration/migration.h
index fcf7684..a1ab06c 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -35,13 +35,14 @@ struct MigrationState
 int64_t bandwidth_limit;
 size_t bytes_xfer;
 size_t xfer_limit;
-QemuThread thread;
+QemuThread *thread;
 QEMUBH *cleanup_bh;
 QEMUFile *file;
 
 int state;
 MigrationParams params;
 double mbps;
+double copy_mbps;
 int64_t total_time;
 int64_t downtime;
 int64_t expected_downtime;
@@ -54,6 +55,7 @@ struct MigrationState
 bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
 int64_t xbzrle_cache_size;
 int64_t setup_time;
+int64_t checkpoints;
 };
 
 void process_incoming_migration(QEMUFile *f);
@@ -137,6 +139,12 @@ enum {
 MIG_STATE_MC,
 MIG_STATE_COMPLETED,
 };
+
+int mc_enable_buffering(void);
+int mc_start_buffer(void);
+void mc_init_checkpointer(MigrationState *s);
+void mc_process_incoming_checkpoints_if_requested(QEMUFile *f);
+
 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
 
 /**
@@ -207,10 +215,14 @@ int ram_control_copy_page(QEMUFile *f,
  long size);
 
 int migrate_use_mc(void);
+int migrate_use_mc_net(void);
 int migrate_use_mc_rdma_copy(void);
 
 #define MC_VERSION 1
 
+int mc_info_load(QEMUFile *f, void *opaque, int version_id);
+void mc_info_save(QEMUFile *f, void *opaque);
+
 void qemu_rdma_info_save(QEMUFile *f, void *opaque);
 int qemu_rdma_info_load(QEMUFile *f, void *opaque, int version_id);
 #endif
diff --git a/migration.c b/migration.c
index 62dded3..8e0827e 100644
--- a/migration.c
+++ b/migration.c
@@ -172,6 +172,31 @@ static void get_xbzrle_cache_stats(MigrationInfo *info)
 }
 }
 
+static void get_ram_stats(MigrationState *s, MigrationInfo *info)
+{
+info-has_total_time = true;
+info-total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
+- s-total_time;
+
+info-has_ram = true;
+info-ram = g_malloc0(sizeof(*info-ram));
+info-ram-transferred = ram_bytes_transferred();
+info-ram-total = ram_bytes_total();
+info-ram-duplicate = dup_mig_pages_transferred();
+info-ram-skipped = skipped_mig_pages_transferred();
+info-ram-normal = norm_mig_pages_transferred();
+info-ram-normal_bytes = norm_mig_bytes_transferred();
+info-ram-mbps = s-mbps;
+
+if (blk_mig_active()) {
+info-has_disk = true;
+info-disk = g_malloc0(sizeof(*info-disk));
+info-disk-transferred = blk_mig_bytes_transferred();
+info-disk-remaining = blk_mig_bytes_remaining();
+info-disk-total = blk_mig_bytes_total();
+}
+}
+
 MigrationInfo *qmp_query_migrate(Error **errp)
 {
 MigrationInfo *info = g_malloc0(sizeof(*info));
@@ -197,26 +222,8 @@ MigrationInfo *qmp_query_migrate(Error **errp)
 info-has_setup_time = true;
 info-setup_time = s-setup_time;
 
-info-has_ram = true;
-info-ram = g_malloc0(sizeof(*info-ram));
-info-ram-transferred = ram_bytes_transferred();
-info-ram-remaining = ram_bytes_remaining();
-info-ram-total

[Qemu-devel] [RFC PATCH v1: 09/12] mc: core logic

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com

This implements the core logic, all described in docs/mc.txt

Signed-off-by: Michael R. Hines mrhi...@us.ibm.com
---
 migration-checkpoint.c | 1589 
 1 file changed, 1589 insertions(+)
 create mode 100644 migration-checkpoint.c

diff --git a/migration-checkpoint.c b/migration-checkpoint.c
new file mode 100644
index 000..14b03e8
--- /dev/null
+++ b/migration-checkpoint.c
@@ -0,0 +1,1589 @@
+/*
+ *  Copyright (C) 2014 Michael R. Hines mrhi...@us.ibm.com
+ *
+ *  Micro-Checkpointing (MC) support 
+ *  (a.k.a. Fault Tolerance or Continuous Replication)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; under version 2 of the License.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see http://www.gnu.org/licenses/.
+ */
+#include libnl3/netlink/route/qdisc/plug.h
+#include libnl3/netlink/route/class.h
+#include libnl3/netlink/cli/utils.h
+#include libnl3/netlink/cli/tc.h
+#include libnl3/netlink/cli/qdisc.h
+#include libnl3/netlink/cli/link.h
+#include qemu-common.h
+#include hw/virtio/virtio.h
+#include hw/virtio/virtio-net.h
+#include qemu/sockets.h
+#include migration/migration.h
+#include migration/qemu-file.h
+#include qmp-commands.h
+#include net/tap-linux.h
+#include sys/ioctl.h
+
+#define DEBUG_MC
+//#define DEBUG_MC_VERBOSE
+//#define DEBUG_MC_REALLY_VERBOSE
+
+#ifdef DEBUG_MC
+#define DPRINTF(fmt, ...) \
+do { printf(mc:  fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+#ifdef DEBUG_MC_VERBOSE
+#define DDPRINTF(fmt, ...) \
+do { printf(mc:  fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DDPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+#ifdef DEBUG_MC_REALLY_VERBOSE
+#define DDDPRINTF(fmt, ...) \
+do { printf(mc:  fmt, ## __VA_ARGS__); } while (0)
+#else
+#define DDDPRINTF(fmt, ...) \
+do { } while (0)
+#endif
+
+#define MBPS(bytes, time) time ? double) bytes * 8) \
+/ ((double) time / 1000.0)) / 1000.0 / 1000.0) : -1.0
+
+/*
+ * Micro checkpoints (MC)s are typically only a few MB when idle.
+ * However, they can easily be very large during heavy workloads.
+ * In the *extreme* worst-case, QEMU might need double the amount of main 
memory
+ * than that of what was originally allocated to the virtual machine.
+ *
+ * To support this variability during transient periods, a MC
+ * consists of a linked list of slabs, each of identical size. A better name
+ * would be welcome, as the name was only chosen because it resembles linux
+ * memory allocation. Because MCs occur several times per second 
+ * (a frequency of 10s of milliseconds), slabs allow MCs to grow and shrink 
+ * without constantly re-allocating all memory in place during each checkpoint.
+ *
+ * During steady-state, the 'head' slab is permanently allocated and never goes
+ * away, so when the VM is idle, there is no memory allocation at all.
+ * This design supports the use of RDMA. Since RDMA requires memory pinning, we
+ * must be able to hold on to a slab for a reasonable amount of time to get any
+ * real use out of it.
+ *
+ * Regardless, the current strategy taken is:
+ * 
+ * 1. If the checkpoint size increases,
+ *then grow the number of slabs to support it.
+ * 2. If the next checkpoint size is smaller than the last one,
+  then that's a strike.
+ * 3. After N strikes, cut the size of the slab cache in half
+ *(to a minimum of 1 slab as described before).
+ *
+ * As of this writing, a typical average size of 
+ * an Idle-VM checkpoint is under 5MB.
+ */
+
+#define MC_SLAB_BUFFER_SIZE (5UL * 1024UL * 1024UL) /* empirical */
+#define MC_DEV_NAME_MAX_SIZE256
+
+#define MC_DEFAULT_CHECKPOINT_FREQ_MS 100 /* too slow, but best for now */
+#define CALC_MAX_STRIKES()   \
+do {  max_strikes = (max_strikes_delay_secs * 1000) / freq_ms; } \
+while (0)
+
+/*
+ * How many seconds-worth of checkpoints to wait before re-evaluating the 
size
+ * of the slab cache?
+ *
+ * #strikes_until_shrink_cache = Function(#checkpoints/sec)
+ *
+ * Increasing the number of seconds, increases the number of strikes needed to
+ * be reached until it is time to cut the cache in half.
+ *
+ * Below value is open for debate - we just want it to be small enough to 
ensure
+ * that a large, idle cache doesn't stay too large for too long.
+ */
+#define MC_DEFAULT_SLAB_MAX_CHECK_DELAY_SECS 10
+
+/* 
+ * MC serializes the actual RAM page contents in such a way that

[Qemu-devel] [RFC PATCH v1: 12/12] mc: activate and use MC core logic if requested

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com

Building on the previous patches, this finally actually
activates protection of the VM by kicking off an MC thread
after the initial live migration completes. The live migration
thread will get destroyed and the MC thread will run and never die.

Signed-off-by: Michael R. Hines mrhi...@us.ibm.com
---
 migration.c | 21 -
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/migration.c b/migration.c
index 8e0827e..15ad264 100644
--- a/migration.c
+++ b/migration.c
@@ -94,6 +94,9 @@ static void process_incoming_migration_co(void *opaque)
 int ret;
 
 ret = qemu_loadvm_state(f);
+if (ret = 0) {
+mc_process_incoming_checkpoints_if_requested(f);
+}
 qemu_fclose(f);
 if (ret  0) {
 fprintf(stderr, load of migration failed\n);
@@ -670,11 +673,27 @@ static void *migration_thread(void *opaque)
 s-downtime = end_time - start_time;
 runstate_set(RUN_STATE_POSTMIGRATE);
 } else {
+if(migrate_use_mc()) {
+qemu_fflush(s-file);
+if (migrate_use_mc_net()) {
+if (mc_enable_buffering()  0 ||
+mc_start_buffer()  0) {
+migrate_set_state(s, MIG_STATE_ACTIVE, MIG_STATE_ERROR);
+}
+}
+}
+
 if (old_vm_running) {
 vm_start();
 }
 }
-qemu_bh_schedule(s-cleanup_bh);
+
+if (migrate_use_mc()  s-state != MIG_STATE_ERROR) {
+mc_init_checkpointer(s);
+} else {
+qemu_bh_schedule(s-cleanup_bh);
+}
+
 qemu_mutex_unlock_iothread();
 
 return NULL;
-- 
1.8.1.2

[Qemu-devel] [RFC PATCH v1: 11/12] mc: register MC qemu-file functions and expose MC tunable capability

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com

The capability allows management software to throttle the MC frequency
during VM application transience.

The qemu-file savevm() functions inform the destination that the incoming
traffic is MC-specific traffic and not vanilla live-migration traffic.

Signed-off-by: Michael R. Hines mrhi...@us.ibm.com
---
 hmp-commands.hx  | 14 ++
 hmp.c|  6 ++
 hmp.h|  1 +
 qapi-schema.json | 13 +
 qmp-commands.hx  | 23 +++
 vl.c |  3 +++
 6 files changed, 60 insertions(+)

diff --git a/hmp-commands.hx b/hmp-commands.hx
index caae5ad..7db0597 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -960,6 +960,20 @@ Set maximum tolerated downtime (in seconds) for migration.
 ETEXI
 
 {
+.name   = migrate-set-mc-delay,
+.args_type  = value:i,
+.params = value,
+.help   = set maximum delay (in milliseconds) between 
micro-checkpoints,
+.mhandler.cmd = hmp_migrate_set_mc_delay,
+},
+
+STEXI
+@item migrate_set_downtime @var{second}
+@findex migrate_set_downtime
+Set maximum tolerated downtime (in seconds) for migration.
+ETEXI
+
+{
 .name   = migrate_set_capability,
 .args_type  = capability:s,state:b,
 .params = capability state,
diff --git a/hmp.c b/hmp.c
index 43896e9..8e89ac7 100644
--- a/hmp.c
+++ b/hmp.c
@@ -1026,6 +1026,12 @@ void hmp_migrate_set_downtime(Monitor *mon, const QDict 
*qdict)
 qmp_migrate_set_downtime(value, NULL);
 }
 
+void hmp_migrate_set_mc_delay(Monitor *mon, const QDict *qdict)
+{
+int64_t value = qdict_get_int(qdict, value);
+qmp_migrate_set_mc_delay(value, NULL);
+}
+
 void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict)
 {
 int64_t value = qdict_get_int(qdict, value);
diff --git a/hmp.h b/hmp.h
index 54cf71f..b6548a3 100644
--- a/hmp.h
+++ b/hmp.h
@@ -60,6 +60,7 @@ void hmp_drive_mirror(Monitor *mon, const QDict *qdict);
 void hmp_drive_backup(Monitor *mon, const QDict *qdict);
 void hmp_migrate_cancel(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_downtime(Monitor *mon, const QDict *qdict);
+void hmp_migrate_set_mc_delay(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_speed(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict);
diff --git a/qapi-schema.json b/qapi-schema.json
index e0a430c..2ed8098 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -2135,6 +2135,19 @@
 { 'command': 'migrate_set_downtime', 'data': {'value': 'number'} }
 
 ##
+# @migrate-set-mc-delay
+#
+# Set delay (in milliseconds) between micro checkpoints.
+#
+# @value: maximum delay in milliseconds 
+#
+# Returns: nothing on success
+#
+# Since: 1.6
+##
+{ 'command': 'migrate-set-mc-delay', 'data': {'value': 'int'} }
+
+##
 # @migrate_set_speed
 #
 # Set maximum speed for migration.
diff --git a/qmp-commands.hx b/qmp-commands.hx
index fba15cd..6d7ef2f 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -754,6 +754,29 @@ Example:
 EQMP
 
 {
+.name   = migrate-set-mc-delay,
+.args_type  = value:i,
+.mhandler.cmd_new = qmp_marshal_input_migrate_set_mc_delay,
+},
+
+SQMP
+migrate-set-mc-delay
+
+
+Set maximum delay (in milliseconds) between micro-checkpoints.
+
+Arguments:
+
+- value: maximum delay (json-int)
+
+Example:
+
+- { execute: migrate-set-mc-delay, arguments: { value: 100 } }
+- { return: {} }
+
+EQMP
+
+{
 .name   = client_migrate_info,
 .args_type  = 
protocol:s,hostname:s,port:i?,tls-port:i?,cert-subject:s?,
 .params = protocol hostname port tls-port cert-subject,
diff --git a/vl.c b/vl.c
index 74d52ab..fa23d66 100644
--- a/vl.c
+++ b/vl.c
@@ -29,6 +29,7 @@
 #include sys/time.h
 #include zlib.h
 #include qemu/bitmap.h
+#include migration/qemu-file.h
 
 /* Needed early for CONFIG_BSD etc. */
 #include config-host.h
@@ -4192,6 +4193,8 @@ int main(int argc, char **argv, char **envp)
 default_drive(default_sdcard, snapshot, IF_SD, 0, SD_OPTS);
 
 register_savevm_live(NULL, ram, 0, 4, savevm_ram_handlers, NULL);
+register_savevm(NULL, mc, -1, MC_VERSION, mc_info_save, 
+mc_info_load, NULL); 
 
 if (nb_numa_nodes  0) {
 int i;
-- 
1.8.1.2

[Qemu-devel] [RFC PATCH v1: 05/12] migration: support custom page loading

2013-10-20 Thread mrhines

From: Michael R. Hines mrhi...@us.ibm.com

Just as RDMA has custom routines for saving memory,
this provides us with custom routines for loading memory.

Micro-checkpointing needs this support in order to be able
to handle loading of the latest checkpoint into memory
as they are received from the network.

Signed-off-by: Michael R. Hines mrhi...@us.ibm.com
---
 arch_init.c   | 17 -
 include/migration/migration.h | 12 ++--
 include/migration/qemu-file.h | 16 ++--
 savevm.c  | 27 ---
 4 files changed, 60 insertions(+), 12 deletions(-)

diff --git a/arch_init.c b/arch_init.c
index b139512..9cf7d18 100644
--- a/arch_init.c
+++ b/arch_init.c
@@ -684,7 +684,7 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 /* In doubt sent page as normal */
 bytes_sent = -1;
 ret = ram_control_save_page(f, block-offset,
-   offset, TARGET_PAGE_SIZE, bytes_sent);
+   block-host, offset, TARGET_PAGE_SIZE, bytes_sent);
 
 if (ret != RAM_SAVE_CONTROL_NOT_SUPP) {
 if (ret != RAM_SAVE_CONTROL_DELAYED) {
@@ -712,9 +712,11 @@ static int ram_save_block(QEMUFile *f, bool last_stage)
 /* XBZRLE overflow or normal page */
 if (bytes_sent == -1) {
 bytes_sent = save_block_hdr(f, block, offset, cont, 
RAM_SAVE_FLAG_PAGE);
-qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
-bytes_sent += TARGET_PAGE_SIZE;
-acct_info.norm_pages++;
+if (ret != RAM_SAVE_CONTROL_DELAYED) {
+qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE);
+bytes_sent += TARGET_PAGE_SIZE;
+acct_info.norm_pages++;
+}
 }
 
 /* if page is unmodified, continue to the next */
@@ -1133,13 +1135,18 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
 } else if (flags  RAM_SAVE_FLAG_PAGE) {
 void *host;
+int r;
 
 host = host_from_stream_offset(f, addr, flags);
 if (!host) {
 return -EINVAL;
 }
 
-qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
+r = ram_control_load_page(f, host, TARGET_PAGE_SIZE);
+
+if (r == RAM_LOAD_CONTROL_NOT_SUPP) {
+qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
+}
 } else if (flags  RAM_SAVE_FLAG_XBZRLE) {
 void *host = host_from_stream_offset(f, addr, flags);
 if (!host) {
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 3ad06c5..ac1b438 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -169,9 +169,17 @@ void ram_control_load_hook(QEMUFile *f, uint64_t flags);
 
 #define RAM_SAVE_CONTROL_NOT_SUPP -1000
 #define RAM_SAVE_CONTROL_DELAYED  -2000
+#define RAM_LOAD_CONTROL_NOT_SUPP -3000
+#define RAM_LOAD_CONTROL_DELAYED  -4000
 
-size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
- ram_addr_t offset, size_t size,
+#define RDMA_CONTROL_VERSION_CURRENT 1
+
+int ram_control_save_page(QEMUFile *f, ram_addr_t block_offset,
+ uint8_t *host_addr,
+ ram_addr_t offset, long size,
  int *bytes_sent);
 
+int ram_control_load_page(QEMUFile *f,
+ void *host_addr,
+ long size);
 #endif
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 0f757fb..d396b40 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -76,12 +76,22 @@ typedef int (QEMURamHookFunc)(QEMUFile *f, void *opaque, 
uint64_t flags);
  * This function allows override of where the RAM page
  * is saved (such as RDMA, for example.)
  */
-typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque,
+typedef int (QEMURamSaveFunc)(QEMUFile *f, void *opaque,
ram_addr_t block_offset,
+   uint8_t *host_addr,
ram_addr_t offset,
-   size_t size,
+   long size,
int *bytes_sent);
 
+/*
+ * This function allows override of where the RAM page
+ * is saved (such as RDMA, for example.)
+ */
+typedef int (QEMURamLoadFunc)(QEMUFile *f,
+   void *opaque,
+   void *host_addr,
+   long size);
+
 typedef struct QEMUFileOps {
 QEMUFilePutBufferFunc *put_buffer;
 QEMUFileGetBufferFunc *get_buffer;
@@ -92,12 +102,14 @@ typedef struct QEMUFileOps {
 QEMURamHookFunc *after_ram_iterate;
 QEMURamHookFunc

Re: [Qemu-devel] [PATCH V14 00/11] Add support for binding guest numa nodes to host numa nodes

2013-10-20 Thread Wanlong Gao

Hi folks,

Any more comments?

Thanks,
Wanlong Gao

 As you know, QEMU can't direct it's memory allocation now, this may cause
 guest cross node access performance regression.
 And, the worse thing is that if PCI-passthrough is used,
 direct-attached-device uses DMA transfer between device and qemu process.
 All pages of the guest will be pinned by get_user_pages().
 
 KVM_ASSIGN_PCI_DEVICE ioctl
   kvm_vm_ioctl_assign_device()
 =kvm_assign_device()
   = kvm_iommu_map_memslots()
 = kvm_iommu_map_pages()
= kvm_pin_pages()
 
 So, with direct-attached-device, all guest page's page count will be +1 and
 any page migration will not work. AutoNUMA won't too.
 
 So, we should set the guest nodes memory allocation policy before
 the pages are really mapped.
 
 According to this patch set, we are able to set guest nodes memory policy
 like following:
 
  -numa node,nodeid=0,cpus=0, \
  -numa mem,size=1024M,policy=membind,host-nodes=0-1 \
  -numa node,nodeid=1,cpus=1 \
  -numa mem,size=1024M,policy=interleave,host-nodes=1
 
 This supports 
 policy={default|membind|interleave|preferred},relative=true,host-nodes=N-N 
 like format.
 
 And add a QMP command query-numa to show numa info through
 this API.
 
 And convert the info numa monitor command to use this
 QMP command query-numa.
 
 This version removes set-mem-policy qmp and hmp commands temporarily
 as Marcelo and Paolo suggested.
 
 V1-V2:
 change to use QemuOpts in numa options (Paolo)
 handle Error in mpol parser (Paolo)
 change qmp command format to mem-policy=membind,mem-hostnode=0-1 like 
 (Paolo)
 V2-V3:
 also handle Error in cpus parser (5/10)
 split out common parser from cpus and hostnode parser (Bandan 6/10)
 V3-V4:
 rebase to request for comments
 V4-V5:
 use OptVisitor and split -numa option (Paolo)
  - s/set-mpol/set-mem-policy (Andreas)
  - s/mem-policy/policy
  - s/mem-hostnode/host-nodes
 fix hmp command process after error (Luiz)
 add qmp command query-numa and convert info numa to it (Luiz)
 V5-V6:
 remove tabs in json file (Laszlo, Paolo)
 add back -numa node,mem=xxx as legacy (Paolo)
 change cpus and host-nodes to array (Laszlo, Eric)
 change nodeid to uint16
 add NumaMemPolicy enum type (Eric)
 rebased on Laszlo's OptsVisitor: support / flatten integer ranges for 
 repeating options patch set, thanks for Laszlo's help
 V6-V7:
 change UInt16 to uint16 (Laszlo)
 fix a typo in adding qmp command set-mem-policy
 V7-V8:
 rebase to current master with Laszlo's V2 of OptsVisitor patch set
 fix an adding white space line error
 V8-V9:
 rebase to current master
 check if total numa memory size is equal to ram_size (Paolo)
 add comments to the OptsVisitor stuff in qapi-schema.json (Eric, Laszlo)
 replace the use of numa_num_configured_nodes() (Andrew)
 avoid abusing the fact i==nodeid (Andrew)
 V9-V10:
 rebase to current master
 remove libnuma (Andrew)
 MAX_NODES=64 - MAX_NODES=128 since libnuma selected 128 (Andrew)
 use MAX_NODES instead of MAX_CPUMASK_BITS for host_mem bitmap (Andrew)
 remove a useless clear_bit() operation (Andrew)
 V10-V11:
 rebase to current master
 fix maxnode argument of mbind(2)
 V11-V12:
 rebase to current master
 split patch 02/11 of V11 (Eduardo)
 add some max value check (Eduardo)
 split MAX_NODES change patch (Eduardo)
 V12-V13:
 rebase to current master
 thanks for Luiz's review (Luiz)
 doc hmp command set-mem-policy (Luiz)
 rename: NUMAInfo - NUMANode (Luiz)
 V13-V14:
 remove set-mem-policy qmp and hmp commands (Marcelo, Paolo)
 
 
 *I hope this can catch up the train of 1.7.*
 
 Thanks,
 Wanlong Gao
 
 Wanlong Gao (11):
   NUMA: move numa related code to new file numa.c
   NUMA: check if the total numa memory size is equal to ram_size
   NUMA: Add numa_info structure to contain numa nodes info
   NUMA: convert -numa option to use OptsVisitor
   NUMA: introduce NumaMemOptions
   NUMA: add -numa mem, options
   NUMA: expand MAX_NODES from 64 to 128
   NUMA: parse guest numa nodes memory policy
   NUMA: set guest numa nodes memory policy
   NUMA: add qmp command query-numa
   NUMA: convert hmp command info_numa to use qmp command query_numa
 
  Makefile.target |   2 +-
  cpus.c  |  14 --
  hmp.c   |  57 +++
  hmp.h   |   1 +
  hw/i386/pc.c|   4 +-
  include/sysemu/cpus.h   |   1 -
  include/sysemu/sysemu.h |  18 ++-
  monitor.c   |  21 +--
  numa.c  | 395 
 
  qapi-schema.json| 112 ++
  qemu-options.hx |   6 +-
  qmp-commands.hx |  48 ++
  vl.c| 160 +++-
  13 files changed, 654 insertions(+), 185 deletions(-)
  create mode 100644 numa.c

Re: [Qemu-devel] [PATCH 02/13] block: do not include monitor.h in block.c

2013-10-20 Thread Wenchao Xia


于 2013/10/18 17:36, Paolo Bonzini 写道:

Il 18/10/2013 03:11, Wenchao Xia ha scritto:

block_int.h already included it.

Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
Reviewed-by: Eric Blake ebl...@redhat.com
---
  block.c |1 -
  1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/block.c b/block.c
index 2c15e5d..e92a556 100644
--- a/block.c
+++ b/block.c
@@ -24,7 +24,6 @@
  #include config-host.h
  #include qemu-common.h
  #include trace.h
-#include monitor/monitor.h
  #include block/block_int.h
  #include block/blockjob.h
  #include qemu/module.h


Does this cause problems?  block.c uses monitor_protocol_event, so it's
good to include the file directly instead of relying on other header files.

Paolo


OK, will drop this patch.

Re: [Qemu-devel] [PATCH 03/13] qapi: move MonitorEvent define

2013-10-20 Thread Wenchao Xia


于 2013/10/18 20:38, Eric Blake 写道:

On 10/18/2013 03:36 AM, Paolo Bonzini wrote:

Il 18/10/2013 03:11, Wenchao Xia ha scritto:

Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
---
  include/monitor/monitor.h |   38 +-
  include/qapi/qmp/qevent.h |   66 +
  include/qapi/qmp/types.h  |1 +
  3 files changed, 68 insertions(+), 37 deletions(-)
  create mode 100644 include/qapi/qmp/qevent.h

Please move it qemu-schema.json instead.

qapi-schema.json, but yes, I agree that declaring it as a qapi enum and
letting the code generator create the constants, rather than moving the
hand-maintained constants into a new header, is smarter.


   I will try use qapi-schema.json in next version.

Re: [Qemu-devel] [PATCH 05/13] error: define struct Error in only one place

2013-10-20 Thread Wenchao Xia

于 2013/10/18 19:22, Markus Armbruster 写道:
 Paolo Bonzini pbonz...@redhat.com writes:

 Il 18/10/2013 03:11, Wenchao Xia ha scritto:
 Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
 ---
  include/qapi/error.h |5 -
  qobject/qerror.c |7 ---
  util/error.c |6 --
  3 files changed, 4 insertions(+), 14 deletions(-)

 diff --git a/include/qapi/error.h b/include/qapi/error.h
 index 7d4c696..8688aaf 100644
 --- a/include/qapi/error.h
 +++ b/include/qapi/error.h
 @@ -20,7 +20,10 @@
   * A class representing internal errors within QEMU.  An error has a 
 ErrorClass
   * code and a human message.
   */
 -typedef struct Error Error;
 +typedef struct Error {
 +char *msg;
 +ErrorClass err_class;
 +} Error;
 Please add a comment that it should be treated as an opaque type.
 Or keep it opaque here, and complete the type in an internal header.
 But see below.

 Paolo

  
  /**
   * Set an indirect pointer to an error given a ErrorClass value and a
 diff --git a/qobject/qerror.c b/qobject/qerror.c
 index 3aee1cf..5b487f3 100644
 --- a/qobject/qerror.c
 +++ b/qobject/qerror.c
 @@ -97,13 +97,6 @@ void qerror_report(ErrorClass eclass, const char *fmt, 
 ...)
  }
  }
  
 -/* Evil... */
 -struct Error
 -{
 -char *msg;
 -ErrorClass err_class;
 -};
 -
  void qerror_report_err(Error *err)
  {
  QError *qerr;
 qerr = qerror_new();
 loc_save(qerr-loc);
 qerr-err_msg = g_strdup(err-msg);
 qerr-err_class = err-err_class;

 if (monitor_cur_is_qmp()) {
 monitor_set_error(cur_mon, qerr);
 } else {
 qerror_print(qerr);
 QDECREF(qerr);
 }
 }

 This is the only use of the evil duplicate.  I suspect it could be
 cleaned up like this:

 qerr-err_msg = g_strdup(error_get_pretty(err));
 qerr-err_class = error_get_class(err);

 If that's true, the duplicate goes away, and we can keep the type
 opaque.
seems a smart idea, will use it.


 diff --git a/util/error.c b/util/error.c
 index ec0faa6..da0d221 100644
 --- a/util/error.c
 +++ b/util/error.c
 @@ -17,12 +17,6 @@
  #include qapi-types.h
  #include qapi/qmp/qerror.h
  
 -struct Error
 -{
 -char *msg;
 -ErrorClass err_class;
 -};
 -
  void error_set(Error **errp, ErrorClass err_class, const char *fmt, ...)
  {
  Error *err;

Re: [Qemu-devel] [PATCH 08/13] error: don't set sep when print progname

2013-10-20 Thread Wenchao Xia

于 2013/10/18 19:40, Markus Armbruster 写道:
 Paolo Bonzini pbonz...@redhat.com writes:

 Il 18/10/2013 03:11, Wenchao Xia ha scritto:
 The behavior to set sep brings trouble to modification later,
 the logic is not changed by add tailing space in fprintf().

 Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
 ---
  util/qemu-error.c |5 ++---
  1 files changed, 2 insertions(+), 3 deletions(-)

 diff --git a/util/qemu-error.c b/util/qemu-error.c
 index 0ccd3e9..d1e858a 100644
 --- a/util/qemu-error.c
 +++ b/util/qemu-error.c
 @@ -161,8 +161,7 @@ static void error_print_loc(void)
  const char *const *argp;
  
  if (!cur_mon  progname) {
 -fprintf(stderr, %s:, progname);
 -sep =  ;
 +fprintf(stderr, %s: , progname);
  }
  switch (cur_loc-kind) {
  case LOC_CMDLINE:
 @@ -181,7 +180,7 @@ static void error_print_loc(void)
  error_printf( );
  break;
  default:
 -error_printf(%s, sep);
 +break;
  }
  }
  

 This changes behavior for LOC_FILE.

 Before:

 $ cat xyz.cfg
 [device abc]
 driver = def
 $ qemu-system-x86_64 -readconfig xyz.cfg
 qemu-system-x86_64:xyz.cfg:2: parse error

 After:

 $ qemu-system-x86_64 -readconfig xyz.cfg
 qemu-system-x86_64: xyz.cfg:2: parse error

 Could even be an improvement, but you need to note it in the commit message.
 No, it is not an improvement.  The old format matches exactly how other
 report errors with location, e.g. jade.  Please leave it that way,

I'll check whether there is way to leave the logic as it was.

Re: [Qemu-devel] [PATCH 09/13] error: print progname with error_vprintf()

2013-10-20 Thread Wenchao Xia


于 2013/10/18 17:44, Paolo Bonzini 写道:

Il 18/10/2013 03:11, Wenchao Xia ha scritto:

This remove additional code path about where to print the error,
error_vprintf() is only the controller now, making future change
easier.

The logic is not changed since when cur_mon = NULL, error_vprintf()
will still print to stderr.

Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
---
  util/qemu-error.c |   11 ++-
  1 files changed, 10 insertions(+), 1 deletions(-)

diff --git a/util/qemu-error.c b/util/qemu-error.c
index d1e858a..c29fcbd 100644
--- a/util/qemu-error.c
+++ b/util/qemu-error.c
@@ -151,6 +151,15 @@ const char *error_get_progname(void)
  return progname;
  }
  
+static void error_print_progname(const char *fmt, ...)

+{
+va_list ap;
+
+va_start(ap, fmt);
+error_vprintf(fmt, ap);
+va_end(ap);
+}
+
  /*
   * Print current location to current monitor if we have one, else to stderr.
   */
@@ -161,7 +170,7 @@ static void error_print_loc(void)
  const char *const *argp;
  
  if (!cur_mon  progname) {

-fprintf(stderr, %s: , progname);
+error_print_progname(%s: , progname);
  }
  switch (cur_loc-kind) {
  case LOC_CMDLINE:


I agree that using fprintf looks odd, but why not use error_printf directly?

Paolo

I used custom function since I have a following modification in my 
private branch. Since

it is not send, I will use error_printf(), which is more straight.

Re: [Qemu-devel] [PATCH 10/13] qerror: deref once in qerror_report()

2013-10-20 Thread Wenchao Xia


于 2013/10/18 17:46, Paolo Bonzini 写道:

Il 18/10/2013 03:11, Wenchao Xia ha scritto:

Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
---
  qobject/qerror.c |1 -
  1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/qobject/qerror.c b/qobject/qerror.c
index 5b487f3..685167a 100644
--- a/qobject/qerror.c
+++ b/qobject/qerror.c
@@ -77,7 +77,6 @@ static void qerror_print(QError *qerror)
  loc_push_restore(qerror-loc);
  error_report(%s, qstring_get_str(qstring));
  loc_pop(qerror-loc);
-QDECREF(qstring);
  }
  
  void qerror_report(ErrorClass eclass, const char *fmt, ...)



Why isn't this a memory leak?

Paolo

My bad, I mistake QDECREF(qstring) as QDECREF(qerror), will drop this 
path, sorry to disturb.

Re: [Qemu-devel] [PATCH 12/13] monitor: hide *cur_mon in monitor_get_fd()

2013-10-20 Thread Wenchao Xia


于 2013/10/18 17:51, Paolo Bonzini 写道:

Il 18/10/2013 03:11, Wenchao Xia ha scritto:

All existing caller are using *cur_mon as its parameter, and *cur_mon
is an internal variable which used inside monitor.c. This patch reduce
the exposing of details in monitor.c, by introduce a new function
monitor_get_fd_cur() and make old one static.

Signed-off-by: Wenchao Xia xiaw...@linux.vnet.ibm.com
---
  dump.c|2 +-
  include/monitor/monitor.h |2 +-
  migration-fd.c|2 +-
  monitor.c |7 ++-
  qmp.c |2 +-
  stubs/get-fd.c|2 +-
  util/qemu-sockets.c   |4 ++--
  7 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/dump.c b/dump.c
index 846155c..8f5b6b0 100644
--- a/dump.c
+++ b/dump.c
@@ -860,7 +860,7 @@ void qmp_dump_guest_memory(bool paging, const char *file, 
bool has_begin,
  
  #if !defined(WIN32)

  if (strstart(file, fd:, p)) {
-fd = monitor_get_fd(cur_mon, p, errp);
+fd = monitor_get_fd_cur(p, errp);
  if (fd == -1) {
  return;
  }
diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index 97fcee3..637f7f3 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -35,7 +35,7 @@ int monitor_read_block_device_key(Monitor *mon, const char 
*device,
BlockDriverCompletionFunc *completion_cb,
void *opaque);
  
-int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp);

+int monitor_get_fd_cur(const char *fdname, Error **errp);
  int monitor_handle_fd_param(Monitor *mon, const char *fdname);
  
  void monitor_vprintf(Monitor *mon, const char *fmt, va_list ap)

diff --git a/migration-fd.c b/migration-fd.c
index d2e523a..022bc50 100644
--- a/migration-fd.c
+++ b/migration-fd.c
@@ -33,7 +33,7 @@
  
  void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp)

  {
-int fd = monitor_get_fd(cur_mon, fdname, errp);
+int fd = monitor_get_fd_cur(fdname, errp);
  if (fd == -1) {
  return;
  }
diff --git a/monitor.c b/monitor.c
index 9377834..80a9dfd 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2290,7 +2290,7 @@ static void do_loadvm(Monitor *mon, const QDict *qdict)
  }
  }
  
-int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp)

+static int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp)
  {
  mon_fd_t *monfd;
  
@@ -2315,6 +2315,11 @@ int monitor_get_fd(Monitor *mon, const char *fdname, Error **errp)

  return -1;
  }
  
+int monitor_get_fd_cur(const char *fdname, Error **errp)

+{
+return monitor_get_fd(cur_mon, fdname, errp);
+}
+
  static void monitor_fdset_cleanup(MonFdset *mon_fdset)
  {
  MonFdsetFd *mon_fdset_fd;
diff --git a/qmp.c b/qmp.c
index 4c149b3..a02804b 100644
--- a/qmp.c
+++ b/qmp.c
@@ -493,7 +493,7 @@ void qmp_add_client(const char *protocol, const char 
*fdname,
  CharDriverState *s;
  int fd;
  
-fd = monitor_get_fd(cur_mon, fdname, errp);

+fd = monitor_get_fd_cur(fdname, errp);
  if (fd  0) {
  return;
  }
diff --git a/stubs/get-fd.c b/stubs/get-fd.c
index 9f2c65c..7d9ec3b 100644
--- a/stubs/get-fd.c
+++ b/stubs/get-fd.c
@@ -1,7 +1,7 @@
  #include qemu-common.h
  #include monitor/monitor.h
  
-int monitor_get_fd(Monitor *mon, const char *name, Error **errp)

+int monitor_get_fd_cur(const char *name, Error **errp)
  {
  error_setg(errp, only QEMU supports file descriptor passing);
  return -1;
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
index 6b97dc1..9cd85dd 100644
--- a/util/qemu-sockets.c
+++ b/util/qemu-sockets.c
@@ -902,7 +902,7 @@ int socket_connect(SocketAddress *addr, Error **errp,
  break;
  
  case SOCKET_ADDRESS_KIND_FD:

-fd = monitor_get_fd(cur_mon, addr-fd-str, errp);
+fd = monitor_get_fd_cur(addr-fd-str, errp);
  if (fd = 0  callback) {
  qemu_set_nonblock(fd);
  callback(fd, opaque);
@@ -934,7 +934,7 @@ int socket_listen(SocketAddress *addr, Error **errp)
  break;
  
  case SOCKET_ADDRESS_KIND_FD:

-fd = monitor_get_fd(cur_mon, addr-fd-str, errp);
+fd = monitor_get_fd_cur(addr-fd-str, errp);
  break;
  
  default:



Doesn't seem like an improvement.  It would be if you could then make
cur_mon static.

Paolo


OK, I will check all code using cur_mon and make it static.

[Qemu-devel] can we create complete image or start a vm from a snapshot point

2013-10-20 Thread yue-kvm

hi.all
can we create complete image  or start a vm from a snapshot point


thanks

60 matches

Mail list logo