diff -ruN net-next-p1/drivers/vhost/net.c net-next-p2/drivers/vhost/net.c
--- net-next-p1/drivers/vhost/net.c	2010-03-01 11:44:22.000000000 -0800
+++ net-next-p2/drivers/vhost/net.c	2010-03-02 13:01:34.000000000 -0800
@@ -109,7 +109,6 @@
 	};
 	size_t len, total_len = 0;
 	int err, wmem;
-	size_t hdr_size;
 	struct socket *sock = rcu_dereference(vq->private_data);
 	if (!sock)
 		return;
@@ -124,7 +123,6 @@
 
 	if (wmem < sock->sk->sk_sndbuf * 2)
 		tx_poll_stop(net);
-	hdr_size = vq->hdr_size;
 
 	for (;;) {
 		head.iov_base = (void *)vhost_get_vq_desc(&net->dev, vq,
@@ -148,25 +146,45 @@
 			       "out %d, int %d\n", out, in);
 			break;
 		}
+		if (vq->guest_hlen > vq->sock_hlen) {
+			if (msg.msg_iov[0].iov_len == vq->guest_hlen)
+				msg.msg_iov[0].iov_len = vq->sock_hlen;
+			else if (out == ARRAY_SIZE(vq->iov))
+				vq_err(vq, "handle_tx iov overflow!");
+			else {
+				int i;
+
+				/* give header its own iov */
+				for (i=out; i>0; ++i)
+					msg.msg_iov[i+1] = msg.msg_iov[i];
+				msg.msg_iov[0].iov_len = vq->sock_hlen;
+				msg.msg_iov[1].iov_base += vq->guest_hlen;
+				msg.msg_iov[1].iov_len -= vq->guest_hlen;
+				out++;
+			}
+		}
 		/* Skip header. TODO: support TSO. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, out);
 		msg.msg_iovlen = out;
 		head.iov_len = len = iov_length(vq->iov, out);
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for TX: "
 			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
+			       len, vq->guest_hlen);
 			break;
 		}
 		/* TODO: Check specific error and bomb out unless ENOBUFS? */
 		err = sock->ops->sendmsg(NULL, sock, &msg, len);
 		if (unlikely(err < 0)) {
-			vhost_discard(vq, 1);
-			tx_poll_start(net, sock);
+			if (err == -EAGAIN) {
+				tx_poll_start(net, sock);
+			} else {
+				vq_err(vq, "sendmsg: errno %d\n", -err);
+				/* drop packet; do not discard/resend */
+				vhost_add_used_and_signal(&net->dev,vq,&head,1);
+			}
 			break;
-		}
-		if (err != len)
+		} else if (err != len)
 			pr_err("Truncated TX packet: "
 			       " len %d != %zd\n", err, len);
 		vhost_add_used_and_signal(&net->dev, vq, &head, 1);
@@ -207,14 +225,8 @@
 		.msg_flags = MSG_DONTWAIT,
 	};
 
-	struct virtio_net_hdr hdr = {
-		.flags = 0,
-		.gso_type = VIRTIO_NET_HDR_GSO_NONE
-	};
-
 	size_t len, total_len = 0;
 	int err, headcount, datalen;
-	size_t hdr_size;
 	struct socket *sock = rcu_dereference(vq->private_data);
 
 	if (!sock || !skb_head_len(&sock->sk->sk_receive_queue))
@@ -223,7 +235,6 @@
 	use_mm(net->dev.mm);
 	mutex_lock(&vq->mutex);
 	vhost_disable_notify(vq);
-	hdr_size = vq->hdr_size;
 
 	vq_log = unlikely(vhost_has_feature(&net->dev, VHOST_F_LOG_ALL)) ?
 		vq->log : NULL;
@@ -232,25 +243,18 @@
 		headcount = vhost_get_heads(vq, datalen, &in, vq_log, &log);
 		/* OK, now we need to know about added descriptors. */
 		if (!headcount) {
-			if (unlikely(vhost_enable_notify(vq))) {
-				/* They have slipped one in as we were
-				 * doing that: check again. */
-				vhost_disable_notify(vq);
-				continue;
-			}
-			/* Nothing new?  Wait for eventfd to tell us
-			 * they refilled. */
+			vhost_enable_notify(vq);
 			break;
 		}
 		/* Skip header. TODO: support TSO/mergeable rx buffers. */
-		s = move_iovec_hdr(vq->iov, vq->hdr, hdr_size, in);
 		msg.msg_iovlen = in;
 		len = iov_length(vq->iov, in);
+
 		/* Sanity check */
 		if (!len) {
 			vq_err(vq, "Unexpected header len for RX: "
 			       "%zd expected %zd\n",
-			       iov_length(vq->hdr, s), hdr_size);
+			       len, vq->guest_hlen);
 			break;
 		}
 		err = sock->ops->recvmsg(NULL, sock, &msg,
@@ -268,13 +272,7 @@
 			continue;
 		}
 		len = err;
-		err = memcpy_toiovec(vq->hdr, (unsigned char *)&hdr, hdr_size);
-		if (err) {
-			vq_err(vq, "Unable to write vnet_hdr at addr %p: %d\n",
-			       vq->iov->iov_base, err);
-			break;
-		}
-		len += hdr_size;
+		len += vq->guest_hlen - vq->sock_hlen;
 		vhost_add_used_and_signal(&net->dev, vq, vq->heads, headcount);
 		if (unlikely(vq_log))
 			vhost_log_write(vq, vq_log, log, len);
@@ -483,6 +481,13 @@
 	return ERR_PTR(-ENOTSOCK);
 }
 
+static int vhost_sock_is_raw(struct socket *sock)
+{
+	if (!sock || !sock->sk)
+		return 0;
+	return sock->sk->sk_type == SOCK_RAW;
+}
+
 static long vhost_net_set_backend(struct vhost_net *n, unsigned index, int fd)
 {
 	struct socket *sock, *oldsock;
@@ -519,6 +524,20 @@
 
 	vhost_net_disable_vq(n, vq);
 	rcu_assign_pointer(vq->private_data, sock);
+
+	if (sock && sock->sk) {
+		if (!vhost_sock_is_raw(sock) ||
+		    vhost_has_feature(&n->dev, VHOST_NET_F_VIRTIO_NET_HDR)) {
+			vq->sock_hlen = sizeof(struct virtio_net_hdr);
+			if (vhost_has_feature(&n->dev, VIRTIO_NET_F_MRG_RXBUF))
+				vq->guest_hlen =
+					sizeof(struct virtio_net_hdr_mrg_rxbuf);
+			else
+				vq->guest_hlen = sizeof(struct virtio_net_hdr);
+		} else
+			vq->guest_hlen = vq->sock_hlen = 0;
+	} else
+		vq_err(vq, "vhost_net_set_backend: sock->sk is NULL");
 	vhost_net_enable_vq(n, vq);
 	mutex_unlock(&vq->mutex);
 done:
@@ -566,8 +585,17 @@
 	n->dev.acked_features = features;
 	smp_wmb();
 	for (i = 0; i < VHOST_NET_VQ_MAX; ++i) {
-		mutex_lock(&n->vqs[i].mutex);
-		n->vqs[i].hdr_size = hdr_size;
+		struct vhost_virtqueue *vq = n->vqs + i;
+		struct socket *sock = vq->private_data;
+
+		mutex_lock(&vq->mutex);
+		if (features & (1 << VIRTIO_NET_F_MRG_RXBUF))
+			vq->sock_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
+		else if (features & (1 << VHOST_NET_F_VIRTIO_NET_HDR) ||
+			 !vhost_sock_is_raw(sock))
+			vq->sock_hlen = sizeof(struct virtio_net_hdr);
+		else
+			vq->sock_hlen = 0;
 		mutex_unlock(&n->vqs[i].mutex);
 	}
 	vhost_net_flush(n);
diff -ruN net-next-p1/drivers/vhost/vhost.c net-next-p2/drivers/vhost/vhost.c
--- net-next-p1/drivers/vhost/vhost.c	2010-03-01 11:44:06.000000000 -0800
+++ net-next-p2/drivers/vhost/vhost.c	2010-03-02 12:53:02.000000000 -0800
@@ -113,7 +113,8 @@
 	vq->used_flags = 0;
 	vq->log_used = false;
 	vq->log_addr = -1ull;
-	vq->hdr_size = 0;
+	vq->guest_hlen = 0;
+	vq->sock_hlen = 0;
 	vq->private_data = NULL;
 	vq->log_base = NULL;
 	vq->error_ctx = NULL;
@@ -848,20 +849,85 @@
 	return 0;
 }
 
+static int
+vhost_get_hdr(struct vhost_virtqueue *vq, int *in, struct vhost_log *log,
+	int *log_num)
+{
+	struct iovec *heads = vq->heads;
+	struct iovec *iov = vq->iov;
+	int out;
+
+	*in = 0;
+	iov[0].iov_len = 0;
+
+	/* get buffer, starting from iov[1] */
+	heads[0].iov_base = (void *)vhost_get_vq_desc(vq->dev, vq,
+		vq->iov+1, ARRAY_SIZE(vq->iov)-1, &out, in, log, log_num);
+	if (out || *in <= 0) {
+		vq_err(vq, "unexpected descriptor format for RX: out %d, "
+			"in %d\n", out, *in);
+		return 0;
+	}
+	if (heads[0].iov_base == (void *)vq->num)
+		return 0;
+
+	/* make iov[0] the header */
+	if (!vq->guest_hlen) {
+		if (vq->sock_hlen) {
+			static struct virtio_net_hdr junk; /* bit bucket */
+
+			iov[0].iov_base = &junk;
+			iov[0].iov_len = sizeof(junk);
+		} else
+			iov[0].iov_len = 0;
+	}
+	if (vq->sock_hlen < vq->guest_hlen) {
+		iov[0].iov_base = iov[1].iov_base;
+		iov[0].iov_len = vq->sock_hlen;
+
+		if (iov[1].iov_len < vq->sock_hlen) {
+			vq_err(vq, "can't fit header in one buffer!");
+			vhost_discard(vq, 1);
+			return 0;
+		}
+		if (!vq->sock_hlen) {
+			static const struct virtio_net_hdr_mrg_rxbuf hdr = {
+				.hdr.flags = 0,
+				.hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE
+			};
+			memcpy(iov[0].iov_base, &hdr, vq->guest_hlen);
+		}
+		iov[1].iov_base += vq->guest_hlen;
+		iov[1].iov_len -= vq->guest_hlen;
+	}
+	return 1;
+}
+
 unsigned vhost_get_heads(struct vhost_virtqueue *vq, int datalen, int *iovcount,
 	struct vhost_log *log, unsigned int *log_num)
 {
 	struct iovec *heads = vq->heads;
-	int out, in;
+	int out, in = 0;
+	int seg = 0;
 	int hc = 0;
 
+	if (vq->guest_hlen != vq->sock_hlen) {
+		seg = vhost_get_hdr(vq, &in, log, log_num);
+		if (!seg)
+			return 0;
+		hc++;
+		datalen -= iov_length(vq->iov+seg, in);
+		seg += in;
+	}
+
 	while (datalen > 0) {
 		if (hc >= VHOST_NET_MAX_SG) {
 			vhost_discard(vq, hc);
 			return 0;
 		}
 		heads[hc].iov_base = (void *)vhost_get_vq_desc(vq->dev, vq,
-			vq->iov, ARRAY_SIZE(vq->iov), &out, &in, log, log_num);
+			vq->iov+seg, ARRAY_SIZE(vq->iov)-seg, &out, &in,
+			log, log_num);
 		if (heads[hc].iov_base == (void *)vq->num) {
 			vhost_discard(vq, hc);
 			return 0;
@@ -872,11 +938,12 @@
 			vhost_discard(vq, hc);
 			return 0;
 		}
-		heads[hc].iov_len = iov_length(vq->iov, in);
-		hc++;
+		heads[hc].iov_len = iov_length(vq->iov+seg, in);
 		datalen -= heads[hc].iov_len;
+		hc++;
+		seg += in;
 	}
-	*iovcount = in;
+	*iovcount = seg;
 	return hc;
 }
 
diff -ruN net-next-p1/drivers/vhost/vhost.h net-next-p2/drivers/vhost/vhost.h
--- net-next-p1/drivers/vhost/vhost.h	2010-03-01 11:42:18.000000000 -0800
+++ net-next-p2/drivers/vhost/vhost.h	2010-03-02 13:02:03.000000000 -0800
@@ -82,10 +82,9 @@
 	u64 log_addr;
 
 	struct iovec indirect[VHOST_NET_MAX_SG];
-	struct iovec iov[VHOST_NET_MAX_SG];
-	struct iovec hdr[VHOST_NET_MAX_SG];
+	struct iovec iov[VHOST_NET_MAX_SG+1]; /* an extra for vnet hdr */
 	struct iovec heads[VHOST_NET_MAX_SG];
-	size_t hdr_size;
+	size_t guest_hlen, sock_hlen;
 	/* We use a kind of RCU to access private pointer.
 	 * All readers access it from workqueue, which makes it possible to
 	 * flush the workqueue instead of synchronize_rcu. Therefore readers do
