Hi, > Subject: [Qemu-devel] [PATCH 07/10] virtio: combine the read of a descriptor > > Compared to vring, virtio has a performance penalty of 10%. Fix it > by combining all the reads for a descriptor in a single address_space_read > call. This also simplifies the code nicely. > > Reviewed-by: Cornelia Huck <cornelia.h...@de.ibm.com> > Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> > --- > hw/virtio/virtio.c | 86 > ++++++++++++++++++++++-------------------------------- > 1 file changed, 35 insertions(+), 51 deletions(-) >
Unbelievable! After applying this patch, the virtio-crypto speed can attach 74MB/sec, host Cpu overhead is 180% (the main thread 100% and vcpu threads 80%) Testing AES-128-CBC cipher: Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs: 74.12 MiB/sec (1523475 packets) Encrypting in chunks of 256 bytes: done. 369.85 MiB in 5.01 secs: 73.88 MiB/sec (1514900 packets) Encrypting in chunks of 256 bytes: done. 371.07 MiB in 5.02 secs: 73.97 MiB/sec (1519914 packets) Encrypting in chunks of 256 bytes: done. 371.66 MiB in 5.02 secs: 74.09 MiB/sec (1522309 packets) Encrypting in chunks of 256 bytes: done. 371.79 MiB in 5.02 secs: 74.12 MiB/sec (1522868 packets) Encrypting in chunks of 256 bytes: done. 371.94 MiB in 5.02 secs: 74.15 MiB/sec (1523457 packets) Encrypting in chunks of 256 bytes: done. 371.90 MiB in 5.02 secs: 74.14 MiB/sec (1523317 packets) Encrypting in chunks of 256 bytes: done. 371.71 MiB in 5.02 secs: 74.10 MiB/sec (1522522 packets) 15.95% qemu-kvm [.] address_space_translate 6.98% qemu-kvm [.] qemu_get_ram_ptr 4.87% libpthread-2.19.so [.] __pthread_mutex_unlock_usercnt 4.40% qemu-kvm [.] qemu_ram_addr_from_host 3.79% qemu-kvm [.] address_space_map 3.41% libc-2.19.so [.] _int_malloc 3.29% libc-2.19.so [.] _int_free 3.07% libc-2.19.so [.] malloc 2.95% libpthread-2.19.so [.] pthread_mutex_lock 2.94% qemu-kvm [.] phys_page_find 2.73% qemu-kvm [.] address_space_translate_internal 2.65% libc-2.19.so [.] malloc_consolidate 2.35% libc-2.19.so [.] __memcpy_sse2_unaligned 1.72% qemu-kvm [.] find_next_zero_bit 1.38% qemu-kvm [.] address_space_rw 1.34% qemu-kvm [.] object_unref 1.30% qemu-kvm [.] object_ref 1.28% qemu-kvm [.] virtqueue_pop 1.20% libc-2.19.so [.] memset 1.11% qemu-kvm [.] virtio_notify Thank you so much! Regards, -Gonglei > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c > index 79a635f..2433866 100644 > --- a/hw/virtio/virtio.c > +++ b/hw/virtio/virtio.c > @@ -107,35 +107,15 @@ void virtio_queue_update_rings(VirtIODevice *vdev, > int n) > vring->align); > } > > -static inline uint64_t vring_desc_addr(VirtIODevice *vdev, hwaddr desc_pa, > - int i) > +static void vring_desc_read(VirtIODevice *vdev, VRingDesc *desc, > + hwaddr desc_pa, int i) > { > - hwaddr pa; > - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, addr); > - return virtio_ldq_phys(vdev, pa); > -} > - > -static inline uint32_t vring_desc_len(VirtIODevice *vdev, hwaddr desc_pa, int > i) > -{ > - hwaddr pa; > - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, len); > - return virtio_ldl_phys(vdev, pa); > -} > - > -static inline uint16_t vring_desc_flags(VirtIODevice *vdev, hwaddr desc_pa, > - int i) > -{ > - hwaddr pa; > - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, flags); > - return virtio_lduw_phys(vdev, pa); > -} > - > -static inline uint16_t vring_desc_next(VirtIODevice *vdev, hwaddr desc_pa, > - int i) > -{ > - hwaddr pa; > - pa = desc_pa + sizeof(VRingDesc) * i + offsetof(VRingDesc, next); > - return virtio_lduw_phys(vdev, pa); > + address_space_read(&address_space_memory, desc_pa + i * > sizeof(VRingDesc), > + MEMTXATTRS_UNSPECIFIED, (void *)desc, > sizeof(VRingDesc)); > + virtio_tswap64s(vdev, &desc->addr); > + virtio_tswap32s(vdev, &desc->len); > + virtio_tswap16s(vdev, &desc->flags); > + virtio_tswap16s(vdev, &desc->next); > } > > static inline uint16_t vring_avail_flags(VirtQueue *vq) > @@ -345,18 +325,18 @@ static unsigned int virtqueue_get_head(VirtQueue > *vq, unsigned int idx) > return head; > } > > -static unsigned virtqueue_next_desc(VirtIODevice *vdev, hwaddr desc_pa, > - unsigned int i, unsigned int max) > +static unsigned virtqueue_read_next_desc(VirtIODevice *vdev, VRingDesc > *desc, > + hwaddr desc_pa, unsigned > int max) > { > unsigned int next; > > /* If this descriptor says it doesn't chain, we're done. */ > - if (!(vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_NEXT)) { > + if (!(desc->flags & VRING_DESC_F_NEXT)) { > return max; > } > > /* Check they're not leading us off end of descriptors. */ > - next = vring_desc_next(vdev, desc_pa, i); > + next = desc->next; > /* Make sure compiler knows to grab that: we don't want it changing! */ > smp_wmb(); > > @@ -365,6 +345,7 @@ static unsigned virtqueue_next_desc(VirtIODevice > *vdev, hwaddr desc_pa, > exit(1); > } > > + vring_desc_read(vdev, desc, desc_pa, next); > return next; > } > > @@ -381,6 +362,7 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, > unsigned int *in_bytes, > while (virtqueue_num_heads(vq, idx)) { > VirtIODevice *vdev = vq->vdev; > unsigned int max, num_bufs, indirect = 0; > + VRingDesc desc; > hwaddr desc_pa; > int i; > > @@ -388,9 +370,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, > unsigned int *in_bytes, > num_bufs = total_bufs; > i = virtqueue_get_head(vq, idx++); > desc_pa = vq->vring.desc; > + vring_desc_read(vdev, &desc, desc_pa, i); > > - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) { > - if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) { > + if (desc.flags & VRING_DESC_F_INDIRECT) { > + if (desc.len % sizeof(VRingDesc)) { > error_report("Invalid size for indirect buffer table"); > exit(1); > } > @@ -403,9 +386,10 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, > unsigned int *in_bytes, > > /* loop over the indirect descriptor table */ > indirect = 1; > - max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc); > - desc_pa = vring_desc_addr(vdev, desc_pa, i); > + max = desc.len / sizeof(VRingDesc); > + desc_pa = desc.addr; > num_bufs = i = 0; > + vring_desc_read(vdev, &desc, desc_pa, i); > } > > do { > @@ -415,15 +399,15 @@ void virtqueue_get_avail_bytes(VirtQueue *vq, > unsigned int *in_bytes, > exit(1); > } > > - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) > { > - in_total += vring_desc_len(vdev, desc_pa, i); > + if (desc.flags & VRING_DESC_F_WRITE) { > + in_total += desc.len; > } else { > - out_total += vring_desc_len(vdev, desc_pa, i); > + out_total += desc.len; > } > if (in_total >= max_in_bytes && out_total >= max_out_bytes) { > goto done; > } > - } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max); > + } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, > max)) != max); > > if (!indirect) > total_bufs = num_bufs; > @@ -545,6 +529,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) > unsigned out_num, in_num; > hwaddr addr[VIRTQUEUE_MAX_SIZE]; > struct iovec iov[VIRTQUEUE_MAX_SIZE]; > + VRingDesc desc; > > if (!virtqueue_num_heads(vq, vq->last_avail_idx)) { > return NULL; > @@ -560,33 +545,32 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) > vring_set_avail_event(vq, vq->last_avail_idx); > } > > - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_INDIRECT) { > - if (vring_desc_len(vdev, desc_pa, i) % sizeof(VRingDesc)) { > + vring_desc_read(vdev, &desc, desc_pa, i); > + if (desc.flags & VRING_DESC_F_INDIRECT) { > + if (desc.len % sizeof(VRingDesc)) { > error_report("Invalid size for indirect buffer table"); > exit(1); > } > > /* loop over the indirect descriptor table */ > - max = vring_desc_len(vdev, desc_pa, i) / sizeof(VRingDesc); > - desc_pa = vring_desc_addr(vdev, desc_pa, i); > + max = desc.len / sizeof(VRingDesc); > + desc_pa = desc.addr; > i = 0; > + vring_desc_read(vdev, &desc, desc_pa, i); > } > > /* Collect all the descriptors */ > do { > - hwaddr pa = vring_desc_addr(vdev, desc_pa, i); > - size_t len = vring_desc_len(vdev, desc_pa, i); > - > - if (vring_desc_flags(vdev, desc_pa, i) & VRING_DESC_F_WRITE) { > + if (desc.flags & VRING_DESC_F_WRITE) { > virtqueue_map_desc(&in_num, addr + out_num, iov + > out_num, > - VIRTQUEUE_MAX_SIZE - out_num, true, > pa, len); > + VIRTQUEUE_MAX_SIZE - out_num, true, > desc.addr, desc.len); > } else { > if (in_num) { > error_report("Incorrect order for descriptors"); > exit(1); > } > virtqueue_map_desc(&out_num, addr, iov, > - VIRTQUEUE_MAX_SIZE, false, pa, len); > + VIRTQUEUE_MAX_SIZE, false, > desc.addr, desc.len); > } > > /* If we've got too many, that implies a descriptor loop. */ > @@ -594,7 +578,7 @@ void *virtqueue_pop(VirtQueue *vq, size_t sz) > error_report("Looped descriptor"); > exit(1); > } > - } while ((i = virtqueue_next_desc(vdev, desc_pa, i, max)) != max); > + } while ((i = virtqueue_read_next_desc(vdev, &desc, desc_pa, max)) != > max); > > /* Now copy what we have collected and mapped */ > elem = virtqueue_alloc_element(sz, out_num, in_num); > -- > 2.5.0 > >