This patch introduces a PCI DMA API and some generic code to support other DMA APIs. It introduces a IOVector type that contains physical address/length pairs. These vectors can be translated by the PCI layer and passed either to generic copying functions or directly to the block or network subsystems.
This enables zero-copy IO to be preformed without introducing assumptions of phys_ram_base. This API is at the PCI device level to enable support of per-device IOMMU remapping. Since v1, I've eliminated renamed PhysIOVector to IOVector and removed the concept of a mapped vector. I've added comments and provided an API for using IOVectors with the network and block layers. It's not optimized at the moment as enabling true zero-copy will require more patches at a later time. Signed-off-by: Anthony Liguori <[EMAIL PROTECTED]> diff --git a/Makefile.target b/Makefile.target index 5ac29a7..94f3e58 100644 --- a/Makefile.target +++ b/Makefile.target @@ -173,7 +173,7 @@ all: $(PROGS) ######################################################### # cpu emulator library LIBOBJS=exec.o kqemu.o translate-all.o cpu-exec.o\ - translate.o host-utils.o + translate.o host-utils.o iovector.o ifndef CONFIG_NO_DYNGEN_OP LIBOBJS+=op.o endif diff --git a/block.c b/block.c index 0730954..58cb6cc 100644 --- a/block.c +++ b/block.c @@ -570,6 +570,51 @@ int bdrv_write(BlockDriverState *bs, int64_t sector_num, } } +#ifndef QEMU_IMG +int bdrv_readv(BlockDriverState *bs, int64_t sector_num, + IOVector *iovec) +{ + char *buffer; + size_t size; + int ret; + + size = iovector_size(iovec); + buffer = qemu_malloc(size); + if (buffer == NULL) + return -ENOMEM; + + ret = bdrv_read(bs, sector_num, buffer, size / 512); + + if (ret >= 0) + memcpy_to_iovector(iovec, 0, size, buffer); + + qemu_free(buffer); + + return ret; +} + +int bdrv_writev(BlockDriverState *bs, int64_t sector_num, + const IOVector *iovec) +{ + char *buffer; + size_t size; + int ret; + + size = iovector_size(iovec); + buffer = qemu_malloc(size); + if (buffer == NULL) + return -ENOMEM; + + memcpy_from_iovector(buffer, 0, size, iovec); + + ret = bdrv_write(bs, sector_num, buffer, size / 512); + + qemu_free(buffer); + + return ret; +} +#endif + static int bdrv_pread_em(BlockDriverState *bs, int64_t offset, uint8_t *buf, int count1) { diff --git a/block.h b/block.h index b730505..9d30db2 100644 --- a/block.h +++ b/block.h @@ -1,6 +1,8 @@ #ifndef BLOCK_H #define BLOCK_H +#include "iovector.h" + /* block.c */ typedef struct BlockDriver BlockDriver; @@ -67,6 +69,9 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, int nb_sectors); int bdrv_write(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors); +int bdrv_readv(BlockDriverState *bs, int64_t sector_num, IOVector *iovec); +int bdrv_writev(BlockDriverState *bs, int64_t sector_num, + const IOVector *iovec); int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int count); int bdrv_pwrite(BlockDriverState *bs, int64_t offset, diff --git a/cpu-all.h b/cpu-all.h index 9e5d33b..3cbc718 100644 --- a/cpu-all.h +++ b/cpu-all.h @@ -835,6 +835,7 @@ void cpu_register_physical_memory(target_phys_addr_t start_addr, unsigned long size, unsigned long phys_offset); ram_addr_t cpu_get_physical_page_desc(target_phys_addr_t addr); +void *cpu_map_physical_page(target_phys_addr_t addr); ram_addr_t qemu_ram_alloc(unsigned int size); void qemu_ram_free(ram_addr_t addr); int cpu_register_io_memory(int io_index, diff --git a/exec.c b/exec.c index c25872d..b2d2af4 100644 --- a/exec.c +++ b/exec.c @@ -2085,6 +2085,21 @@ ram_addr_t cpu_get_physical_page_desc(target_phys_addr_t addr) return p->phys_offset; } +void *cpu_map_physical_page(target_phys_addr_t addr) +{ + ram_addr_t phys_offset; + + /* DMA'ing to MMIO, just skip */ + phys_offset = cpu_get_physical_page_desc(addr); + if ((phys_offset & ~TARGET_PAGE_MASK) != IO_MEM_RAM) + return NULL; + + phys_offset &= TARGET_PAGE_MASK; + phys_offset += addr & ~TARGET_PAGE_MASK; + + return phys_ram_base + phys_offset; +} + /* XXX: better than nothing */ ram_addr_t qemu_ram_alloc(unsigned int size) { diff --git a/hw/pci.c b/hw/pci.c index bc55989..c09b5f8 100644 --- a/hw/pci.c +++ b/hw/pci.c @@ -145,6 +145,34 @@ int pci_device_load(PCIDevice *s, QEMUFile *f) return 0; } +/* Return a translated IOVector suitable for DMA. At the moment, we perform + * no translation. */ +IOVector *pci_device_dma_map(PCIDevice *s, const IOVector *iovec) +{ + return (IOVector *)iovec; +} + +/* Unmap a translated IOVector and update dirty bits if necessary. */ +void pci_device_dma_unmap(PCIDevice *s, const IOVector *orig, + IOVector *mapped, int write) +{ + int i; + + if (!write) + return; + + /* mark memory as dirty if necessary */ + for (i = 0; i < orig->num; i++) { + size_t offset; + + for (offset = 0; + offset < orig->sg[i].len; + offset += TARGET_PAGE_SIZE) { + cpu_physical_memory_set_dirty(orig->sg[i].base + offset); + } + } +} + /* -1 for devfn means auto assign */ PCIDevice *pci_register_device(PCIBus *bus, const char *name, int instance_size, int devfn, diff --git a/hw/pci.h b/hw/pci.h index e870987..b86d8cb 100644 --- a/hw/pci.h +++ b/hw/pci.h @@ -4,6 +4,8 @@ /* PCI includes legacy ISA access. */ #include "isa.h" +#include "iovector.h" + /* PCI bus */ extern target_phys_addr_t pci_mem_base; @@ -81,6 +83,10 @@ void pci_default_write_config(PCIDevice *d, void pci_device_save(PCIDevice *s, QEMUFile *f); int pci_device_load(PCIDevice *s, QEMUFile *f); +IOVector *pci_device_dma_map(PCIDevice *s, const IOVector *iovec); +void pci_device_dma_unmap(PCIDevice *s, const IOVector *orig, + IOVector *mapped, int write); + typedef void (*pci_set_irq_fn)(qemu_irq *pic, int irq_num, int level); typedef int (*pci_map_irq_fn)(PCIDevice *pci_dev, int irq_num); PCIBus *pci_register_bus(pci_set_irq_fn set_irq, pci_map_irq_fn map_irq, diff --git a/iovector.c b/iovector.c new file mode 100644 index 0000000..7002656 --- /dev/null +++ b/iovector.c @@ -0,0 +1,137 @@ +/* + * IO Vectors + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <[EMAIL PROTECTED]> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "iovector.h" + +static size_t iovector_rw(void *buffer, size_t size, IOVector *iov, int read) +{ + uint8_t *ptr = buffer; + size_t offset = 0; + int i; + + for (i = 0; i < iov->num; i++) { + size_t len; + void *addr; + + len = MIN(iov->sg[i].len, size - offset); + + addr = cpu_map_physical_page(iov->sg[i].base); + + if (read) + memcpy(ptr + offset, addr, len); + else + memcpy(addr, ptr + offset, len); + + offset += len; + } + + return offset; +} + +size_t memcpy_from_iovector(void *buffer, size_t offset, size_t size, + const IOVector *iov) +{ + IOVector *sg; + size_t len; + + if (offset) + sg = iovector_trim(iov, offset, size); + else + sg = (IOVector *)iov; + + len = iovector_rw(buffer, size, sg, 1); + + if (offset) + qemu_free(sg); + + return len; +} + +size_t memcpy_to_iovector(IOVector *iovec, size_t offset, size_t size, + const void *buffer) +{ + IOVector *sg; + size_t len; + + if (offset) + sg = iovector_trim(iovec, offset, size); + else + sg = iovec; + + len = iovector_rw((void *)buffer, size, sg, 0); + + if (offset) + qemu_free(sg); + + return len; +} + +IOVector *iovector_new(int num) +{ + IOVector *ret; + + ret = qemu_malloc(sizeof(IOVector) + sizeof(IOVectorElement) * num); + if (ret == NULL) + return NULL; + + ret->num = num; + + return ret; +} + +IOVector *iovector_trim(const IOVector *iov, size_t offset, size_t size) +{ + IOVector *ret; + size_t off, total_size; + int i; + + ret = iovector_new(iov->num); + if (ret == NULL) + return NULL; + + total_size = 0; + ret->num = 0; + off = 0; + for (i = 0; i < iov->num; i++) { + if (off >= offset || offset < (off + iov->sg[i].len)) { + size_t fudge = 0; + if (off < offset) + fudge = offset - off; + + ret->sg[ret->num].base = iov->sg[i].base + fudge; + ret->sg[ret->num].len = MIN(iov->sg[i].len - fudge, + size - total_size); + total_size += ret->sg[ret->num].len; + ret->num++; + + if (total_size == size) + break; + } + + off += iov->sg[i].len; + } + + return ret; +} + +size_t iovector_size(const IOVector *iov) +{ + size_t size = 0; + int i; + + for (i = 0; i < iov->num; i++) + size += iov->sg[i].len; + + return size; +} diff --git a/iovector.h b/iovector.h new file mode 100644 index 0000000..fac7236 --- /dev/null +++ b/iovector.h @@ -0,0 +1,49 @@ +/* + * IO Vectors + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <[EMAIL PROTECTED]> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + */ + +#ifndef _QEMU_IOVECTOR_H +#define _QEMU_IOVECTOR_H + +typedef struct IOVectorElement IOVectorElement; + +typedef struct IOVector +{ + int num; + struct IOVectorElement { + uint64_t base; + size_t len; + } sg[0]; +} IOVector; + +/* Copy from an IOVector to a flat buffer. Be careful to pass in a fully + * translated IOVector here. */ +size_t memcpy_from_iovector(void *buffer, size_t offset, size_t size, + const IOVector *iov); + +/* Copy to an IOVector from a flat buffer. Be careful to pass in a fully + * translated IOVector here. */ +size_t memcpy_to_iovector(IOVector *iovec, size_t offset, size_t size, + const void *buffer); + +/* Return a new IOVector that's a subset of the passed in IOVector. It should + * be freed with qemu_free when you are done with it. */ +IOVector *iovector_trim(const IOVector *iov, size_t offset, size_t size); + +/* Returns the size of an IOVector in bytes */ +size_t iovector_size(const IOVector *iov); + +/* Returns a new IOVector with num elements. iov->num will be set to num on + * return */ +IOVector *iovector_new(int num); + +#endif diff --git a/net.h b/net.h index 2dfff8d..0b3a155 100644 --- a/net.h +++ b/net.h @@ -1,6 +1,8 @@ #ifndef QEMU_NET_H #define QEMU_NET_H +#include "iovector.h" + /* VLANs support */ typedef struct VLANClientState VLANClientState; @@ -30,6 +32,7 @@ VLANClientState *qemu_new_vlan_client(VLANState *vlan, void *opaque); int qemu_can_send_packet(VLANClientState *vc); void qemu_send_packet(VLANClientState *vc, const uint8_t *buf, int size); +void qemu_sendv_packet(VLANClientState *vc, const IOVector *iovec); void qemu_handler_true(void *opaque); void do_info_network(void); diff --git a/vl.c b/vl.c index 61eb191..342ef79 100644 --- a/vl.c +++ b/vl.c @@ -3731,6 +3731,22 @@ void qemu_send_packet(VLANClientState *vc1, const uint8_t *buf, int size) } } +void qemu_sendv_packet(VLANClientState *vc, const IOVector *iovec) +{ + size_t size; + uint8_t *data; + + size = iovector_size(iovec); + data = qemu_malloc(size); + if (data == NULL) + return; + + memcpy_from_iovector(data, 0, size, iovec); + qemu_send_packet(vc, data, size); + + qemu_free(data); +} + #if defined(CONFIG_SLIRP) /* slirp network adapter */ ------------------------------------------------------------------------- This SF.net email is sponsored by the 2008 JavaOne(SM) Conference Register now and save $200. Hurry, offer ends at 11:59 p.m., Monday, April 7! Use priority code J8TLD2. http://ad.doubleclick.net/clk;198757673;13503038;p?http://java.sun.com/javaone _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel