dmabuf is a modern Linux kernel feature to allow DMA transfers between
two drivers. Common examples of usage are streaming video devices and
NIC to GPU transfers. Prior to dmabuf users had to load proprietary
drivers to expose the DMA mappings. With dmabuf the proprietary drivers
are no longer required.

A new api function rte_extmem_register_dmabuf is introduced to create
the mapping from a dmabuf file descriptor. dmabuf uses a file descriptor
and an offset that has been pre-opened with the kernel. The kernel uses
the file descriptor to map to a VA pointer. To avoid ABI changes, a
static struct is used inside of eal_common_memory.c, and lookups are
done on this struct rather than from the rte_memseg_list.

Ideally we would like to add both the dmabuf file descriptor and offset
to rte_memseg_list, but it's not clear if we can reuse existing fields
when using the dmabuf API.

We could rename the external flag to a more generic "properties" flag where
"external" is the lowest bit, then we can use the second bit to indicate the
presence of dmabuf. In the presence of the flag for dmabuf we could
reuse the base_va address field for the dmabuf offset, and the socket_id
for the file descriptor.

Which option is preferred?

Signed-off-by: Cliff Burdick <[email protected]>
---
 .mailmap                           |   1 +
 lib/eal/common/eal_common_memory.c | 168 +++++++++++++++++++++++++++++
 lib/eal/common/eal_memalloc.h      |  21 ++++
 lib/eal/common/malloc_heap.c       |  27 +++++
 lib/eal/common/malloc_heap.h       |   5 +
 lib/eal/include/rte_memory.h       | 125 +++++++++++++++++++++
 6 files changed, 347 insertions(+)

diff --git a/.mailmap b/.mailmap
index 2f089326ff..4c2b2f921d 100644
--- a/.mailmap
+++ b/.mailmap
@@ -291,6 +291,7 @@ Cian Ferriter <[email protected]>
 Ciara Loftus <[email protected]>
 Ciara Power <[email protected]>
 Claire Murphy <[email protected]>
+Cliff Burdick <[email protected]>
 Clemens Famulla-Conrad <[email protected]>
 Cody Doucette <[email protected]>
 Congwen Zhang <[email protected]>
diff --git a/lib/eal/common/eal_common_memory.c 
b/lib/eal/common/eal_common_memory.c
index c62edf5e55..304ed18396 100644
--- a/lib/eal/common/eal_common_memory.c
+++ b/lib/eal/common/eal_common_memory.c
@@ -45,6 +45,18 @@
 static void *next_baseaddr;
 static uint64_t system_page_sz;
 
+/* Internal storage for dmabuf info, indexed by memseg list index.
+ * This keeps dmabuf metadata out of the public rte_memseg_list structure
+ * to preserve ABI compatibility.
+ */
+static struct {
+               int fd;          /**< dmabuf fd, -1 if not dmabuf backed */
+               uint64_t offset; /**< offset within dmabuf */
+       } dmabuf_info[RTE_MAX_MEMSEG_LISTS] = {
+       [0 ... RTE_MAX_MEMSEG_LISTS - 1] = { .fd = -1, .offset = 0 }
+};
+
+
 #define MAX_MMAP_WITH_DEFINED_ADDR_TRIES 5
 void *
 eal_get_virtual_area(void *requested_addr, size_t *size,
@@ -930,6 +942,109 @@ rte_memseg_get_fd_offset(const struct rte_memseg *ms, 
size_t *offset)
        return ret;
 }
 
+/* Internal dmabuf info functions */
+int
+eal_memseg_list_set_dmabuf_info(int list_idx, int fd, uint64_t offset)
+{
+       if (list_idx < 0 || list_idx >= RTE_MAX_MEMSEG_LISTS)
+               return -EINVAL;
+
+       dmabuf_info[list_idx].fd = fd;
+       dmabuf_info[list_idx].offset = offset;
+       return 0;
+}
+
+int
+eal_memseg_list_get_dmabuf_fd(int list_idx)
+{
+       if (list_idx < 0 || list_idx >= RTE_MAX_MEMSEG_LISTS)
+               return -EINVAL;
+
+       return dmabuf_info[list_idx].fd;
+}
+
+int
+eal_memseg_list_get_dmabuf_offset(int list_idx, uint64_t *offset)
+{
+       if (list_idx < 0 || list_idx >= RTE_MAX_MEMSEG_LISTS || offset == NULL)
+               return -EINVAL;
+
+       *offset = dmabuf_info[list_idx].offset;
+       return 0;
+}
+
+/* Public dmabuf info API functions */
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_fd_thread_unsafe)
+int
+rte_memseg_list_get_dmabuf_fd_thread_unsafe(const struct rte_memseg_list *msl)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       int msl_idx;
+
+       if (msl == NULL) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       msl_idx = msl - mcfg->memsegs;
+       if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       return dmabuf_info[msl_idx].fd;
+}
+
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_fd)
+int
+rte_memseg_list_get_dmabuf_fd(const struct rte_memseg_list *msl)
+{
+       int ret;
+
+       rte_mcfg_mem_read_lock();
+       ret = rte_memseg_list_get_dmabuf_fd_thread_unsafe(msl);
+       rte_mcfg_mem_read_unlock();
+
+       return ret;
+}
+
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_offset_thread_unsafe)
+int
+rte_memseg_list_get_dmabuf_offset_thread_unsafe(const struct rte_memseg_list 
*msl,
+               uint64_t *offset)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       int msl_idx;
+
+       if (msl == NULL || offset == NULL) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       msl_idx = msl - mcfg->memsegs;
+       if (msl_idx < 0 || msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+
+       *offset = dmabuf_info[msl_idx].offset;
+       return 0;
+}
+
+RTE_EXPORT_SYMBOL(rte_memseg_list_get_dmabuf_offset)
+int
+rte_memseg_list_get_dmabuf_offset(const struct rte_memseg_list *msl,
+               uint64_t *offset)
+{
+       int ret;
+
+       rte_mcfg_mem_read_lock();
+       ret = rte_memseg_list_get_dmabuf_offset_thread_unsafe(msl, offset);
+       rte_mcfg_mem_read_unlock();
+
+       return ret;
+}
+
 RTE_EXPORT_SYMBOL(rte_extmem_register)
 int
 rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
@@ -980,6 +1095,59 @@ rte_extmem_register(void *va_addr, size_t len, rte_iova_t 
iova_addrs[],
        return ret;
 }
 
+RTE_EXPORT_SYMBOL(rte_extmem_register_dmabuf)
+int
+rte_extmem_register_dmabuf(void *va_addr, size_t len,
+               int dmabuf_fd, uint64_t dmabuf_offset,
+               rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       unsigned int socket_id, n;
+       int ret = 0;
+
+       if (va_addr == NULL || page_sz == 0 || len == 0 ||
+                       !rte_is_power_of_2(page_sz) ||
+                       RTE_ALIGN(len, page_sz) != len ||
+                       ((len / page_sz) != n_pages && iova_addrs != NULL) ||
+                       !rte_is_aligned(va_addr, page_sz) ||
+                       dmabuf_fd < 0) {
+               rte_errno = EINVAL;
+               return -1;
+       }
+       rte_mcfg_mem_write_lock();
+
+       /* make sure the segment doesn't already exist */
+       if (malloc_heap_find_external_seg(va_addr, len) != NULL) {
+               rte_errno = EEXIST;
+               ret = -1;
+               goto unlock;
+       }
+
+       /* get next available socket ID */
+       socket_id = mcfg->next_socket_id;
+       if (socket_id > INT32_MAX) {
+               EAL_LOG(ERR, "Cannot assign new socket ID's");
+               rte_errno = ENOSPC;
+               ret = -1;
+               goto unlock;
+       }
+
+       /* we can create a new memseg with dma-buf info */
+       n = len / page_sz;
+       if (malloc_heap_create_external_seg_dmabuf(va_addr, iova_addrs, n,
+                       page_sz, "extmem_dmabuf", socket_id,
+                       dmabuf_fd, dmabuf_offset) == NULL) {
+               ret = -1;
+               goto unlock;
+       }
+
+       /* memseg list successfully created - increment next socket ID */
+       mcfg->next_socket_id++;
+unlock:
+       rte_mcfg_mem_write_unlock();
+       return ret;
+}
+
 RTE_EXPORT_SYMBOL(rte_extmem_unregister)
 int
 rte_extmem_unregister(void *va_addr, size_t len)
diff --git a/lib/eal/common/eal_memalloc.h b/lib/eal/common/eal_memalloc.h
index 0c267066d9..bb2cfa0717 100644
--- a/lib/eal/common/eal_memalloc.h
+++ b/lib/eal/common/eal_memalloc.h
@@ -90,6 +90,27 @@ eal_memalloc_set_seg_list_fd(int list_idx, int fd);
 int
 eal_memalloc_get_seg_fd_offset(int list_idx, int seg_idx, size_t *offset);
 
+/*
+ * Set dmabuf info for a memseg list.
+ * Returns 0 on success, -errno on failure.
+ */
+int
+eal_memseg_list_set_dmabuf_info(int list_idx, int fd, uint64_t offset);
+
+/*
+ * Get dmabuf fd for a memseg list.
+ * Returns fd (>= 0) on success, -1 if not dmabuf backed, -errno on error.
+ */
+int
+eal_memseg_list_get_dmabuf_fd(int list_idx);
+
+/*
+ * Get dmabuf offset for a memseg list.
+ * Returns 0 on success, -errno on failure.
+ */
+int
+eal_memseg_list_get_dmabuf_offset(int list_idx, uint64_t *offset);
+
 int
 eal_memalloc_init(void)
        __rte_requires_shared_capability(rte_mcfg_mem_get_lock());
diff --git a/lib/eal/common/malloc_heap.c b/lib/eal/common/malloc_heap.c
index 39240c261c..fd0376d13b 100644
--- a/lib/eal/common/malloc_heap.c
+++ b/lib/eal/common/malloc_heap.c
@@ -1232,6 +1232,33 @@ malloc_heap_create_external_seg(void *va_addr, 
rte_iova_t iova_addrs[],
        msl->version = 0;
        msl->external = 1;
 
+       /* initialize dmabuf info to "not dmabuf backed" */
+       eal_memseg_list_set_dmabuf_info(i, -1, 0);
+
+       return msl;
+}
+
+struct rte_memseg_list *
+malloc_heap_create_external_seg_dmabuf(void *va_addr, rte_iova_t iova_addrs[],
+               unsigned int n_pages, size_t page_sz, const char *seg_name,
+               unsigned int socket_id, int dmabuf_fd, uint64_t dmabuf_offset)
+{
+       struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+       struct rte_memseg_list *msl;
+       int msl_idx;
+
+       /* Create the base external segment */
+       msl = malloc_heap_create_external_seg(va_addr, iova_addrs, n_pages,
+                       page_sz, seg_name, socket_id);
+       if (msl == NULL)
+               return NULL;
+
+       /* Get memseg list index */
+       msl_idx = msl - mcfg->memsegs;
+
+       /* Set dma-buf info in the internal side-table */
+       eal_memseg_list_set_dmabuf_info(msl_idx, dmabuf_fd, dmabuf_offset);
+
        return msl;
 }
 
diff --git a/lib/eal/common/malloc_heap.h b/lib/eal/common/malloc_heap.h
index dfc56d4ae3..87525d1a68 100644
--- a/lib/eal/common/malloc_heap.h
+++ b/lib/eal/common/malloc_heap.h
@@ -51,6 +51,11 @@ malloc_heap_create_external_seg(void *va_addr, rte_iova_t 
iova_addrs[],
                unsigned int n_pages, size_t page_sz, const char *seg_name,
                unsigned int socket_id);
 
+struct rte_memseg_list *
+malloc_heap_create_external_seg_dmabuf(void *va_addr, rte_iova_t iova_addrs[],
+               unsigned int n_pages, size_t page_sz, const char *seg_name,
+               unsigned int socket_id, int dmabuf_fd, uint64_t dmabuf_offset);
+
 struct rte_memseg_list *
 malloc_heap_find_external_seg(void *va_addr, size_t len);
 
diff --git a/lib/eal/include/rte_memory.h b/lib/eal/include/rte_memory.h
index b6e97ad695..d1c2fc8aa5 100644
--- a/lib/eal/include/rte_memory.h
+++ b/lib/eal/include/rte_memory.h
@@ -405,6 +405,82 @@ int
 rte_memseg_get_fd_offset_thread_unsafe(const struct rte_memseg *ms,
                size_t *offset);
 
+/**
+ * Get dma-buf file descriptor associated with a memseg list.
+ *
+ * @note This function read-locks the memory hotplug subsystem, and thus cannot
+ *       be used within memory-related callback functions.
+ *
+ * @param msl
+ *   A pointer to memseg list for which to get dma-buf fd.
+ *
+ * @return
+ *   Valid dma-buf file descriptor (>= 0) in case of success.
+ *   -1 if not dma-buf backed or in case of error, with ``rte_errno`` set to:
+ *     - EINVAL  - ``msl`` pointer was NULL or did not point to a valid memseg 
list
+ */
+int
+rte_memseg_list_get_dmabuf_fd(const struct rte_memseg_list *msl);
+
+/**
+ * Get dma-buf file descriptor associated with a memseg list.
+ *
+ * @note This function does not perform any locking, and is only safe to call
+ *       from within memory-related callback functions.
+ *
+ * @param msl
+ *   A pointer to memseg list for which to get dma-buf fd.
+ *
+ * @return
+ *   Valid dma-buf file descriptor (>= 0) in case of success.
+ *   -1 if not dma-buf backed or in case of error, with ``rte_errno`` set to:
+ *     - EINVAL  - ``msl`` pointer was NULL or did not point to a valid memseg 
list
+ */
+int
+rte_memseg_list_get_dmabuf_fd_thread_unsafe(const struct rte_memseg_list *msl);
+
+/**
+ * Get dma-buf offset associated with a memseg list.
+ *
+ * @note This function read-locks the memory hotplug subsystem, and thus cannot
+ *       be used within memory-related callback functions.
+ *
+ * @param msl
+ *   A pointer to memseg list for which to get dma-buf offset.
+ * @param offset
+ *   A pointer to offset value where the result will be stored.
+ *
+ * @return
+ *   0 on success.
+ *   -1 in case of error, with ``rte_errno`` set to:
+ *     - EINVAL  - ``msl`` pointer was NULL or did not point to a valid memseg 
list
+ *     - EINVAL  - ``offset`` pointer was NULL
+ */
+int
+rte_memseg_list_get_dmabuf_offset(const struct rte_memseg_list *msl,
+               uint64_t *offset);
+
+/**
+ * Get dma-buf offset associated with a memseg list.
+ *
+ * @note This function does not perform any locking, and is only safe to call
+ *       from within memory-related callback functions.
+ *
+ * @param msl
+ *   A pointer to memseg list for which to get dma-buf offset.
+ * @param offset
+ *   A pointer to offset value where the result will be stored.
+ *
+ * @return
+ *   0 on success.
+ *   -1 in case of error, with ``rte_errno`` set to:
+ *     - EINVAL  - ``msl`` pointer was NULL or did not point to a valid memseg 
list
+ *     - EINVAL  - ``offset`` pointer was NULL
+ */
+int
+rte_memseg_list_get_dmabuf_offset_thread_unsafe(const struct rte_memseg_list 
*msl,
+               uint64_t *offset);
+
 /**
  * Register external memory chunk with DPDK.
  *
@@ -443,6 +519,55 @@ int
 rte_extmem_register(void *va_addr, size_t len, rte_iova_t iova_addrs[],
                unsigned int n_pages, size_t page_sz);
 
+/**
+ * Register external memory chunk backed by a dma-buf with DPDK.
+ *
+ * This is similar to rte_extmem_register() but additionally stores dma-buf
+ * file descriptor information, allowing drivers to use dma-buf based
+ * memory registration (e.g., ibv_reg_dmabuf_mr for RDMA devices).
+ *
+ * @note Using this API is mutually exclusive with ``rte_malloc`` family of
+ *   API's.
+ *
+ * @note This API will not perform any DMA mapping. It is expected that user
+ *   will do that themselves via rte_dev_dma_map().
+ *
+ * @note Before accessing this memory in other processes, it needs to be
+ *   attached in each of those processes by calling ``rte_extmem_attach`` in
+ *   each other process.
+ *
+ * @param va_addr
+ *   Start of virtual area to register (mmap'd address of the dma-buf).
+ *   Must be aligned by ``page_sz``.
+ * @param len
+ *   Length of virtual area to register. Must be aligned by ``page_sz``.
+ *   This is independent of dmabuf_offset.
+ * @param dmabuf_fd
+ *   File descriptor of the dma-buf.
+ * @param dmabuf_offset
+ *   Offset within the dma-buf where the registered region starts.
+ * @param iova_addrs
+ *   Array of page IOVA addresses corresponding to each page in this memory
+ *   area. Can be NULL, in which case page IOVA addresses will be set to
+ *   RTE_BAD_IOVA.
+ * @param n_pages
+ *   Number of elements in the iova_addrs array. Ignored if ``iova_addrs``
+ *   is NULL.
+ * @param page_sz
+ *   Page size of the underlying memory
+ *
+ * @return
+ *   - 0 on success
+ *   - -1 in case of error, with rte_errno set to one of the following:
+ *     EINVAL - one of the parameters was invalid
+ *     EEXIST - memory chunk is already registered
+ *     ENOSPC - no more space in internal config to store a new memory chunk
+ */
+int
+rte_extmem_register_dmabuf(void *va_addr, size_t len,
+               int dmabuf_fd, uint64_t dmabuf_offset,
+               rte_iova_t iova_addrs[], unsigned int n_pages, size_t page_sz);
+
 /**
  * Unregister external memory chunk with DPDK.
  *
-- 
2.52.0

Reply via email to