On Mon, Mar 16, 2026 at 5:59 AM Jiri Pirko <[email protected]> wrote:
>
> From: Jiri Pirko <[email protected]>
>
> Add a new "system_cc_decrypted" dma-buf heap to allow userspace to
> allocate decrypted (shared) memory for confidential computing (CoCo)
> VMs.
>
> On CoCo VMs, guest memory is encrypted by default. The hardware uses an
> encryption bit in page table entries (C-bit on AMD SEV, "shared" bit on
> Intel TDX) to control whether a given memory access is encrypted or
> decrypted. The kernel's direct map is set up with encryption enabled,
> so pages returned by alloc_pages() are encrypted in the direct map
> by default. To make this memory usable for devices that do not support
> DMA to encrypted memory (no TDISP support), it has to be explicitly
> decrypted. A couple of things are needed to properly handle
> decrypted memory for the dma-buf use case:
>
> - set_memory_decrypted() on the direct map after allocation:
>   Besides clearing the encryption bit in the direct map PTEs, this
>   also notifies the hypervisor about the page state change. On free,
>   the inverse set_memory_encrypted() must be called before returning
>   pages to the allocator. If re-encryption fails, pages
>   are intentionally leaked to prevent decrypted memory from being
>   reused as private.
>
> - pgprot_decrypted() for userspace and kernel virtual mappings:
>   Any new mapping of the decrypted pages, be it to userspace via
>   mmap or to kernel vmalloc space via vmap, creates PTEs independent
>   of the direct map. These must also have the encryption bit cleared,
>   otherwise accesses through them would see encrypted (garbage) data.
>
> - DMA_ATTR_CC_DECRYPTED for DMA mapping:
>   Since the pages are already decrypted, the DMA API needs to be
>   informed via DMA_ATTR_CC_DECRYPTED so it can map them correctly
>   as unencrypted for device access.
>
> On non-CoCo VMs, the system_cc_decrypted heap is not registered
> to prevent misuse by userspace that does not understand
> the security implications of explicitly decrypted memory.
>
> Signed-off-by: Jiri Pirko <[email protected]>
> ---
> v2->v3:
> - removed couple of leftovers from headers
> v1->v2:
> - fixed build errors on s390 by including mem_encrypt.h
> - converted system heap flag implementation to a separate heap
> ---
>  drivers/dma-buf/heaps/system_heap.c | 103 ++++++++++++++++++++++++++--
>  1 file changed, 98 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/dma-buf/heaps/system_heap.c 
> b/drivers/dma-buf/heaps/system_heap.c
> index b3650d8fd651..a525e9aaaffa 100644
> --- a/drivers/dma-buf/heaps/system_heap.c
> +++ b/drivers/dma-buf/heaps/system_heap.c
> @@ -10,17 +10,25 @@
>   *     Andrew F. Davis <[email protected]>
>   */
>
> +#include <linux/cc_platform.h>
>  #include <linux/dma-buf.h>
>  #include <linux/dma-mapping.h>
>  #include <linux/dma-heap.h>
>  #include <linux/err.h>
>  #include <linux/highmem.h>
> +#include <linux/mem_encrypt.h>
>  #include <linux/mm.h>
> +#include <linux/set_memory.h>
>  #include <linux/module.h>
> +#include <linux/pgtable.h>
>  #include <linux/scatterlist.h>
>  #include <linux/slab.h>
>  #include <linux/vmalloc.h>
>
> +struct system_heap_priv {
> +       bool decrypted;
> +};

Hi Jiri,

I wonder if it'd better to call this cc_decrypted (or I guess
cc_shared based on Robin's comment in the previous patch) like the DMA
attr? There's a separate effort for "restricted" heaps with TEE for
(encrypted) video playback, which doesn't involve VMs or RDMA. I think
the cc_ prefix might help avoid any confusion between the usecase here
and restricted heaps.



> +
>  struct system_heap_buffer {
>         struct dma_heap *heap;
>         struct list_head attachments;
> @@ -29,6 +37,7 @@ struct system_heap_buffer {
>         struct sg_table sg_table;
>         int vmap_cnt;
>         void *vaddr;
> +       bool decrypted;
>  };
>
>  struct dma_heap_attachment {
> @@ -36,6 +45,7 @@ struct dma_heap_attachment {
>         struct sg_table table;
>         struct list_head list;
>         bool mapped;
> +       bool decrypted;
>  };
>
>  #define LOW_ORDER_GFP (GFP_HIGHUSER | __GFP_ZERO)
> @@ -52,6 +62,34 @@ static gfp_t order_flags[] = {HIGH_ORDER_GFP, 
> HIGH_ORDER_GFP, LOW_ORDER_GFP};
>  static const unsigned int orders[] = {8, 4, 0};
>  #define NUM_ORDERS ARRAY_SIZE(orders)
>
> +static int system_heap_set_page_decrypted(struct page *page)
> +{
> +       unsigned long addr = (unsigned long)page_address(page);
> +       unsigned int nr_pages = 1 << compound_order(page);
> +       int ret;
> +
> +       ret = set_memory_decrypted(addr, nr_pages);
> +       if (ret)
> +               pr_warn_ratelimited("dma-buf system heap: failed to decrypt 
> page at %p\n",
> +                                   page_address(page));
> +
> +       return ret;
> +}
> +
> +static int system_heap_set_page_encrypted(struct page *page)
> +{
> +       unsigned long addr = (unsigned long)page_address(page);
> +       unsigned int nr_pages = 1 << compound_order(page);
> +       int ret;
> +
> +       ret = set_memory_encrypted(addr, nr_pages);
> +       if (ret)
> +               pr_warn_ratelimited("dma-buf system heap: failed to 
> re-encrypt page at %p, leaking memory\n",
> +                                   page_address(page));
> +
> +       return ret;
> +}
> +
>  static int dup_sg_table(struct sg_table *from, struct sg_table *to)
>  {
>         struct scatterlist *sg, *new_sg;
> @@ -90,6 +128,7 @@ static int system_heap_attach(struct dma_buf *dmabuf,
>         a->dev = attachment->dev;
>         INIT_LIST_HEAD(&a->list);
>         a->mapped = false;
> +       a->decrypted = buffer->decrypted;
>
>         attachment->priv = a;
>
> @@ -119,9 +158,11 @@ static struct sg_table *system_heap_map_dma_buf(struct 
> dma_buf_attachment *attac
>  {
>         struct dma_heap_attachment *a = attachment->priv;
>         struct sg_table *table = &a->table;
> +       unsigned long attrs;
>         int ret;
>
> -       ret = dma_map_sgtable(attachment->dev, table, direction, 0);
> +       attrs = a->decrypted ? DMA_ATTR_CC_DECRYPTED : 0;
> +       ret = dma_map_sgtable(attachment->dev, table, direction, attrs);
>         if (ret)
>                 return ERR_PTR(ret);
>
> @@ -188,8 +229,13 @@ static int system_heap_mmap(struct dma_buf *dmabuf, 
> struct vm_area_struct *vma)
>         unsigned long addr = vma->vm_start;
>         unsigned long pgoff = vma->vm_pgoff;
>         struct scatterlist *sg;
> +       pgprot_t prot;
>         int i, ret;
>
> +       prot = vma->vm_page_prot;
> +       if (buffer->decrypted)
> +               prot = pgprot_decrypted(prot);
> +
>         for_each_sgtable_sg(table, sg, i) {
>                 unsigned long n = sg->length >> PAGE_SHIFT;
>
> @@ -206,8 +252,7 @@ static int system_heap_mmap(struct dma_buf *dmabuf, 
> struct vm_area_struct *vma)
>                 if (addr + size > vma->vm_end)
>                         size = vma->vm_end - addr;
>
> -               ret = remap_pfn_range(vma, addr, page_to_pfn(page),
> -                               size, vma->vm_page_prot);
> +               ret = remap_pfn_range(vma, addr, page_to_pfn(page), size, 
> prot);
>                 if (ret)
>                         return ret;
>
> @@ -225,6 +270,7 @@ static void *system_heap_do_vmap(struct 
> system_heap_buffer *buffer)
>         struct page **pages = vmalloc(sizeof(struct page *) * npages);
>         struct page **tmp = pages;
>         struct sg_page_iter piter;
> +       pgprot_t prot;
>         void *vaddr;
>
>         if (!pages)
> @@ -235,7 +281,10 @@ static void *system_heap_do_vmap(struct 
> system_heap_buffer *buffer)
>                 *tmp++ = sg_page_iter_page(&piter);
>         }
>
> -       vaddr = vmap(pages, npages, VM_MAP, PAGE_KERNEL);
> +       prot = PAGE_KERNEL;
> +       if (buffer->decrypted)
> +               prot = pgprot_decrypted(prot);
> +       vaddr = vmap(pages, npages, VM_MAP, prot);
>         vfree(pages);
>
>         if (!vaddr)
> @@ -296,6 +345,14 @@ static void system_heap_dma_buf_release(struct dma_buf 
> *dmabuf)
>         for_each_sgtable_sg(table, sg, i) {
>                 struct page *page = sg_page(sg);
>
> +               /*
> +                * Intentionally leak pages that cannot be re-encrypted
> +                * to prevent decrypted memory from being reused.
> +                */
> +               if (buffer->decrypted &&
> +                   system_heap_set_page_encrypted(page))
> +                       continue;
> +
>                 __free_pages(page, compound_order(page));
>         }
>         sg_free_table(table);
> @@ -347,6 +404,8 @@ static struct dma_buf *system_heap_allocate(struct 
> dma_heap *heap,
>         DEFINE_DMA_BUF_EXPORT_INFO(exp_info);
>         unsigned long size_remaining = len;
>         unsigned int max_order = orders[0];
> +       struct system_heap_priv *priv = dma_heap_get_drvdata(heap);
> +       bool decrypted = priv->decrypted;
>         struct dma_buf *dmabuf;
>         struct sg_table *table;
>         struct scatterlist *sg;
> @@ -362,6 +421,7 @@ static struct dma_buf *system_heap_allocate(struct 
> dma_heap *heap,
>         mutex_init(&buffer->lock);
>         buffer->heap = heap;
>         buffer->len = len;
> +       buffer->decrypted = decrypted;
>
>         INIT_LIST_HEAD(&pages);
>         i = 0;
> @@ -396,6 +456,14 @@ static struct dma_buf *system_heap_allocate(struct 
> dma_heap *heap,
>                 list_del(&page->lru);
>         }
>
> +       if (decrypted) {
> +               for_each_sgtable_sg(table, sg, i) {
> +                       ret = system_heap_set_page_decrypted(sg_page(sg));
> +                       if (ret)
> +                               goto free_pages;
> +               }
> +       }
> +
>         /* create the dmabuf */
>         exp_info.exp_name = dma_heap_get_name(heap);
>         exp_info.ops = &system_heap_buf_ops;
> @@ -413,6 +481,13 @@ static struct dma_buf *system_heap_allocate(struct 
> dma_heap *heap,
>         for_each_sgtable_sg(table, sg, i) {
>                 struct page *p = sg_page(sg);
>
> +               /*
> +                * Intentionally leak pages that cannot be re-encrypted
> +                * to prevent decrypted memory from being reused.
> +                */
> +               if (buffer->decrypted &&
> +                   system_heap_set_page_encrypted(p))
> +                       continue;
>                 __free_pages(p, compound_order(p));
>         }
>         sg_free_table(table);
> @@ -428,6 +503,14 @@ static const struct dma_heap_ops system_heap_ops = {
>         .allocate = system_heap_allocate,
>  };
>
> +static struct system_heap_priv system_heap_priv = {
> +       .decrypted = false,
> +};
> +
> +static struct system_heap_priv system_heap_cc_decrypted_priv = {
> +       .decrypted = true,
> +};
> +
>  static int __init system_heap_create(void)
>  {
>         struct dma_heap_export_info exp_info;
> @@ -435,8 +518,18 @@ static int __init system_heap_create(void)
>
>         exp_info.name = "system";
>         exp_info.ops = &system_heap_ops;
> -       exp_info.priv = NULL;
> +       exp_info.priv = &system_heap_priv;
> +
> +       sys_heap = dma_heap_add(&exp_info);
> +       if (IS_ERR(sys_heap))
> +               return PTR_ERR(sys_heap);
> +
> +       if (IS_ENABLED(CONFIG_HIGHMEM) ||
> +           !cc_platform_has(CC_ATTR_MEM_ENCRYPT))
> +               return 0;
>
> +       exp_info.name = "system_cc_decrypted";
> +       exp_info.priv = &system_heap_cc_decrypted_priv;
>         sys_heap = dma_heap_add(&exp_info);
>         if (IS_ERR(sys_heap))
>                 return PTR_ERR(sys_heap);
> --
> 2.51.1
>

Reply via email to