On Fri, Sep 04, 2015 at 09:59:00AM -0700, Jesse Barnes wrote:
> New file with VT-d SVM and PASID handling functions and page table
> management.  This belongs in the IOMMU code (along with some extra bits
> for waiting for invalidations and page faults to complete, flushing the
> device IOTLB, etc.)
> 
> FIXME:
>   need work queue for re-submitting contexts
>   TE bit handling on SKL
> ---
>  drivers/gpu/drm/i915/Makefile           |    5 +-
>  drivers/gpu/drm/i915/i915_drv.h         |   43 ++
>  drivers/gpu/drm/i915/i915_gem.c         |    3 +
>  drivers/gpu/drm/i915/i915_gem_context.c |    3 +
>  drivers/gpu/drm/i915/i915_irq.c         |    7 +
>  drivers/gpu/drm/i915/i915_reg.h         |   47 ++
>  drivers/gpu/drm/i915/i915_svm.c         | 1102 
> +++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/intel_lrc.c        |  120 +++-
>  drivers/gpu/drm/i915/intel_lrc.h        |    1 +
>  9 files changed, 1299 insertions(+), 32 deletions(-)
>  create mode 100644 drivers/gpu/drm/i915/i915_svm.c
> 
> diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile
> index 44d290a..e4883a7 100644
> --- a/drivers/gpu/drm/i915/Makefile
> +++ b/drivers/gpu/drm/i915/Makefile
> @@ -38,7 +38,8 @@ i915-y += i915_cmd_parser.o \
>         intel_lrc.o \
>         intel_mocs.o \
>         intel_ringbuffer.o \
> -       intel_uncore.o
> +       intel_uncore.o \
> +       i915_svm.o

Correct me if I am wrong, but it looks like i915_svm implements the
lowlevel interface with the hardware, so by convention is intel_svm.c
  
>  # general-purpose microcontroller (GuC) support
>  i915-y += intel_guc_loader.o \
> @@ -93,6 +94,8 @@ i915-y += dvo_ch7017.o \
>  # virtual gpu code
>  i915-y += i915_vgpu.o
>  
> +i915-$(CONFIG_MMU_NOTIFIER) += i915_svm.o

Added twice?

> +
>  # legacy horrors
>  i915-y += i915_dma.o
>  
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 20beb51..ca38a7a 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -47,6 +47,7 @@
>  #include <drm/drm_gem.h>
>  #include <linux/backlight.h>
>  #include <linux/hashtable.h>
> +#include <linux/mmu_notifier.h>
>  #include <linux/intel-iommu.h>
>  #include <linux/kref.h>
>  #include <linux/pm_qos.h>
> @@ -848,6 +849,13 @@ struct i915_ctx_hang_stats {
>       bool banned;
>  };
>  
> +struct intel_mm_struct {
> +     struct kref kref;
> +     struct mmu_notifier notifier;
> +     struct drm_i915_private *dev_priv;
> +     struct list_head context_list;
> +};

Doesn't this look kind of familiar? struct i915_mm_struct perhaps?

> +
>  /* This must match up with the value previously used for execbuf2.rsvd1. */
>  #define DEFAULT_CONTEXT_HANDLE 0
>  
> @@ -874,6 +882,9 @@ struct i915_ctx_hang_stats {
>  struct intel_context {
>       struct kref ref;
>       int user_handle;
> +     bool is_svm; /* shares x86 page tables */
> +     u32 pasid; /* 20 bits */
> +     struct intel_mm_struct *ims;
>       uint8_t remap_slice;
>       struct drm_i915_private *i915;
>       int flags;
> @@ -895,6 +906,9 @@ struct intel_context {
>               int pin_count;
>       } engine[I915_NUM_RINGS];
>  
> +     struct list_head mm_list;

This is a link, name it so.

> +     struct task_struct *tsk;

One task? A context can be passed by the device fd to another process.
Do we inherit the VM along with the context? I don't anything to prevent
such.

> +static void gpu_mm_segv(struct task_struct *tsk, unsigned long address,
> +                     int si_code)
> +{
> +     siginfo_t info;
> +
> +     /* Need specific signal info here */
> +     info.si_signo   = SIGSEGV;
> +     info.si_errno   = EIO;
> +     info.si_code    = si_code;
> +     info.si_addr    = (void __user *)address;
> +
> +     force_sig_info(SIGSEGV, &info, tsk);

force_sig_info() is not exported, ah you builtin i915-svm.c

> +}
> +
> +/*
> + * Read the fault descriptor and handle the fault:
> + *   get PML4 from PASID
> + *   get mm struct
> + *   get the vma
> + *   verify the address is valid
> + *   call handle_mm_fault after taking the mm->mmap_sem
> + */
> +void intel_gpu_fault_work(struct work_struct *work)
> +{
> +     struct i915_svm_state *svm = container_of(work, struct i915_svm_state,
> +                                               work);
> +     struct drm_i915_private *dev_priv =
> +             container_of(svm, struct drm_i915_private, svm);
> +     struct drm_device *dev = dev_priv->dev;
> +     struct intel_ringbuffer *ringbuf;
> +     struct page_request_dsc desc;
> +     struct page_group_response_dsc resp;
> +     struct intel_context *ctx;
> +     struct task_struct *tsk;
> +     struct mm_struct *mm;
> +     struct vm_area_struct *vma;
> +     u64 address;
> +     int ret;
> +
> +     DRM_ERROR("PRQ updated, head 0x%08x, tail 0x%08x\n",
> +               I915_READ(SVM_PRQ_HEAD), I915_READ(SVM_PRQ_TAIL));
> +     prq_read_descriptor(dev, &desc);
> +     DRM_ERROR("page fault on addr 0x%016llx, PASID %d, srr %d\n",
> +               (u64)(desc.addr << PAGE_SHIFT), desc.pasid, desc.srr);
> +
> +     spin_lock(&dev_priv->svm.lock);
> +     ctx = dev_priv->svm.pasid_ctx[desc.pasid];
> +     tsk = ctx->tsk;
> +     mm = tsk->mm;
> +     address = desc.addr << PAGE_SHIFT;
> +     ringbuf = ctx->engine[RCS].ringbuf;
> +     spin_unlock(&dev_priv->svm.lock);

All of the above can disappear at anytime after the unlock?

> +
> +     down_read_trylock(&mm->mmap_sem);
> +     vma = find_extend_vma(mm, address);
> +     if (!vma || address < vma->vm_start) {
> +             DRM_ERROR("bad VMA or address out of range\n");
> +             gpu_mm_segv(tsk, address, SEGV_MAPERR);
> +             goto out_unlock; /* need to kill process */
> +     }
> +
> +     ret = handle_mm_fault(mm, vma, address,
> +                           desc.wr_req ? FAULT_FLAG_WRITE : 0);
> +     if (ret & VM_FAULT_ERROR) {
> +             gpu_mm_segv(tsk, address, SEGV_ACCERR); /* ? */
> +             goto out_unlock;
> +     }
> +
> +     if (ret & VM_FAULT_MAJOR)
> +             tsk->maj_flt++;
> +     else
> +             tsk->min_flt++;
> +
> +     if (desc.srr)
> +             resp.dsc_type = PAGE_STREAM_RESP_DSC;
> +     else
> +             resp.dsc_type = PAGE_GRP_RESP_DSC;
> +     resp.pasid = desc.pasid;
> +     resp.pasid_present = 1;
> +     resp.requestor_id = PCI_DEVID(0, PCI_DEVFN(2,0));
> +     resp.resp_code = RESP_CODE_SUCCESS;
> +     resp.prg_index = desc.prg_index;
> +     resp.private = desc.private;
> +     ivq_write_resp_descriptor(dev, &resp);
> +out_unlock:
> +     up_read(&mm->mmap_sem);
> +
> +     /* FIXME: wait for page response to be serviced */
> +
> +     /* FIXME: queue context for re-submit */
> +     /* execlists_context_queue(req); */
> +}

> +/* Make sure GPU writes can't hit the mm that's about to go away */
> +static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm)
> +{
> +     struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
> +                                                notifier);
> +     struct drm_i915_private *dev_priv = ims->dev_priv;
> +     struct drm_device *dev = dev_priv->dev;
> +     struct intel_context *ctx;
> +
> +     /*
> +      * Wait for any outstanding activity and unbind the mm.  Since
> +      * each context has its own ring, we can simply wait for the ring
> +      * to idle before invalidating the PASID and flushing the TLB.
> +      */
> +     mutex_lock(&dev->struct_mutex);
> +     list_for_each_entry(ctx, &ims->context_list, mm_list) {
> +             intel_ring_idle(ctx->engine[RCS].ringbuf->ring);
> +     }
> +
> +     intel_iommu_tlb_flush(dev_priv->dev);
> +     mutex_unlock(&dev->struct_mutex);

Erm, what! So you halt the GPU everytime? But you've already invalidated
the shadow PTE -- ah, invalidate-range looks to be a wip.

> +static void intel_flush_page_locked(struct drm_device *dev, int pasid,
> +                                 unsigned long address)
> +{
> +     struct ext_iotlb_inv_dsc dsc = { 0 };
> +
> +     dsc.dsc_type = EXT_IOTLB_INV_DSC;
> +     dsc.g = EXT_IOTLB_INV_G_PASID_PAGE_SELECT;
> +     dsc.pasid = pasid;
> +     dsc.ih = 0;
> +     dsc.addr = address;
> +     dsc.am = 1;
> +     ivq_write_ext_iotlb_inv_descriptor(dev, &dsc);
> +}
> +
> +static void intel_change_pte(struct mmu_notifier *mn, struct mm_struct *mm,
> +                          unsigned long address, pte_t pte)
> +{
> +     struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
> +                                                notifier);
> +     struct drm_i915_private *dev_priv = ims->dev_priv;
> +     struct drm_device *dev = dev_priv->dev;
> +
> +     struct intel_context *ctx;
> +
> +     mutex_lock(&dev->struct_mutex);
> +     list_for_each_entry(ctx, &ims->context_list, mm_list)
> +             intel_flush_page_locked(dev, ctx->pasid, address);
> +     mutex_unlock(&dev->struct_mutex);

Suggests you really want a ims->spinlock for context_list instead.

> +}
> +
> +static void intel_invalidate_page(struct mmu_notifier *mn,
> +                               struct mm_struct *mm,
> +                               unsigned long address)
> +{
> +     struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
> +                                                notifier);
> +     struct drm_i915_private *dev_priv = ims->dev_priv;
> +     struct drm_device *dev = dev_priv->dev;
> +     struct intel_context *ctx;
> +
> +     mutex_lock(&dev->struct_mutex);
> +     list_for_each_entry(ctx, &ims->context_list, mm_list)
> +             intel_flush_page_locked(dev, ctx->pasid, address);
> +     mutex_unlock(&dev->struct_mutex);
> +}
> +
> +/* Need to unmap this range and make sure it doesn't get re-faulted */
> +static void intel_invalidate_range_start(struct mmu_notifier *mn,
> +                                      struct mm_struct *mm,
> +                                      unsigned long start, unsigned long end)
> +{
> +     struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
> +                                                notifier);
> +     struct drm_i915_private *dev_priv = ims->dev_priv;
> +     struct drm_device *dev = dev_priv->dev;
> +
> +     /* FIXME: invalidate page only */
> +     intel_iommu_tlb_flush(dev);
> +}
> +
> +/* Pages have been freed at this point */
> +static void intel_invalidate_range_end(struct mmu_notifier *mn,
> +                                    struct mm_struct *mm,
> +                                    unsigned long start, unsigned long end)
> +{
> +     struct intel_mm_struct *ims = container_of(mn, struct intel_mm_struct,
> +                                                notifier);
> +     struct drm_i915_private *dev_priv = ims->dev_priv;
> +     struct drm_device *dev = dev_priv->dev;
> +
> +     /* FIXME: invalidate page only */
> +     intel_iommu_tlb_flush(dev);
> +}
> +
> +static const struct mmu_notifier_ops intel_mmuops = {
> +     .release = intel_mm_release,
> +     /* no clear_flush_young, we just share the x86 bits */
> +     /* no test_young, we just share the x86 bits */
> +     .change_pte = intel_change_pte,
> +     .invalidate_page = intel_invalidate_page,
> +     .invalidate_range_start = intel_invalidate_range_start,
> +     .invalidate_range_end = intel_invalidate_range_end,
> +};
> +
> +struct intel_mm_struct *intel_bind_mm(struct drm_device *dev,
> +                                   struct intel_context *ctx)
> +{
> +     struct drm_i915_private *dev_priv = dev->dev_private;
> +     struct intel_mm_struct *ims;
> +     struct mmu_notifier *mn;
> +     int ret;
> +
> +     WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
> +
> +     mn = mmu_find_ops(current->mm, &intel_mmuops);

Magic function, I am missing its definition

> +     if (mn) {
> +             ims = container_of(mn, struct intel_mm_struct, notifier);
> +             kref_get(&ims->kref);
> +             goto out;
> +     }
> +
> +     ims = kzalloc(sizeof(*ims), GFP_KERNEL);
> +     if (!ims) {
> +             ret = -ENOMEM;
> +             goto error;
> +     }
> +     INIT_LIST_HEAD(&ims->context_list);
> +
> +     ims->notifier.ops = &intel_mmuops;
> +
> +     ret = mmu_notifier_register(&ims->notifier, current->mm);

This has lock inversion between struct_mutex and mm->mmap_sem.

> +     if (ret)
> +             goto error;
> +
> +     ims->dev_priv = dev->dev_private;
> +
> +out:
> +     list_add(&ctx->mm_list, &ims->context_list);
> +     return ims;
> +error:
> +     kfree(ims);
> +     return ERR_PTR(ret);
> +}
> +
> +static void intel_mm_free(struct kref *ims_ref)
> +{
> +     struct intel_mm_struct *ims =
> +             container_of(ims_ref, struct intel_mm_struct, kref);
> +
> +     mmu_notifier_unregister(&ims->notifier, current->mm);

More lock inversion.

> +     kfree(ims);
> +}
> +
> +void intel_unbind_mm(struct intel_context *ctx)
> +{
> +     struct drm_i915_private *dev_priv = ctx->ims->dev_priv;
> +
> +     WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
> +
> +     list_del(&ctx->mm_list);
> +     kref_put(&ctx->ims->kref, intel_mm_free);
> +
> +     return;
> +}
> +
> +int intel_exec_mm_ioctl(struct drm_device *dev, void *data,
> +                     struct drm_file *file)
> +{
> +//   struct drm_i915_exec_mm *exec_mm = data;
> +//   struct drm_i915_private *dev_priv = dev->dev_private;
> +
> +     /* Load new context into context reg */

Ah, there is a modicum of user API here.

> +     return 0;
> +}
> +
> +/*
> + * The PASID table has 32 entries in the current config, rotate through
> + * them as needed.
> + */
> +int intel_alloc_pasid(struct drm_device *dev, struct intel_context *ctx)
> +{
> +     struct drm_i915_private *dev_priv = dev->dev_private;
> +     struct pasid_table_entry *table;
> +     int i;
> +
> +     WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
> +
> +     spin_lock(&dev_priv->svm.lock);
> +     table = dev_priv->svm.pasid_table;
> +
> +     for (i = 0; i < PASID_COUNT; i++) {
> +             if (!table[i].present)
> +                     goto found;
> +     }
> +
> +     spin_unlock(&dev_priv->svm.lock);
> +     return -1;
> +
> +found:
> +     table[i].pml4 = __pa(current->mm->pgd) >> PAGE_SHIFT;
> +     table[i].present = 1;
> +
> +     ctx->pasid = i;
> +     dev_priv->svm.pasid_ctx[ctx->pasid] = NULL;
> +     spin_unlock(&dev_priv->svm.lock);
> +
> +     intel_iommu_tlb_flush(dev);
> +
> +     return 0;
> +}
> +
> +void intel_free_pasid(struct drm_device *dev, struct intel_context *ctx)
> +{
> +     struct drm_i915_private *dev_priv = dev->dev_private;
> +     struct pasid_table_entry *table;
> +
> +     WARN_ON(!mutex_is_locked(&dev_priv->dev->struct_mutex));
> +
> +     if (ctx->pasid >= PASID_COUNT)
> +             return;
> +
> +     spin_lock(&dev_priv->svm.lock);
> +     table = dev_priv->svm.pasid_table;
> +     memset(&table[ctx->pasid], 0, sizeof(struct pasid_table_entry));
> +     dev_priv->svm.pasid_ctx[ctx->pasid] = NULL;
> +     ctx->pasid = -1;
> +     spin_unlock(&dev_priv->svm.lock);
> +
> +     intel_iommu_tlb_flush(dev);
> +}
> +
> +/*
> + * Each root table entry is 16 bytes wide.  In legacy mode, only
> + * the lower 64 bits are used:
> + *   Bits 38:12: context table pointer
> + *   Bit 0: present
> + *   all other bits reserved
> + * In extended mode (what we use for SVM):
> + *   Bits 102:76: upper context table pointer
> + *   Bit 64: upper present
> + *   Bits 38:12: lower context table pointer
> + *   Bit 0: lower present
> + *   all other bits reserved
> + *
> + * The context entries are 128 bit in legacy mode:
> + *   Bits 87:72: Domain ID
> + *   Bits 70:67: Available
> + *   Bits 66:64: Address width
> + *   Bits 38:12: Page table pointer
> + *   Bits 3:2: Translation type
> + *     00 - only untranslated DMA requests go through this table
> + *          translated and translation requests are blocked
> + *     01 - untranslated, translated, and translation requests supported
> + *     10 - untranslated requests are treated as pass through (HPA == GPA),
> + *          translated DMA requests and translation requests are blocked
> + *     11 - reserved
> + *   Bit 1: fault disable
> + *   Bit 0: Present
> + * and 256 bit in extended:
> + *   Bits 230:204: PASID state table pointer
> + *   Bits 166:140: PASID table pointer
> + *   Bits 131:128: PASID table size
> + *   Bits 127:96: Page table attribute (PAT)
> + *   Bit 92: SL64KPE
> + *   Bit 91: SLEE
> + *   Bit 90: ERE
> + *   Bit 89: SRE
> + *   Bit 88: SMEP
> + *   Bits 87:72: Domain ID
> + *   Bit 71: Extended memory type enable
> + *   Bit 70: cache disable (CD)
> + *   Bit 69: write protect (WP)
> + *   Bit 68: no execute enable (NXE)
> + *   Bit 67: page global enable (PGE)
> + *   Bits 66:64: address width
> + *   Bits 38:12: 2nd level (VT-d) page table pointer
> + *   Bit 11: PASID enable
> + *   Bit 10: Nesting enable
> + *   Bit 9: Page Request enable
> + *   Bit 8: Lazy-Invalidate enable
> + *   Bits 7:5: Extended Memory Type (VT-d)
> + *   Bits 4:2: Translation type
> + *     000 - Only Untranslated DMA requests are translated through this page
> + *           table. Translated DMA requests and Translation Requests are
> + *           blocked.  Untranslated requests-without-PASID are remapped using
> + *           the second-level page-table referenced through SLPTPTR field.
> + *           If PASIDE field is Set, Untranslated requests-with-PASID are
> + *           remapped using the PASID Table referenced through PASIDPTPTR
> + *           field. If PASIDE field is Clear, Untranslated 
> requests-with-PASID
> + *           are blocked.  Translation requests (with or without PASID), and
> + *           Translated Requests are blocked.
> + *     001 - Un-translated and Translation requests without PASID supported
> + *           (and with PASID supported, if PASID Enable Set); Translate
> + *           requests bypass address translation.  Untranslated
> + *           requests-without-PASID and Translation requests-without-PASID 
> are
> + *           remapped using the second level page-table referenced through
> + *           SLPTPTR field. If PASIDE field is Set, Untranslated
> + *           requests-with-PASID and Translation requests-with-PASID are
> + *           remapped using the PASID Table referenced through PASIDPTPTR
> + *           field. If PASIDE field is Clear, Untranslated 
> requests-with-PASID,
> + *           and Translation requests-with-PASID, are blocked. Translated
> + *           requests bypass address translation.
> + *     010 - If Pass-through Supported (GT supports pass-through),
> + *           Un-translated requests without PASID bypass address translation;
> + *           All other requests (with or without PASID) blocked. Untranslated
> + *           requests-without-PASID bypass address translation and are
> + *           processed as passthrough. SLPTPTR field is ignored by hardware.
> + *           Untranslated requests-with-PASID, Translation requests (with or
> + *           without PASID), and Translated requests are blocked.
> + *     011 - Reserved.
> + *     100 - Un-translated requests without PASID bypass address translation;
> + *           Un-translated requests with PASID supported, if PASID Enable 
> Set;
> + *           All other requests blocked. Untranslated requests-without-PASID
> + *           bypass address translation and are processed as passthrough.
> + *           SLPTPTR field is ignored by hardware. Untranslated
> + *           requests-with-PASID are remapped using the PASID Table 
> referenced
> + *           through PASIDPTPTR field. Translation requests (with or without
> + *           PASID) and Translated requests are blocked.
> + *     101 - Un-translated and Translation requests without PASID bypass
> + *           address translation; Un-translated and Translation requests with
> + *           PASID supported, if PASID Enable Set; Translated requests bypass
> + *           address translation.  Untranslated requests-without-PASID bypass
> + *           address translation and are processed as passthrough. SLPTPTR
> + *           field is ignored by hardware.  Translation 
> requests-without-PASID
> + *           are responded with Untranslated access only bit Set (U=1) along
> + *           with read and write permissions (R=W=1). SLPTPTR field is 
> ignored
> + *           by hardware. Untranslated requests-with-PASID, and Translation
> + *           requests-with-PASID are remapped using the PASID Table 
> referenced
> + *           through PASIDPTPTR field.  Translated requests bypass address
> + *           translation.
> + *     110 - Un-translated requests without PASID are blocked; Un-translated
> + *           requests with PASID supported, if PASID Enable Set; All other
> + *           requests blocked – Not applicable to GFX, GT should treat this 
> as
> + *           reserved.
> + *     111 - Un-translated and Translation requests without PASID blocked;
> + *           Un-translated and Translation requests with PASID supported, if
> + *           PASID Enable Set; Translated requests bypass address 
> translation.
> + *           Note: Not applicable to GFX, GT should treat this as reserved.
> + *   Bit 1: Fault disable
> + *   Bit 0: Present
> + *
> + * Page walks for graphics addresses can go through one or two levels of
> + * translation, depending on whether VT-d is enabled.
> + *
> + * If we're in driver mode (currently the only supported mode), we always
> + * use a single level of translation, meaning the second level page table
> + * pointer (if present) is ignored.
> + *
> + * The full walk starts at the root table, which indexes into the upper
> + * and lower context tables.  Those tables point to PASID mapping and state
> + * tables and potentially a second level page table for VT-d (which, as noted
> + * above, is unused currently).  The PASID mapping table points to a PML4
> + * (x86 compatible) page table, while the state table indicates other
> + * information about the PASID involved in the request, which ultimately 
> comes
> + * from the execlist port submission of the context descriptor.
> + *
> + * To enable a shared CPU/GPU address space, we can use a couple of different
> + * translation types, either 101 or 01 w/o nesting.  The main requirement
> + * is that requests with PASID are translated through the page tables 
> provided,
> + * potentially with nesting if we're running in a VT-d context (which we
> + * don't currently support).
> + */
> +#define CONTEXT_OFFSET (PAGE_SIZE * 1)
> +#define PASID_OFFSET (PAGE_SIZE * 2)
> +#define PASID_STATE_OFFSET (PAGE_SIZE * 3)
> +#define PRQ_OFFSET (PAGE_SIZE * 4)
> +#define IVQ_OFFSET (PAGE_SIZE * 5)
> +static void intel_init_svm_root_table(struct drm_device *dev,
> +                                   drm_dma_handle_t *tables)
> +{
> +     struct drm_i915_private *dev_priv = dev->dev_private;
> +     struct extended_root_table_entry *root_table;
> +     struct extended_context_table_entry *context;
> +     struct pasid_table_entry *pasid_table;
> +     struct pasid_state_table_entry *pasid_state_table;
> +     u64 *tmp;
> +
> +     root_table = tables->vaddr;
> +     context = tables->vaddr + CONTEXT_OFFSET;
> +        pasid_table = tables->vaddr + PASID_OFFSET;
> +     pasid_state_table = tables->vaddr + PASID_STATE_OFFSET;
> +
> +     DRM_ERROR("programmed PASID table, vaddr %p, busaddr 0x%16llx\n",
> +               pasid_table, tables->busaddr + PASID_OFFSET);
> +
> +     /* Context entry for gfx device */
> +     context[16].pat = 0x66666666;
> +     context[16].ere = 1;
> +     context[16].sre = 1;
> +     context[16].smep = 1;
> +     context[16].domain_id = 1;
> +     context[16].addr_width = AGAW_48; /* full x86 walk */
> +     context[16].pasid_en = 1;
> +     context[16].nesting_en = 0; /* not yet */
> +     context[16].pg_req_en = 1;
> +     context[16].lazy_invalidate_en = 1;
> +     context[16].ext_mem_type = EXTENDED_MTYPE_WB;
> +     context[16].translation_type = EXTENDED_TTYPE_UT_TR_PASID_PT;
> +     context[16].fault_disable = 0;
> +     context[16].present = 1;
> +     context[16].pasid_state_table_addr = (tables->busaddr + 
> PASID_STATE_OFFSET) >> PAGE_SHIFT;
> +     context[16].pasid_table_addr = (tables->busaddr + PASID_OFFSET) >>
> +             PAGE_SHIFT;
> +     context[16].pasid_table_size = 0; /* 2^(5+x) */
> +
> +     tmp = (u64 *)&context[16];
> +     DRM_ERROR("root entry: 0x%016llx%016llx\n", tmp[1], tmp[0]);
> +
> +     DRM_ERROR("programmed context table, vaddr %p, busaddr 0x%16llx\n",
> +               context, tables->busaddr + CONTEXT_OFFSET);
> +
> +     /* Root table */
> +     root_table[0].lo_ctx_addr = (tables->busaddr + CONTEXT_OFFSET) >>
> +             PAGE_SHIFT;
> +     root_table[0].lo_present = 1;
> +     root_table[0].hi_present = 0;
> +
> +     tmp = (u64 *)&root_table[0];
> +     DRM_ERROR("root entry: 0x%016llx%016llx\n", tmp[1], tmp[0]);
> +
> +     dev_priv->svm.root_table = root_table;
> +     dev_priv->svm.context = context;
> +        dev_priv->svm.pasid_table = pasid_table;
> +     dev_priv->svm.pasid_state_table = pasid_state_table;
> +     dev_priv->svm.prq_ring = tables->vaddr + PRQ_OFFSET;
> +     dev_priv->svm.ivq_ring = tables->vaddr + IVQ_OFFSET;
> +
> +     /* Enable the page request queue */
> +     I915_WRITE64(SVM_PRQA, tables->busaddr + PRQ_OFFSET);
> +     I915_WRITE(SVM_PRQ_HEAD, 0);
> +     I915_WRITE(SVM_PRQ_TAIL, 0);
> +     I915_WRITE(SVM_PRECTL, 0);
> +
> +     /* Set up the invalidation request queue */
> +     I915_WRITE64(SVM_IQA, tables->busaddr + IVQ_OFFSET);
> +     I915_WRITE(SVM_IVQ_HEAD, 0);
> +     I915_WRITE(SVM_IVQ_TAIL, 0);
> +     I915_WRITE(SVM_IECTL, 0);
> +
> +     I915_WRITE(SVM_GCMD, GCMD_QIE);
> +     if (wait_for(I915_READ(SVM_GSTS) & GSTS_QIES, 500))
> +             DRM_ERROR("timed out waiting for queued invalidation enable\n");
> +
> +     /* All set, program the root */
> +     I915_WRITE(SVM_RTADDR, tables->busaddr | SVM_RTT_TYPE_EXT);
> +
> +     I915_WRITE(SVM_GCMD, GCMD_SRTP);
> +     if (wait_for(I915_READ(SVM_GSTS) & GSTS_RTPS, 500))
> +             DRM_ERROR("timed out waiting for root table to load\n");
> +
> +     DRM_ERROR("programmed SVM root, vaddr %p, busaddr 0x%16llx\n",
> +               tables->vaddr, tables->busaddr);
> +
> +     intel_iommu_tlb_flush(dev);
> +}
> +
> +/*
> + * Probe for SVM capability.  If found:
> + *  - try to switch to driver mode
> + *  - set up root PASID table
> + *  - enable page fault and error handling interrupts
> + *  - allow SVM ioctls
> + */
> +void intel_init_svm(struct drm_device *dev)
> +{
> +     struct drm_i915_private *dev_priv = dev->dev_private;
> +     drm_dma_handle_t *tables;
> +     u32 dev_mode;
> +     int num_tables = 6;
> +
> +     dev_mode = I915_READ(BDW_SVM_DEV_MODE_CNFG);
> +     I915_WRITE(BDW_SVM_DEV_MODE_CNFG, dev_mode | BDW_SVM_MODE_DRIVER);
> +     dev_mode = I915_READ(BDW_SVM_DEV_MODE_CNFG);
> +#if defined(CONFIG_INTEL_IOMMU) || defined(IOMMU_SUPPORT)
> +#error must disable IOMMU support
> +#endif
> +     if (!dev_mode & BDW_SVM_MODE_DRIVER) {
> +             DRM_ERROR("driver mode not available, disabling SVM\n");
> +             goto err;
> +     }
> +
> +     tables = drm_pci_alloc(dev, PAGE_SIZE*num_tables, PAGE_SIZE);
> +     if (!tables) {
> +             DRM_ERROR("table alloc failed, disabling SVM\n");
> +             goto err;
> +     }
> +
> +     memset(tables->vaddr, 0, PAGE_SIZE*num_tables);
> +
> +     intel_init_svm_root_table(dev, tables);
> +
> +     spin_lock_init(&dev_priv->svm.lock);
> +
> +#if 0
> +     I915_WRITE(SVM_GCMD, GCMD_TE);
> +     if (wait_for(I915_READ(SVM_GSTS) & GSTS_TES, 500))
> +             DRM_ERROR("timed out waiting for translation enable\n");
> +#endif
> +     INIT_WORK(&dev_priv->svm.work, intel_gpu_fault_work);
> +
> +     DRM_ERROR("SVM driver mode enabled\n");
> +     dev_priv->svm.svm_available = true;
> +     return;
> +
> +err:
> +     dev_priv->svm.svm_available = false;
> +     return;
> +}
> diff --git a/drivers/gpu/drm/i915/intel_lrc.c 
> b/drivers/gpu/drm/i915/intel_lrc.c
> index 40cbba4..1450491 100644
> --- a/drivers/gpu/drm/i915/intel_lrc.c
> +++ b/drivers/gpu/drm/i915/intel_lrc.c
> @@ -217,6 +217,7 @@ enum {
>       FAULT_AND_STREAM,
>       FAULT_AND_CONTINUE /* Unsupported */
>  };
> +#define GEN8_CTX_FAULT_SHIFT 6
>  #define GEN8_CTX_ID_SHIFT 32
>  #define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT  0x17
>  
> @@ -289,12 +290,21 @@ uint64_t intel_lr_context_descriptor(struct 
> intel_context *ctx,
>       WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
>  
>       desc = GEN8_CTX_VALID;
> -     desc |= GEN8_CTX_ADDRESSING_MODE(dev) << GEN8_CTX_ADDRESSING_MODE_SHIFT;
> -     if (IS_GEN8(ctx_obj->base.dev))
> -             desc |= GEN8_CTX_L3LLC_COHERENT;
> -     desc |= GEN8_CTX_PRIVILEGE;
> -     desc |= lrca;
> -     desc |= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT;
> +     if (ctx->is_svm) {
> +             desc |= ADVANCED_CONTEXT << GEN8_CTX_ADDRESSING_MODE_SHIFT;
> +             desc |= FAULT_AND_STREAM << GEN8_CTX_FAULT_SHIFT;
> +             desc |= lrca;
> +             desc |= (u64)intel_execlists_ctx_id(ctx_obj) << 
> GEN8_CTX_ID_SHIFT;
> +     } else {
> +             desc |= GEN8_CTX_ADDRESSING_MODE(dev) <<
> +                     GEN8_CTX_ADDRESSING_MODE_SHIFT;
> +             if (IS_GEN8(ctx_obj->base.dev))
> +                     desc |= GEN8_CTX_L3LLC_COHERENT;
> +             desc |= GEN8_CTX_PRIVILEGE;
> +             desc |= lrca;
> +             desc |= (u64)intel_execlists_ctx_id(ctx_obj) <<
> +                     GEN8_CTX_ID_SHIFT;
> +     }
>  
>       /* TODO: WaDisableLiteRestore when we start using semaphore
>        * signalling between Command Streamers */
> @@ -545,7 +555,7 @@ void intel_lrc_irq_handler(struct intel_engine_cs *ring)
>                  _MASKED_FIELD(0x07 << 8, 
> ((u32)ring->next_context_status_buffer & 0x07) << 8));
>  }
>  
> -static int execlists_context_queue(struct drm_i915_gem_request *request)
> +int execlists_context_queue(struct drm_i915_gem_request *request)
>  {
>       struct intel_engine_cs *ring = request->ring;
>       struct drm_i915_gem_request *cursor;
> @@ -2273,31 +2283,40 @@ populate_lr_context(struct intel_context *ctx, struct 
> drm_i915_gem_object *ctx_o
>       reg_state[CTX_LRI_HEADER_1] |= MI_LRI_FORCE_POSTED;
>       reg_state[CTX_CTX_TIMESTAMP] = ring->mmio_base + 0x3a8;
>       reg_state[CTX_CTX_TIMESTAMP+1] = 0;
> -     reg_state[CTX_PDP3_UDW] = GEN8_RING_PDP_UDW(ring, 3);
> -     reg_state[CTX_PDP3_LDW] = GEN8_RING_PDP_LDW(ring, 3);
> -     reg_state[CTX_PDP2_UDW] = GEN8_RING_PDP_UDW(ring, 2);
> -     reg_state[CTX_PDP2_LDW] = GEN8_RING_PDP_LDW(ring, 2);
> -     reg_state[CTX_PDP1_UDW] = GEN8_RING_PDP_UDW(ring, 1);
> -     reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1);
> -     reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0);
> -     reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0);
> -
> -     if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
> -             /* 64b PPGTT (48bit canonical)
> -              * PDP0_DESCRIPTOR contains the base address to PML4 and
> -              * other PDP Descriptors are ignored.
> -              */
> -             ASSIGN_CTX_PML4(ppgtt, reg_state);
> +
> +     if (ctx->is_svm) {
> +             reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0);
> +             reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0);
> +             reg_state[CTX_PDP0_UDW+1] = 0;
> +             reg_state[CTX_PDP0_LDW+1] = ctx->pasid;
>       } else {
> -             /* 32b PPGTT
> -              * PDP*_DESCRIPTOR contains the base address of space supported.
> -              * With dynamic page allocation, PDPs may not be allocated at
> -              * this point. Point the unallocated PDPs to the scratch page
> -              */
> -             ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
> -             ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
> -             ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
> -             ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
> +             reg_state[CTX_PDP3_UDW] = GEN8_RING_PDP_UDW(ring, 3);
> +             reg_state[CTX_PDP3_LDW] = GEN8_RING_PDP_LDW(ring, 3);
> +             reg_state[CTX_PDP2_UDW] = GEN8_RING_PDP_UDW(ring, 2);
> +             reg_state[CTX_PDP2_LDW] = GEN8_RING_PDP_LDW(ring, 2);
> +             reg_state[CTX_PDP1_UDW] = GEN8_RING_PDP_UDW(ring, 1);
> +             reg_state[CTX_PDP1_LDW] = GEN8_RING_PDP_LDW(ring, 1);
> +             reg_state[CTX_PDP0_UDW] = GEN8_RING_PDP_UDW(ring, 0);
> +             reg_state[CTX_PDP0_LDW] = GEN8_RING_PDP_LDW(ring, 0);
> +
> +             if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
> +                     /* 64b PPGTT (48bit canonical)
> +                      * PDP0_DESCRIPTOR contains the base address to PML4 and
> +                      * other PDP Descriptors are ignored.
> +                      */
> +                     ASSIGN_CTX_PML4(ppgtt, reg_state);
> +             } else {
> +                     /* 32b PPGTT
> +                      * PDP*_DESCRIPTOR contains the base address of space
> +                      * supported. With dynamic page allocation, PDPs may
> +                      * not be allocated at this point. Point the
> +                      * unallocated PDPs to the scratch page
> +                      */
> +                     ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
> +                     ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
> +                     ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
> +                     ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
> +             }
>       }
>  
>       if (ring->id == RCS) {
> @@ -2327,6 +2346,12 @@ void intel_lr_context_free(struct intel_context *ctx)
>  {
>       int i;
>  
> +        if (ctx->is_svm) {
> +                intel_free_pasid(ctx->ims->dev_priv->dev, ctx);
> +                intel_unbind_mm(ctx);
> +             put_task_struct(ctx->tsk);
> +       }
> +
>       for (i = 0; i < I915_NUM_RINGS; i++) {
>               struct drm_i915_gem_object *ctx_obj = ctx->engine[i].state;
>  
> @@ -2480,6 +2505,37 @@ int intel_lr_context_deferred_create(struct 
> intel_context *ctx,
>  
>       }
>  
> +     if (ctx->is_svm) {
> +             /* FIXME: just skip here, don't bail and trash the ctx */
> +             if (ring->id != RCS) {
> +                     DRM_DEBUG_DRIVER("svm context only allowed on RCS\n");

That's fairly useless then :)
-Chris

-- 
Chris Wilson, Intel Open Source Technology Centre
_______________________________________________
Intel-gfx mailing list
[email protected]
http://lists.freedesktop.org/mailman/listinfo/intel-gfx

Reply via email to