[PATCH RFC 3/3] x86/sgx: Implement EAUG population with MAP_POPULATE
With SGX1 an enclave needs to be created with its maximum memory demands pre-allocated. Pages cannot be added to an enclave after it is initialized. SGX2 introduces a new function, ENCLS[EAUG] for adding pages to an initialized enclave. Add support for dynamically adding pages to an initialized enclave with mmap() by populating pages with EAUG. Use f_ops->populate() callback to achieve this behaviour. Signed-off-by: Jarkko Sakkinen --- arch/x86/kernel/cpu/sgx/driver.c | 129 +++ 1 file changed, 129 insertions(+) diff --git a/arch/x86/kernel/cpu/sgx/driver.c b/arch/x86/kernel/cpu/sgx/driver.c index aa9b8b868867..0e97e7476076 100644 --- a/arch/x86/kernel/cpu/sgx/driver.c +++ b/arch/x86/kernel/cpu/sgx/driver.c @@ -9,6 +9,7 @@ #include #include "driver.h" #include "encl.h" +#include "encls.h" u64 sgx_attributes_reserved_mask; u64 sgx_xfrm_reserved_mask = ~0x3; @@ -101,6 +102,133 @@ static int sgx_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +static int sgx_encl_augment_page(struct sgx_encl *encl, unsigned long offset) +{ + struct sgx_pageinfo pginfo = {0}; + struct sgx_encl_page *encl_page; + struct sgx_epc_page *epc_page; + struct sgx_va_page *va_page; + u64 secinfo_flags; + int ret; + + /* +* Ignore internal permission checking for dynamically added pages. +* They matter only for data added during the pre-initialization phase. +* The enclave decides the permissions by the means of EACCEPT, +* EACCEPTCOPY and EMODPE. +*/ + secinfo_flags = SGX_SECINFO_R | SGX_SECINFO_W | SGX_SECINFO_X; + encl_page = sgx_encl_page_alloc(encl, offset, secinfo_flags); + if (IS_ERR(encl_page)) + return PTR_ERR(encl_page); + + epc_page = sgx_alloc_epc_page(encl_page, true); + if (IS_ERR(epc_page)) { + ret = PTR_ERR(epc_page); + goto err_alloc_epc_page; + } + + va_page = sgx_encl_grow(encl); + if (IS_ERR(va_page)) { + ret = PTR_ERR(va_page); + goto err_grow; + } + + mutex_lock(>lock); + + /* +* Adding to encl->va_pages must be done under encl->lock. Ditto for +* deleting (via sgx_encl_shrink()) in the error path. +*/ + if (va_page) + list_add(_page->list, >va_pages); + + /* +* Insert prior to EADD in case of OOM. EADD modifies MRENCLAVE, i.e. +* can't be gracefully unwound, while failure on EADD/EXTEND is limited +* to userspace errors (or kernel/hardware bugs). +*/ + ret = xa_insert(>page_array, PFN_DOWN(encl_page->desc), + encl_page, GFP_KERNEL); + + /* +* If ret == -EBUSY then page was created in another flow while +* running without encl->lock +*/ + if (ret) + goto err_xa_insert; + + pginfo.secs = (unsigned long)sgx_get_epc_virt_addr(encl->secs.epc_page); + pginfo.addr = encl_page->desc & PAGE_MASK; + pginfo.metadata = 0; + + ret = __eaug(, sgx_get_epc_virt_addr(epc_page)); + if (ret) + goto err_eaug; + + encl_page->encl = encl; + encl_page->epc_page = epc_page; + encl_page->type = SGX_PAGE_TYPE_REG; + encl->secs_child_cnt++; + + sgx_mark_page_reclaimable(encl_page->epc_page); + + mutex_unlock(>lock); + + return 0; + +err_eaug: + xa_erase(>page_array, PFN_DOWN(encl_page->desc)); + +err_xa_insert: + sgx_encl_shrink(encl, va_page); + mutex_unlock(>lock); + +err_grow: + sgx_encl_free_epc_page(epc_page); + +err_alloc_epc_page: + kfree(encl_page); + + return VM_FAULT_SIGBUS; +} + +/* + * Add new pages to the enclave sequentially with ENCLS[EAUG]. Note that + * sgx_mmap() validates that the given VMA is within the enclave range. Calling + * here sgx_encl_may_map() second time would too time consuming. + */ +static int sgx_populate(struct file *file, struct vm_area_struct *vma) +{ + unsigned long length = vma->vm_end - vma->vm_start; + struct sgx_encl *encl = file->private_data; + unsigned long start = encl->base - vma->vm_start; + unsigned long pos; + int ret; + + /* EAUG works only for initialized enclaves. */ + if (!test_bit(SGX_ENCL_INITIALIZED, >flags)) + return -EINVAL; + + for (pos = 0 ; pos < length; pos += PAGE_SIZE) { + if (signal_pending(current)) { + if (!pos) + ret = -ERESTARTSYS; + + break; + } + + if (need_resched()) + cond_resched(); + + ret = sgx_encl_augment_page(encl, start + pos); + if (ret) + break; + } + + return ret; +} + static unsigned long sgx_get_unmapped_area(struct file *file,
[PATCH RFC 2/3] x86/sgx: Export sgx_encl_page_alloc()
Move sgx_encl_page_alloc() to encl.c and export it so that it can be used in the implementation for MAP_POPULATE, which requires to allocate new enclave pages. Signed-off-by: Jarkko Sakkinen --- arch/x86/kernel/cpu/sgx/encl.c | 38 + arch/x86/kernel/cpu/sgx/encl.h | 3 +++ arch/x86/kernel/cpu/sgx/ioctl.c | 38 - 3 files changed, 41 insertions(+), 38 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 89aeed798ffb..79e39bd99c09 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -914,6 +914,44 @@ int sgx_encl_test_and_clear_young(struct mm_struct *mm, return ret; } +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags) +{ + struct sgx_encl_page *encl_page; + unsigned long prot; + + encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); + if (!encl_page) + return ERR_PTR(-ENOMEM); + + encl_page->desc = encl->base + offset; + encl_page->encl = encl; + + prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | + _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); + + /* +* TCS pages must always RW set for CPU access while the SECINFO +* permissions are *always* zero - the CPU ignores the user provided +* values and silently overwrites them with zero permissions. +*/ + if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) + prot |= PROT_READ | PROT_WRITE; + + /* Calculate maximum of the VM flags for the page. */ + encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); + + /* +* At time of allocation, the runtime protection bits are the same +* as the maximum protection bits. +*/ + encl_page->vm_run_prot_bits = encl_page->vm_max_prot_bits; + + return encl_page; +} + /** * sgx_zap_enclave_ptes() - remove PTEs mapping the address from enclave * @encl: the enclave diff --git a/arch/x86/kernel/cpu/sgx/encl.h b/arch/x86/kernel/cpu/sgx/encl.h index 1b6ce1da7c92..3df0d3faf3a1 100644 --- a/arch/x86/kernel/cpu/sgx/encl.h +++ b/arch/x86/kernel/cpu/sgx/encl.h @@ -113,6 +113,9 @@ int sgx_encl_get_backing(struct sgx_encl *encl, unsigned long page_index, void sgx_encl_put_backing(struct sgx_backing *backing, bool do_write); int sgx_encl_test_and_clear_young(struct mm_struct *mm, struct sgx_encl_page *page); +struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, + unsigned long offset, + u64 secinfo_flags); void sgx_zap_enclave_ptes(struct sgx_encl *encl, unsigned long addr); struct sgx_epc_page *sgx_alloc_va_page(void); unsigned int sgx_alloc_va_slot(struct sgx_va_page *va_page); diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index d8c3c07badb3..3e3ca27a6f72 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -169,44 +169,6 @@ static long sgx_ioc_enclave_create(struct sgx_encl *encl, void __user *arg) return ret; } -static struct sgx_encl_page *sgx_encl_page_alloc(struct sgx_encl *encl, -unsigned long offset, -u64 secinfo_flags) -{ - struct sgx_encl_page *encl_page; - unsigned long prot; - - encl_page = kzalloc(sizeof(*encl_page), GFP_KERNEL); - if (!encl_page) - return ERR_PTR(-ENOMEM); - - encl_page->desc = encl->base + offset; - encl_page->encl = encl; - - prot = _calc_vm_trans(secinfo_flags, SGX_SECINFO_R, PROT_READ) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_W, PROT_WRITE) | - _calc_vm_trans(secinfo_flags, SGX_SECINFO_X, PROT_EXEC); - - /* -* TCS pages must always RW set for CPU access while the SECINFO -* permissions are *always* zero - the CPU ignores the user provided -* values and silently overwrites them with zero permissions. -*/ - if ((secinfo_flags & SGX_SECINFO_PAGE_TYPE_MASK) == SGX_SECINFO_TCS) - prot |= PROT_READ | PROT_WRITE; - - /* Calculate maximum of the VM flags for the page. */ - encl_page->vm_max_prot_bits = calc_vm_prot_bits(prot, 0); - - /* -* At time of allocation, the runtime protection bits are the same -* as the maximum protection bits. -*/ - encl_page->vm_run_prot_bits = encl_page->vm_max_prot_bits; - - return encl_page; -} - static int sgx_validate_secinfo(struct sgx_secinfo *secinfo) { u64 perm = secinfo->flags &
[PATCH RFC 1/3] mm: Add f_ops->populate()
Sometimes you might want to use MAP_POPULATE to ask a device driver to initialize the device memory in some specific manner. SGX driver can use this to request more memory by issuing ENCLS[EAUG] x86 opcode for each page in the address range. Add f_ops->populate() with the same parameters as f_ops->mmap() and make it conditionally called inside call_mmap(). Update call sites accodingly. --- Signed-off-by: Jarkko Sakkinen v3: - if (!ret && do_populate && file->f_op->populate) + if (!ret && do_populate && file->f_op->populate && + !!(vma->vm_flags & (VM_IO | VM_PFNMAP))) (reported by Matthew Wilcox) v2: - if (!ret && do_populate) + if (!ret && do_populate && file->f_op->populate) (reported by Jan Harkes) --- arch/mips/kernel/vdso.c| 2 +- drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c | 2 +- fs/coda/file.c | 2 +- fs/overlayfs/file.c| 2 +- include/linux/fs.h | 12 ++-- include/linux/mm.h | 2 +- ipc/shm.c | 2 +- mm/mmap.c | 10 +- mm/nommu.c | 4 ++-- 9 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 3d0cf471f2fe..89f3f3da9abd 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -102,7 +102,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) base = mmap_region(NULL, STACK_TOP, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - 0, NULL); + 0, NULL, false); if (IS_ERR_VALUE(base)) { ret = base; goto out; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c index 1b526039a60d..4c71f64d6a79 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c @@ -107,7 +107,7 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct * if (!obj->base.filp) return -ENODEV; - ret = call_mmap(obj->base.filp, vma); + ret = call_mmap(obj->base.filp, vma, false); if (ret) return ret; diff --git a/fs/coda/file.c b/fs/coda/file.c index 29dd87be2fb8..e14f312fdbf8 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -173,7 +173,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) spin_unlock(>c_lock); vma->vm_file = get_file(host_file); - ret = call_mmap(vma->vm_file, vma); + ret = call_mmap(vma->vm_file, vma, false); if (ret) { /* if call_mmap fails, our caller will put host_file so we diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index fa125feed0ff..b963a9397e80 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -503,7 +503,7 @@ static int ovl_mmap(struct file *file, struct vm_area_struct *vma) vma_set_file(vma, realfile); old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = call_mmap(vma->vm_file, vma); + ret = call_mmap(vma->vm_file, vma, false); revert_creds(old_cred); ovl_file_accessed(file); diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..2909e2d14af8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -1993,6 +1994,7 @@ struct file_operations { long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); + int (*populate)(struct file *, struct vm_area_struct *); unsigned long mmap_supported_flags; int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); @@ -2074,9 +2076,15 @@ static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio, return file->f_op->write_iter(kio, iter); } -static inline int call_mmap(struct file *file, struct vm_area_struct *vma) +static inline int call_mmap(struct file *file, struct vm_area_struct *vma, bool do_populate) { - return file->f_op->mmap(file, vma); + int ret = file->f_op->mmap(file, vma); + + if (!ret && do_populate && file->f_op->populate && + !!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + ret = file->f_op->populate(file, vma); + + return ret; } extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..6c8c036f423b 100644 --- a/include/linux/mm.h +++
[PATCH RFC 0/3] MAP_POPULATE for device memory
For device memory (aka VM_IO | VM_PFNMAP) MAP_POPULATE does nothing. Allow to use that for initializing the device memory by providing a new callback f_ops->populate() for the purpose. SGX patches are provided to show the callback in context. An obvious alternative is a ioctl but it is less elegant and requires two syscalls (mmap + ioctl) per memory range, instead of just one (mmap). Jarkko Sakkinen (3): mm: Add f_ops->populate() x86/sgx: Export sgx_encl_page_alloc() x86/sgx: Implement EAUG population with MAP_POPULATE arch/mips/kernel/vdso.c| 2 +- arch/x86/kernel/cpu/sgx/driver.c | 129 + arch/x86/kernel/cpu/sgx/encl.c | 38 ++ arch/x86/kernel/cpu/sgx/encl.h | 3 + arch/x86/kernel/cpu/sgx/ioctl.c| 38 -- drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c | 2 +- fs/coda/file.c | 2 +- fs/overlayfs/file.c| 2 +- include/linux/fs.h | 12 +- include/linux/mm.h | 2 +- ipc/shm.c | 2 +- mm/mmap.c | 10 +- mm/nommu.c | 4 +- 13 files changed, 193 insertions(+), 53 deletions(-) -- 2.35.1
Re: [PATCH RFC] mm: Add f_ops->populate()
On Sun, Mar 06, 2022 at 06:25:52AM +0200, Jarkko Sakkinen wrote: > > Are you deliberately avoiding the question? I'm not asking about > > implementation. I'm asking about the semantics of MAP_POPULATE with > > your driver. > > No. I just noticed a bug in the guard from your comment that I wanted > point out. > > With the next version I post the corresponding change to the driver, > in order to see this in context. Oh, good grief. Don't bother. NAK.
Re: [PATCH RFC] mm: Add f_ops->populate()
On Sun, Mar 06, 2022 at 04:19:26AM +, Matthew Wilcox wrote: > On Sun, Mar 06, 2022 at 06:11:21AM +0200, Jarkko Sakkinen wrote: > > On Sun, Mar 06, 2022 at 03:52:12AM +, Matthew Wilcox wrote: > > > On Sun, Mar 06, 2022 at 05:21:11AM +0200, Jarkko Sakkinen wrote: > > > > On Sun, Mar 06, 2022 at 02:57:55AM +, Matthew Wilcox wrote: > > > > > On Sun, Mar 06, 2022 at 04:15:33AM +0200, Jarkko Sakkinen wrote: > > > > > > Sometimes you might want to use MAP_POPULATE to ask a device driver > > > > > > to > > > > > > initialize the device memory in some specific manner. SGX driver > > > > > > can use > > > > > > this to request more memory by issuing ENCLS[EAUG] x86 opcode for > > > > > > each > > > > > > page in the address range. > > > > > > > > > > > > Add f_ops->populate() with the same parameters as f_ops->mmap() and > > > > > > make > > > > > > it conditionally called inside call_mmap(). Update call sites > > > > > > accodingly. > > > > > > > > > > Your device driver has a ->mmap operation. Why does it need another > > > > > one? More explanation required here. > > > > > > > > f_ops->mmap() would require an additional parameter, which results > > > > heavy refactoring. > > > > > > > > struct file_operations has 1125 references in the kernel tree, so I > > > > decided to check this way around first. > > > > > > Are you saying that your device driver behaves differently if > > > MAP_POPULATE is set versus if it isn't? That seems hideously broken. > > > > MAP_POPULATE does not do anything (according to __mm_populate in mm/gup.c) > > with VMA's that have some sort of device/IO memory, i.e. vm_flags > > intersecting with VM_PFNMAP | VM_IO. > > > > I can extend the guard obviously to: > > > > if (!ret && do_populate && file->f_op->populate && > > !!(vma->vm_flags & (VM_IO | VM_PFNMAP))) > > file->f_op->populate(file, vma); > > Are you deliberately avoiding the question? I'm not asking about > implementation. I'm asking about the semantics of MAP_POPULATE with > your driver. No. I just noticed a bug in the guard from your comment that I wanted point out. With the next version I post the corresponding change to the driver, in order to see this in context. BR, Jarkko
Re: [PATCH RFC] mm: Add f_ops->populate()
On Sun, Mar 06, 2022 at 06:11:21AM +0200, Jarkko Sakkinen wrote: > On Sun, Mar 06, 2022 at 03:52:12AM +, Matthew Wilcox wrote: > > On Sun, Mar 06, 2022 at 05:21:11AM +0200, Jarkko Sakkinen wrote: > > > On Sun, Mar 06, 2022 at 02:57:55AM +, Matthew Wilcox wrote: > > > > On Sun, Mar 06, 2022 at 04:15:33AM +0200, Jarkko Sakkinen wrote: > > > > > Sometimes you might want to use MAP_POPULATE to ask a device driver to > > > > > initialize the device memory in some specific manner. SGX driver can > > > > > use > > > > > this to request more memory by issuing ENCLS[EAUG] x86 opcode for each > > > > > page in the address range. > > > > > > > > > > Add f_ops->populate() with the same parameters as f_ops->mmap() and > > > > > make > > > > > it conditionally called inside call_mmap(). Update call sites > > > > > accodingly. > > > > > > > > Your device driver has a ->mmap operation. Why does it need another > > > > one? More explanation required here. > > > > > > f_ops->mmap() would require an additional parameter, which results > > > heavy refactoring. > > > > > > struct file_operations has 1125 references in the kernel tree, so I > > > decided to check this way around first. > > > > Are you saying that your device driver behaves differently if > > MAP_POPULATE is set versus if it isn't? That seems hideously broken. > > MAP_POPULATE does not do anything (according to __mm_populate in mm/gup.c) > with VMA's that have some sort of device/IO memory, i.e. vm_flags > intersecting with VM_PFNMAP | VM_IO. > > I can extend the guard obviously to: > > if (!ret && do_populate && file->f_op->populate && > !!(vma->vm_flags & (VM_IO | VM_PFNMAP))) > file->f_op->populate(file, vma); Are you deliberately avoiding the question? I'm not asking about implementation. I'm asking about the semantics of MAP_POPULATE with your driver.
Re: [PATCH RFC] mm: Add f_ops->populate()
On Sun, Mar 06, 2022 at 03:52:12AM +, Matthew Wilcox wrote: > On Sun, Mar 06, 2022 at 05:21:11AM +0200, Jarkko Sakkinen wrote: > > On Sun, Mar 06, 2022 at 02:57:55AM +, Matthew Wilcox wrote: > > > On Sun, Mar 06, 2022 at 04:15:33AM +0200, Jarkko Sakkinen wrote: > > > > Sometimes you might want to use MAP_POPULATE to ask a device driver to > > > > initialize the device memory in some specific manner. SGX driver can use > > > > this to request more memory by issuing ENCLS[EAUG] x86 opcode for each > > > > page in the address range. > > > > > > > > Add f_ops->populate() with the same parameters as f_ops->mmap() and make > > > > it conditionally called inside call_mmap(). Update call sites > > > > accodingly. > > > > > > Your device driver has a ->mmap operation. Why does it need another > > > one? More explanation required here. > > > > f_ops->mmap() would require an additional parameter, which results > > heavy refactoring. > > > > struct file_operations has 1125 references in the kernel tree, so I > > decided to check this way around first. > > Are you saying that your device driver behaves differently if > MAP_POPULATE is set versus if it isn't? That seems hideously broken. MAP_POPULATE does not do anything (according to __mm_populate in mm/gup.c) with VMA's that have some sort of device/IO memory, i.e. vm_flags intersecting with VM_PFNMAP | VM_IO. I can extend the guard obviously to: if (!ret && do_populate && file->f_op->populate && !!(vma->vm_flags & (VM_IO | VM_PFNMAP))) file->f_op->populate(file, vma); BR, Jarkko
Re: [PATCH RFC] mm: Add f_ops->populate()
On Sun, Mar 06, 2022 at 05:21:11AM +0200, Jarkko Sakkinen wrote: > On Sun, Mar 06, 2022 at 02:57:55AM +, Matthew Wilcox wrote: > > On Sun, Mar 06, 2022 at 04:15:33AM +0200, Jarkko Sakkinen wrote: > > > Sometimes you might want to use MAP_POPULATE to ask a device driver to > > > initialize the device memory in some specific manner. SGX driver can use > > > this to request more memory by issuing ENCLS[EAUG] x86 opcode for each > > > page in the address range. > > > > > > Add f_ops->populate() with the same parameters as f_ops->mmap() and make > > > it conditionally called inside call_mmap(). Update call sites > > > accodingly. > > > > Your device driver has a ->mmap operation. Why does it need another > > one? More explanation required here. > > f_ops->mmap() would require an additional parameter, which results > heavy refactoring. > > struct file_operations has 1125 references in the kernel tree, so I > decided to check this way around first. Are you saying that your device driver behaves differently if MAP_POPULATE is set versus if it isn't? That seems hideously broken.
Re: Report 2 in ext4 and journal based on v5.17-rc1
On Sat, Mar 05, 2022 at 11:55:34PM +0900, Byungchul Park wrote: > > that is why some of the DEPT reports were completely incomprehensible > > It's because you are blinded to blame at it without understanding how > Dept works at all. I will fix those that must be fixed. Don't worry. Users of DEPT must not have to understand how DEPT works in order to understand and use DEPT reports. If you think I don't understand how DEPT work, I'm going to gently suggest that this means DEPT reports are clear enough, and/or DEPT documentation needs to be *substantially* improved, or both --- and these needs to happen before DEPT is ready to be merged. > > So if DEPT is issuing lots of reports about apparently circular > > dependencies, please try to be open to the thought that the fault is > > No one was convinced that Dept doesn't have a fault. I think your > worries are too much. In that case, may I ask that you add back a RFC to the subject prefix (e.g., [PATCH RFC -v5]?) Or maybe even add the subject prefix NOT YET READY? I have seen cases when after a patch series get to PATCH -v22, and then people assume that it *must* be ready, as opposed what it really means, which is "the author is just persistently reposting and rebasing the patch series over and over again". It would be helpful if you directly acknowledge, in each patch submission, that it is not yet ready for prime time. After all, right now, DEPT has generated two reports in ext4, both of which were false positives, and both of which have required a lot of maintainer times to prove to you that they were in fact false positives. So are we all agreed that DEPT is not ready for prime time? > No one argued that their code must be buggy, either. So I don't think > you have to worry about what's never happened. Well, you kept on insisting that ext4 must have a circular dependency, and that depending on a "rescue wakeup" is bad programming practice, but you'll reluctantly agree to make DEPT accept "rescue wakeups" if that is the will of the developers. My concern here is the fundmaental concept of "rescue wakeups" is wrong; I don't see how you can distinguish between a valid wakeup and one that you and DEPT is going to somehow characterize as dodgy. Consider: a process can first subscribe to multiple wait queues, and arrange to be woken up by a timeout, and then call schedule() to go to sleep. So it is not waiting on a single wait channel, but potentially *multiple* wakeup sources. If you are going to prove that kernel is not going to make forward progress, you need to prove that *all* ways that process might not wake up aren't going to happen for some reason. Just because one wakeup source seems to form a circular dependency proves nothing, since another wakeup source might be the designed and architected way that code makes forward progress. You seem to be assuminng that one wakeup source is somehow the "correct" one, and the other ways that process could be woken up is a "rescue wakeup source" and you seem to believe that relying on a "rescue wakeup source" is bad. But in the case of a process which has called prepare-to-wait on more than one wait queue, how is DEPT going to distinguish between your "morally correct" wkaeup source, and the "rescue wakeup source"? > No doubt. I already think so. But it doesn't mean that I have to keep > quiet without discussing to imporve Dept. I will keep improving Dept in > a reasonable way. Well, I don't want to be in a position of having to prove that every single DEPT report in my code that doesn't make sense to me is nonsense, or else DEPT will get merged. So maybe we need to reverse the burden of proof. Instead of just sending a DEPT report, and then asking the maintainers to explain why it is a false positive, how about if *you* use the DEPT report to examinie the subsystem code, and then explain plain English, how you think this could trigger in real life, or cause a performance problem in real life or perhaps provide a script or C reproducer that triggers the supposed deadlock? Yes, that means you will need to understand the code in question, but hopefully the DEPT reports should be clear enough that someone who isn't a deep expert in the code should be able to spot the bug. If not, and if only a few deep experts of code in question will be able to decipher the DEPT report and figure out a fix, that's really not ideal. If DEPT can find a real bug and you can show that Lockdep wouldn't have been able to find it, then that would be proof that it is making a real contribution. That's would be real benefit. At the same time, DEPT will hopefully be able to demonstrate a false positive rate which is low enough that the benefits clearly outweight the costs. At the moment, I believe the scoreboard for DEPT with respect to ext4 is zero real bugs found, and two false positives, both of which have required significant rounds of e-mail before the subsystem maintainers were able to prove to you that it
[PATCH RFC v2] mm: Add f_ops->populate()
Sometimes you might want to use MAP_POPULATE to ask a device driver to initialize the device memory in some specific manner. SGX driver can use this to request more memory by issuing ENCLS[EAUG] x86 opcode for each page in the address range. Add f_ops->populate() with the same parameters as f_ops->mmap() and make it conditionally called inside call_mmap(). Update call sites accodingly. Signed-off-by: Jarkko Sakkinen --- v2: - if (!ret && do_populate) + if (!ret && do_populate && file->f_op->populate) (reported by Jan Harkes) --- arch/mips/kernel/vdso.c| 2 +- drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c | 2 +- fs/coda/file.c | 2 +- fs/overlayfs/file.c| 2 +- include/linux/fs.h | 10 -- include/linux/mm.h | 2 +- ipc/shm.c | 2 +- mm/mmap.c | 10 +- mm/nommu.c | 4 ++-- 9 files changed, 21 insertions(+), 15 deletions(-) diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 3d0cf471f2fe..89f3f3da9abd 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -102,7 +102,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) base = mmap_region(NULL, STACK_TOP, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - 0, NULL); + 0, NULL, false); if (IS_ERR_VALUE(base)) { ret = base; goto out; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c index 1b526039a60d..4c71f64d6a79 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c @@ -107,7 +107,7 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct * if (!obj->base.filp) return -ENODEV; - ret = call_mmap(obj->base.filp, vma); + ret = call_mmap(obj->base.filp, vma, false); if (ret) return ret; diff --git a/fs/coda/file.c b/fs/coda/file.c index 29dd87be2fb8..e14f312fdbf8 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -173,7 +173,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) spin_unlock(>c_lock); vma->vm_file = get_file(host_file); - ret = call_mmap(vma->vm_file, vma); + ret = call_mmap(vma->vm_file, vma, false); if (ret) { /* if call_mmap fails, our caller will put host_file so we diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index fa125feed0ff..b963a9397e80 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -503,7 +503,7 @@ static int ovl_mmap(struct file *file, struct vm_area_struct *vma) vma_set_file(vma, realfile); old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = call_mmap(vma->vm_file, vma); + ret = call_mmap(vma->vm_file, vma, false); revert_creds(old_cred); ovl_file_accessed(file); diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..4c6a3339373d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1993,6 +1993,7 @@ struct file_operations { long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); + int (*populate)(struct file *, struct vm_area_struct *); unsigned long mmap_supported_flags; int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); @@ -2074,9 +2075,14 @@ static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio, return file->f_op->write_iter(kio, iter); } -static inline int call_mmap(struct file *file, struct vm_area_struct *vma) +static inline int call_mmap(struct file *file, struct vm_area_struct *vma, bool do_populate) { - return file->f_op->mmap(file, vma); + int ret = file->f_op->mmap(file, vma); + + if (!ret && do_populate && file->f_op->populate) + ret = file->f_op->populate(file, vma); + + return ret; } extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..6c8c036f423b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2683,7 +2683,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); + struct list_head *uf, bool do_populate);
Re: [PATCH RFC] mm: Add f_ops->populate()
On Sun, Mar 06, 2022 at 02:57:55AM +, Matthew Wilcox wrote: > On Sun, Mar 06, 2022 at 04:15:33AM +0200, Jarkko Sakkinen wrote: > > Sometimes you might want to use MAP_POPULATE to ask a device driver to > > initialize the device memory in some specific manner. SGX driver can use > > this to request more memory by issuing ENCLS[EAUG] x86 opcode for each > > page in the address range. > > > > Add f_ops->populate() with the same parameters as f_ops->mmap() and make > > it conditionally called inside call_mmap(). Update call sites > > accodingly. > > Your device driver has a ->mmap operation. Why does it need another > one? More explanation required here. f_ops->mmap() would require an additional parameter, which results heavy refactoring. struct file_operations has 1125 references in the kernel tree, so I decided to check this way around first. BR, Jarkko
Re: [PATCH RFC] mm: Add f_ops->populate()
On Sun, Mar 06, 2022 at 04:15:33AM +0200, Jarkko Sakkinen wrote: > Sometimes you might want to use MAP_POPULATE to ask a device driver to > initialize the device memory in some specific manner. SGX driver can use > this to request more memory by issuing ENCLS[EAUG] x86 opcode for each > page in the address range. > > Add f_ops->populate() with the same parameters as f_ops->mmap() and make > it conditionally called inside call_mmap(). Update call sites > accodingly. Your device driver has a ->mmap operation. Why does it need another one? More explanation required here.
[PATCH RFC] mm: Add f_ops->populate()
Sometimes you might want to use MAP_POPULATE to ask a device driver to initialize the device memory in some specific manner. SGX driver can use this to request more memory by issuing ENCLS[EAUG] x86 opcode for each page in the address range. Add f_ops->populate() with the same parameters as f_ops->mmap() and make it conditionally called inside call_mmap(). Update call sites accodingly. Signed-off-by: Jarkko Sakkinen --- arch/mips/kernel/vdso.c| 2 +- drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c | 2 +- fs/coda/file.c | 2 +- fs/overlayfs/file.c| 2 +- include/linux/fs.h | 10 -- include/linux/mm.h | 2 +- ipc/shm.c | 2 +- mm/mmap.c | 10 +- mm/nommu.c | 4 ++-- 9 files changed, 21 insertions(+), 15 deletions(-) diff --git a/arch/mips/kernel/vdso.c b/arch/mips/kernel/vdso.c index 3d0cf471f2fe..89f3f3da9abd 100644 --- a/arch/mips/kernel/vdso.c +++ b/arch/mips/kernel/vdso.c @@ -102,7 +102,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) base = mmap_region(NULL, STACK_TOP, PAGE_SIZE, VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC, - 0, NULL); + 0, NULL, false); if (IS_ERR_VALUE(base)) { ret = base; goto out; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c index 1b526039a60d..4c71f64d6a79 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_dmabuf.c @@ -107,7 +107,7 @@ static int i915_gem_dmabuf_mmap(struct dma_buf *dma_buf, struct vm_area_struct * if (!obj->base.filp) return -ENODEV; - ret = call_mmap(obj->base.filp, vma); + ret = call_mmap(obj->base.filp, vma, false); if (ret) return ret; diff --git a/fs/coda/file.c b/fs/coda/file.c index 29dd87be2fb8..e14f312fdbf8 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -173,7 +173,7 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) spin_unlock(>c_lock); vma->vm_file = get_file(host_file); - ret = call_mmap(vma->vm_file, vma); + ret = call_mmap(vma->vm_file, vma, false); if (ret) { /* if call_mmap fails, our caller will put host_file so we diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index fa125feed0ff..b963a9397e80 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -503,7 +503,7 @@ static int ovl_mmap(struct file *file, struct vm_area_struct *vma) vma_set_file(vma, realfile); old_cred = ovl_override_creds(file_inode(file)->i_sb); - ret = call_mmap(vma->vm_file, vma); + ret = call_mmap(vma->vm_file, vma, false); revert_creds(old_cred); ovl_file_accessed(file); diff --git a/include/linux/fs.h b/include/linux/fs.h index e2d892b201b0..fb90284e1c82 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1993,6 +1993,7 @@ struct file_operations { long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); + int (*populate)(struct file *, struct vm_area_struct *); unsigned long mmap_supported_flags; int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); @@ -2074,9 +2075,14 @@ static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio, return file->f_op->write_iter(kio, iter); } -static inline int call_mmap(struct file *file, struct vm_area_struct *vma) +static inline int call_mmap(struct file *file, struct vm_area_struct *vma, bool do_populate) { - return file->f_op->mmap(file, vma); + int ret = file->f_op->mmap(file, vma); + + if (!ret && do_populate) + ret = file->f_op->populate(file, vma); + + return ret; } extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 213cc569b192..6c8c036f423b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2683,7 +2683,7 @@ extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned lo extern unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf); + struct list_head *uf, bool do_populate); extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags,
[CI 2/2] drm/i915/gem: Don't try to map and fence large scanout buffers (v9)
On platforms capable of allowing 8K (7680 x 4320) modes, pinning 2 or more framebuffers/scanout buffers results in only one that is mappable/ fenceable. Therefore, pageflipping between these 2 FBs where only one is mappable/fenceable creates latencies large enough to miss alternate vblanks thereby producing less optimal framerate. This mainly happens because when i915_gem_object_pin_to_display_plane() is called to pin one of the FB objs, the associated vma is identified as misplaced and therefore i915_vma_unbind() is called which unbinds and evicts it. This misplaced vma gets subseqently pinned only when i915_gem_object_ggtt_pin_ww() is called without PIN_MAPPABLE. This results in a latency of ~10ms and happens every other vblank/repaint cycle. Therefore, to fix this issue, we try to see if there is space to map at-least two objects of a given size and return early if there isn't. This would ensure that we do not try with PIN_MAPPABLE for any objects that are too big to map thereby preventing unncessary unbind. Testcase: Running Weston and weston-simple-egl on an Alderlake_S (ADLS) platform with a 8K@60 mode results in only ~40 FPS. Since upstream Weston submits a frame ~7ms before the next vblank, the latencies seen between atomic commit and flip event are 7, 24 (7 + 16.66), 7, 24. suggesting that it misses the vblank every other frame. Here is the ftrace snippet that shows the source of the ~10ms latency: i915_gem_object_pin_to_display_plane() { 0.102 us |i915_gem_object_set_cache_level(); i915_gem_object_ggtt_pin_ww() { 0.390 us | i915_vma_instance(); 0.178 us | i915_vma_misplaced(); i915_vma_unbind() { __i915_active_wait() { 0.082 us |i915_active_acquire_if_busy(); 0.475 us | } intel_runtime_pm_get() { 0.087 us |intel_runtime_pm_acquire(); 0.259 us | } __i915_active_wait() { 0.085 us |i915_active_acquire_if_busy(); 0.240 us | } __i915_vma_evict() { ggtt_unbind_vma() { gen8_ggtt_clear_range() { 10507.255 us |} 10507.689 us | } 10508.516 us | } v2: Instead of using bigjoiner checks, determine whether a scanout buffer is too big by checking to see if it is possible to map two of them into the ggtt. v3 (Ville): - Count how many fb objects can be fit into the available holes instead of checking for a hole twice the object size. - Take alignment constraints into account. - Limit this large scanout buffer check to >= Gen 11 platforms. v4: - Remove existing heuristic that checks just for size. (Ville) - Return early if we find space to map at-least two objects. (Tvrtko) - Slightly update the commit message. v5: (Tvrtko) - Rename the function to indicate that the object may be too big to map into the aperture. - Account for guard pages while calculating the total size required for the object. - Do not subject all objects to the heuristic check and instead consider objects only of a certain size. - Do the hole walk using the rbtree. - Preserve the existing PIN_NONBLOCK logic. - Drop the PIN_MAPPABLE check while pinning the VMA. v6: (Tvrtko) - Return 0 on success and the specific error code on failure to preserve the existing behavior. v7: (Ville) - Drop the HAS_GMCH(i915), DISPLAY_VER(i915) < 11 and size < ggtt->mappable_end / 4 checks. - Drop the redundant check that is based on previous heuristic. v8: - Make sure that we are holding the mutex associated with ggtt vm as we traverse the hole nodes. v9: (Tvrtko) - Use mutex_lock_interruptible_nested() instead of mutex_lock(). Cc: Ville Syrjälä Cc: Maarten Lankhorst Cc: Tvrtko Ursulin Cc: Manasi Navare Reviewed-by: Tvrtko Ursulin Signed-off-by: Vivek Kasireddy --- drivers/gpu/drm/i915/i915_gem.c | 128 +++- 1 file changed, 94 insertions(+), 34 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 2e10187cd0a0..4bef9eaa8b2e 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -49,6 +49,7 @@ #include "gem/i915_gem_pm.h" #include "gem/i915_gem_region.h" #include "gem/i915_gem_userptr.h" +#include "gem/i915_gem_tiling.h" #include "gt/intel_engine_user.h" #include "gt/intel_gt.h" #include "gt/intel_gt_pm.h" @@ -879,6 +880,96 @@ static void discard_ggtt_vma(struct i915_vma *vma) spin_unlock(>vma.lock); } +static int +i915_gem_object_fits_in_aperture(struct drm_i915_gem_object *obj, +u64 alignment, u64 flags) +{ + struct drm_i915_private *i915 = to_i915(obj->base.dev); + struct i915_ggtt *ggtt = to_gt(i915)->ggtt; + struct drm_mm_node *hole; + u64 hole_start, hole_end, start, end; + u64 fence_size, fence_alignment; + unsigned int count = 0; + int err = 0; + + /* +* If the
[CI 0/2] drm/mm: Add an iterator to optimally walk over holes suitable for an allocation
The first patch is a drm core patch that replaces the for loop in drm_mm_insert_node_in_range() with the iterator and would not cause any functional changes. The second patch is a i915 driver specific patch that also uses the iterator but solves a different problem. v2: - Added a new patch to this series to fix a potential NULL dereference. - Fixed a typo associated with the iterator introduced in the drm core patch. - Added locking around the snippet in the i915 patch that traverses the GGTT hole nodes. v3: (Tvrtko) - Replaced mutex_lock with mutex_lock_interruptible_nested() in the i915 patch. v4: (Tvrtko) - Dropped the patch added in v2 as it was deemed unnecessary. v5: (Tvrtko) - Fixed yet another typo in the drm core patch: should have passed caller_mode instead of mode to the iterator. Cc: Tvrtko Ursulin Cc: Nirmoy Das Cc: Christian König Vivek Kasireddy (2): drm/mm: Add an iterator to optimally walk over holes for an allocation (v5) drm/i915/gem: Don't try to map and fence large scanout buffers (v9) drivers/gpu/drm/drm_mm.c| 32 drivers/gpu/drm/i915/i915_gem.c | 128 +++- include/drm/drm_mm.h| 36 + 3 files changed, 145 insertions(+), 51 deletions(-) -- 2.35.1
[CI 1/2] drm/mm: Add an iterator to optimally walk over holes for an allocation (v5)
This iterator relies on drm_mm_first_hole() and drm_mm_next_hole() functions to identify suitable holes for an allocation of a given size by efficiently traversing the rbtree associated with the given allocator. It replaces the for loop in drm_mm_insert_node_in_range() and can also be used by drm drivers to quickly identify holes of a certain size within a given range. v2: (Tvrtko) - Prepend a double underscore for the newly exported first/next_hole - s/each_best_hole/each_suitable_hole/g - Mask out DRM_MM_INSERT_ONCE from the mode before calling first/next_hole and elsewhere. v3: (Tvrtko) - Reduce the number of hunks by retaining the "mode" variable name v4: - Typo: s/__drm_mm_next_hole(.., hole/__drm_mm_next_hole(.., pos v5: (Tvrtko) - Fixed another typo: should pass caller_mode instead of mode to the iterator in drm_mm_insert_node_in_range(). Reviewed-by: Tvrtko Ursulin Acked-by: Christian König Suggested-by: Tvrtko Ursulin Signed-off-by: Vivek Kasireddy --- drivers/gpu/drm/drm_mm.c | 32 +++- include/drm/drm_mm.h | 36 2 files changed, 51 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index 8257f9d4f619..6ff98a0e4df3 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -352,10 +352,10 @@ static struct drm_mm_node *find_hole_addr(struct drm_mm *mm, u64 addr, u64 size) return node; } -static struct drm_mm_node * -first_hole(struct drm_mm *mm, - u64 start, u64 end, u64 size, - enum drm_mm_insert_mode mode) +struct drm_mm_node * +__drm_mm_first_hole(struct drm_mm *mm, + u64 start, u64 end, u64 size, + enum drm_mm_insert_mode mode) { switch (mode) { default: @@ -374,6 +374,7 @@ first_hole(struct drm_mm *mm, hole_stack); } } +EXPORT_SYMBOL(__drm_mm_first_hole); /** * DECLARE_NEXT_HOLE_ADDR - macro to declare next hole functions @@ -410,11 +411,11 @@ static struct drm_mm_node *name(struct drm_mm_node *entry, u64 size) \ DECLARE_NEXT_HOLE_ADDR(next_hole_high_addr, rb_left, rb_right) DECLARE_NEXT_HOLE_ADDR(next_hole_low_addr, rb_right, rb_left) -static struct drm_mm_node * -next_hole(struct drm_mm *mm, - struct drm_mm_node *node, - u64 size, - enum drm_mm_insert_mode mode) +struct drm_mm_node * +__drm_mm_next_hole(struct drm_mm *mm, + struct drm_mm_node *node, + u64 size, + enum drm_mm_insert_mode mode) { switch (mode) { default: @@ -432,6 +433,7 @@ next_hole(struct drm_mm *mm, return >hole_stack == >hole_stack ? NULL : node; } } +EXPORT_SYMBOL(__drm_mm_next_hole); /** * drm_mm_reserve_node - insert an pre-initialized node @@ -516,11 +518,11 @@ int drm_mm_insert_node_in_range(struct drm_mm * const mm, u64 size, u64 alignment, unsigned long color, u64 range_start, u64 range_end, - enum drm_mm_insert_mode mode) + enum drm_mm_insert_mode caller_mode) { struct drm_mm_node *hole; u64 remainder_mask; - bool once; + enum drm_mm_insert_mode mode = caller_mode & ~DRM_MM_INSERT_ONCE; DRM_MM_BUG_ON(range_start > range_end); @@ -533,13 +535,9 @@ int drm_mm_insert_node_in_range(struct drm_mm * const mm, if (alignment <= 1) alignment = 0; - once = mode & DRM_MM_INSERT_ONCE; - mode &= ~DRM_MM_INSERT_ONCE; - remainder_mask = is_power_of_2(alignment) ? alignment - 1 : 0; - for (hole = first_hole(mm, range_start, range_end, size, mode); -hole; -hole = once ? NULL : next_hole(mm, hole, size, mode)) { + drm_mm_for_each_suitable_hole(hole, mm, range_start, range_end, + size, caller_mode) { u64 hole_start = __drm_mm_hole_node_start(hole); u64 hole_end = hole_start + hole->hole_size; u64 adj_start, adj_end; diff --git a/include/drm/drm_mm.h b/include/drm/drm_mm.h index ac33ba1b18bc..dff6db627807 100644 --- a/include/drm/drm_mm.h +++ b/include/drm/drm_mm.h @@ -400,6 +400,42 @@ static inline u64 drm_mm_hole_node_end(const struct drm_mm_node *hole_node) 1 : 0; \ pos = list_next_entry(pos, hole_stack)) +struct drm_mm_node * +__drm_mm_first_hole(struct drm_mm *mm, + u64 start, u64 end, u64 size, + enum drm_mm_insert_mode mode); + +struct drm_mm_node * +__drm_mm_next_hole(struct drm_mm *mm, + struct drm_mm_node *node, + u64 size, + enum drm_mm_insert_mode mode); + +/** + * drm_mm_for_each_suitable_hole - iterator to optimally walk over all + *
Re: [PATCH v2 2/2] dt-bindings: gpu: Convert aspeed-gfx bindings to yaml
On 04/03/2022 01:03, Joel Stanley wrote: > Convert the bindings to yaml and add the ast2600 compatible string. > > The legacy mfd description was put in place before the gfx bindings > existed, to document the compatible that is used in the pinctrl > bindings. > > Signed-off-by: Joel Stanley > --- > .../devicetree/bindings/gpu/aspeed,gfx.yaml | 69 +++ > .../devicetree/bindings/gpu/aspeed-gfx.txt| 41 --- > .../devicetree/bindings/mfd/aspeed-gfx.txt| 17 - > 3 files changed, 69 insertions(+), 58 deletions(-) > create mode 100644 Documentation/devicetree/bindings/gpu/aspeed,gfx.yaml > delete mode 100644 Documentation/devicetree/bindings/gpu/aspeed-gfx.txt > delete mode 100644 Documentation/devicetree/bindings/mfd/aspeed-gfx.txt > > diff --git a/Documentation/devicetree/bindings/gpu/aspeed,gfx.yaml > b/Documentation/devicetree/bindings/gpu/aspeed,gfx.yaml > new file mode 100644 > index ..8ddc4fa6e8e4 > --- /dev/null > +++ b/Documentation/devicetree/bindings/gpu/aspeed,gfx.yaml > @@ -0,0 +1,69 @@ > +# SPDX-License-Identifier: GPL-2.0-only > +%YAML 1.2 > +--- > +$id: http://devicetree.org/schemas/gpu/aspeed,gfx.yaml# > +$schema: http://devicetree.org/meta-schemas/core.yaml# > + > +title: ASPEED GFX display device > + > +maintainers: > + - Joel Stanley > + > +properties: > + compatible: > +items: > + - enum: > + - aspeed,ast2400-gfx > + - aspeed,ast2500-gfx > + - aspeed,ast2600-gfx That's different than original bindings - new patch. It's not currently supported, so adding it is more than just updating bindings to current state. > + - const: syscon > + > + reg: > +minItems: 1 maxItems? > + > + interrupts: > +maxItems: 1 > + > + clocks: > +maxItems: 1 > + > + resets: > +maxItems: 1 > + > + memory-region: true > + > + syscon: true You need at least description. I see old bindings did not mention it (except that it is required)... I think you also need a type, because it is not a standard property. > + > + reg-io-width: true Some constraints? Can it be anything from syscon schema? Best regards, Krzysztof
[PATCH] drm/msm/a6xx: Fix missing ARRAY_SIZE() check
From: Rob Clark Fixes: f6d62d091cfd ("drm/msm/a6xx: add support for Adreno 660 GPU") Signed-off-by: Rob Clark Reviewed-by: Dmitry Baryshkov --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 02b47977b5c3..83c31b2ad865 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -683,19 +683,23 @@ static void a6xx_set_cp_protect(struct msm_gpu *gpu) { struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); const u32 *regs = a6xx_protect; - unsigned i, count = ARRAY_SIZE(a6xx_protect), count_max = 32; - - BUILD_BUG_ON(ARRAY_SIZE(a6xx_protect) > 32); - BUILD_BUG_ON(ARRAY_SIZE(a650_protect) > 48); + unsigned i, count, count_max; if (adreno_is_a650(adreno_gpu)) { regs = a650_protect; count = ARRAY_SIZE(a650_protect); count_max = 48; + BUILD_BUG_ON(ARRAY_SIZE(a650_protect) > 48); } else if (adreno_is_a660_family(adreno_gpu)) { regs = a660_protect; count = ARRAY_SIZE(a660_protect); count_max = 48; + BUILD_BUG_ON(ARRAY_SIZE(a660_protect) > 48); + } else { + regs = a6xx_protect; + count = ARRAY_SIZE(a6xx_protect); + count_max = 32; + BUILD_BUG_ON(ARRAY_SIZE(a6xx_protect) > 32); } /* -- 2.35.1
Re: [PATCH v3 3/5] drm/print: RFC add choice to use dynamic debug in drm-debug
From: Daniel Vetter Hi Daniel, everyone, Ive substantially updated this patchset, and I thought it useful to reply here. Im noting the biggest changes in context below, trimming heavily otherwize. Also, Ive lost the msg in my gmail-cloud, so this lacks the usual "Daniel said: " attribution, and the "> " marks. Ive prefixed "< " to 1st line of my replies where it helps. The current patchset is here: https://patchwork.freedesktop.org/series/100290/ https://lore.kernel.org/lkml/20220217034829.64395-1-jim.cro...@gmail.com/ Its "failing" patchwork CI cuz of a macro warning in dyndbg, which would be nice for CI to "pass" because its out of DRM tree, and subject to a separate review process, and in the meantime, it would be nice to see it under test. Going item by item: On Wed, Jul 14, 2021 at 11:51:36AM -0600, Jim Cromie wrote: > drm's debug system uses distinct categories of debug messages, encoded > in an enum (DRM_UT_), enum is now renumbered to 0-15 (fits in 4 bits) drm_debug_enabled() does BIT(cat) 15 is unclassed/reserved > Dynamic debug has no concept of category, but we can map the DRM_UT_* now it has "class" keyword, and class_id:4 field. #> echo module drm class 1 +p > control # 1=DRM_UT_KMS iirc This interface is unused yet, DRM is its A-1 customer, are you shopping ? Since its unused, it cannot break anything not using it :-) > CONFIG_DRM_USE_DYNAMIC_DEBUG enables this, and is available if still true. > The indirection/switchover is layered into the macro scheme: Heres where the biggest DRM centric changes (vs old) are: The cDRM_UT_* => prefix-string mapping is gone; while it worked, it made the literal format strings config dependent, and was more complicated to explain. Fitting category into class_id is much cleaner. old way replaced drm*dbg with pr_debug (at surface level) new way adapts drm*dbg to reuse the class-Factory macro under pr_debug. This loses pr_debug's log-decorating feature, but DRM has its own conventions, and extra decorations are unlikely to really add anything. OTOH, drm could re-use those flags to optionalize some of its conventions. > 0. A new callback on drm.debug which calls dynamic_debug_exec_queries now in dyndbg, where it belonged. now just uses class query inside. This is really awesome. For merging I think we need to discuss with dyn debug folks whether they're all ok with this, but it's exported already should should be fine. < Your (fresh) endorsement should help :-) Particularly, the new dyndbg features need a user. > > 1. A "converted" or "classy" DRM_UT_* map > repeating, 2nd map is gone, DRM_UT_* is merely renumbered. >DRM_UT_* are unchanged, since theyre used in drm_debug_enabled() >and elsewhere. I think for the production version of these we need to retire/deprecate them, at least for drm core. Otherwise you have an annoying mismatch between drm.debug module option and dyn debug. < I think this isnt relevant now, since symbols are preserved. Also, the __drm_debug var is used directly by the CLASSBITS macro, (and therefore the callbacks), so /sys/module/drm/parameters/debug is preserved at an ABI-ish level. (__drm_debug now ulong, to work with BIT) > > 2. drm_dev_dbg & drm_debug are renamed (prefixed with '_') > >old names are now macros, calling either: > legacy: -> to renamed fn > enabled: -> dev_dbg & pr_debug, with cDRM-prefix # format. > For merging, can we pull out the renames and reorgs from this patch, and then maybe also the reorder the next patch in your series here to be before the dyn debug stuff? < the above adaptation is re-adapted to use dyndbg's new class-Factory macro, other stuff is gone. > Signed-off-by: Jim Cromie > --- > drivers/gpu/drm/Kconfig | 13 + > drivers/gpu/drm/drm_print.c | 75 -- > include/drm/drm_print.h | 102 ++-- > 3 files changed, 158 insertions(+), 32 deletions(-) update - drm parts are slightly smaller now :-) [jimc@frodo wk-next]$ git diff --stat master Documentation/admin-guide/dynamic-debug-howto.rst | 7 + drivers/gpu/drm/Kconfig | 12 drivers/gpu/drm/Makefile | 2 ++ drivers/gpu/drm/drm_print.c | 56 ++ include/drm/drm_print.h | 80 + include/linux/dynamic_debug.h | 113 + lib/dynamic_debug.c | 140 + 7 files changed, 323 insertions(+), 87 deletions(-) I really like this, I think you can drop the RFC. A few more things that I think we need: - An overview kerneldoc section which explains the interfaces and how it all works together. Essentially your commit message with some light markup to make it look good. < not sure anything worth documenting has survived. - I think it would
Re: Report 2 in ext4 and journal based on v5.17-rc1
On Fri, Mar 04, 2022 at 10:40:35PM -0500, Theodore Ts'o wrote: > On Fri, Mar 04, 2022 at 12:20:02PM +0900, Byungchul Park wrote: > > > > I found a point that the two wait channels don't lead a deadlock in > > some cases thanks to Jan Kara. I will fix it so that Dept won't > > complain it. > > I sent my last (admittedly cranky) message before you sent this. I'm > glad you finally understood Jan's explanation. I was trying to tell Not finally. I've understood him whenever he tried to tell me something. > you the same thing, but apparently I failed to communicate in a I don't think so. Your point and Jan's point are different. All he has said make sense. But yours does not. > sufficiently clear manner. In any case, what Jan described is a > fundamental part of how wait queues work, and I'm kind of amazed that > you were able to implement DEPT without understanding it. (But maybe Of course, it was possible because all that Dept has to know for basic work is wait and event. The subtle things like what Jan told me help Dept be better. > that is why some of the DEPT reports were completely incomprehensible It's because you are blinded to blame at it without understanding how Dept works at all. I will fix those that must be fixed. Don't worry. > to me; I couldn't interpret why in the world DEPT was saying there was > a problem.) I can tell you if you really want to understand why. But I can't if you are like this. > In any case, the thing I would ask is a little humility. We regularly > use lockdep, and we run a huge number of stress tests, throughout each > development cycle. Sure. > So if DEPT is issuing lots of reports about apparently circular > dependencies, please try to be open to the thought that the fault is No one was convinced that Dept doesn't have a fault. I think your worries are too much. > in DEPT, and don't try to argue with maintainers that their code MUST > be buggy --- but since you don't understand our code, and DEPT must be No one argued that their code must be buggy, either. So I don't think you have to worry about what's never happened. > theoretically perfect, that it is up to the Maintainers to prove to > you that their code is correct. > > I am going to gently suggest that it is at least as likely, if not > more likely, that the failure is in DEPT or your understanding of what No doubt. I already think so. But it doesn't mean that I have to keep quiet without discussing to imporve Dept. I will keep improving Dept in a reasonable way. > how kernel wait channels and locking works. After all, why would it > be that we haven't found these problems via our other QA practices? Let's talk more once you understand how Dept works at least 10%. Or I think we cannot talk in a productive way.
Re: Report 2 in ext4 and journal based on v5.17-rc1
On Fri, Mar 04, 2022 at 10:26:23PM -0500, Theodore Ts'o wrote: > On Fri, Mar 04, 2022 at 09:42:37AM +0900, Byungchul Park wrote: > > > > All contexts waiting for any of the events in the circular dependency > > chain will be definitely stuck if there is a circular dependency as I > > explained. So we need another wakeup source to break the circle. In > > ext4 code, you might have the wakeup source for breaking the circle. > > > > What I agreed with is: > > > >The case that 1) the circular dependency is unevitable 2) there are > >another wakeup source for breadking the circle and 3) the duration > >in sleep is short enough, should be acceptable. > > > > Sounds good? > > These dependencies are part of every single ext4 metadata update, > and if there were any unnecessary sleeps, this would be a major > performance gap, and this is a very well studied part of ext4. > > There are some places where we sleep, sure. In some case > start_this_handle() needs to wait for a commit to complete, and the > commit thread might need to sleep for I/O to complete. But the moment > the thing that we're waiting for is complete, we wake up all of the > processes on the wait queue. But in the case where we wait for I/O > complete, that wakeupis coming from the device driver, when it > receives the the I/O completion interrupt from the hard drive. Is > that considered an "external source"? Maybe DEPT doesn't recognize > that this is certain to happen just as day follows the night? (Well, > maybe the I/O completion interrupt might not happen if the disk drive > bursts into flames --- but then, you've got bigger problems. :-) Almost all you've been blaming at Dept are totally non-sense. Based on what you're saying, I'm conviced that you don't understand how Dept works even 1%. You don't even try to understand it before blame. You don't have to understand and support it. But I can't response to you if you keep saying silly things that way. > In any case, if DEPT is going to report these "circular dependencies > as bugs that MUST be fixed", it's going to be pure noise and I will > ignore all DEPT reports, and will push back on having Lockdep replaced Dept is going to be improved so that what you are concerning about won't be reported. > by DEPT --- because Lockdep give us actionable reports, and if DEPT Right. Dept should give actionable reports, too. > can't tell the difference between a valid programming pattern and a > bug, then it's worse than useless. Needless to say.
Re: [PATCH] drm/msm/a6xx: Fix missing ARRAY_SIZE() check
On Sat, 5 Mar 2022 at 00:57, Rob Clark wrote: > > On Fri, Mar 4, 2022 at 1:47 PM Dmitry Baryshkov > wrote: > > > > On Fri, 4 Mar 2022 at 23:23, Rob Clark wrote: > > > > > > From: Rob Clark > > > > > > Fixes: f6d62d091cfd ("drm/msm/a6xx: add support for Adreno 660 GPU") > > > Signed-off-by: Rob Clark > > > > Reviewed-by: Dmitry Baryshkov > > However see the comment below. > > > > > --- > > > drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 1 + > > > 1 file changed, 1 insertion(+) > > > > > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > > > b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > > > index 02b47977b5c3..6406d8c3411a 100644 > > > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > > > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > > > @@ -687,6 +687,7 @@ static void a6xx_set_cp_protect(struct msm_gpu *gpu) > > > > > > BUILD_BUG_ON(ARRAY_SIZE(a6xx_protect) > 32); > > > BUILD_BUG_ON(ARRAY_SIZE(a650_protect) > 48); > > > + BUILD_BUG_ON(ARRAY_SIZE(a660_protect) > 48); > > > > The magic number 32 and 48 are repeated through this code. I'd suggest > > to define them and use defined names. > > It can come up as a separate commit. > > > > Or perhaps instead: IMO this is much better. Reviewed-by: Dmitry Baryshkov > > diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > index 6406d8c3411a..58c371930fb4 100644 > --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c > @@ -683,20 +683,23 @@ static void a6xx_set_cp_protect(struct msm_gpu *gpu) > { > struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu); > const u32 *regs = a6xx_protect; > - unsigned i, count = ARRAY_SIZE(a6xx_protect), count_max = 32; > - > - BUILD_BUG_ON(ARRAY_SIZE(a6xx_protect) > 32); > - BUILD_BUG_ON(ARRAY_SIZE(a650_protect) > 48); > - BUILD_BUG_ON(ARRAY_SIZE(a660_protect) > 48); > + unsigned i, count, count_max; > > if (adreno_is_a650(adreno_gpu)) { > regs = a650_protect; > count = ARRAY_SIZE(a650_protect); > count_max = 48; > + BUILD_BUG_ON(ARRAY_SIZE(a650_protect) > 48); > } else if (adreno_is_a660_family(adreno_gpu)) { > regs = a660_protect; > count = ARRAY_SIZE(a660_protect); > count_max = 48; > + BUILD_BUG_ON(ARRAY_SIZE(a660_protect) > 48); > + } else { > + regs = a6xx_protect; > + count = ARRAY_SIZE(a6xx_protect); > + count_max = 32; > + BUILD_BUG_ON(ARRAY_SIZE(a6xx_protect) > 32); > } > > /* > > > that moves each of the two uses of constant together.. adding three > #defines each used only twice seems a bit silly, IMHO > > BR, > -R -- With best wishes Dmitry
Re: [PATCH] drm/nouveau/bios: Rename prom_init() and friends functions
Le 05/03/2022 à 08:38, Christophe Leroy a écrit : Le 04/03/2022 à 21:24, Lyude Paul a écrit : This mostly looks good to me. Just one question (and one comment down below that needs addressing). Is this with ppc32? (I ask because ppc64le doesn't seem to hit this compilation error). That's with PPC64, see http://kisskb.ellerman.id.au/kisskb/branch/chleroy/head/252ba609bea83234d2e35841c19ae84c67b43ec7/ But that's not (yet) with the mainline tree. That's work I'm doing to cleanup our asm/asm-protoypes.h header. Since commit 4efca4ed05cb ("kbuild: modversions for EXPORT_SYMBOL() for asm") that file is dedicated to prototypes of functions defined in assembly. Therefore I'm trying to dispatch C functions prototypes in other headers. I wanted to move prom_init() prototype into asm/prom.h and then I hit the problem. In the beginning I was thinking about just changing the name of the function in powerpc, but as I see that M68K, MIPS and SPARC also have a prom_init() function, I thought it would be better to change the name in shadowrom.c to avoid any future conflict like the one I got while reworking the headers. @@ -57,8 +57,8 @@ prom_init(struct nvkm_bios *bios, const char *name) const struct nvbios_source nvbios_rom = { .name = "PROM", - .init = prom_init, - .fini = prom_fini, - .read = prom_read, + .init = nvbios_rom_init, + .fini = nvbios_rom_fini, + .read = nvbios_rom_read, Seeing as the source name is prom, I think using the naming convention nvbios_prom_* would be better then nvbios_rom_*. Yes I wasn't sure about the best naming as the file name is shadowrom.c and not shadowprom.c. I will send v2 using nvbios_prom_* as a name. While preparing v2 I remembered that in fact, I called the functions nvbios_rom_* because the name of the nvbios_source struct is nvbios_rom, so for me it made sense to use the name of the struct as a prefix for the functions. So I'm OK to change it to nvbios_prom_* but it looks less logical to me. Please confirm you still prefer nvbios_prom as prefix to the function names. Christophe