Re: [Nouveau] [PATCH v8 8/8] nouveau/svm: Implement atomic SVM access

2021-05-20 Thread Ben Skeggs
On Wed, 7 Apr 2021 at 18:43, Alistair Popple  wrote:
>
> Some NVIDIA GPUs do not support direct atomic access to system memory
> via PCIe. Instead this must be emulated by granting the GPU exclusive
> access to the memory. This is achieved by replacing CPU page table
> entries with special swap entries that fault on userspace access.
>
> The driver then grants the GPU permission to update the page undergoing
> atomic access via the GPU page tables. When CPU access to the page is
> required a CPU fault is raised which calls into the device driver via
> MMU notifiers to revoke the atomic access. The original page table
> entries are then restored allowing CPU access to proceed.
>
> Signed-off-by: Alistair Popple 
The Nouveau bits at least look good to me.

For patches 7/8:
Reviewed-by: Ben Skeggs 

>
> ---
>
> v7:
> * Removed magic values for fault access levels
> * Improved readability of fault comparison code
>
> v4:
> * Check that page table entries haven't changed before mapping on the
>   device
> ---
>  drivers/gpu/drm/nouveau/include/nvif/if000c.h |   1 +
>  drivers/gpu/drm/nouveau/nouveau_svm.c | 126 --
>  drivers/gpu/drm/nouveau/nvkm/subdev/mmu/vmm.h |   1 +
>  .../drm/nouveau/nvkm/subdev/mmu/vmmgp100.c|   6 +
>  4 files changed, 123 insertions(+), 11 deletions(-)
>
> diff --git a/drivers/gpu/drm/nouveau/include/nvif/if000c.h 
> b/drivers/gpu/drm/nouveau/include/nvif/if000c.h
> index d6dd40f21eed..9c7ff56831c5 100644
> --- a/drivers/gpu/drm/nouveau/include/nvif/if000c.h
> +++ b/drivers/gpu/drm/nouveau/include/nvif/if000c.h
> @@ -77,6 +77,7 @@ struct nvif_vmm_pfnmap_v0 {
>  #define NVIF_VMM_PFNMAP_V0_APER   
> 0x00f0ULL
>  #define NVIF_VMM_PFNMAP_V0_HOST   
> 0xULL
>  #define NVIF_VMM_PFNMAP_V0_VRAM   
> 0x0010ULL
> +#define NVIF_VMM_PFNMAP_V0_A 
> 0x0004ULL
>  #define NVIF_VMM_PFNMAP_V0_W  
> 0x0002ULL
>  #define NVIF_VMM_PFNMAP_V0_V  
> 0x0001ULL
>  #define NVIF_VMM_PFNMAP_V0_NONE   
> 0xULL
> diff --git a/drivers/gpu/drm/nouveau/nouveau_svm.c 
> b/drivers/gpu/drm/nouveau/nouveau_svm.c
> index a195e48c9aee..81526d65b4e2 100644
> --- a/drivers/gpu/drm/nouveau/nouveau_svm.c
> +++ b/drivers/gpu/drm/nouveau/nouveau_svm.c
> @@ -35,6 +35,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>
>  struct nouveau_svm {
> struct nouveau_drm *drm;
> @@ -67,6 +68,11 @@ struct nouveau_svm {
> } buffer[1];
>  };
>
> +#define FAULT_ACCESS_READ 0
> +#define FAULT_ACCESS_WRITE 1
> +#define FAULT_ACCESS_ATOMIC 2
> +#define FAULT_ACCESS_PREFETCH 3
> +
>  #define SVM_DBG(s,f,a...) NV_DEBUG((s)->drm, "svm: "f"\n", ##a)
>  #define SVM_ERR(s,f,a...) NV_WARN((s)->drm, "svm: "f"\n", ##a)
>
> @@ -411,6 +417,24 @@ nouveau_svm_fault_cancel_fault(struct nouveau_svm *svm,
>   fault->client);
>  }
>
> +static int
> +nouveau_svm_fault_priority(u8 fault)
> +{
> +   switch (fault) {
> +   case FAULT_ACCESS_PREFETCH:
> +   return 0;
> +   case FAULT_ACCESS_READ:
> +   return 1;
> +   case FAULT_ACCESS_WRITE:
> +   return 2;
> +   case FAULT_ACCESS_ATOMIC:
> +   return 3;
> +   default:
> +   WARN_ON_ONCE(1);
> +   return -1;
> +   }
> +}
> +
>  static int
>  nouveau_svm_fault_cmp(const void *a, const void *b)
>  {
> @@ -421,9 +445,8 @@ nouveau_svm_fault_cmp(const void *a, const void *b)
> return ret;
> if ((ret = (s64)fa->addr - fb->addr))
> return ret;
> -   /*XXX: atomic? */
> -   return (fa->access == 0 || fa->access == 3) -
> -  (fb->access == 0 || fb->access == 3);
> +   return nouveau_svm_fault_priority(fa->access) -
> +   nouveau_svm_fault_priority(fb->access);
>  }
>
>  static void
> @@ -487,6 +510,10 @@ static bool nouveau_svm_range_invalidate(struct 
> mmu_interval_notifier *mni,
> struct svm_notifier *sn =
> container_of(mni, struct svm_notifier, notifier);
>
> +   if (range->event == MMU_NOTIFY_EXCLUSIVE &&
> +   range->owner == sn->svmm->vmm->cli->drm->dev)
> +   return true;
> +
> /*
>  * serializes the update to mni->invalidate_seq done by caller and
>  * prevents invalidation of the PTE from progressing while HW is being
> @@ -555,6 +582,71 @@ static void nouveau_hmm_convert_pfn(struct nouveau_drm 
> *drm,
> args->p.phys[0] |= NVIF_VMM_PFNMAP_V0_W;
>  }
>
> +static int nouveau_atomic_range_fault(struct nouveau_svmm *svmm,
> +  struct nouveau_drm *drm,
> +  struct nouveau_pfnmap_args *args, u32 size,
> +  struct svm_notifier 

Re: [Nouveau] [PATCH v8 3/8] mm/rmap: Split try_to_munlock from try_to_unmap

2021-05-20 Thread Alistair Popple
On Friday, 21 May 2021 6:24:27 AM AEST Liam Howlett wrote:

[...]
 
> > > > diff --git a/mm/rmap.c b/mm/rmap.c
> > > > index 977e70803ed8..f09d522725b9 100644
> > > > --- a/mm/rmap.c
> > > > +++ b/mm/rmap.c
> > > > @@ -1405,10 +1405,6 @@ static bool try_to_unmap_one(struct page *page,
> > > > struct vm_area_struct *vma,>
> > > > 
> > > >   struct mmu_notifier_range range;
> > > >   enum ttu_flags flags = (enum ttu_flags)(long)arg;
> > > > 
> > > > - /* munlock has nothing to gain from examining un-locked vmas */
> > > > - if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
> > > > - return true;
> > > > -
> > > > 
> > > >   if (IS_ENABLED(CONFIG_MIGRATION) && (flags & TTU_MIGRATION) &&
> > > >   
> > > >   is_zone_device_page(page) && !is_device_private_page(page))
> > > >   
> > > >   return true;
> > > > 
> > > > @@ -1469,8 +1465,6 @@ static bool try_to_unmap_one(struct page *page,
> > > > struct vm_area_struct *vma,>
> > > > 
> > > >   page_vma_mapped_walk_done();
> > > >   break;
> > > >   
> > > >   }
> > > > 
> > > > - if (flags & TTU_MUNLOCK)
> > > > - continue;
> > > > 
> > > >   }
> > > >   
> > > >   /* Unexpected PMD-mapped THP? */
> > > > 
> > > > @@ -1784,8 +1778,39 @@ bool try_to_unmap(struct page *page, enum
> > > > ttu_flags
> > > > flags)>
> > > > 
> > > >   return !page_mapcount(page) ? true : false;
> > > >  
> > > >  }
> > > 
> > > Please add a comment here, especially around locking.
> 
> Did you miss this comment?  I think the name confusion alone means this
> needs some documentation.  It's also worth mentioning arg is unused.

Ack. Was meant to come back to that after discussing some of the locking 
questions below. The other side effect of splitting this code out is it leaves 
space for more specific documentation which is only a good thing. I will try 
and summarise some of the discussion below into a comment here.

> > > > +static bool page_mlock_one(struct page *page, struct vm_area_struct
> > > > *vma,
> > > > +  unsigned long address, void *arg)
> > > > +{
> > > > + struct page_vma_mapped_walk pvmw = {
> > > > + .page = page,
> > > > + .vma = vma,
> > > > + .address = address,
> > > > + };
> > > > +
> > > > + /* munlock has nothing to gain from examining un-locked vmas */
> > > > + if (!(vma->vm_flags & VM_LOCKED))
> > > > + return true;
> > > 
> > > The logic here doesn't make sense.  You called page_mlock_one() on a VMA
> > > that isn't locked and it returns true?  Is this a check to see if the
> > > VMA has zero mlock'ed pages?
> > 
> > I'm pretty sure the logic is correct. This is used for an rmap_walk, so we
> > return true to continue the page table scan to see if other VMAs have the
> > page locked.
> 
> yes, sorry.  The logic is correct but doesn't read as though it does.
> I cannot see what is going on easily and there are no comments stating
> what is happening.

Thanks for confirming. The documentation in Documentation/vm/unevictable-
lru.rst is helpful for higher level context but I will put some comments here 
around the logic.

> > > > +
> > > > + while (page_vma_mapped_walk()) {
> > > > + /* PTE-mapped THP are never mlocked */
> > > > + if (!PageTransCompound(page)) {
> > > > + /*
> > > > +  * Holding pte lock, we do *not* need
> > > > +  * mmap_lock here
> > > > +  */
> > > 
> > > Are you sure?  I think you at least need to hold the mmap lock for
> > > reading to ensure there's no race here?  mlock_vma_page() eludes to such
> > > a scenario when lazy mlocking.
> > 
> > Not really. I don't claim to be an mlock expert but as this is a clean-up
> > for try_to_unmap() the intent was to not change existing behaviour.
> > 
> > However presenting the function in this simplified form did raise this and
> > some other questions during previous reviews - see
> > https://lore.kernel.org/
> > dri-devel/20210331115746.ga1463...@nvidia.com/ for the previous
> > discussion.
> 
> From what I can see, at least the following paths have mmap_lock held
> for writing:
> 
> munlock_vma_pages_range() from __do_munmap()
> munlokc_vma_pages_range() from remap_file_pages()
> 
> > To answer the questions around locking though I did do some git sha1
> > mining. The best explanation seemed to be contained in
> > https://git.kernel.org/pub/scm/
> > linux/kernel/git/torvalds/linux.git/commit/?
> > id=b87537d9e2feb30f6a962f27eb32768682698d3b from Hugh (whom I've added
> > again here in case he can help answer some of these).
> 
> Thanks for the pointer.  That race doesn't make the lock unnecessary.
> It is the exception to the rule because the 

Re: [Nouveau] [PATCH v8 3/8] mm/rmap: Split try_to_munlock from try_to_unmap

2021-05-20 Thread Liam Howlett
* Alistair Popple  [210519 08:38]:
> On Wednesday, 19 May 2021 6:04:51 AM AEST Liam Howlett wrote:
> > External email: Use caution opening links or attachments
> > 
> > * Alistair Popple  [210407 04:43]:
> > > The behaviour of try_to_unmap_one() is difficult to follow because it
> > > performs different operations based on a fairly large set of flags used
> > > in different combinations.
> > > 
> > > TTU_MUNLOCK is one such flag. However it is exclusively used by
> > > try_to_munlock() which specifies no other flags. Therefore rather than
> > > overload try_to_unmap_one() with unrelated behaviour split this out into
> > > it's own function and remove the flag.
> > > 
> > > Signed-off-by: Alistair Popple 
> > > Reviewed-by: Ralph Campbell 
> > > Reviewed-by: Christoph Hellwig 
> > > 
> > > ---
> > > 
> > > v8:
> > > * Renamed try_to_munlock to page_mlock to better reflect what the
> > > 
> > >   function actually does.
> > > 
> > > * Removed the TODO from the documentation that this patch addresses.
> > > 
> > > v7:
> > > * Added Christoph's Reviewed-by
> > > 
> > > v4:
> > > * Removed redundant check for VM_LOCKED
> > > ---
> > > 
> > >  Documentation/vm/unevictable-lru.rst | 33 ---
> > >  include/linux/rmap.h |  3 +-
> > >  mm/mlock.c   | 10 +++---
> > >  mm/rmap.c| 48 +---
> > >  4 files changed, 55 insertions(+), 39 deletions(-)
> > > 
> > > diff --git a/Documentation/vm/unevictable-lru.rst
> > > b/Documentation/vm/unevictable-lru.rst index 0e1490524f53..eae3af17f2d9
> > > 100644
> > > --- a/Documentation/vm/unevictable-lru.rst
> > > +++ b/Documentation/vm/unevictable-lru.rst
> > > @@ -389,14 +389,14 @@ mlocked, munlock_vma_page() updates that zone
> > > statistics for the number of> 
> > >  mlocked pages.  Note, however, that at this point we haven't checked
> > >  whether the page is mapped by other VM_LOCKED VMAs.
> > > 
> > > -We can't call try_to_munlock(), the function that walks the reverse map
> > > to
> > > +We can't call page_mlock(), the function that walks the reverse map to
> > > 
> > >  check for other VM_LOCKED VMAs, without first isolating the page from the
> > >  LRU.> 
> > > -try_to_munlock() is a variant of try_to_unmap() and thus requires that
> > > the page +page_mlock() is a variant of try_to_unmap() and thus requires
> > > that the page> 
> > >  not be on an LRU list [more on these below].  However, the call to
> > > 
> > > -isolate_lru_page() could fail, in which case we couldn't
> > > try_to_munlock().  So, +isolate_lru_page() could fail, in which case we
> > > can't call page_mlock().  So,> 
> > >  we go ahead and clear PG_mlocked up front, as this might be the only
> > >  chance we> 
> > > -have.  If we can successfully isolate the page, we go ahead and
> > > -try_to_munlock(), which will restore the PG_mlocked flag and update the
> > > zone +have.  If we can successfully isolate the page, we go ahead and
> > > call +page_mlock(), which will restore the PG_mlocked flag and update the
> > > zone> 
> > >  page statistics if it finds another VMA holding the page mlocked.  If we
> > >  fail to isolate the page, we'll have left a potentially mlocked page on
> > >  the LRU. This is fine, because we'll catch it later if and if vmscan
> > >  tries to reclaim> 
> > > @@ -545,31 +545,24 @@ munlock or munmap system calls, mm teardown
> > > (munlock_vma_pages_all), reclaim,> 
> > >  holepunching, and truncation of file pages and their anonymous COWed
> > >  pages.
> > > 
> > > -try_to_munlock() Reverse Map Scan
> > > +page_mlock() Reverse Map Scan
> > > 
> > >  -
> > > 
> > > -.. warning::
> > > -   [!] TODO/FIXME: a better name might be page_mlocked() - analogous to
> > > the -   page_referenced() reverse map walker.
> > > -
> > > 
> > >  When munlock_vma_page() [see section :ref:`munlock()/munlockall() System
> > >  Call Handling ` above] tries to munlock a
> > >  page, it needs to determine whether or not the page is mapped by any
> > >  VM_LOCKED VMA without actually attempting to unmap all PTEs from the
> > >  page.  For this purpose, the unevictable/mlock infrastructure
> > > 
> > > -introduced a variant of try_to_unmap() called try_to_munlock().
> > > +introduced a variant of try_to_unmap() called page_mlock().
> > > 
> > > -try_to_munlock() calls the same functions as try_to_unmap() for anonymous
> > > and -mapped file and KSM pages with a flag argument specifying unlock
> > > versus unmap -processing.  Again, these functions walk the respective
> > > reverse maps looking -for VM_LOCKED VMAs.  When such a VMA is found, as
> > > in the try_to_unmap() case, -the functions mlock the page via
> > > mlock_vma_page() and return SWAP_MLOCK.  This -undoes the pre-clearing of
> > > the page's PG_mlocked done by munlock_vma_page. +page_mlock() walks the
> > > respective reverse maps looking for VM_LOCKED VMAs. When +such a VMA is
> > > found the page 

Re: [Nouveau] [Intel-gfx] [PATCH 0/7] Per client engine busyness

2021-05-20 Thread arabek
> Well if it becomes a problem fixing the debugfs "clients" file and
> making it sysfs shouldn't be much of a problem later on.

Why not to try using something in terms of perf / opensnoop or bpf
to do the work. Should be optimal enough.

ie.
http://www.brendangregg.com/blog/2014-07-25/opensnoop-for-linux.html
https://man7.org/linux/man-pages/man2/bpf.2.html
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [Intel-gfx] [PATCH 0/7] Per client engine busyness

2021-05-20 Thread Christian König




Am 20.05.21 um 16:11 schrieb Daniel Vetter:

On Wed, May 19, 2021 at 11:17:24PM +, Nieto, David M wrote:

[AMD Official Use Only]

Parsing over 550 processes for fdinfo is taking between 40-100ms single
threaded in a 2GHz skylake IBRS within a VM using simple string
comparisons and DIRent parsing. And that is pretty much the worst case
scenario with some more optimized implementations.

I think this is plenty ok, and if it's not you could probably make this
massively faster with io_uring for all the fs operations and whack a
parser-generator on top for real parsing speed.


Well if it becomes a problem fixing the debugfs "clients" file and 
making it sysfs shouldn't be much of a problem later on.


Christian.



So imo we shouldn't worry about algorithmic inefficiency of the fdinfo
approach at all, and focuse more on trying to reasonably (but not too
much, this is still drm render stuff after all) standardize how it works
and how we'll extend it all. I think there's tons of good suggestions in
this thread on this topic already.

/me out
-Daniel


David

From: Daniel Vetter 
Sent: Wednesday, May 19, 2021 11:23 AM
To: Tvrtko Ursulin 
Cc: Daniel Stone ; jhubb...@nvidia.com ; nouveau@lists.freedesktop.org 
; Intel Graphics Development ; Maling list - DRI 
developers ; Simon Ser ; Koenig, Christian 
; arit...@nvidia.com ; Nieto, David M 
Subject: Re: [Intel-gfx] [PATCH 0/7] Per client engine busyness

On Wed, May 19, 2021 at 6:16 PM Tvrtko Ursulin
 wrote:


On 18/05/2021 10:40, Tvrtko Ursulin wrote:

On 18/05/2021 10:16, Daniel Stone wrote:

Hi,

On Tue, 18 May 2021 at 10:09, Tvrtko Ursulin
 wrote:

I was just wondering if stat(2) and a chrdev major check would be a
solid criteria to more efficiently (compared to parsing the text
content) detect drm files while walking procfs.

Maybe I'm missing something, but is the per-PID walk actually a
measurable performance issue rather than just a bit unpleasant?

Per pid and per each open fd.

As said in the other thread what bothers me a bit in this scheme is that
the cost of obtaining GPU usage scales based on non-GPU criteria.

For use case of a top-like tool which shows all processes this is a
smaller additional cost, but then for a gpu-top like tool it is somewhat
higher.

To further expand, not only cost would scale per pid multiplies per open
fd, but to detect which of the fds are DRM I see these three options:

1) Open and parse fdinfo.
2) Name based matching ie /dev/dri/.. something.
3) Stat the symlink target and check for DRM major.

stat with symlink following should be plenty fast.


All sound quite sub-optimal to me.

Name based matching is probably the least evil on system resource usage
(Keeping the dentry cache too hot? Too many syscalls?), even though
fundamentally I don't it is the right approach.

What happens with dup(2) is another question.

We need benchmark numbers showing that on anything remotely realistic
it's an actual problem. Until we've demonstrated it's a real problem
we don't need to solve it.

E.g. top with any sorting enabled also parses way more than it
displays on every update. It seems to be doing Just Fine (tm).


Does anyone have any feedback on the /proc//gpu idea at all?

When we know we have a problem to solve we can take a look at solutions.
-Daniel
--
Daniel Vetter
Software Engineer, Intel Corporation
https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fblog.ffwll.ch%2Fdata=04%7C01%7CChristian.Koenig%40amd.com%7Ced2eccaa081d4cd336d408d91b991ee0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637571166744508313%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=ZihrnanU70nJAM6bHYCjRnURDDCIdwGI85imjGd%2FNgs%3Dreserved=0


___
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [Intel-gfx] [PATCH 0/7] Per client engine busyness

2021-05-20 Thread Daniel Vetter
On Wed, May 19, 2021 at 11:17:24PM +, Nieto, David M wrote:
> [AMD Official Use Only]
> 
> Parsing over 550 processes for fdinfo is taking between 40-100ms single
> threaded in a 2GHz skylake IBRS within a VM using simple string
> comparisons and DIRent parsing. And that is pretty much the worst case
> scenario with some more optimized implementations.

I think this is plenty ok, and if it's not you could probably make this
massively faster with io_uring for all the fs operations and whack a
parser-generator on top for real parsing speed.

So imo we shouldn't worry about algorithmic inefficiency of the fdinfo
approach at all, and focuse more on trying to reasonably (but not too
much, this is still drm render stuff after all) standardize how it works
and how we'll extend it all. I think there's tons of good suggestions in
this thread on this topic already.

/me out
-Daniel

> 
> David
> 
> From: Daniel Vetter 
> Sent: Wednesday, May 19, 2021 11:23 AM
> To: Tvrtko Ursulin 
> Cc: Daniel Stone ; jhubb...@nvidia.com 
> ; nouveau@lists.freedesktop.org 
> ; Intel Graphics Development 
> ; Maling list - DRI developers 
> ; Simon Ser ; Koenig, 
> Christian ; arit...@nvidia.com 
> ; Nieto, David M 
> Subject: Re: [Intel-gfx] [PATCH 0/7] Per client engine busyness
> 
> On Wed, May 19, 2021 at 6:16 PM Tvrtko Ursulin
>  wrote:
> >
> >
> > On 18/05/2021 10:40, Tvrtko Ursulin wrote:
> > >
> > > On 18/05/2021 10:16, Daniel Stone wrote:
> > >> Hi,
> > >>
> > >> On Tue, 18 May 2021 at 10:09, Tvrtko Ursulin
> > >>  wrote:
> > >>> I was just wondering if stat(2) and a chrdev major check would be a
> > >>> solid criteria to more efficiently (compared to parsing the text
> > >>> content) detect drm files while walking procfs.
> > >>
> > >> Maybe I'm missing something, but is the per-PID walk actually a
> > >> measurable performance issue rather than just a bit unpleasant?
> > >
> > > Per pid and per each open fd.
> > >
> > > As said in the other thread what bothers me a bit in this scheme is that
> > > the cost of obtaining GPU usage scales based on non-GPU criteria.
> > >
> > > For use case of a top-like tool which shows all processes this is a
> > > smaller additional cost, but then for a gpu-top like tool it is somewhat
> > > higher.
> >
> > To further expand, not only cost would scale per pid multiplies per open
> > fd, but to detect which of the fds are DRM I see these three options:
> >
> > 1) Open and parse fdinfo.
> > 2) Name based matching ie /dev/dri/.. something.
> > 3) Stat the symlink target and check for DRM major.
> 
> stat with symlink following should be plenty fast.
> 
> > All sound quite sub-optimal to me.
> >
> > Name based matching is probably the least evil on system resource usage
> > (Keeping the dentry cache too hot? Too many syscalls?), even though
> > fundamentally I don't it is the right approach.
> >
> > What happens with dup(2) is another question.
> 
> We need benchmark numbers showing that on anything remotely realistic
> it's an actual problem. Until we've demonstrated it's a real problem
> we don't need to solve it.
> 
> E.g. top with any sorting enabled also parses way more than it
> displays on every update. It seems to be doing Just Fine (tm).
> 
> > Does anyone have any feedback on the /proc//gpu idea at all?
> 
> When we know we have a problem to solve we can take a look at solutions.
> -Daniel
> --
> Daniel Vetter
> Software Engineer, Intel Corporation
> https://nam11.safelinks.protection.outlook.com/?url=http%3A%2F%2Fblog.ffwll.ch%2Fdata=04%7C01%7CDavid.Nieto%40amd.com%7Cf6aea97532cf41f916de08d91af32cc1%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637570453997158377%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=4CFrY9qWbJREcIcSzeO9KIn2P%2Fw6k%2BYdNlh6rdS%2BEh4%3Dreserved=0

-- 
Daniel Vetter
Software Engineer, Intel Corporation
http://blog.ffwll.ch
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] Requests For Proposals for hosting XDC 2022 are now open

2021-05-20 Thread Samuel Iglesias Gonsálvez
Hello everyone!

The X.org board is soliciting proposals to host XDC in 2022. Since
XDC 2021 is being held in Europe this year (although virtually), we've
decided to host in North America. However, the board is open to other
locations, especially if there's an interesting co-location with
another conference.

Of course though, due to the ongoing COVID-19 pandemic it's not yet
clear whether or not it will be possible to host XDC 2022 in person,
although is seems very likely. Because of this, we would like to
make it clear that sponsors should prepare for both the possibility
of an in person conference, and the possibility of a virtual
conference. We will work with organizers on coming up with a
deadline for deciding whether or not we'll be going virtual, likely
sometime around July 2022.

If you're considering hosting XDC, we've assembled a wiki page with
what's generally expected and needed:

https://www.x.org/wiki/Events/RFP/

When submitting your proposal, please make sure to include at least the
key information about the potential location in question, possible
dates along with estimated costs. Proposals can be submitted to board
at foundation.x.org until the deadline of *September 1st, 2021*. 

Additionally, an quirk early heads-up to the board if you're
considering hosting would be appreciated, in case we need to adjust the
schedule a bit. Also, earlier is better since there generally will be a
bit of Q with organizers.

And if you just have some questions about what organizing XDC entails,
please feel free to chat with previous organizers, or someone from the
board.

Thanks,

Sam



signature.asc
Description: This is a digitally signed message part
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau


[Nouveau] XDC 2021: Registration & Call for Proposals now open!

2021-05-20 Thread Szwichtenberg, Radoslaw
Hello!

Registration & Call for Proposals are now open for XDC 2021, which will
take place on September 15-17, 2021. This year we will repeat as
virtual event.

https://indico.freedesktop.org/event/1/

As usual, the conference is free of charge and open to the general
public. If you plan on attending, please make sure to register as early
as possible!

In order to register as attendee, you will therefore need to register
via the XDC website. As XDC moved to a new Indico infrastructure, if
you previously registered on the XDC website, you need to create a new
account again.

https://indico.freedesktop.org/event/1/registrations/1/

In addition to registration, the CfP is now open for talks, workshops
and demos at XDC 2021. While any serious proposal will be gratefully
considered, topics of interest to X.Org and freedesktop.org developers
are encouraged. The program focus is on new development, ongoing
challenges and anything else that will spark discussions among
attendees in the hallway track.

We are open to talks across all layers of the graphics stack, from the
kernel to desktop environments / graphical applications and about how
to make things better for the developers who build them. Head to the
CfP page to learn more:

https://indico.freedesktop.org/event/1/abstracts/

The deadline for submissions is Sunday, 4 July 2021.

Last year we modified our Reimbursement Policy to accept speaker
expenses for X.Org virtual events like XDC 2021. Check it out here:

https://www.x.org/wiki/XorgFoundation/Policies/Reimbursement/

If you have any questions, please send me an email to
radoslaw.szwichtenb...@intel.com,  
adding on CC the X.org board (board
at foundation.x.org).

And don't forget, you can follow us on Twitter for all the latest
updates and to stay connected:


https://twitter.com/XOrgDevConf

Best,

Radek

P.S: a DNS redirection (xdc2021.x.org) is work in progress. Please use
the mentioned links for the moment.


Radosław Szwichtenberg
-
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173, 80-298 Gdansk
KRS 101882 - NIP 957-07-52-316

___
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau


Re: [Nouveau] [Intel-gfx] [PATCH 0/7] Per client engine busyness

2021-05-20 Thread Tvrtko Ursulin



On 19/05/2021 19:23, Daniel Vetter wrote:

On Wed, May 19, 2021 at 6:16 PM Tvrtko Ursulin
 wrote:



On 18/05/2021 10:40, Tvrtko Ursulin wrote:


On 18/05/2021 10:16, Daniel Stone wrote:

Hi,

On Tue, 18 May 2021 at 10:09, Tvrtko Ursulin
 wrote:

I was just wondering if stat(2) and a chrdev major check would be a
solid criteria to more efficiently (compared to parsing the text
content) detect drm files while walking procfs.


Maybe I'm missing something, but is the per-PID walk actually a
measurable performance issue rather than just a bit unpleasant?


Per pid and per each open fd.

As said in the other thread what bothers me a bit in this scheme is that
the cost of obtaining GPU usage scales based on non-GPU criteria.

For use case of a top-like tool which shows all processes this is a
smaller additional cost, but then for a gpu-top like tool it is somewhat
higher.


To further expand, not only cost would scale per pid multiplies per open
fd, but to detect which of the fds are DRM I see these three options:

1) Open and parse fdinfo.
2) Name based matching ie /dev/dri/.. something.
3) Stat the symlink target and check for DRM major.


stat with symlink following should be plenty fast.


Maybe. I don't think my point about keeping the dentry cache needlessly 
hot is getting through at all. On my lightly loaded desktop:


 $ sudo lsof | wc -l
 599551

 $ sudo lsof | grep "/dev/dri/" | wc -l
 1965

It's going to look up ~600k pointless dentries in every iteration. Just 
to find a handful of DRM ones. Hard to say if that is better or worse 
than just parsing fdinfo text for all files. Will see.



All sound quite sub-optimal to me.

Name based matching is probably the least evil on system resource usage
(Keeping the dentry cache too hot? Too many syscalls?), even though
fundamentally I don't it is the right approach.

What happens with dup(2) is another question.


We need benchmark numbers showing that on anything remotely realistic
it's an actual problem. Until we've demonstrated it's a real problem
we don't need to solve it.


Point about dup(2) is whether it is possible to distinguish the 
duplicated fds in fdinfo. If a DRM client dupes, and we found two 
fdinfos each saying client is using 20% GPU, we don't want to add it up 
to 40%.



E.g. top with any sorting enabled also parses way more than it
displays on every update. It seems to be doing Just Fine (tm).


Ha, perceptions differ. I see it using 4-5% while building the kernel on 
a Xeon server which I find quite a lot. :)



Does anyone have any feedback on the /proc//gpu idea at all?


When we know we have a problem to solve we can take a look at solutions.


Yes I don't think the problem would be to add a better solution later, 
so happy to try the fdinfo first. I am simply pointing out a fundamental 
design inefficiency. Even if machines are getting faster and faster I 
don't think that should be an excuse to waste more and more under the 
hood, when a more efficient solution can be designed from the start.


Regards,

Tvrtko
___
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau