Re: unlock mmap(2) for anonymous mappings

2022-02-10 Thread Mark Kettenis
> Date: Thu, 10 Feb 2022 10:19:20 +
> From: Klemens Nanni 
> 
> On Mon, Feb 07, 2022 at 02:12:52PM +0100, Mark Kettenis wrote:
> > > Date: Mon,  7 Feb 2022 12:11:42 +
> > > From: Klemens Nanni 
> > > 
> > > On Mon, Feb 07, 2022 at 12:41:27PM +0100, Mark Kettenis wrote:
> 
> > > > So there is the existing UVM_MAP_REQ_WRITE().  Compared to
> > > > vm_map_assert_wrlock() it checks the reference count of the map.  I
> > > > think that's questionable, so these should probably be replaced by
> > > > vm_map_assert_wrlock().
> > > 
> > > Yes, I'd replace the old macro with the new functions in a separate diff.
> > 
> > Fair enough.  This has been tested extensively and it doesn't make a
> > lot of sense to start all over again.
> 
> Here's the replacement diff.
> 
> OK?

If this survives a make build, I think this should be safe.

ok kettenis@

> Index: uvm_map.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> retrieving revision 1.282
> diff -u -p -r1.282 uvm_map.c
> --- uvm_map.c 21 Dec 2021 22:21:32 -  1.282
> +++ uvm_map.c 10 Feb 2022 09:23:21 -
> @@ -326,16 +326,6 @@ vaddr_t uvm_maxkaddr;
>  /*
>   * Locking predicate.
>   */
> -#define UVM_MAP_REQ_WRITE(_map)  
> \
> - do {\
> - if ((_map)->ref_count > 0) {\
> - if (((_map)->flags & VM_MAP_INTRSAFE) == 0) \
> - rw_assert_wrlock(&(_map)->lock);\
> - else\
> - MUTEX_ASSERT_LOCKED(&(_map)->mtx);  \
> - }   \
> - } while (0)
> -
>  #define  vm_map_modflags(map, set, clear)
> \
>   do {\
>   mtx_enter(&(map)->flags_lock);  \
> @@ -407,7 +397,7 @@ uvm_mapent_free_insert(struct vm_map *ma
>   KDASSERT((entry->fspace & (vaddr_t)PAGE_MASK) == 0);
>   KASSERT((entry->etype & UVM_ET_FREEMAPPED) == 0);
>  
> - UVM_MAP_REQ_WRITE(map);
> + vm_map_assert_wrlock(map);
>  
>   /* Actual insert: forward to uaddr pointer. */
>   if (uaddr != NULL) {
> @@ -433,7 +423,7 @@ uvm_mapent_free_remove(struct vm_map *ma
>  
>   KASSERT((entry->etype & UVM_ET_FREEMAPPED) != 0 || uaddr == NULL);
>   KASSERT(uvm_map_uaddr_e(map, entry) == uaddr);
> - UVM_MAP_REQ_WRITE(map);
> + vm_map_assert_wrlock(map);
>  
>   if (uaddr != NULL) {
>   fun = uaddr->uaddr_functions;
> @@ -460,7 +450,7 @@ uvm_mapent_addr_insert(struct vm_map *ma
>   TRACEPOINT(uvm, map_insert,
>   entry->start, entry->end, entry->protection, NULL);
>  
> - UVM_MAP_REQ_WRITE(map);
> + vm_map_assert_wrlock(map);
>   res = RBT_INSERT(uvm_map_addr, >addr, entry);
>   if (res != NULL) {
>   panic("uvm_mapent_addr_insert: map %p entry %p "
> @@ -483,7 +473,7 @@ uvm_mapent_addr_remove(struct vm_map *ma
>   TRACEPOINT(uvm, map_remove,
>   entry->start, entry->end, entry->protection, NULL);
>  
> - UVM_MAP_REQ_WRITE(map);
> + vm_map_assert_wrlock(map);
>   res = RBT_REMOVE(uvm_map_addr, >addr, entry);
>   if (res != entry)
>   panic("uvm_mapent_addr_remove");
> 



Re: unlock mmap(2) for anonymous mappings

2022-02-10 Thread Klemens Nanni
On Mon, Feb 07, 2022 at 02:12:52PM +0100, Mark Kettenis wrote:
> > Date: Mon,  7 Feb 2022 12:11:42 +
> > From: Klemens Nanni 
> > 
> > On Mon, Feb 07, 2022 at 12:41:27PM +0100, Mark Kettenis wrote:

> > > So there is the existing UVM_MAP_REQ_WRITE().  Compared to
> > > vm_map_assert_wrlock() it checks the reference count of the map.  I
> > > think that's questionable, so these should probably be replaced by
> > > vm_map_assert_wrlock().
> > 
> > Yes, I'd replace the old macro with the new functions in a separate diff.
> 
> Fair enough.  This has been tested extensively and it doesn't make a
> lot of sense to start all over again.

Here's the replacement diff.

OK?


Index: uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.282
diff -u -p -r1.282 uvm_map.c
--- uvm_map.c   21 Dec 2021 22:21:32 -  1.282
+++ uvm_map.c   10 Feb 2022 09:23:21 -
@@ -326,16 +326,6 @@ vaddr_t uvm_maxkaddr;
 /*
  * Locking predicate.
  */
-#define UVM_MAP_REQ_WRITE(_map)
\
-   do {\
-   if ((_map)->ref_count > 0) {\
-   if (((_map)->flags & VM_MAP_INTRSAFE) == 0) \
-   rw_assert_wrlock(&(_map)->lock);\
-   else\
-   MUTEX_ASSERT_LOCKED(&(_map)->mtx);  \
-   }   \
-   } while (0)
-
 #definevm_map_modflags(map, set, clear)
\
do {\
mtx_enter(&(map)->flags_lock);  \
@@ -407,7 +397,7 @@ uvm_mapent_free_insert(struct vm_map *ma
KDASSERT((entry->fspace & (vaddr_t)PAGE_MASK) == 0);
KASSERT((entry->etype & UVM_ET_FREEMAPPED) == 0);
 
-   UVM_MAP_REQ_WRITE(map);
+   vm_map_assert_wrlock(map);
 
/* Actual insert: forward to uaddr pointer. */
if (uaddr != NULL) {
@@ -433,7 +423,7 @@ uvm_mapent_free_remove(struct vm_map *ma
 
KASSERT((entry->etype & UVM_ET_FREEMAPPED) != 0 || uaddr == NULL);
KASSERT(uvm_map_uaddr_e(map, entry) == uaddr);
-   UVM_MAP_REQ_WRITE(map);
+   vm_map_assert_wrlock(map);
 
if (uaddr != NULL) {
fun = uaddr->uaddr_functions;
@@ -460,7 +450,7 @@ uvm_mapent_addr_insert(struct vm_map *ma
TRACEPOINT(uvm, map_insert,
entry->start, entry->end, entry->protection, NULL);
 
-   UVM_MAP_REQ_WRITE(map);
+   vm_map_assert_wrlock(map);
res = RBT_INSERT(uvm_map_addr, >addr, entry);
if (res != NULL) {
panic("uvm_mapent_addr_insert: map %p entry %p "
@@ -483,7 +473,7 @@ uvm_mapent_addr_remove(struct vm_map *ma
TRACEPOINT(uvm, map_remove,
entry->start, entry->end, entry->protection, NULL);
 
-   UVM_MAP_REQ_WRITE(map);
+   vm_map_assert_wrlock(map);
res = RBT_REMOVE(uvm_map_addr, >addr, entry);
if (res != entry)
panic("uvm_mapent_addr_remove");



Re: unlock mmap(2) for anonymous mappings

2022-02-07 Thread Mark Kettenis
> Date: Mon,  7 Feb 2022 12:11:42 +
> From: Klemens Nanni 
> 
> On Mon, Feb 07, 2022 at 12:41:27PM +0100, Mark Kettenis wrote:
> > > Date: Mon,  7 Feb 2022 11:12:50 +
> > > From: Klemens Nanni 
> > > 
> > > On Thu, Jan 27, 2022 at 11:11:48AM +, Klemens Nanni wrote:
> > > > On Tue, Jan 25, 2022 at 07:54:52AM +, Klemens Nanni wrote:
> > > > > > Could we use a vm_map_assert_locked() or something similar that 
> > > > > > reflect
> > > > > > the exclusive or shared (read) lock comments?  I don't trust 
> > > > > > comments.
> > > > > > It's too easy to miss a lock in a code path.
> > > > > 
> > > > > This survives desktop usage, running regress and building kernels on
> > > > > amd64.
> > > > > 
> > > > > I'll throw it at sparc64 soon.
> > > > > 
> > > > > > 
> > > > > > > So annotate functions using `size' wrt. the required lock.
> > > > 
> > > > Here's a better diff that asserts for read, write or any type of vm_map
> > > > lock.
> > > > 
> > > > vm_map_lock() and vm_map_lock_read() are used for exclusive and shared
> > > > access respectively;  unless I missed something, no code path is purely
> > > > protected by vm_map_lock_read() alone, i.e. functions called with a read
> > > > lock held by the callee are also called with a write lock elsewhere.
> > > > 
> > > > That means my new vm_map_assert_locked_read() is currently unused, but
> > > > vm_map_assert_locked_any() and vm_map_assert_locked_write() are now used
> > > > to validate existing function comments as well as a few new places.
> > > > 
> > > > amd64 and sparc64 are happy with this, both daily drivers as well as
> > > > build/regress machines.
> > > 
> > > amd64 package bulk (thanks naddy), amd64 regress (thanks bluhm) and
> > > sparc64 base builds/regress are happy with the diff below, the
> > > "uvm_unmap_kill_entry(): unwire with map lock held" locking diff on
> > > tech@ and all three of mmap, munmap, mprotect unlocked.
> > > 
> > > > 
> > > > I'd appreciate more tests and reports about running with this diff and
> > > > mmap(2) unlocked.
> > > 
> > > Here's that tested diff which a) follows rwlock(9)'s naming pattern for
> > > the assert functions and b) syncs lock related comments with NetBSD.
> > > 
> > > It only adds and (updates) comments;  no locking or logic changes.
> > > 
> > > I'd like to get these asserts committed soon (after the required
> > > uvm_unmap_kill_entry() vm_map lock diff) before moving forward with
> > > other (un)locking diffs.
> > 
> > So there is the existing UVM_MAP_REQ_WRITE().  Compared to
> > vm_map_assert_wrlock() it checks the reference count of the map.  I
> > think that's questionable, so these should probably be replaced by
> > vm_map_assert_wrlock().
> 
> Yes, I'd replace the old macro with the new functions in a separate diff.

Fair enough.  This has been tested extensively and it doesn't make a
lot of sense to start all over again.

> 
> > Also...
> > 
> > > Index: uvm_map.c
> > > ===
> > > RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> > > retrieving revision 1.282
> > > diff -u -p -r1.282 uvm_map.c
> > > --- uvm_map.c 21 Dec 2021 22:21:32 -  1.282
> > > +++ uvm_map.c 3 Feb 2022 15:46:44 -
> > > @@ -498,6 +498,7 @@ uvm_mapent_addr_remove(struct vm_map *ma
> > >  void
> > >  uvm_map_reference(struct vm_map *map)
> > >  {
> > > + vm_map_assert_unlocked(map);
> > 
> > I don't understand this one.  Grabbing a reference should be safe at
> > any time as long as you already hold a reference.
> 
> That's a left-over from when I added asserts rather mechanically based
> on function comments.
> 
> There's no problem with holding a lock while grabbing a reference, so
> I've removed this hunk, thanks.

so ok kettenis@

> Index: uvm_map.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> retrieving revision 1.282
> diff -u -p -r1.282 uvm_map.c
> --- uvm_map.c 21 Dec 2021 22:21:32 -  1.282
> +++ uvm_map.c 7 Feb 2022 12:08:41 -
> @@ -805,6 +805,8 @@ uvm_map_sel_limits(vaddr_t *min, vaddr_t
>   * Fills in *start_ptr and *end_ptr to be the first and last entry describing
>   * the space.
>   * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
> + *
> + * map must be at least read-locked.
>   */
>  int
>  uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
> @@ -815,6 +817,8 @@ uvm_map_isavail(struct vm_map *map, stru
>   struct uvm_map_addr *atree;
>   struct vm_map_entry *i, *i_end;
>  
> + vm_map_assert_anylock(map);
> +
>   if (addr + sz < addr)
>   return 0;
>  
> @@ -892,6 +896,8 @@ uvm_map_isavail(struct vm_map *map, stru
>  /*
>   * Invoke each address selector until an address is found.
>   * Will not invoke uaddr_exe.
> + *
> + * => caller must at least have read-locked map
>   */
>  int
>  uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first,
> @@ -901,6 +907,8 @@ 

Re: unlock mmap(2) for anonymous mappings

2022-02-07 Thread Klemens Nanni
On Mon, Feb 07, 2022 at 12:41:27PM +0100, Mark Kettenis wrote:
> > Date: Mon,  7 Feb 2022 11:12:50 +
> > From: Klemens Nanni 
> > 
> > On Thu, Jan 27, 2022 at 11:11:48AM +, Klemens Nanni wrote:
> > > On Tue, Jan 25, 2022 at 07:54:52AM +, Klemens Nanni wrote:
> > > > > Could we use a vm_map_assert_locked() or something similar that 
> > > > > reflect
> > > > > the exclusive or shared (read) lock comments?  I don't trust comments.
> > > > > It's too easy to miss a lock in a code path.
> > > > 
> > > > This survives desktop usage, running regress and building kernels on
> > > > amd64.
> > > > 
> > > > I'll throw it at sparc64 soon.
> > > > 
> > > > > 
> > > > > > So annotate functions using `size' wrt. the required lock.
> > > 
> > > Here's a better diff that asserts for read, write or any type of vm_map
> > > lock.
> > > 
> > > vm_map_lock() and vm_map_lock_read() are used for exclusive and shared
> > > access respectively;  unless I missed something, no code path is purely
> > > protected by vm_map_lock_read() alone, i.e. functions called with a read
> > > lock held by the callee are also called with a write lock elsewhere.
> > > 
> > > That means my new vm_map_assert_locked_read() is currently unused, but
> > > vm_map_assert_locked_any() and vm_map_assert_locked_write() are now used
> > > to validate existing function comments as well as a few new places.
> > > 
> > > amd64 and sparc64 are happy with this, both daily drivers as well as
> > > build/regress machines.
> > 
> > amd64 package bulk (thanks naddy), amd64 regress (thanks bluhm) and
> > sparc64 base builds/regress are happy with the diff below, the
> > "uvm_unmap_kill_entry(): unwire with map lock held" locking diff on
> > tech@ and all three of mmap, munmap, mprotect unlocked.
> > 
> > > 
> > > I'd appreciate more tests and reports about running with this diff and
> > > mmap(2) unlocked.
> > 
> > Here's that tested diff which a) follows rwlock(9)'s naming pattern for
> > the assert functions and b) syncs lock related comments with NetBSD.
> > 
> > It only adds and (updates) comments;  no locking or logic changes.
> > 
> > I'd like to get these asserts committed soon (after the required
> > uvm_unmap_kill_entry() vm_map lock diff) before moving forward with
> > other (un)locking diffs.
> 
> So there is the existing UVM_MAP_REQ_WRITE().  Compared to
> vm_map_assert_wrlock() it checks the reference count of the map.  I
> think that's questionable, so these should probably be replaced by
> vm_map_assert_wrlock().

Yes, I'd replace the old macro with the new functions in a separate diff.

> Also...
> 
> > Index: uvm_map.c
> > ===
> > RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> > retrieving revision 1.282
> > diff -u -p -r1.282 uvm_map.c
> > --- uvm_map.c   21 Dec 2021 22:21:32 -  1.282
> > +++ uvm_map.c   3 Feb 2022 15:46:44 -
> > @@ -498,6 +498,7 @@ uvm_mapent_addr_remove(struct vm_map *ma
> >  void
> >  uvm_map_reference(struct vm_map *map)
> >  {
> > +   vm_map_assert_unlocked(map);
> 
> I don't understand this one.  Grabbing a reference should be safe at
> any time as long as you already hold a reference.

That's a left-over from when I added asserts rather mechanically based
on function comments.

There's no problem with holding a lock while grabbing a reference, so
I've removed this hunk, thanks.


Index: uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.282
diff -u -p -r1.282 uvm_map.c
--- uvm_map.c   21 Dec 2021 22:21:32 -  1.282
+++ uvm_map.c   7 Feb 2022 12:08:41 -
@@ -805,6 +805,8 @@ uvm_map_sel_limits(vaddr_t *min, vaddr_t
  * Fills in *start_ptr and *end_ptr to be the first and last entry describing
  * the space.
  * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
+ *
+ * map must be at least read-locked.
  */
 int
 uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
@@ -815,6 +817,8 @@ uvm_map_isavail(struct vm_map *map, stru
struct uvm_map_addr *atree;
struct vm_map_entry *i, *i_end;
 
+   vm_map_assert_anylock(map);
+
if (addr + sz < addr)
return 0;
 
@@ -892,6 +896,8 @@ uvm_map_isavail(struct vm_map *map, stru
 /*
  * Invoke each address selector until an address is found.
  * Will not invoke uaddr_exe.
+ *
+ * => caller must at least have read-locked map
  */
 int
 uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first,
@@ -901,6 +907,8 @@ uvm_map_findspace(struct vm_map *map, st
struct uvm_addr_state *uaddr;
int i;
 
+   vm_map_assert_anylock(map);
+
/*
 * Allocation for sz bytes at any address,
 * using the addr selectors in order.
@@ -1160,7 +1168,7 @@ unlock:
  * uvm_map: establish a valid mapping in map
  *
  * => *addr and sz must be a multiple of PAGE_SIZE.
- * => map must be unlocked.
+ * => map 

Re: unlock mmap(2) for anonymous mappings

2022-02-07 Thread Mark Kettenis
> Date: Mon,  7 Feb 2022 11:12:50 +
> From: Klemens Nanni 
> 
> On Thu, Jan 27, 2022 at 11:11:48AM +, Klemens Nanni wrote:
> > On Tue, Jan 25, 2022 at 07:54:52AM +, Klemens Nanni wrote:
> > > > Could we use a vm_map_assert_locked() or something similar that reflect
> > > > the exclusive or shared (read) lock comments?  I don't trust comments.
> > > > It's too easy to miss a lock in a code path.
> > > 
> > > This survives desktop usage, running regress and building kernels on
> > > amd64.
> > > 
> > > I'll throw it at sparc64 soon.
> > > 
> > > > 
> > > > > So annotate functions using `size' wrt. the required lock.
> > 
> > Here's a better diff that asserts for read, write or any type of vm_map
> > lock.
> > 
> > vm_map_lock() and vm_map_lock_read() are used for exclusive and shared
> > access respectively;  unless I missed something, no code path is purely
> > protected by vm_map_lock_read() alone, i.e. functions called with a read
> > lock held by the callee are also called with a write lock elsewhere.
> > 
> > That means my new vm_map_assert_locked_read() is currently unused, but
> > vm_map_assert_locked_any() and vm_map_assert_locked_write() are now used
> > to validate existing function comments as well as a few new places.
> > 
> > amd64 and sparc64 are happy with this, both daily drivers as well as
> > build/regress machines.
> 
> amd64 package bulk (thanks naddy), amd64 regress (thanks bluhm) and
> sparc64 base builds/regress are happy with the diff below, the
> "uvm_unmap_kill_entry(): unwire with map lock held" locking diff on
> tech@ and all three of mmap, munmap, mprotect unlocked.
> 
> > 
> > I'd appreciate more tests and reports about running with this diff and
> > mmap(2) unlocked.
> 
> Here's that tested diff which a) follows rwlock(9)'s naming pattern for
> the assert functions and b) syncs lock related comments with NetBSD.
> 
> It only adds and (updates) comments;  no locking or logic changes.
> 
> I'd like to get these asserts committed soon (after the required
> uvm_unmap_kill_entry() vm_map lock diff) before moving forward with
> other (un)locking diffs.

So there is the existing UVM_MAP_REQ_WRITE().  Compared to
vm_map_assert_wrlock() it checks the reference count of the map.  I
think that's questionable, so these should probably be replaced by
vm_map_assert_wrlock().

Also...

> Index: uvm_map.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> retrieving revision 1.282
> diff -u -p -r1.282 uvm_map.c
> --- uvm_map.c 21 Dec 2021 22:21:32 -  1.282
> +++ uvm_map.c 3 Feb 2022 15:46:44 -
> @@ -498,6 +498,7 @@ uvm_mapent_addr_remove(struct vm_map *ma
>  void
>  uvm_map_reference(struct vm_map *map)
>  {
> + vm_map_assert_unlocked(map);

I don't understand this one.  Grabbing a reference should be safe at
any time as long as you already hold a reference.

>   atomic_inc_int(>ref_count);
>  }
>  
> @@ -805,6 +806,8 @@ uvm_map_sel_limits(vaddr_t *min, vaddr_t
>   * Fills in *start_ptr and *end_ptr to be the first and last entry describing
>   * the space.
>   * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
> + *
> + * map must be at least read-locked.
>   */
>  int
>  uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
> @@ -815,6 +818,8 @@ uvm_map_isavail(struct vm_map *map, stru
>   struct uvm_map_addr *atree;
>   struct vm_map_entry *i, *i_end;
>  
> + vm_map_assert_anylock(map);
> +
>   if (addr + sz < addr)
>   return 0;
>  
> @@ -892,6 +897,8 @@ uvm_map_isavail(struct vm_map *map, stru
>  /*
>   * Invoke each address selector until an address is found.
>   * Will not invoke uaddr_exe.
> + *
> + * => caller must at least have read-locked map
>   */
>  int
>  uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first,
> @@ -901,6 +908,8 @@ uvm_map_findspace(struct vm_map *map, st
>   struct uvm_addr_state *uaddr;
>   int i;
>  
> + vm_map_assert_anylock(map);
> +
>   /*
>* Allocation for sz bytes at any address,
>* using the addr selectors in order.
> @@ -1160,7 +1169,7 @@ unlock:
>   * uvm_map: establish a valid mapping in map
>   *
>   * => *addr and sz must be a multiple of PAGE_SIZE.
> - * => map must be unlocked.
> + * => map must be unlocked (we will lock it)
>   * =>  value meanings (4 cases):
>   *   [1]   == uoffset is a hint for PMAP_PREFER
>   *   [2]== don't PMAP_PREFER
> @@ -1821,6 +1830,8 @@ boolean_t
>  uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
>  struct vm_map_entry **entry)
>  {
> + vm_map_assert_anylock(map);
> +
>   *entry = uvm_map_entrybyaddr(>addr, address);
>   return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
>   (*entry)->start <= address && (*entry)->end > address;
> @@ -2206,6 +2217,8 @@ uvm_unmap_kill_entry(struct vm_map *map,
>   * If remove_holes, then remove ET_HOLE entries as well.
>   * If markfree, 

Re: unlock mmap(2) for anonymous mappings

2022-02-07 Thread Klemens Nanni
On Thu, Jan 27, 2022 at 11:11:48AM +, Klemens Nanni wrote:
> On Tue, Jan 25, 2022 at 07:54:52AM +, Klemens Nanni wrote:
> > > Could we use a vm_map_assert_locked() or something similar that reflect
> > > the exclusive or shared (read) lock comments?  I don't trust comments.
> > > It's too easy to miss a lock in a code path.
> > 
> > This survives desktop usage, running regress and building kernels on
> > amd64.
> > 
> > I'll throw it at sparc64 soon.
> > 
> > > 
> > > > So annotate functions using `size' wrt. the required lock.
> 
> Here's a better diff that asserts for read, write or any type of vm_map
> lock.
> 
> vm_map_lock() and vm_map_lock_read() are used for exclusive and shared
> access respectively;  unless I missed something, no code path is purely
> protected by vm_map_lock_read() alone, i.e. functions called with a read
> lock held by the callee are also called with a write lock elsewhere.
> 
> That means my new vm_map_assert_locked_read() is currently unused, but
> vm_map_assert_locked_any() and vm_map_assert_locked_write() are now used
> to validate existing function comments as well as a few new places.
> 
> amd64 and sparc64 are happy with this, both daily drivers as well as
> build/regress machines.

amd64 package bulk (thanks naddy), amd64 regress (thanks bluhm) and
sparc64 base builds/regress are happy with the diff below, the
"uvm_unmap_kill_entry(): unwire with map lock held" locking diff on
tech@ and all three of mmap, munmap, mprotect unlocked.

> 
> I'd appreciate more tests and reports about running with this diff and
> mmap(2) unlocked.

Here's that tested diff which a) follows rwlock(9)'s naming pattern for
the assert functions and b) syncs lock related comments with NetBSD.

It only adds and (updates) comments;  no locking or logic changes.

I'd like to get these asserts committed soon (after the required
uvm_unmap_kill_entry() vm_map lock diff) before moving forward with
other (un)locking diffs.


Index: uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.282
diff -u -p -r1.282 uvm_map.c
--- uvm_map.c   21 Dec 2021 22:21:32 -  1.282
+++ uvm_map.c   3 Feb 2022 15:46:44 -
@@ -498,6 +498,7 @@ uvm_mapent_addr_remove(struct vm_map *ma
 void
 uvm_map_reference(struct vm_map *map)
 {
+   vm_map_assert_unlocked(map);
atomic_inc_int(>ref_count);
 }
 
@@ -805,6 +806,8 @@ uvm_map_sel_limits(vaddr_t *min, vaddr_t
  * Fills in *start_ptr and *end_ptr to be the first and last entry describing
  * the space.
  * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
+ *
+ * map must be at least read-locked.
  */
 int
 uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
@@ -815,6 +818,8 @@ uvm_map_isavail(struct vm_map *map, stru
struct uvm_map_addr *atree;
struct vm_map_entry *i, *i_end;
 
+   vm_map_assert_anylock(map);
+
if (addr + sz < addr)
return 0;
 
@@ -892,6 +897,8 @@ uvm_map_isavail(struct vm_map *map, stru
 /*
  * Invoke each address selector until an address is found.
  * Will not invoke uaddr_exe.
+ *
+ * => caller must at least have read-locked map
  */
 int
 uvm_map_findspace(struct vm_map *map, struct vm_map_entry**first,
@@ -901,6 +908,8 @@ uvm_map_findspace(struct vm_map *map, st
struct uvm_addr_state *uaddr;
int i;
 
+   vm_map_assert_anylock(map);
+
/*
 * Allocation for sz bytes at any address,
 * using the addr selectors in order.
@@ -1160,7 +1169,7 @@ unlock:
  * uvm_map: establish a valid mapping in map
  *
  * => *addr and sz must be a multiple of PAGE_SIZE.
- * => map must be unlocked.
+ * => map must be unlocked (we will lock it)
  * =>  value meanings (4 cases):
  * [1]   == uoffset is a hint for PMAP_PREFER
  * [2]== don't PMAP_PREFER
@@ -1821,6 +1830,8 @@ boolean_t
 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
 struct vm_map_entry **entry)
 {
+   vm_map_assert_anylock(map);
+
*entry = uvm_map_entrybyaddr(>addr, address);
return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
(*entry)->start <= address && (*entry)->end > address;
@@ -2206,6 +2217,8 @@ uvm_unmap_kill_entry(struct vm_map *map,
  * If remove_holes, then remove ET_HOLE entries as well.
  * If markfree, entry will be properly marked free, otherwise, no replacement
  * entry will be put in the tree (corrupting the tree).
+ *
+ * => map must be locked by caller
  */
 void
 uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
@@ -2214,6 +2227,8 @@ uvm_unmap_remove(struct vm_map *map, vad
 {
struct vm_map_entry *prev_hint, *next, *entry;
 
+   vm_map_assert_wrlock(map);
+
start = MAX(start, map->min_offset);
end = MIN(end, map->max_offset);
if (start >= end)
@@ -2304,6 +2319,8 @@ uvm_map_pageable_pgon(struct vm_map *map
 {
struct vm_map_entry *iter;

Re: unlock mmap(2) for anonymous mappings

2022-01-27 Thread Klemens Nanni
On Tue, Jan 25, 2022 at 07:54:52AM +, Klemens Nanni wrote:
> > Could we use a vm_map_assert_locked() or something similar that reflect
> > the exclusive or shared (read) lock comments?  I don't trust comments.
> > It's too easy to miss a lock in a code path.
> 
> This survives desktop usage, running regress and building kernels on
> amd64.
> 
> I'll throw it at sparc64 soon.
> 
> > 
> > > So annotate functions using `size' wrt. the required lock.

Here's a better diff that asserts for read, write or any type of vm_map
lock.

vm_map_lock() and vm_map_lock_read() are used for exclusive and shared
access respectively;  unless I missed something, no code path is purely
protected by vm_map_lock_read() alone, i.e. functions called with a read
lock held by the callee are also called with a write lock elsewhere.

That means my new vm_map_assert_locked_read() is currently unused, but
vm_map_assert_locked_any() and vm_map_assert_locked_write() are now used
to validate existing function comments as well as a few new places.

amd64 and sparc64 are happy with this, both daily drivers as well as
build/regress machines.

I'd appreciate more tests and reports about running with this diff and
mmap(2) unlocked.

Index: uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.282
diff -u -p -r1.282 uvm_map.c
--- uvm_map.c   21 Dec 2021 22:21:32 -  1.282
+++ uvm_map.c   27 Jan 2022 10:45:47 -
@@ -805,6 +805,8 @@ uvm_map_sel_limits(vaddr_t *min, vaddr_t
  * Fills in *start_ptr and *end_ptr to be the first and last entry describing
  * the space.
  * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
+ *
+ * map must be at least read-locked.
  */
 int
 uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
@@ -815,6 +817,8 @@ uvm_map_isavail(struct vm_map *map, stru
struct uvm_map_addr *atree;
struct vm_map_entry *i, *i_end;
 
+   vm_map_assert_locked_any(map);
+
if (addr + sz < addr)
return 0;
 
@@ -1821,6 +1825,8 @@ boolean_t
 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
 struct vm_map_entry **entry)
 {
+   vm_map_assert_locked_any(map);
+
*entry = uvm_map_entrybyaddr(>addr, address);
return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
(*entry)->start <= address && (*entry)->end > address;
@@ -2206,6 +2212,8 @@ uvm_unmap_kill_entry(struct vm_map *map,
  * If remove_holes, then remove ET_HOLE entries as well.
  * If markfree, entry will be properly marked free, otherwise, no replacement
  * entry will be put in the tree (corrupting the tree).
+ *
+ * map must be locked.
  */
 void
 uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
@@ -2214,6 +,8 @@ uvm_unmap_remove(struct vm_map *map, vad
 {
struct vm_map_entry *prev_hint, *next, *entry;
 
+   vm_map_assert_locked_write(map);
+
start = MAX(start, map->min_offset);
end = MIN(end, map->max_offset);
if (start >= end)
@@ -2976,12 +2986,17 @@ uvm_tree_sanity(struct vm_map *map, char
UVM_ASSERT(map, addr == vm_map_max(map), file, line);
 }
 
+/*
+ * map must be at least read-locked.
+ */
 void
 uvm_tree_size_chk(struct vm_map *map, char *file, int line)
 {
struct vm_map_entry *iter;
vsize_t size;
 
+   vm_map_assert_locked_any(map);
+
size = 0;
RBT_FOREACH(iter, uvm_map_addr, >addr) {
if (!UVM_ET_ISHOLE(iter))
@@ -4268,7 +4283,7 @@ uvm_map_submap(struct vm_map *map, vaddr
  * uvm_map_checkprot: check protection in map
  *
  * => must allow specific protection in a fully allocated region.
- * => map mut be read or write locked by caller.
+ * => map must be read or write locked by caller.
  */
 boolean_t
 uvm_map_checkprot(struct vm_map *map, vaddr_t start, vaddr_t end,
@@ -4276,6 +4291,8 @@ uvm_map_checkprot(struct vm_map *map, va
 {
struct vm_map_entry *entry;
 
+   vm_map_assert_locked_any(map);
+
if (start < map->min_offset || end > map->max_offset || start > end)
return FALSE;
if (start == end)
@@ -5557,6 +5577,46 @@ vm_map_unbusy_ln(struct vm_map *map, cha
mtx_leave(>flags_lock);
if (oflags & VM_MAP_WANTLOCK)
wakeup(>flags);
+}
+
+void
+vm_map_assert_locked_any_ln(struct vm_map *map, char *file, int line)
+{
+   LPRINTF(("map assert read or write locked: %p (at %s %d)\n", map, file, 
line));
+   if ((map->flags & VM_MAP_INTRSAFE) == 0)
+   rw_assert_anylock(>lock);
+   else
+   MUTEX_ASSERT_LOCKED(>mtx);
+}
+
+void
+vm_map_assert_locked_read_ln(struct vm_map *map, char *file, int line)
+{
+   LPRINTF(("map assert read locked: %p (at %s %d)\n", map, file, line));
+   if ((map->flags & VM_MAP_INTRSAFE) == 0)
+   rw_assert_rdlock(>lock);
+   else
+   MUTEX_ASSERT_LOCKED(>mtx);
+}
+
+void

Re: unlock mmap(2) for anonymous mappings

2022-01-24 Thread Klemens Nanni
On Mon, Jan 24, 2022 at 12:08:33PM -0300, Martin Pieuchot wrote:
> On 24/01/22(Mon) 12:06, Klemens Nanni wrote:
> > On Sun, Jan 16, 2022 at 09:22:50AM -0300, Martin Pieuchot wrote:
> > > IMHO this approach of let's try if it works now and revert if it isn't
> > > doesn't help us make progress.  I'd be more confident seeing diffs that
> > > assert for the right lock in the functions called by uvm_mapanon() and
> > > documentation about which lock is protecting which field of the data
> > > structures.
> > 
> > I picked `vm_map's `size' as first underdocumented member.
> > All accesses to it are protected by vm_map_lock(), either through the
> > function itself or implicitly by already requiring the calling function
> > to lock the map.
> 
> Could we use a vm_map_assert_locked() or something similar that reflect
> the exclusive or shared (read) lock comments?  I don't trust comments.
> It's too easy to miss a lock in a code path.

This survives desktop usage, running regress and building kernels on
amd64.

I'll throw it at sparc64 soon.

> 
> > So annotate functions using `size' wrt. the required lock.

Index: uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.282
diff -u -p -r1.282 uvm_map.c
--- uvm_map.c   21 Dec 2021 22:21:32 -  1.282
+++ uvm_map.c   25 Jan 2022 07:37:32 -
@@ -805,6 +805,8 @@ uvm_map_sel_limits(vaddr_t *min, vaddr_t
  * Fills in *start_ptr and *end_ptr to be the first and last entry describing
  * the space.
  * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
+ *
+ * map must be at least read-locked.
  */
 int
 uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
@@ -815,6 +817,8 @@ uvm_map_isavail(struct vm_map *map, stru
struct uvm_map_addr *atree;
struct vm_map_entry *i, *i_end;
 
+   vm_map_assert_locked(map);
+
if (addr + sz < addr)
return 0;
 
@@ -986,6 +990,8 @@ uvm_mapanon(struct vm_map *map, vaddr_t 
vaddr_t  pmap_align, pmap_offset;
vaddr_t  hint;
 
+   vm_map_assert_unlocked(map);
+
KASSERT((map->flags & VM_MAP_ISVMSPACE) == VM_MAP_ISVMSPACE);
KASSERT(map != kernel_map);
KASSERT((map->flags & UVM_FLAG_HOLE) == 0);
@@ -1189,6 +1195,8 @@ uvm_map(struct vm_map *map, vaddr_t *add
vaddr_t  pmap_align, pmap_offset;
vaddr_t  hint;
 
+   vm_map_assert_unlocked(map);
+
if ((map->flags & VM_MAP_INTRSAFE) == 0)
splassert(IPL_NONE);
else
@@ -1821,6 +1829,8 @@ boolean_t
 uvm_map_lookup_entry(struct vm_map *map, vaddr_t address,
 struct vm_map_entry **entry)
 {
+   vm_map_assert_locked(map);
+
*entry = uvm_map_entrybyaddr(>addr, address);
return *entry != NULL && !UVM_ET_ISHOLE(*entry) &&
(*entry)->start <= address && (*entry)->end > address;
@@ -2206,6 +2216,8 @@ uvm_unmap_kill_entry(struct vm_map *map,
  * If remove_holes, then remove ET_HOLE entries as well.
  * If markfree, entry will be properly marked free, otherwise, no replacement
  * entry will be put in the tree (corrupting the tree).
+ *
+ * map must be locked.
  */
 void
 uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
@@ -2214,6 +2226,8 @@ uvm_unmap_remove(struct vm_map *map, vad
 {
struct vm_map_entry *prev_hint, *next, *entry;
 
+   vm_map_assert_locked(map);
+
start = MAX(start, map->min_offset);
end = MIN(end, map->max_offset);
if (start >= end)
@@ -2976,13 +2990,17 @@ uvm_tree_sanity(struct vm_map *map, char
UVM_ASSERT(map, addr == vm_map_max(map), file, line);
 }
 
+/*
+ * map must be at least read-locked.
+ */
 void
 uvm_tree_size_chk(struct vm_map *map, char *file, int line)
 {
struct vm_map_entry *iter;
-   vsize_t size;
+   vsize_t size = 0;
+
+   vm_map_assert_locked(map);
 
-   size = 0;
RBT_FOREACH(iter, uvm_map_addr, >addr) {
if (!UVM_ET_ISHOLE(iter))
size += iter->end - iter->start;
@@ -3320,6 +3338,8 @@ uvm_map_protect(struct vm_map *map, vadd
vsize_t dused;
int error;
 
+   vm_map_assert_unlocked(map);
+
if (start > end)
return EINVAL;
start = MAX(start, map->min_offset);
@@ -4096,6 +4116,7 @@ uvmspace_fork(struct process *pr)
struct vm_map_entry *old_entry, *new_entry;
struct uvm_map_deadq dead;
 
+   vm_map_assert_unlocked(old_map);
vm_map_lock(old_map);
 
vm2 = uvmspace_alloc(old_map->min_offset, old_map->max_offset,
@@ -4236,6 +4257,8 @@ uvm_map_submap(struct vm_map *map, vaddr
struct vm_map_entry *entry;
int result;
 
+   vm_map_assert_unlocked(map);
+
if (start > map->max_offset || end > map->max_offset ||
start < map->min_offset || end < map->min_offset)
return 

Re: unlock mmap(2) for anonymous mappings

2022-01-24 Thread Martin Pieuchot
On 24/01/22(Mon) 12:06, Klemens Nanni wrote:
> On Sun, Jan 16, 2022 at 09:22:50AM -0300, Martin Pieuchot wrote:
> > IMHO this approach of let's try if it works now and revert if it isn't
> > doesn't help us make progress.  I'd be more confident seeing diffs that
> > assert for the right lock in the functions called by uvm_mapanon() and
> > documentation about which lock is protecting which field of the data
> > structures.
> 
> I picked `vm_map's `size' as first underdocumented member.
> All accesses to it are protected by vm_map_lock(), either through the
> function itself or implicitly by already requiring the calling function
> to lock the map.

Could we use a vm_map_assert_locked() or something similar that reflect
the exclusive or shared (read) lock comments?  I don't trust comments.
It's too easy to miss a lock in a code path.

> So annotate functions using `size' wrt. the required lock.
> 
> Index: uvm_map.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_map.c,v
> retrieving revision 1.282
> diff -u -p -r1.282 uvm_map.c
> --- uvm_map.c 21 Dec 2021 22:21:32 -  1.282
> +++ uvm_map.c 21 Jan 2022 13:03:05 -
> @@ -805,6 +805,8 @@ uvm_map_sel_limits(vaddr_t *min, vaddr_t
>   * Fills in *start_ptr and *end_ptr to be the first and last entry describing
>   * the space.
>   * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
> + *
> + * map must be at least read-locked.
>   */
>  int
>  uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
> @@ -2206,6 +2208,8 @@ uvm_unmap_kill_entry(struct vm_map *map,
>   * If remove_holes, then remove ET_HOLE entries as well.
>   * If markfree, entry will be properly marked free, otherwise, no replacement
>   * entry will be put in the tree (corrupting the tree).
> + *
> + * map must be locked.
>   */
>  void
>  uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
> @@ -2976,6 +2980,9 @@ uvm_tree_sanity(struct vm_map *map, char
>   UVM_ASSERT(map, addr == vm_map_max(map), file, line);
>  }
>  
> +/*
> + * map must be at least read-locked.
> + */
>  void
>  uvm_tree_size_chk(struct vm_map *map, char *file, int line)
>  {
> Index: uvm_map.h
> ===
> RCS file: /cvs/src/sys/uvm/uvm_map.h,v
> retrieving revision 1.71
> diff -u -p -r1.71 uvm_map.h
> --- uvm_map.h 15 Dec 2021 12:53:53 -  1.71
> +++ uvm_map.h 21 Jan 2022 12:51:26 -
> @@ -272,7 +272,7 @@ struct vm_map {
>  
>   struct uvm_map_addr addr;   /* [v] Entry tree, by addr */
>  
> - vsize_t size;   /* virtual size */
> + vsize_t size;   /* [v] virtual size */
>   int ref_count;  /* [a] Reference count */
>   int flags;  /* flags */
>   struct mutexflags_lock; /* flags lock */
> 



Re: unlock mmap(2) for anonymous mappings

2022-01-24 Thread Klemens Nanni
On Sun, Jan 16, 2022 at 09:22:50AM -0300, Martin Pieuchot wrote:
> IMHO this approach of let's try if it works now and revert if it isn't
> doesn't help us make progress.  I'd be more confident seeing diffs that
> assert for the right lock in the functions called by uvm_mapanon() and
> documentation about which lock is protecting which field of the data
> structures.

I picked `vm_map's `size' as first underdocumented member.
All accesses to it are protected by vm_map_lock(), either through the
function itself or implicitly by already requiring the calling function
to lock the map.

So annotate functions using `size' wrt. the required lock.

Index: uvm_map.c
===
RCS file: /cvs/src/sys/uvm/uvm_map.c,v
retrieving revision 1.282
diff -u -p -r1.282 uvm_map.c
--- uvm_map.c   21 Dec 2021 22:21:32 -  1.282
+++ uvm_map.c   21 Jan 2022 13:03:05 -
@@ -805,6 +805,8 @@ uvm_map_sel_limits(vaddr_t *min, vaddr_t
  * Fills in *start_ptr and *end_ptr to be the first and last entry describing
  * the space.
  * If called with prefilled *start_ptr and *end_ptr, they are to be correct.
+ *
+ * map must be at least read-locked.
  */
 int
 uvm_map_isavail(struct vm_map *map, struct uvm_addr_state *uaddr,
@@ -2206,6 +2208,8 @@ uvm_unmap_kill_entry(struct vm_map *map,
  * If remove_holes, then remove ET_HOLE entries as well.
  * If markfree, entry will be properly marked free, otherwise, no replacement
  * entry will be put in the tree (corrupting the tree).
+ *
+ * map must be locked.
  */
 void
 uvm_unmap_remove(struct vm_map *map, vaddr_t start, vaddr_t end,
@@ -2976,6 +2980,9 @@ uvm_tree_sanity(struct vm_map *map, char
UVM_ASSERT(map, addr == vm_map_max(map), file, line);
 }
 
+/*
+ * map must be at least read-locked.
+ */
 void
 uvm_tree_size_chk(struct vm_map *map, char *file, int line)
 {
Index: uvm_map.h
===
RCS file: /cvs/src/sys/uvm/uvm_map.h,v
retrieving revision 1.71
diff -u -p -r1.71 uvm_map.h
--- uvm_map.h   15 Dec 2021 12:53:53 -  1.71
+++ uvm_map.h   21 Jan 2022 12:51:26 -
@@ -272,7 +272,7 @@ struct vm_map {
 
struct uvm_map_addr addr;   /* [v] Entry tree, by addr */
 
-   vsize_t size;   /* virtual size */
+   vsize_t size;   /* [v] virtual size */
int ref_count;  /* [a] Reference count */
int flags;  /* flags */
struct mutexflags_lock; /* flags lock */



Re: unlock mmap(2) for anonymous mappings

2022-01-16 Thread Martin Pieuchot
On 14/01/22(Fri) 23:01, Mark Kettenis wrote:
> > Date: Tue, 11 Jan 2022 23:13:20 +
> > From: Klemens Nanni 
> > 
> > On Tue, Jan 11, 2022 at 09:54:44AM -0700, Theo de Raadt wrote:
> > > > Now this is clearly a "slow" path.  I don't think there is any reason
> > > > not to put all the code in that if (uvw_wxabort) block under the
> > > > kernel lock.  For now I think making access to ps_wxcounter atomic is
> > > > just too fine grained.
> > > 
> > > Right.  Lock the whole block.
> > 
> > Thanks everyone, here's the combined diff for that.
> 
> I think mpi@ should be involved in the actual unlocking of mmap(2),
> munmap(2) and mprotect(2).  But the changes to uvm_mmap.c are ok
> kettenis@ and can be committed now.

It isn't clear to me what changed since the last time this has been
tried.  Why is it safe now?  What are the locking assumptions?  

IMHO this approach of let's try if it works now and revert if it isn't
doesn't help us make progress.  I'd be more confident seeing diffs that
assert for the right lock in the functions called by uvm_mapanon() and
documentation about which lock is protecting which field of the data
structures.

NetBSD has done much of this and the code bases do not diverge much so
it can be useful to look there as well.



Re: unlock mmap(2) for anonymous mappings

2022-01-14 Thread Mark Kettenis
> Date: Tue, 11 Jan 2022 23:13:20 +
> From: Klemens Nanni 
> 
> On Tue, Jan 11, 2022 at 09:54:44AM -0700, Theo de Raadt wrote:
> > > Now this is clearly a "slow" path.  I don't think there is any reason
> > > not to put all the code in that if (uvw_wxabort) block under the
> > > kernel lock.  For now I think making access to ps_wxcounter atomic is
> > > just too fine grained.
> > 
> > Right.  Lock the whole block.
> 
> Thanks everyone, here's the combined diff for that.

I think mpi@ should be involved in the actual unlocking of mmap(2),
munmap(2) and mprotect(2).  But the changes to uvm_mmap.c are ok
kettenis@ and can be committed now.

> Index: kern/syscalls.master
> ===
> RCS file: /cvs/src/sys/kern/syscalls.master,v
> retrieving revision 1.222
> diff -u -p -r1.222 syscalls.master
> --- kern/syscalls.master  11 Jan 2022 08:09:14 -  1.222
> +++ kern/syscalls.master  11 Jan 2022 23:10:50 -
> @@ -126,7 +126,7 @@
>   struct sigaction *osa); }
>  47   STD NOLOCK  { gid_t sys_getgid(void); }
>  48   STD NOLOCK  { int sys_sigprocmask(int how, sigset_t mask); }
> -49   STD { void *sys_mmap(void *addr, size_t len, int prot, \
> +49   STD NOLOCK  { void *sys_mmap(void *addr, size_t len, int prot, \
>   int flags, int fd, off_t pos); }
>  50   STD { int sys_setlogin(const char *namebuf); }
>  #ifdef ACCOUNTING
> Index: uvm/uvm_mmap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> retrieving revision 1.168
> diff -u -p -r1.168 uvm_mmap.c
> --- uvm/uvm_mmap.c5 Jan 2022 17:53:44 -   1.168
> +++ uvm/uvm_mmap.c11 Jan 2022 23:02:13 -
> @@ -183,12 +183,14 @@ uvm_wxcheck(struct proc *p, char *call)
>   return 0;
>  
>   if (uvm_wxabort) {
> + KERNEL_LOCK();
>   /* Report W^X failures */
>   if (pr->ps_wxcounter++ == 0)
>   log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
>   pr->ps_comm, pr->ps_pid, call);
>   /* Send uncatchable SIGABRT for coredump */
>   sigexit(p, SIGABRT);
> + KERNEL_UNLOCK();
>   }
>  
>   return ENOTSUP;
> 



Re: unlock mmap(2) for anonymous mappings

2022-01-13 Thread Christian Weisgerber
Klemens Nanni:

> > > Now this is clearly a "slow" path.  I don't think there is any reason
> > > not to put all the code in that if (uvw_wxabort) block under the
> > > kernel lock.  For now I think making access to ps_wxcounter atomic is
> > > just too fine grained.
> > 
> > Right.  Lock the whole block.
> 
> Thanks everyone, here's the combined diff for that.
-snip-

FWIW, I ran an amd64 package bulk build with this.  No problems.

-- 
Christian "naddy" Weisgerber  na...@mips.inka.de



Re: unlock mmap(2) for anonymous mappings

2022-01-11 Thread Theo de Raadt
I am saying I want better study on all architectures, so that I don't
hit problems (first) in the snapshots cluster.

Theo de Raadt  wrote:

> I am blocking this.
> 
> I don't see any messaging which suggests this has been run on all
> architectures.  It might still hit some MD code which is wrong.
> 
> Klemens Nanni  wrote:
> 
> > On Tue, Jan 11, 2022 at 09:54:44AM -0700, Theo de Raadt wrote:
> > > > Now this is clearly a "slow" path.  I don't think there is any reason
> > > > not to put all the code in that if (uvw_wxabort) block under the
> > > > kernel lock.  For now I think making access to ps_wxcounter atomic is
> > > > just too fine grained.
> > > 
> > > Right.  Lock the whole block.
> > 
> > Thanks everyone, here's the combined diff for that.
> > 
> > Index: kern/syscalls.master
> > ===
> > RCS file: /cvs/src/sys/kern/syscalls.master,v
> > retrieving revision 1.222
> > diff -u -p -r1.222 syscalls.master
> > --- kern/syscalls.master11 Jan 2022 08:09:14 -  1.222
> > +++ kern/syscalls.master11 Jan 2022 23:10:50 -
> > @@ -126,7 +126,7 @@
> > struct sigaction *osa); }
> >  47 STD NOLOCK  { gid_t sys_getgid(void); }
> >  48 STD NOLOCK  { int sys_sigprocmask(int how, sigset_t mask); }
> > -49 STD { void *sys_mmap(void *addr, size_t len, int prot, \
> > +49 STD NOLOCK  { void *sys_mmap(void *addr, size_t len, int prot, \
> > int flags, int fd, off_t pos); }
> >  50 STD { int sys_setlogin(const char *namebuf); }
> >  #ifdef ACCOUNTING
> > Index: uvm/uvm_mmap.c
> > ===
> > RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> > retrieving revision 1.168
> > diff -u -p -r1.168 uvm_mmap.c
> > --- uvm/uvm_mmap.c  5 Jan 2022 17:53:44 -   1.168
> > +++ uvm/uvm_mmap.c  11 Jan 2022 23:02:13 -
> > @@ -183,12 +183,14 @@ uvm_wxcheck(struct proc *p, char *call)
> > return 0;
> >  
> > if (uvm_wxabort) {
> > +   KERNEL_LOCK();
> > /* Report W^X failures */
> > if (pr->ps_wxcounter++ == 0)
> > log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
> > pr->ps_comm, pr->ps_pid, call);
> > /* Send uncatchable SIGABRT for coredump */
> > sigexit(p, SIGABRT);
> > +   KERNEL_UNLOCK();
> > }
> >  
> > return ENOTSUP;
> > 
> 



Re: unlock mmap(2) for anonymous mappings

2022-01-11 Thread Theo de Raadt
I am blocking this.

I don't see any messaging which suggests this has been run on all
architectures.  It might still hit some MD code which is wrong.

Klemens Nanni  wrote:

> On Tue, Jan 11, 2022 at 09:54:44AM -0700, Theo de Raadt wrote:
> > > Now this is clearly a "slow" path.  I don't think there is any reason
> > > not to put all the code in that if (uvw_wxabort) block under the
> > > kernel lock.  For now I think making access to ps_wxcounter atomic is
> > > just too fine grained.
> > 
> > Right.  Lock the whole block.
> 
> Thanks everyone, here's the combined diff for that.
> 
> Index: kern/syscalls.master
> ===
> RCS file: /cvs/src/sys/kern/syscalls.master,v
> retrieving revision 1.222
> diff -u -p -r1.222 syscalls.master
> --- kern/syscalls.master  11 Jan 2022 08:09:14 -  1.222
> +++ kern/syscalls.master  11 Jan 2022 23:10:50 -
> @@ -126,7 +126,7 @@
>   struct sigaction *osa); }
>  47   STD NOLOCK  { gid_t sys_getgid(void); }
>  48   STD NOLOCK  { int sys_sigprocmask(int how, sigset_t mask); }
> -49   STD { void *sys_mmap(void *addr, size_t len, int prot, \
> +49   STD NOLOCK  { void *sys_mmap(void *addr, size_t len, int prot, \
>   int flags, int fd, off_t pos); }
>  50   STD { int sys_setlogin(const char *namebuf); }
>  #ifdef ACCOUNTING
> Index: uvm/uvm_mmap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> retrieving revision 1.168
> diff -u -p -r1.168 uvm_mmap.c
> --- uvm/uvm_mmap.c5 Jan 2022 17:53:44 -   1.168
> +++ uvm/uvm_mmap.c11 Jan 2022 23:02:13 -
> @@ -183,12 +183,14 @@ uvm_wxcheck(struct proc *p, char *call)
>   return 0;
>  
>   if (uvm_wxabort) {
> + KERNEL_LOCK();
>   /* Report W^X failures */
>   if (pr->ps_wxcounter++ == 0)
>   log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
>   pr->ps_comm, pr->ps_pid, call);
>   /* Send uncatchable SIGABRT for coredump */
>   sigexit(p, SIGABRT);
> + KERNEL_UNLOCK();
>   }
>  
>   return ENOTSUP;
> 



Re: unlock mmap(2) for anonymous mappings

2022-01-11 Thread Klemens Nanni
On Wed, Jan 12, 2022 at 02:32:31AM +0300, Vitaliy Makkoveev wrote:
> Does it make sense to document that kernel lock protects
> `ps_wxcounter’? We have such [K] in other structure definitions.
> 
> + u_int64_t ps_wxcounter; /* [K] number of W^X violations */

Yes, that would be in order.  I'll make sure to follow up with a comment
once/if progress is made -- until then I avoid changing the header so as
to avoid rebuilds due to a mere comment.

> The rest of diff looks good to me.

So far, this diff runs fine on amd64, sparc64, arm64 doing releases,
regress, ports builds and daily usage for me.

Others have reported no regression on their octeon and i386 boxes with
varying workloads, as well as additional builds on the architectures I
already tested.



Re: unlock mmap(2) for anonymous mappings

2022-01-11 Thread Vitaliy Makkoveev
Does it make sense to document that kernel lock protects
`ps_wxcounter’? We have such [K] in other structure definitions.

+   u_int64_t ps_wxcounter; /* [K] number of W^X violations */

The rest of diff looks good to me.

> On 12 Jan 2022, at 02:13, Klemens Nanni  wrote:
> 
> On Tue, Jan 11, 2022 at 09:54:44AM -0700, Theo de Raadt wrote:
>>> Now this is clearly a "slow" path.  I don't think there is any reason
>>> not to put all the code in that if (uvw_wxabort) block under the
>>> kernel lock.  For now I think making access to ps_wxcounter atomic is
>>> just too fine grained.
>> 
>> Right.  Lock the whole block.
> 
> Thanks everyone, here's the combined diff for that.
> 
> Index: kern/syscalls.master
> ===
> RCS file: /cvs/src/sys/kern/syscalls.master,v
> retrieving revision 1.222
> diff -u -p -r1.222 syscalls.master
> --- kern/syscalls.master  11 Jan 2022 08:09:14 -  1.222
> +++ kern/syscalls.master  11 Jan 2022 23:10:50 -
> @@ -126,7 +126,7 @@
>   struct sigaction *osa); }
> 47STD NOLOCK  { gid_t sys_getgid(void); }
> 48STD NOLOCK  { int sys_sigprocmask(int how, sigset_t mask); }
> -49   STD { void *sys_mmap(void *addr, size_t len, int prot, \
> +49   STD NOLOCK  { void *sys_mmap(void *addr, size_t len, int prot, \
>   int flags, int fd, off_t pos); }
> 50STD { int sys_setlogin(const char *namebuf); }
> #ifdef ACCOUNTING
> Index: uvm/uvm_mmap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> retrieving revision 1.168
> diff -u -p -r1.168 uvm_mmap.c
> --- uvm/uvm_mmap.c5 Jan 2022 17:53:44 -   1.168
> +++ uvm/uvm_mmap.c11 Jan 2022 23:02:13 -
> @@ -183,12 +183,14 @@ uvm_wxcheck(struct proc *p, char *call)
>   return 0;
> 
>   if (uvm_wxabort) {
> + KERNEL_LOCK();
>   /* Report W^X failures */
>   if (pr->ps_wxcounter++ == 0)
>   log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
>   pr->ps_comm, pr->ps_pid, call);
>   /* Send uncatchable SIGABRT for coredump */
>   sigexit(p, SIGABRT);
> + KERNEL_UNLOCK();
>   }
> 
>   return ENOTSUP;
> 



Re: unlock mmap(2) for anonymous mappings

2022-01-11 Thread Klemens Nanni
On Tue, Jan 11, 2022 at 09:54:44AM -0700, Theo de Raadt wrote:
> > Now this is clearly a "slow" path.  I don't think there is any reason
> > not to put all the code in that if (uvw_wxabort) block under the
> > kernel lock.  For now I think making access to ps_wxcounter atomic is
> > just too fine grained.
> 
> Right.  Lock the whole block.

Thanks everyone, here's the combined diff for that.

Index: kern/syscalls.master
===
RCS file: /cvs/src/sys/kern/syscalls.master,v
retrieving revision 1.222
diff -u -p -r1.222 syscalls.master
--- kern/syscalls.master11 Jan 2022 08:09:14 -  1.222
+++ kern/syscalls.master11 Jan 2022 23:10:50 -
@@ -126,7 +126,7 @@
struct sigaction *osa); }
 47 STD NOLOCK  { gid_t sys_getgid(void); }
 48 STD NOLOCK  { int sys_sigprocmask(int how, sigset_t mask); }
-49 STD { void *sys_mmap(void *addr, size_t len, int prot, \
+49 STD NOLOCK  { void *sys_mmap(void *addr, size_t len, int prot, \
int flags, int fd, off_t pos); }
 50 STD { int sys_setlogin(const char *namebuf); }
 #ifdef ACCOUNTING
Index: uvm/uvm_mmap.c
===
RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
retrieving revision 1.168
diff -u -p -r1.168 uvm_mmap.c
--- uvm/uvm_mmap.c  5 Jan 2022 17:53:44 -   1.168
+++ uvm/uvm_mmap.c  11 Jan 2022 23:02:13 -
@@ -183,12 +183,14 @@ uvm_wxcheck(struct proc *p, char *call)
return 0;
 
if (uvm_wxabort) {
+   KERNEL_LOCK();
/* Report W^X failures */
if (pr->ps_wxcounter++ == 0)
log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
pr->ps_comm, pr->ps_pid, call);
/* Send uncatchable SIGABRT for coredump */
sigexit(p, SIGABRT);
+   KERNEL_UNLOCK();
}
 
return ENOTSUP;



Re: unlock mmap(2) for anonymous mappings

2022-01-11 Thread Theo de Raadt
> Now this is clearly a "slow" path.  I don't think there is any reason
> not to put all the code in that if (uvw_wxabort) block under the
> kernel lock.  For now I think making access to ps_wxcounter atomic is
> just too fine grained.

Right.  Lock the whole block.



Re: unlock mmap(2) for anonymous mappings

2022-01-11 Thread Mark Kettenis
> Date: Tue, 11 Jan 2022 08:15:13 +
> From: Klemens Nanni 
> 
> On Mon, Jan 10, 2022 at 12:06:44PM +, Klemens Nanni wrote:
> > On Fri, Dec 31, 2021 at 07:54:53PM +0300, Vitaliy Makkoveev wrote:
> > > The uvm_wxabort path within uvm_wxcheck() looks not MP-safe.
> > 
> > Right, I did not pay enough attention to W^X handling.
> 
> New diff, either alone or on top of the mmap unlocking.
> 
> > I'm not entirely sure about the sigexit() path.
> 
> We can't dump core without the kernel lock, assertions do make sure...
> 
> > There's `ps_wxcounter' as u_int64_t which needs a lock or atomic
> > operations.
> 
> This could look like this.  `ps_wxcounter' is not used anywhere else in
> the tree;  it has been like this since import in 2016.
> 
> > The kernel lock could be pushed into uvm_wxabort() but there it'd still
> > be grabbed for every mmap(2) call.
> 
> This means we're only grabbing the kernel lock if `kern.wxabort=1' is
> set *and* there's a W^X violation -- much better.

That diff is not right: u_int64_t is long long on 32-bit architectures
so that cast you're doing isn't right.  I also don't think it is safe
to call log(9) without holding the kernel lock yet.

Now this is clearly a "slow" path.  I don't think there is any reason
not to put all the code in that if (uvw_wxabort) block under the
kernel lock.  For now I think making access to ps_wxcounter atomic is
just too fine grained.

> Index: sys/proc.h
> ===
> RCS file: /cvs/src/sys/sys/proc.h,v
> retrieving revision 1.323
> diff -u -p -r1.323 proc.h
> --- sys/proc.h10 Dec 2021 05:34:42 -  1.323
> +++ sys/proc.h9 Jan 2022 13:42:45 -
> @@ -197,7 +197,7 @@ struct process {
>   time_t  ps_nextxcpu;/* when to send next SIGXCPU, */
>   /* in seconds of process runtime */
>  
> - u_int64_t ps_wxcounter;
> + u_int64_t ps_wxcounter; /* [a] number of W^X violations */
>  
>   struct unveil *ps_uvpaths;  /* unveil vnodes and names */
>   ssize_t ps_uvvcount;/* count of unveil vnodes held */
> Index: uvm/uvm_mmap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> retrieving revision 1.168
> diff -u -p -r1.168 uvm_mmap.c
> --- uvm/uvm_mmap.c5 Jan 2022 17:53:44 -   1.168
> +++ uvm/uvm_mmap.c10 Jan 2022 15:48:03 -
> @@ -184,11 +184,13 @@ uvm_wxcheck(struct proc *p, char *call)
>  
>   if (uvm_wxabort) {
>   /* Report W^X failures */
> - if (pr->ps_wxcounter++ == 0)
> + if (atomic_inc_long_nv((unsigned long *)>ps_wxcounter) == 1)
>   log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
>   pr->ps_comm, pr->ps_pid, call);
>   /* Send uncatchable SIGABRT for coredump */
> + KERNEL_LOCK();
>   sigexit(p, SIGABRT);
> + KERNEL_UNLOCK();
>   }
>  
>   return ENOTSUP;
> 
> 



Re: unlock mmap(2) for anonymous mappings

2022-01-11 Thread Robert Nagy
On 11/01/22 09:57 +0100, Claudio Jeker wrote:
> On Tue, Jan 11, 2022 at 08:15:13AM +, Klemens Nanni wrote:
> > On Mon, Jan 10, 2022 at 12:06:44PM +, Klemens Nanni wrote:
> > > On Fri, Dec 31, 2021 at 07:54:53PM +0300, Vitaliy Makkoveev wrote:
> > > > The uvm_wxabort path within uvm_wxcheck() looks not MP-safe.
> > > 
> > > Right, I did not pay enough attention to W^X handling.
> > 
> > New diff, either alone or on top of the mmap unlocking.
> > 
> > > I'm not entirely sure about the sigexit() path.
> > 
> > We can't dump core without the kernel lock, assertions do make sure...
> > 
> > > There's `ps_wxcounter' as u_int64_t which needs a lock or atomic
> > > operations.
> > 
> > This could look like this.  `ps_wxcounter' is not used anywhere else in
> > the tree;  it has been like this since import in 2016.
> > 
> > > The kernel lock could be pushed into uvm_wxabort() but there it'd still
> > > be grabbed for every mmap(2) call.
> > 
> > This means we're only grabbing the kernel lock if `kern.wxabort=1' is
> > set *and* there's a W^X violation -- much better.
> > 
> > 
> > Index: sys/proc.h
> > ===
> > RCS file: /cvs/src/sys/sys/proc.h,v
> > retrieving revision 1.323
> > diff -u -p -r1.323 proc.h
> > --- sys/proc.h  10 Dec 2021 05:34:42 -  1.323
> > +++ sys/proc.h  9 Jan 2022 13:42:45 -
> > @@ -197,7 +197,7 @@ struct process {
> > time_t  ps_nextxcpu;/* when to send next SIGXCPU, */
> > /* in seconds of process runtime */
> >  
> > -   u_int64_t ps_wxcounter;
> > +   u_int64_t ps_wxcounter; /* [a] number of W^X violations */
> 
> This can't be right. We don't have 64bit atomic instructions on all archs.
> Either this needs to be unsigned long or unsigned int.
> Since this is only used to limit the number of reports per process I would
> suggest to mover this to an unsigned int. You can't have that many threads
> to overflow that count.
> Or another option is to move the ps_wxcounter into the KERNEL_LOCK block
> which is also trivial and will fix this for as long as sigexit requires
> the KERNEL_LOCK.

I think moving the KERNEL_LOCK block is good enough.



Re: unlock mmap(2) for anonymous mappings

2022-01-11 Thread Claudio Jeker
On Tue, Jan 11, 2022 at 08:15:13AM +, Klemens Nanni wrote:
> On Mon, Jan 10, 2022 at 12:06:44PM +, Klemens Nanni wrote:
> > On Fri, Dec 31, 2021 at 07:54:53PM +0300, Vitaliy Makkoveev wrote:
> > > The uvm_wxabort path within uvm_wxcheck() looks not MP-safe.
> > 
> > Right, I did not pay enough attention to W^X handling.
> 
> New diff, either alone or on top of the mmap unlocking.
> 
> > I'm not entirely sure about the sigexit() path.
> 
> We can't dump core without the kernel lock, assertions do make sure...
> 
> > There's `ps_wxcounter' as u_int64_t which needs a lock or atomic
> > operations.
> 
> This could look like this.  `ps_wxcounter' is not used anywhere else in
> the tree;  it has been like this since import in 2016.
> 
> > The kernel lock could be pushed into uvm_wxabort() but there it'd still
> > be grabbed for every mmap(2) call.
> 
> This means we're only grabbing the kernel lock if `kern.wxabort=1' is
> set *and* there's a W^X violation -- much better.
> 
> 
> Index: sys/proc.h
> ===
> RCS file: /cvs/src/sys/sys/proc.h,v
> retrieving revision 1.323
> diff -u -p -r1.323 proc.h
> --- sys/proc.h10 Dec 2021 05:34:42 -  1.323
> +++ sys/proc.h9 Jan 2022 13:42:45 -
> @@ -197,7 +197,7 @@ struct process {
>   time_t  ps_nextxcpu;/* when to send next SIGXCPU, */
>   /* in seconds of process runtime */
>  
> - u_int64_t ps_wxcounter;
> + u_int64_t ps_wxcounter; /* [a] number of W^X violations */

This can't be right. We don't have 64bit atomic instructions on all archs.
Either this needs to be unsigned long or unsigned int.
Since this is only used to limit the number of reports per process I would
suggest to mover this to an unsigned int. You can't have that many threads
to overflow that count.
Or another option is to move the ps_wxcounter into the KERNEL_LOCK block
which is also trivial and will fix this for as long as sigexit requires
the KERNEL_LOCK.

>   struct unveil *ps_uvpaths;  /* unveil vnodes and names */
>   ssize_t ps_uvvcount;/* count of unveil vnodes held */
> Index: uvm/uvm_mmap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> retrieving revision 1.168
> diff -u -p -r1.168 uvm_mmap.c
> --- uvm/uvm_mmap.c5 Jan 2022 17:53:44 -   1.168
> +++ uvm/uvm_mmap.c10 Jan 2022 15:48:03 -
> @@ -184,11 +184,13 @@ uvm_wxcheck(struct proc *p, char *call)
>  
>   if (uvm_wxabort) {
>   /* Report W^X failures */
> - if (pr->ps_wxcounter++ == 0)
> + if (atomic_inc_long_nv((unsigned long *)>ps_wxcounter) == 1)
>   log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
>   pr->ps_comm, pr->ps_pid, call);
>   /* Send uncatchable SIGABRT for coredump */
> + KERNEL_LOCK();
>   sigexit(p, SIGABRT);
> + KERNEL_UNLOCK();
>   }
>  
>   return ENOTSUP;
> 

-- 
:wq Claudio



Re: unlock mmap(2) for anonymous mappings

2022-01-11 Thread Vitaliy Makkoveev
On Tue, Jan 11, 2022 at 08:15:13AM +, Klemens Nanni wrote:
> On Mon, Jan 10, 2022 at 12:06:44PM +, Klemens Nanni wrote:
> > On Fri, Dec 31, 2021 at 07:54:53PM +0300, Vitaliy Makkoveev wrote:
> > > The uvm_wxabort path within uvm_wxcheck() looks not MP-safe.
> > 
> > Right, I did not pay enough attention to W^X handling.
> 
> New diff, either alone or on top of the mmap unlocking.
> 
> > I'm not entirely sure about the sigexit() path.
> 
> We can't dump core without the kernel lock, assertions do make sure...
> 
> > There's `ps_wxcounter' as u_int64_t which needs a lock or atomic
> > operations.
> 
> This could look like this.  `ps_wxcounter' is not used anywhere else in
> the tree;  it has been like this since import in 2016.
> 
> > The kernel lock could be pushed into uvm_wxabort() but there it'd still
> > be grabbed for every mmap(2) call.
> 
> This means we're only grabbing the kernel lock if `kern.wxabort=1' is
> set *and* there's a W^X violation -- much better.
> 
> 

This is true, sigexit() should be serialized with kernel lock. Ant this
path will be followed only with non-null kern.wxabort.

> Index: sys/proc.h
> ===
> RCS file: /cvs/src/sys/sys/proc.h,v
> retrieving revision 1.323
> diff -u -p -r1.323 proc.h
> --- sys/proc.h10 Dec 2021 05:34:42 -  1.323
> +++ sys/proc.h9 Jan 2022 13:42:45 -
> @@ -197,7 +197,7 @@ struct process {
>   time_t  ps_nextxcpu;/* when to send next SIGXCPU, */
>   /* in seconds of process runtime */
>  
> - u_int64_t ps_wxcounter;
> + u_int64_t ps_wxcounter; /* [a] number of W^X violations */
>  
>   struct unveil *ps_uvpaths;  /* unveil vnodes and names */
>   ssize_t ps_uvvcount;/* count of unveil vnodes held */
> Index: uvm/uvm_mmap.c
> ===
> RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
> retrieving revision 1.168
> diff -u -p -r1.168 uvm_mmap.c
> --- uvm/uvm_mmap.c5 Jan 2022 17:53:44 -   1.168
> +++ uvm/uvm_mmap.c10 Jan 2022 15:48:03 -
> @@ -184,11 +184,13 @@ uvm_wxcheck(struct proc *p, char *call)
>  
>   if (uvm_wxabort) {
>   /* Report W^X failures */
> - if (pr->ps_wxcounter++ == 0)
> + if (atomic_inc_long_nv((unsigned long *)>ps_wxcounter) == 1)
>   log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
>   pr->ps_comm, pr->ps_pid, call);

`ps_wxcounter' is u_int64_t variable. I doubt this cast to unsigned long
would work as you expect on x32 architectures.

>   /* Send uncatchable SIGABRT for coredump */
> + KERNEL_LOCK();
>   sigexit(p, SIGABRT);
> + KERNEL_UNLOCK();
>   }
>  
>   return ENOTSUP;
> 



Re: unlock mmap(2) for anonymous mappings

2022-01-11 Thread Klemens Nanni
On Mon, Jan 10, 2022 at 12:06:44PM +, Klemens Nanni wrote:
> On Fri, Dec 31, 2021 at 07:54:53PM +0300, Vitaliy Makkoveev wrote:
> > The uvm_wxabort path within uvm_wxcheck() looks not MP-safe.
> 
> Right, I did not pay enough attention to W^X handling.

New diff, either alone or on top of the mmap unlocking.

> I'm not entirely sure about the sigexit() path.

We can't dump core without the kernel lock, assertions do make sure...

> There's `ps_wxcounter' as u_int64_t which needs a lock or atomic
> operations.

This could look like this.  `ps_wxcounter' is not used anywhere else in
the tree;  it has been like this since import in 2016.

> The kernel lock could be pushed into uvm_wxabort() but there it'd still
> be grabbed for every mmap(2) call.

This means we're only grabbing the kernel lock if `kern.wxabort=1' is
set *and* there's a W^X violation -- much better.


Index: sys/proc.h
===
RCS file: /cvs/src/sys/sys/proc.h,v
retrieving revision 1.323
diff -u -p -r1.323 proc.h
--- sys/proc.h  10 Dec 2021 05:34:42 -  1.323
+++ sys/proc.h  9 Jan 2022 13:42:45 -
@@ -197,7 +197,7 @@ struct process {
time_t  ps_nextxcpu;/* when to send next SIGXCPU, */
/* in seconds of process runtime */
 
-   u_int64_t ps_wxcounter;
+   u_int64_t ps_wxcounter; /* [a] number of W^X violations */
 
struct unveil *ps_uvpaths;  /* unveil vnodes and names */
ssize_t ps_uvvcount;/* count of unveil vnodes held */
Index: uvm/uvm_mmap.c
===
RCS file: /cvs/src/sys/uvm/uvm_mmap.c,v
retrieving revision 1.168
diff -u -p -r1.168 uvm_mmap.c
--- uvm/uvm_mmap.c  5 Jan 2022 17:53:44 -   1.168
+++ uvm/uvm_mmap.c  10 Jan 2022 15:48:03 -
@@ -184,11 +184,13 @@ uvm_wxcheck(struct proc *p, char *call)
 
if (uvm_wxabort) {
/* Report W^X failures */
-   if (pr->ps_wxcounter++ == 0)
+   if (atomic_inc_long_nv((unsigned long *)>ps_wxcounter) == 1)
log(LOG_NOTICE, "%s(%d): %s W^X violation\n",
pr->ps_comm, pr->ps_pid, call);
/* Send uncatchable SIGABRT for coredump */
+   KERNEL_LOCK();
sigexit(p, SIGABRT);
+   KERNEL_UNLOCK();
}
 
return ENOTSUP;



Re: unlock mmap(2) for anonymous mappings

2022-01-10 Thread Klemens Nanni
On Fri, Dec 31, 2021 at 09:54:23AM -0700, Theo de Raadt wrote:
> >Now that mpi has unlocked uvm's fault handler, we can unlock the mmap
> >syscall to handle MAP_ANON without the big lock.
> ...
> >So here's a first small step.  I've been running with this for months
> >on a few amd64, arm64 and sparc64 boxes without problems
> 
> So, 3 architectures have been tested.
> 
> I read your mail as a request for OKs to commit before the other architectures
> have been tested.
> 
> No way.  You first find people to help you test the others.  Or you ask for
> the whole community to help ensure the list is complete.  But you are begging
> for the wrong people to respond to you when you include this on the list of
> questions:
> 
> > Feedback?  Objections?  OK?
>   ^^^

Duly noted.  I use(d) this as a standard line for diffs, but you're
right in that it does not convey the necessary demand for testing in
such crucial cases.

I'll make sure to only ask for an OK next time afer enough feedbacka and
testing.



Re: unlock mmap(2) for anonymous mappings

2022-01-10 Thread Klemens Nanni
On Fri, Dec 31, 2021 at 07:54:53PM +0300, Vitaliy Makkoveev wrote:
> The uvm_wxabort path within uvm_wxcheck() looks not MP-safe.

Right, I did not pay enough attention to W^X handling.

I'm not entirely sure about the sigexit() path.

There's `ps_wxcounter' as u_int64_t which needs a lock or atomic
operations.

The kernel lock could be pushed into uvm_wxabort() but there it'd still
be grabbed for every mmap(2) call.

> 
> > On 31 Dec 2021, at 12:14, Klemens Nanni  wrote:
> > 
> > Now that mpi has unlocked uvm's fault handler, we can unlock the mmap
> > syscall to handle MAP_ANON without the big lock.
> > 
> > sys_mmap() still protects the !MAP_ANON case, i.e. file based mappings,
> > with the KERNEL_LOCK() itself, which is why unlocking the syscall will
> > only change locking behaviour for anonymous mappings.
> > 
> > A previous to unlock file based mappings was reverted, see the following
> > from https://marc.info/?l=openbsd-tech=160155434212888=2 :
> > 
> > commit 38802bc07455f2a4f8cdde272850a5eab5dfa6c8
> > from: mpi 
> > date: Wed Oct  7 12:26:20 2020 UTC
> > 
> > Do not release the KERNEL_LOCK() when mmap(2)ing files.
> > 
> > Previous attempt to unlock amap & anon exposed a race in vnode reference
> > counting.  So be conservative with the code paths that we're not fully 
> > moving
> > out of the KERNEL_LOCK() to allow us to concentrate on one area at a 
> > time.
> > ...
> > 
> > 
> > So here's a first small step.  I've been running with this for months
> > on a few amd64, arm64 and sparc64 boxes without problems;   they've been
> > daily drivers and/or have been building releases and ports.
> > 
> > Feedback?  Objections?  OK?
> > 
> > 
> > Index: sys/kern/syscalls.master
> > ===
> > RCS file: /cvs/src/sys/kern/syscalls.master,v
> > retrieving revision 1.221
> > diff -u -p -r1.221 syscalls.master
> > --- sys/kern/syscalls.master23 Dec 2021 18:50:31 -  1.221
> > +++ sys/kern/syscalls.master31 Dec 2021 09:14:00 -
> > @@ -126,7 +126,7 @@
> > struct sigaction *osa); }
> > 47  STD NOLOCK  { gid_t sys_getgid(void); }
> > 48  STD NOLOCK  { int sys_sigprocmask(int how, sigset_t mask); }
> > -49 STD { void *sys_mmap(void *addr, size_t len, int prot, \
> > +49 STD NOLOCK  { void *sys_mmap(void *addr, size_t len, int prot, \
> > int flags, int fd, off_t pos); }
> > 50  STD { int sys_setlogin(const char *namebuf); }
> > #ifdef ACCOUNTING
> > 
> 



Re: unlock mmap(2) for anonymous mappings

2021-12-31 Thread Vitaliy Makkoveev
The uvm_wxabort path within uvm_wxcheck() looks not MP-safe.

> On 31 Dec 2021, at 12:14, Klemens Nanni  wrote:
> 
> Now that mpi has unlocked uvm's fault handler, we can unlock the mmap
> syscall to handle MAP_ANON without the big lock.
> 
> sys_mmap() still protects the !MAP_ANON case, i.e. file based mappings,
> with the KERNEL_LOCK() itself, which is why unlocking the syscall will
> only change locking behaviour for anonymous mappings.
> 
> A previous to unlock file based mappings was reverted, see the following
> from https://marc.info/?l=openbsd-tech=160155434212888=2 :
> 
>   commit 38802bc07455f2a4f8cdde272850a5eab5dfa6c8
>   from: mpi 
>   date: Wed Oct  7 12:26:20 2020 UTC
> 
>   Do not release the KERNEL_LOCK() when mmap(2)ing files.
> 
>   Previous attempt to unlock amap & anon exposed a race in vnode reference
>   counting.  So be conservative with the code paths that we're not fully 
> moving
>   out of the KERNEL_LOCK() to allow us to concentrate on one area at a 
> time.
>   ...
> 
> 
> So here's a first small step.  I've been running with this for months
> on a few amd64, arm64 and sparc64 boxes without problems;   they've been
> daily drivers and/or have been building releases and ports.
> 
> Feedback?  Objections?  OK?
> 
> 
> Index: sys/kern/syscalls.master
> ===
> RCS file: /cvs/src/sys/kern/syscalls.master,v
> retrieving revision 1.221
> diff -u -p -r1.221 syscalls.master
> --- sys/kern/syscalls.master  23 Dec 2021 18:50:31 -  1.221
> +++ sys/kern/syscalls.master  31 Dec 2021 09:14:00 -
> @@ -126,7 +126,7 @@
>   struct sigaction *osa); }
> 47STD NOLOCK  { gid_t sys_getgid(void); }
> 48STD NOLOCK  { int sys_sigprocmask(int how, sigset_t mask); }
> -49   STD { void *sys_mmap(void *addr, size_t len, int prot, \
> +49   STD NOLOCK  { void *sys_mmap(void *addr, size_t len, int prot, \
>   int flags, int fd, off_t pos); }
> 50STD { int sys_setlogin(const char *namebuf); }
> #ifdef ACCOUNTING
> 



Re: unlock mmap(2) for anonymous mappings

2021-12-31 Thread Theo de Raadt
>Now that mpi has unlocked uvm's fault handler, we can unlock the mmap
>syscall to handle MAP_ANON without the big lock.
...
>So here's a first small step.  I've been running with this for months
>on a few amd64, arm64 and sparc64 boxes without problems

So, 3 architectures have been tested.

I read your mail as a request for OKs to commit before the other architectures
have been tested.

No way.  You first find people to help you test the others.  Or you ask for
the whole community to help ensure the list is complete.  But you are begging
for the wrong people to respond to you when you include this on the list of
questions:

> Feedback?  Objections?  OK?
  ^^^



unlock mmap(2) for anonymous mappings

2021-12-31 Thread Klemens Nanni
Now that mpi has unlocked uvm's fault handler, we can unlock the mmap
syscall to handle MAP_ANON without the big lock.

sys_mmap() still protects the !MAP_ANON case, i.e. file based mappings,
with the KERNEL_LOCK() itself, which is why unlocking the syscall will
only change locking behaviour for anonymous mappings.

A previous to unlock file based mappings was reverted, see the following
from https://marc.info/?l=openbsd-tech=160155434212888=2 :

commit 38802bc07455f2a4f8cdde272850a5eab5dfa6c8
from: mpi 
date: Wed Oct  7 12:26:20 2020 UTC

Do not release the KERNEL_LOCK() when mmap(2)ing files.

Previous attempt to unlock amap & anon exposed a race in vnode reference
counting.  So be conservative with the code paths that we're not fully 
moving
out of the KERNEL_LOCK() to allow us to concentrate on one area at a 
time.
...


So here's a first small step.  I've been running with this for months
on a few amd64, arm64 and sparc64 boxes without problems;   they've been
daily drivers and/or have been building releases and ports.

Feedback?  Objections?  OK?


Index: sys/kern/syscalls.master
===
RCS file: /cvs/src/sys/kern/syscalls.master,v
retrieving revision 1.221
diff -u -p -r1.221 syscalls.master
--- sys/kern/syscalls.master23 Dec 2021 18:50:31 -  1.221
+++ sys/kern/syscalls.master31 Dec 2021 09:14:00 -
@@ -126,7 +126,7 @@
struct sigaction *osa); }
 47 STD NOLOCK  { gid_t sys_getgid(void); }
 48 STD NOLOCK  { int sys_sigprocmask(int how, sigset_t mask); }
-49 STD { void *sys_mmap(void *addr, size_t len, int prot, \
+49 STD NOLOCK  { void *sys_mmap(void *addr, size_t len, int prot, \
int flags, int fd, off_t pos); }
 50 STD { int sys_setlogin(const char *namebuf); }
 #ifdef ACCOUNTING