Re: fill_file(): use solock_shared() to protect socket data

2023-05-16 Thread Vitaliy Makkoveev
On Thu, Apr 27, 2023 at 02:54:38PM +0200, Claudio Jeker wrote:
> On Thu, Apr 27, 2023 at 01:55:33PM +0300, Vitaliy Makkoveev wrote:
> > Now only direct netlock used for inet sockets protection. The unlocked
> > access to all other sockets is safe, but we could lost consistency for a
> > little. Since the solock() used for sockets protection, make locking
> > path common and use it. Make it shared because this is read-only access
> > to sockets data. For same reason use shared netlock while performing
> > inpcb tables foreach walkthrough.
> 
> This is still wrong. fill_file() is not allowed to sleep. So you can not
> use a rwlock in here. fill_file is called inside a allprocess
> LIST_FOREACH loop and sleeping there will blow up.
>  
> Maybe it is enough to grab a shared NETLOCK around the LIST_FOREACH() to
> ensure that we don't sleep inside.
> 
> Please fix this properly. Right now running fstat is like playing russian
> roulette.
> 

Except __pmap_asid_alloc() from arch/sh/sh/pmap.c of uniprocessor
machine, proc_stop_sweep() is the only place where `ps_list' can't be
protected by `allprocess_lock' rwlock(9). Since the refactoring is not
obvious, for now  I propose to use two locks for `ps_list' as we do for
some other data structures like `ps_threads' or `if_list'. For the code
paths where context switch is unwanted, the read only acces to `ps_list'
will be protected by kernel lock, and by `allprocess_lock' rwlock(9) in
all the rest. The `ps_list' list modification will require both locks to
be held. Not the best solution, but it fixes sysctl_file().

Index: sys/kern/kern_exit.c
===
RCS file: /cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.211
diff -u -p -r1.211 kern_exit.c
--- sys/kern/kern_exit.c25 Apr 2023 18:14:06 -  1.211
+++ sys/kern/kern_exit.c16 May 2023 11:14:49 -
@@ -223,6 +223,8 @@ exit1(struct proc *p, int xexit, int xsi
 
p->p_fd = NULL; /* zap the thread's copy */
 
+   rw_enter_write(_lock);
+
 /*
 * Remove proc from pidhash chain and allproc so looking
 * it up won't work.  We will put the proc on the
@@ -295,6 +297,8 @@ exit1(struct proc *p, int xexit, int xsi
process_clear_orphan(qr);
}
}
+
+   rw_exit_write(_lock);
 
/* add thread's accumulated rusage into the process's total */
ruadd(rup, >p_ru);
Index: sys/kern/kern_fork.c
===
RCS file: /cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.247
diff -u -p -r1.247 kern_fork.c
--- sys/kern/kern_fork.c25 Apr 2023 18:14:06 -  1.247
+++ sys/kern/kern_fork.c16 May 2023 11:14:49 -
@@ -62,6 +62,11 @@
 #include 
 #include 
 
+/*
+ * Locks used to protect struct members in this file:
+ * P   allprocess_lock
+ */
+
 intnprocesses = 1; /* process 0 */
 intnthreads = 1;   /* proc 0 */
 struct forkstat forkstat;
@@ -372,6 +377,8 @@ fork1(struct proc *curp, int flags, void
return (ENOMEM);
}
 
+   rw_enter_write(_lock);
+
/*
 * From now on, we're committed to the fork and cannot fail.
 */
@@ -468,19 +475,28 @@ fork1(struct proc *curp, int flags, void
knote_locked(>ps_klist, NOTE_FORK | pr->ps_pid);
 
/*
-* Update stats now that we know the fork was successful.
+* Return child pid to parent process
 */
-   uvmexp.forks++;
-   if (flags & FORK_PPWAIT)
-   uvmexp.forks_ppwait++;
-   if (flags & FORK_SHAREVM)
-   uvmexp.forks_sharevm++;
+   if (retval != NULL)
+   *retval = pr->ps_pid;
 
/*
 * Pass a pointer to the new process to the caller.
+* XXX but we don't return `rnewprocp' to sys_fork()
+* or sys_vfork().
 */
if (rnewprocp != NULL)
*rnewprocp = p;
+   rw_exit_write(_lock);
+
+   /*
+* Update stats now that we know the fork was successful.
+*/
+   uvmexp.forks++;
+   if (flags & FORK_PPWAIT)
+   uvmexp.forks_ppwait++;
+   if (flags & FORK_SHAREVM)
+   uvmexp.forks_sharevm++;
 
/*
 * Preserve synchronization semantics of vfork.  If waiting for
@@ -499,11 +515,6 @@ fork1(struct proc *curp, int flags, void
if ((flags & FORK_PTRACE) && (curpr->ps_flags & PS_TRACED))
psignal(curp, SIGTRAP);
 
-   /*
-* Return child pid to parent process
-*/
-   if (retval != NULL)
-   *retval = pr->ps_pid;
return (0);
 }
 
@@ -610,7 +621,7 @@ alloctid(void)
 /*
  * Checks for current use of a pid, either as a pid or pgid.
  */
-pid_t oldpids[128];
+pid_t oldpids[128];/* [P] */
 int
 ispidtaken(pid_t pid)
 {
Index: sys/kern/kern_ktrace.c

Re: fill_file(): use solock_shared() to protect socket data

2023-04-27 Thread Vitaliy Makkoveev
On Thu, Apr 27, 2023 at 02:54:38PM +0200, Claudio Jeker wrote:
> On Thu, Apr 27, 2023 at 01:55:33PM +0300, Vitaliy Makkoveev wrote:
> > Now only direct netlock used for inet sockets protection. The unlocked
> > access to all other sockets is safe, but we could lost consistency for a
> > little. Since the solock() used for sockets protection, make locking
> > path common and use it. Make it shared because this is read-only access
> > to sockets data. For same reason use shared netlock while performing
> > inpcb tables foreach walkthrough.
> 
> This is still wrong. fill_file() is not allowed to sleep. So you can not
> use a rwlock in here. fill_file is called inside a allprocess
> LIST_FOREACH loop and sleeping there will blow up.
>  
> Maybe it is enough to grab a shared NETLOCK around the LIST_FOREACH() to
> ensure that we don't sleep inside.
> 
> Please fix this properly. Right now running fstat is like playing russian
> roulette.
> 

This time fill_file() already has sleep point introduced by direct
netlock invocation. So, this diff doesn't make it worse.

The netlock around allprocess foreach loops is not the properly fix,
because we left non inet sockets unlocked.

However, these allprocess foreach loops are not the only place with
similar problem. Some times ago I did the diff to introduce the
`allprocess_lock' rwlock for `allprocess' list protection.
Unfortunately, this diff was abandoned after some discussions about
proc_stop_sweep() software interrupt handler. I reworked it to kernel
thread to use this new rwlock within. Since proc_stop_sweep() only
schedules parents of stopped processes to run I assume this extra delay
as non significant.

This is the last iteration of `allprocess' diff. I propose to commit it
first to fix sysctl_file() in the right way.


Index: sys/arch/sh/sh/pmap.c
===
RCS file: /cvs/src/sys/arch/sh/sh/pmap.c,v
retrieving revision 1.30
diff -u -p -r1.30 pmap.c
--- sys/arch/sh/sh/pmap.c   1 Jan 2023 19:49:17 -   1.30
+++ sys/arch/sh/sh/pmap.c   9 Jan 2023 09:06:42 -
@@ -1071,6 +1071,11 @@ __pmap_asid_alloc(void)
 * so it's far from LRU but rather almost pessimal once you have
 * too many processes.
 */
+   /*
+* XXX Unlocked access to `allprocess' list is safe. This is
+* uniprocessor machine, we don't modify the list and we have
+* no context switch within.
+*/
LIST_FOREACH(pr, , ps_list) {
pmap_t pmap = pr->ps_vmspace->vm_map.pmap;
 
Index: sys/kern/kern_exit.c
===
RCS file: /cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.210
diff -u -p -r1.210 kern_exit.c
--- sys/kern/kern_exit.c29 Dec 2022 01:36:36 -  1.210
+++ sys/kern/kern_exit.c9 Jan 2023 09:06:42 -
@@ -222,6 +222,8 @@ exit1(struct proc *p, int xexit, int xsi
 
p->p_fd = NULL; /* zap the thread's copy */
 
+   rw_enter_write(_lock);
+
 /*
 * Remove proc from pidhash chain and allproc so looking
 * it up won't work.  We will put the proc on the
@@ -294,6 +296,8 @@ exit1(struct proc *p, int xexit, int xsi
process_clear_orphan(qr);
}
}
+
+   rw_exit_write(_lock);
 
/* add thread's accumulated rusage into the process's total */
ruadd(rup, >p_ru);
Index: sys/kern/kern_fork.c
===
RCS file: /cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.245
diff -u -p -r1.245 kern_fork.c
--- sys/kern/kern_fork.c7 Jan 2023 05:24:58 -   1.245
+++ sys/kern/kern_fork.c9 Jan 2023 09:06:42 -
@@ -62,6 +62,11 @@
 #include 
 #include 
 
+/*
+ * Locks used to protect struct members in this file:
+ * P   allprocess_lock
+ */
+
 intnprocesses = 1; /* process 0 */
 intnthreads = 1;   /* proc 0 */
 struct forkstat forkstat;
@@ -372,6 +377,8 @@ fork1(struct proc *curp, int flags, void
return (ENOMEM);
}
 
+   rw_enter_write(_lock);
+
/*
 * From now on, we're committed to the fork and cannot fail.
 */
@@ -468,19 +475,28 @@ fork1(struct proc *curp, int flags, void
KNOTE(>ps_klist, NOTE_FORK | pr->ps_pid);
 
/*
-* Update stats now that we know the fork was successful.
+* Return child pid to parent process
 */
-   uvmexp.forks++;
-   if (flags & FORK_PPWAIT)
-   uvmexp.forks_ppwait++;
-   if (flags & FORK_SHAREVM)
-   uvmexp.forks_sharevm++;
+   if (retval != NULL)
+   *retval = pr->ps_pid;
 
/*
 * Pass a pointer to the new process to the caller.
+* XXX but we don't return `rnewprocp' to sys_fork()
+* or sys_vfork().
 */
if (rnewprocp != NULL)
*rnewprocp = p;

Re: fill_file(): use solock_shared() to protect socket data

2023-04-27 Thread Alexander Bluhm
On Thu, Apr 27, 2023 at 02:54:38PM +0200, Claudio Jeker wrote:
> On Thu, Apr 27, 2023 at 01:55:33PM +0300, Vitaliy Makkoveev wrote:
> > Now only direct netlock used for inet sockets protection. The unlocked
> > access to all other sockets is safe, but we could lost consistency for a
> > little. Since the solock() used for sockets protection, make locking
> > path common and use it. Make it shared because this is read-only access
> > to sockets data. For same reason use shared netlock while performing
> > inpcb tables foreach walkthrough.
> 
> This is still wrong. fill_file() is not allowed to sleep. So you can not
> use a rwlock in here. fill_file is called inside a allprocess
> LIST_FOREACH loop and sleeping there will blow up.

Yes.

> Maybe it is enough to grab a shared NETLOCK around the LIST_FOREACH() to
> ensure that we don't sleep inside.

I have tired to fix this for years.  It is not easy.  Taking the
net lock to early results in a lock ordering problem.  Down in
fill_file() fdplock() is called:

#define fdplock(fdp) NET_ASSERT_UNLOCKED(); rw_enter_write(&(fdp)->fd_lock);

This guarantees lock ordering fdlock before netlock.  You probably
run into NFS problems if you want to relax that.

> Please fix this properly. Right now running fstat is like playing russian
> roulette.

Shared netlock is possible russian roulette with less bullets
compared to exclusive netlock.  For network sockets pcb mutex and
pru_lock the situation gets better.  For non-network sockets this
diff does rw_enter_write(>so_lock).  Do we need that?  It means
more sleeping points and more opportunity to crash.

bluhm

> > Index: sys/kern/kern_sysctl.c
> > ===
> > RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
> > retrieving revision 1.411
> > diff -u -p -r1.411 kern_sysctl.c
> > --- sys/kern/kern_sysctl.c  22 Jan 2023 12:05:44 -  1.411
> > +++ sys/kern/kern_sysctl.c  27 Apr 2023 10:18:01 -
> > @@ -1173,14 +1173,11 @@ fill_file(struct kinfo_file *kf, struct 
> >  
> > if (so == NULL) {
> > so = (struct socket *)fp->f_data;
> > +   solock_shared(so);
> > +   locked = 1;
> > +   } else {
> > /* if so is passed as parameter it is already locked */
> > -   switch (so->so_proto->pr_domain->dom_family) {
> > -   case AF_INET:
> > -   case AF_INET6:
> > -   NET_LOCK();
> > -   locked = 1;
> > -   break;
> > -   }
> > +   soassertlocked(so);
> > }
> >  
> > kf->so_type = so->so_type;
> > @@ -1203,14 +1200,13 @@ fill_file(struct kinfo_file *kf, struct 
> > kf->so_splicelen = -1;
> > if (so->so_pcb == NULL) {
> > if (locked)
> > -   NET_UNLOCK();
> > +   sounlock_shared(so);
> > break;
> > }
> > switch (kf->so_family) {
> > case AF_INET: {
> > struct inpcb *inpcb = so->so_pcb;
> >  
> > -   NET_ASSERT_LOCKED();
> > if (show_pointers)
> > kf->inp_ppcb = PTRTOINT64(inpcb->inp_ppcb);
> > kf->inp_lport = inpcb->inp_lport;
> > @@ -1232,7 +1228,6 @@ fill_file(struct kinfo_file *kf, struct 
> > case AF_INET6: {
> > struct inpcb *inpcb = so->so_pcb;
> >  
> > -   NET_ASSERT_LOCKED();
> > if (show_pointers)
> > kf->inp_ppcb = PTRTOINT64(inpcb->inp_ppcb);
> > kf->inp_lport = inpcb->inp_lport;
> > @@ -1279,7 +1274,7 @@ fill_file(struct kinfo_file *kf, struct 
> > }
> > }
> > if (locked)
> > -   NET_UNLOCK();
> > +   sounlock_shared(so);
> > break;
> > }
> >  
> > @@ -1375,7 +1370,7 @@ sysctl_file(int *name, u_int namelen, ch
> > if (arg == DTYPE_SOCKET) {
> > struct inpcb *inp;
> >  
> > -   NET_LOCK();
> > +   NET_LOCK_SHARED();
> > mtx_enter(_mtx);
> > TAILQ_FOREACH(inp, _queue, inp_queue)
> > FILLSO(inp->inp_socket);
> > @@ -1395,7 +1390,7 @@ sysctl_file(int *name, u_int namelen, ch
> > FILLSO(inp->inp_socket);
> > mtx_leave(_mtx);
> >  #endif
> > -   NET_UNLOCK();
> > +   NET_UNLOCK_SHARED();
> > }
> > fp = NULL;
> > while ((fp = fd_iterfile(fp, p)) != NULL) {
> > 
> 
> -- 
> :wq Claudio



Re: fill_file(): use solock_shared() to protect socket data

2023-04-27 Thread Claudio Jeker
On Thu, Apr 27, 2023 at 01:55:33PM +0300, Vitaliy Makkoveev wrote:
> Now only direct netlock used for inet sockets protection. The unlocked
> access to all other sockets is safe, but we could lost consistency for a
> little. Since the solock() used for sockets protection, make locking
> path common and use it. Make it shared because this is read-only access
> to sockets data. For same reason use shared netlock while performing
> inpcb tables foreach walkthrough.

This is still wrong. fill_file() is not allowed to sleep. So you can not
use a rwlock in here. fill_file is called inside a allprocess
LIST_FOREACH loop and sleeping there will blow up.
 
Maybe it is enough to grab a shared NETLOCK around the LIST_FOREACH() to
ensure that we don't sleep inside.

Please fix this properly. Right now running fstat is like playing russian
roulette.

> Index: sys/kern/kern_sysctl.c
> ===
> RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
> retrieving revision 1.411
> diff -u -p -r1.411 kern_sysctl.c
> --- sys/kern/kern_sysctl.c22 Jan 2023 12:05:44 -  1.411
> +++ sys/kern/kern_sysctl.c27 Apr 2023 10:18:01 -
> @@ -1173,14 +1173,11 @@ fill_file(struct kinfo_file *kf, struct 
>  
>   if (so == NULL) {
>   so = (struct socket *)fp->f_data;
> + solock_shared(so);
> + locked = 1;
> + } else {
>   /* if so is passed as parameter it is already locked */
> - switch (so->so_proto->pr_domain->dom_family) {
> - case AF_INET:
> - case AF_INET6:
> - NET_LOCK();
> - locked = 1;
> - break;
> - }
> + soassertlocked(so);
>   }
>  
>   kf->so_type = so->so_type;
> @@ -1203,14 +1200,13 @@ fill_file(struct kinfo_file *kf, struct 
>   kf->so_splicelen = -1;
>   if (so->so_pcb == NULL) {
>   if (locked)
> - NET_UNLOCK();
> + sounlock_shared(so);
>   break;
>   }
>   switch (kf->so_family) {
>   case AF_INET: {
>   struct inpcb *inpcb = so->so_pcb;
>  
> - NET_ASSERT_LOCKED();
>   if (show_pointers)
>   kf->inp_ppcb = PTRTOINT64(inpcb->inp_ppcb);
>   kf->inp_lport = inpcb->inp_lport;
> @@ -1232,7 +1228,6 @@ fill_file(struct kinfo_file *kf, struct 
>   case AF_INET6: {
>   struct inpcb *inpcb = so->so_pcb;
>  
> - NET_ASSERT_LOCKED();
>   if (show_pointers)
>   kf->inp_ppcb = PTRTOINT64(inpcb->inp_ppcb);
>   kf->inp_lport = inpcb->inp_lport;
> @@ -1279,7 +1274,7 @@ fill_file(struct kinfo_file *kf, struct 
>   }
>   }
>   if (locked)
> - NET_UNLOCK();
> + sounlock_shared(so);
>   break;
>   }
>  
> @@ -1375,7 +1370,7 @@ sysctl_file(int *name, u_int namelen, ch
>   if (arg == DTYPE_SOCKET) {
>   struct inpcb *inp;
>  
> - NET_LOCK();
> + NET_LOCK_SHARED();
>   mtx_enter(_mtx);
>   TAILQ_FOREACH(inp, _queue, inp_queue)
>   FILLSO(inp->inp_socket);
> @@ -1395,7 +1390,7 @@ sysctl_file(int *name, u_int namelen, ch
>   FILLSO(inp->inp_socket);
>   mtx_leave(_mtx);
>  #endif
> - NET_UNLOCK();
> + NET_UNLOCK_SHARED();
>   }
>   fp = NULL;
>   while ((fp = fd_iterfile(fp, p)) != NULL) {
> 

-- 
:wq Claudio



fill_file(): use solock_shared() to protect socket data

2023-04-27 Thread Vitaliy Makkoveev
Now only direct netlock used for inet sockets protection. The unlocked
access to all other sockets is safe, but we could lost consistency for a
little. Since the solock() used for sockets protection, make locking
path common and use it. Make it shared because this is read-only access
to sockets data. For same reason use shared netlock while performing
inpcb tables foreach walkthrough.

Index: sys/kern/kern_sysctl.c
===
RCS file: /cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.411
diff -u -p -r1.411 kern_sysctl.c
--- sys/kern/kern_sysctl.c  22 Jan 2023 12:05:44 -  1.411
+++ sys/kern/kern_sysctl.c  27 Apr 2023 10:18:01 -
@@ -1173,14 +1173,11 @@ fill_file(struct kinfo_file *kf, struct 
 
if (so == NULL) {
so = (struct socket *)fp->f_data;
+   solock_shared(so);
+   locked = 1;
+   } else {
/* if so is passed as parameter it is already locked */
-   switch (so->so_proto->pr_domain->dom_family) {
-   case AF_INET:
-   case AF_INET6:
-   NET_LOCK();
-   locked = 1;
-   break;
-   }
+   soassertlocked(so);
}
 
kf->so_type = so->so_type;
@@ -1203,14 +1200,13 @@ fill_file(struct kinfo_file *kf, struct 
kf->so_splicelen = -1;
if (so->so_pcb == NULL) {
if (locked)
-   NET_UNLOCK();
+   sounlock_shared(so);
break;
}
switch (kf->so_family) {
case AF_INET: {
struct inpcb *inpcb = so->so_pcb;
 
-   NET_ASSERT_LOCKED();
if (show_pointers)
kf->inp_ppcb = PTRTOINT64(inpcb->inp_ppcb);
kf->inp_lport = inpcb->inp_lport;
@@ -1232,7 +1228,6 @@ fill_file(struct kinfo_file *kf, struct 
case AF_INET6: {
struct inpcb *inpcb = so->so_pcb;
 
-   NET_ASSERT_LOCKED();
if (show_pointers)
kf->inp_ppcb = PTRTOINT64(inpcb->inp_ppcb);
kf->inp_lport = inpcb->inp_lport;
@@ -1279,7 +1274,7 @@ fill_file(struct kinfo_file *kf, struct 
}
}
if (locked)
-   NET_UNLOCK();
+   sounlock_shared(so);
break;
}
 
@@ -1375,7 +1370,7 @@ sysctl_file(int *name, u_int namelen, ch
if (arg == DTYPE_SOCKET) {
struct inpcb *inp;
 
-   NET_LOCK();
+   NET_LOCK_SHARED();
mtx_enter(_mtx);
TAILQ_FOREACH(inp, _queue, inp_queue)
FILLSO(inp->inp_socket);
@@ -1395,7 +1390,7 @@ sysctl_file(int *name, u_int namelen, ch
FILLSO(inp->inp_socket);
mtx_leave(_mtx);
 #endif
-   NET_UNLOCK();
+   NET_UNLOCK_SHARED();
}
fp = NULL;
while ((fp = fd_iterfile(fp, p)) != NULL) {