Re: fix races in if_clone_create()

2020-07-13 Thread Vitaliy Makkoveev
On Mon, Jul 13, 2020 at 12:52:15PM +0300, Vitaliy Makkoveev wrote:
> On Mon, Jul 13, 2020 at 09:53:44AM +0200, Martin Pieuchot wrote:
> > On 06/07/20(Mon) 15:44, Vitaliy Makkoveev wrote:
> > > > On 6 Jul 2020, at 12:17, Martin Pieuchot  wrote:
> > > > Assertions and documentation are more important than preventing races
> > > > because they allow to build awareness and elegant solutions instead of
> > > > hacking diffs until stuff work without knowing why.
> > > > 
> > > > There are two cases where `ifp' are inserted into `ifnet':
> > > > 1. by autoconf during boot or hotplug
> > > > 2. by cloning ioctl
> > > > 
> > > > In the second case it is always about pseudo-devices.  So the assertion
> > > > should be conditional like:
> > > > 
> > > > if (ISSET(ifp->if_xflags, IFXF_CLONED))
> > > > rw_assert_wrlock(_lock);
> > > > 
> > > > In other words this fixes serializes insertions/removal on the global
> > > > list `ifnet', the KERNEL_LOCK() being still required for reading it.
> > > > 
> > > > Is there any other data structure which ends up being protected by this
> > > > approach and could be documented?
> > > 
> > > We should be sure there is no multiple `ifnet’s in `if_list’ with the same
> > > `if_xname’.
> > 
> > That's a symptom of a bug.  Checking for a symptom won't prevent another
> > type of corruption, maybe next time it will be a corrupted pointer?
> 
> Absolutely no. You don't break the list do you understand this?
> 
> > 
> > > And the assertion you proposed looks not obvious here.
> > 
> > Why, is it because of the if() check?  That's required unless we change
> > put all if_attach() functions under the lock which would require changing
> > all driver in-tree.  However since drivers for physical devices are being
> > attached without having multiple CPUs running there's no possible race.
> 
> Because we should keep `if_list’ be linked by objects with unique
> `if_xname’. The modificatios of this list is not the problem. Problem is
> inconsistency caused by not unique `if_xname'. You are not fixing
> problem, just simptom.
> 
> > 
> > > Assertion like below looks more reasonable but introduces performance
> > > impact.
> > 
> > We should first aim for correctness then performance.  In this case,
> > performance is not even an issue because interfaces are not created
> > often compared to the rate of processing packets.
> >
> 
> Well, so let's do "KASSERT(ifunit(ifp->if_xname) == NULL);" here.

Updated diff below.
Now we have assertion in if_attach{,head}().

Index: sys/net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.612
diff -u -p -r1.612 if.c
--- sys/net/if.c10 Jul 2020 13:23:34 -  1.612
+++ sys/net/if.c13 Jul 2020 10:50:44 -
@@ -155,6 +155,8 @@ int if_getgrouplist(caddr_t);
 void   if_linkstate(struct ifnet *);
 void   if_linkstate_task(void *);
 
+intif_clone_lock(struct if_clone *);
+void   if_clone_unlock(struct if_clone *);
 intif_clone_list(struct if_clonereq *);
 struct if_clone*if_clone_lookup(const char *, int *);
 
@@ -524,6 +526,7 @@ if_attachhead(struct ifnet *ifp)
 {
if_attach_common(ifp);
NET_LOCK();
+   KASSERT(ifunit(ifp->if_xname) == NULL);
TAILQ_INSERT_HEAD(, ifp, if_list);
if_attachsetup(ifp);
NET_UNLOCK();
@@ -534,6 +537,7 @@ if_attach(struct ifnet *ifp)
 {
if_attach_common(ifp);
NET_LOCK();
+   KASSERT(ifunit(ifp->if_xname) == NULL);
TAILQ_INSERT_TAIL(, ifp, if_list);
if_attachsetup(ifp);
NET_UNLOCK();
@@ -1244,27 +1248,35 @@ if_clone_create(const char *name, int rd
 {
struct if_clone *ifc;
struct ifnet *ifp;
-   int unit, ret;
+   int unit, error;
 
ifc = if_clone_lookup(name, );
if (ifc == NULL)
return (EINVAL);
 
-   if (ifunit(name) != NULL)
-   return (EEXIST);
+   error = if_clone_lock(ifc);
+   if (error != 0)
+   return (error);
+
+   if (ifunit(name) != NULL) {
+   error = (EEXIST);
+   goto unlock;
+   }
 
-   ret = (*ifc->ifc_create)(ifc, unit);
+   error = (*ifc->ifc_create)(ifc, unit);
 
-   if (ret != 0 || (ifp = ifunit(name)) == NULL)
-   return (ret);
+   if (error != 0 || (ifp = ifunit(name)) == NULL)
+   goto unlock;
 
NET_LOCK();
if_addgroup(ifp, ifc->ifc_name);
if (rdomain != 0)
if_setrdomain(ifp, rdomain);
NET_UNLOCK();
+unlock:
+   if_clone_unlock(ifc);
 
-   return (ret);
+   return (error);
 }
 
 /*
@@ -1275,18 +1287,26 @@ if_clone_destroy(const char *name)
 {
struct if_clone *ifc;
struct ifnet *ifp;
-   int ret;
+   int error;
 
ifc = if_clone_lookup(name, NULL);
if (ifc == NULL)
return (EINVAL);
 
+   error = 

Re: fix races in if_clone_create()

2020-07-13 Thread Vitaliy Makkoveev
On Mon, Jul 13, 2020 at 09:53:44AM +0200, Martin Pieuchot wrote:
> On 06/07/20(Mon) 15:44, Vitaliy Makkoveev wrote:
> > > On 6 Jul 2020, at 12:17, Martin Pieuchot  wrote:
> > > Assertions and documentation are more important than preventing races
> > > because they allow to build awareness and elegant solutions instead of
> > > hacking diffs until stuff work without knowing why.
> > > 
> > > There are two cases where `ifp' are inserted into `ifnet':
> > > 1. by autoconf during boot or hotplug
> > > 2. by cloning ioctl
> > > 
> > > In the second case it is always about pseudo-devices.  So the assertion
> > > should be conditional like:
> > > 
> > >   if (ISSET(ifp->if_xflags, IFXF_CLONED))
> > >   rw_assert_wrlock(_lock);
> > > 
> > > In other words this fixes serializes insertions/removal on the global
> > > list `ifnet', the KERNEL_LOCK() being still required for reading it.
> > > 
> > > Is there any other data structure which ends up being protected by this
> > > approach and could be documented?
> > 
> > We should be sure there is no multiple `ifnet’s in `if_list’ with the same
> > `if_xname’.
> 
> That's a symptom of a bug.  Checking for a symptom won't prevent another
> type of corruption, maybe next time it will be a corrupted pointer?

Absolutely no. You don't break the list do you understand this?

> 
> > And the assertion you proposed looks not obvious here.
> 
> Why, is it because of the if() check?  That's required unless we change
> put all if_attach() functions under the lock which would require changing
> all driver in-tree.  However since drivers for physical devices are being
> attached without having multiple CPUs running there's no possible race.

Because we should keep `if_list’ be linked by objects with unique
`if_xname’. The modificatios of this list is not the problem. Problem is
inconsistency caused by not unique `if_xname'. You are not fixing
problem, just simptom.

> 
> > Assertion like below looks more reasonable but introduces performance
> > impact.
> 
> We should first aim for correctness then performance.  In this case,
> performance is not even an issue because interfaces are not created
> often compared to the rate of processing packets.
>

Well, so let's do "KASSERT(ifunit(ifp->if_xname) == NULL);" here.



Re: fix races in if_clone_create()

2020-07-13 Thread Martin Pieuchot
On 06/07/20(Mon) 15:44, Vitaliy Makkoveev wrote:
> > On 6 Jul 2020, at 12:17, Martin Pieuchot  wrote:
> > Assertions and documentation are more important than preventing races
> > because they allow to build awareness and elegant solutions instead of
> > hacking diffs until stuff work without knowing why.
> > 
> > There are two cases where `ifp' are inserted into `ifnet':
> > 1. by autoconf during boot or hotplug
> > 2. by cloning ioctl
> > 
> > In the second case it is always about pseudo-devices.  So the assertion
> > should be conditional like:
> > 
> > if (ISSET(ifp->if_xflags, IFXF_CLONED))
> > rw_assert_wrlock(_lock);
> > 
> > In other words this fixes serializes insertions/removal on the global
> > list `ifnet', the KERNEL_LOCK() being still required for reading it.
> > 
> > Is there any other data structure which ends up being protected by this
> > approach and could be documented?
> 
> We should be sure there is no multiple `ifnet’s in `if_list’ with the same
> `if_xname’.

That's a symptom of a bug.  Checking for a symptom won't prevent another
type of corruption, maybe next time it will be a corrupted pointer?

> And the assertion you proposed looks not obvious here.

Why, is it because of the if() check?  That's required unless we change
put all if_attach() functions under the lock which would require changing
all driver in-tree.  However since drivers for physical devices are being
attached without having multiple CPUs running there's no possible race.

> Assertion like below looks more reasonable but introduces performance
> impact.

We should first aim for correctness then performance.  In this case,
performance is not even an issue because interfaces are not created
often compared to the rate of processing packets.



Re: fix races in if_clone_create()

2020-07-06 Thread Vitaliy Makkoveev



> On 6 Jul 2020, at 12:17, Martin Pieuchot  wrote:
> 
> On 01/07/20(Wed) 00:02, Vitaliy Makkoveev wrote:
>> On Tue, Jun 30, 2020 at 03:48:22PM +0300, Vitaliy Makkoveev wrote:
>>> On Tue, Jun 30, 2020 at 12:08:03PM +0200, Martin Pieuchot wrote:
 On 29/06/20(Mon) 11:59, Vitaliy Makkoveev wrote:
> [...] 
> I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> takes couple of minutes to take panic on 4 cores. Also some screenshots
> attached.
 
 Setting kern.pool_debug=2 makes the race reproducible in seconds.
>> 
>> Unfortunately you will catch splassert() caused by kern/sched_bsd.c:304.
>> malloc() will call yield() while we are holding NET_LOCK(). I attached
>> screenshot with splassertion to this mail.
> 
> With kern.splassert < 3 it is fine. 
> 
 Could you turn this test into something committable in regress/?  We can
 link it to the build once a fix is committed.
 
>>> 
>>> We have 3 races with cloned interfaces:
>>> 1. if_clone_create() vs if_clone_create()
>>> 2. if_clone_destroy() vs if_clone_destroy()
>>> 3. if_clone_destroy() vs the rest of stack
>>> 
>>> It makes sences to commit unified test to regress/, so I suggest to wait
>>> a little.
>> 
>> The another solution.
>> 
>> Diff below introduces per-`ifc' serialization for if_clone_create() and
>> if_clone_destroy(). There is no index bitmap anymore.
> 
> I like the simplification.  More comments below:
> 
>> +/*
>> + * Lock a clone network interface.
>> + */
>> +int
>> +if_clone_lock(struct if_clone *ifc)
>> +{
>> +int error;
>> +
>> +rw_enter_write(>ifc_lock);
>> +
>> +while (ifc->ifc_flags & IFC_CREATE_LOCKED) {
>> +ifc->ifc_flags |= IFC_CREATE_LOCKWAIT;
>> +error = rwsleep_nsec(>ifc_flags, >ifc_lock,
>> +PWAIT|PCATCH, "ifclk", INFSLP);
>> +if(error != 0) {
>> +ifc->ifc_flags &= ~IFC_CREATE_LOCKWAIT;
>> +rw_exit_write(>ifc_lock);
>> +return error;
>> +}
>> +}
>> +ifc->ifc_flags |= IFC_CREATE_LOCKED;
>> +ifc->ifc_flags &= ~IFC_CREATE_LOCKWAIT;
>> +
>> +rw_exit_write(>ifc_lock);
>> +
>> +return 0;
>> +}
> 
> This is like re-implementing a rwlock but loosing the debugging ability of
> WITNESS.

The reason to do this is to avoid call `ifc_create’ with rwlock held.
We have unique sleep points for each underlaying routine for `ifc_create’,
so this "rwlock reimplementation" looks better. Also this lock is used in
one place only and impact of loosing debugging ability is not such big.

> 
> I also don't see any reason for having a per-ifc lock.  If, at least one
> of the problems, is a double insert in `ifnet' then we should be able to
> assert that a lock is held when doing such assertion.

This race breaks ifunit() not `if_list’. I mean LIST_*() operation are
not broken because `le_{prev,next}’ are valid, but list is inconsistent of 
course.

Since only "ifconfig clonerA0 create& ifconfig clonerA0 create” will
break, I see no reason to deny simultaneous execution of
“ifconfig clonerA0 create& ifconfig clonerB0 create”.

Let this lock be per `ifc’ not global.

> 
> Assertions and documentation are more important than preventing races
> because they allow to build awareness and elegant solutions instead of
> hacking diffs until stuff work without knowing why.
> 
> There are two cases where `ifp' are inserted into `ifnet':
> 1. by autoconf during boot or hotplug
> 2. by cloning ioctl
> 
> In the second case it is always about pseudo-devices.  So the assertion
> should be conditional like:
> 
>   if (ISSET(ifp->if_xflags, IFXF_CLONED))
>   rw_assert_wrlock(_lock);
> 
> In other words this fixes serializes insertions/removal on the global
> list `ifnet', the KERNEL_LOCK() being still required for reading it.
> 
> Is there any other data structure which ends up being protected by this
> approach and could be documented?

We should be sure there is no multiple `ifnet’s in `if_list’ with the same
`if_xname’. And the assertion you proposed looks not obvious here.
Assertion like below looks more reasonable but introduces performance
impact.

 cut begin 
void
if_attach(struct ifnet *ifp)
{
if_attach_common(ifp);
NET_LOCK();
KASSERT(ifunit(ifp->if_xname) == NULL);
TAILQ_INSERT_TAIL(, ifp, if_list);
if_attachsetup(ifp);
NET_UNLOCK();
}
 cut end 

I guess the commentary within if_clone_create() is the best solution.
Something like this: “Deny simultaneous execution to prevent multiple
creation of interfaces with the same name”.


Re: fix races in if_clone_create()

2020-07-06 Thread Martin Pieuchot
On 01/07/20(Wed) 00:02, Vitaliy Makkoveev wrote:
> On Tue, Jun 30, 2020 at 03:48:22PM +0300, Vitaliy Makkoveev wrote:
> > On Tue, Jun 30, 2020 at 12:08:03PM +0200, Martin Pieuchot wrote:
> > > On 29/06/20(Mon) 11:59, Vitaliy Makkoveev wrote:
> > > > [...] 
> > > > I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> > > > takes couple of minutes to take panic on 4 cores. Also some screenshots
> > > > attached.
> > > 
> > > Setting kern.pool_debug=2 makes the race reproducible in seconds.
> 
> Unfortunately you will catch splassert() caused by kern/sched_bsd.c:304.
> malloc() will call yield() while we are holding NET_LOCK(). I attached
> screenshot with splassertion to this mail.

With kern.splassert < 3 it is fine. 

> > > Could you turn this test into something committable in regress/?  We can
> > > link it to the build once a fix is committed.
> > > 
> > 
> > We have 3 races with cloned interfaces:
> > 1. if_clone_create() vs if_clone_create()
> > 2. if_clone_destroy() vs if_clone_destroy()
> > 3. if_clone_destroy() vs the rest of stack
> > 
> > It makes sences to commit unified test to regress/, so I suggest to wait
> > a little.
> 
> The another solution.
> 
> Diff below introduces per-`ifc' serialization for if_clone_create() and
> if_clone_destroy(). There is no index bitmap anymore.

I like the simplification.  More comments below:

> +/*
> + * Lock a clone network interface.
> + */
> +int
> +if_clone_lock(struct if_clone *ifc)
> +{
> + int error;
> +
> + rw_enter_write(>ifc_lock);
> +
> + while (ifc->ifc_flags & IFC_CREATE_LOCKED) {
> + ifc->ifc_flags |= IFC_CREATE_LOCKWAIT;
> + error = rwsleep_nsec(>ifc_flags, >ifc_lock,
> + PWAIT|PCATCH, "ifclk", INFSLP);
> + if(error != 0) {
> + ifc->ifc_flags &= ~IFC_CREATE_LOCKWAIT;
> + rw_exit_write(>ifc_lock);
> + return error;
> + }
> + }
> + ifc->ifc_flags |= IFC_CREATE_LOCKED;
> + ifc->ifc_flags &= ~IFC_CREATE_LOCKWAIT;
> +
> + rw_exit_write(>ifc_lock);
> + 
> + return 0;
> +}

This is like re-implementing a rwlock but loosing the debugging ability of
WITNESS.

I also don't see any reason for having a per-ifc lock.  If, at least one
of the problems, is a double insert in `ifnet' then we should be able to
assert that a lock is held when doing such assertion.

Assertions and documentation are more important than preventing races
because they allow to build awareness and elegant solutions instead of
hacking diffs until stuff work without knowing why.

There are two cases where `ifp' are inserted into `ifnet':
 1. by autoconf during boot or hotplug
 2. by cloning ioctl

In the second case it is always about pseudo-devices.  So the assertion
should be conditional like:

if (ISSET(ifp->if_xflags, IFXF_CLONED))
rw_assert_wrlock(_lock);

In other words this fixes serializes insertions/removal on the global
list `ifnet', the KERNEL_LOCK() being still required for reading it.

Is there any other data structure which ends up being protected by this
approach and could be documented?



Re: fix races in if_clone_create()

2020-07-03 Thread Vitaliy Makkoveev
ping?

> On 1 Jul 2020, at 00:02, Vitaliy Makkoveev  wrote:
> 
> On Tue, Jun 30, 2020 at 03:48:22PM +0300, Vitaliy Makkoveev wrote:
>> On Tue, Jun 30, 2020 at 12:08:03PM +0200, Martin Pieuchot wrote:
>>> On 29/06/20(Mon) 11:59, Vitaliy Makkoveev wrote:
 [...] 
 I reworked tool for reproduce. Now I avoided fork()/exec() route and it
 takes couple of minutes to take panic on 4 cores. Also some screenshots
 attached.
>>> 
>>> Setting kern.pool_debug=2 makes the race reproducible in seconds.
> 
> Unfortunately you will catch splassert() caused by kern/sched_bsd.c:304.
> malloc() will call yield() while we are holding NET_LOCK(). I attached
> screenshot with splassertion to this mail.
> 
>>> 
>>> Could you turn this test into something committable in regress/?  We can
>>> link it to the build once a fix is committed.
>>> 
>> 
>> We have 3 races with cloned interfaces:
>> 1. if_clone_create() vs if_clone_create()
>> 2. if_clone_destroy() vs if_clone_destroy()
>> 3. if_clone_destroy() vs the rest of stack
>> 
>> It makes sences to commit unified test to regress/, so I suggest to wait
>> a little.
> 
> The another solution.
> 
> Diff below introduces per-`ifc' serialization for if_clone_create() and
> if_clone_destroy(). There is no index bitmap anymore.
> 
> Diff fixes the following races:
> 1. if_clone_create() vs if_clone_create()
> 2. if_clone_destroy() vs if_clone_destroy()
> 
> `ifc_create' will go the same lock path for each cloner, and
> `ifc_destroy' will go this path but in reverse order. It seems
> reasonable to allow simultaneous create/destroy for different cloners
> but since different instances of one cloner will block each other it's
> no reason have parallelism here.
> 
> Updated test tool
>  cut begin 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> 
> static struct ifreq ifr;
> 
> static void *clone_create(void *arg)
> {
>   int s;
> 
>   if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
>   err(1, "socket()");
>   while(1){
>   if(ioctl(s, SIOCIFCREATE, )<0)
>   if(errno==EINVAL)
>   exit(1);
>   }
> 
>   return NULL;
> }
> 
> static void *clone_destroy(void *arg)
> {
>   int s;
> 
>   if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
>   err(1, "socket()");
>   while(1){
>   if(ioctl(s, SIOCIFDESTROY, )<0)
>   if(errno==EINVAL)
>   exit(1);
>   }
> 
>   return NULL;
> }
> 
> int main(int argc, char *argv[])
> {
>   pthread_t thr;
>   int i;
> 
>   if(argc!=2){
>   fprintf(stderr, "usage: %s ifname\n", getprogname());
>   return 1;
>   }
> 
>   if(getuid()!=0){
>   fprintf(stderr, "should be root\n");
>   return 1;
>   }
> 
>   memset(, 0, sizeof(ifr));
>   strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));
> 
>   for(i=0; i<8*4; ++i){
>   if(pthread_create(, NULL, clone_create, NULL)!=0)
>   errx(1, "pthread_create(clone_create)");
>   if(pthread_create(, NULL, clone_destroy, NULL)!=0)
>   errx(1, "pthread_create(clone_destroy)");
>   }
> 
>   select(0, NULL, NULL, NULL, NULL);
> 
>   return 0;
> }
>  cut end 
> 
> 
> 
> Index: sys/net/if.c
> ===
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.611
> diff -u -p -r1.611 if.c
> --- sys/net/if.c  30 Jun 2020 09:31:38 -  1.611
> +++ sys/net/if.c  30 Jun 2020 20:41:50 -
> @@ -155,6 +155,8 @@ int   if_getgrouplist(caddr_t);
> void  if_linkstate(struct ifnet *);
> void  if_linkstate_task(void *);
> 
> +int  if_clone_lock(struct if_clone *);
> +void if_clone_unlock(struct if_clone *);
> int   if_clone_list(struct if_clonereq *);
> struct if_clone   *if_clone_lookup(const char *, int *);
> 
> @@ -1244,27 +1246,35 @@ if_clone_create(const char *name, int rd
> {
>   struct if_clone *ifc;
>   struct ifnet *ifp;
> - int unit, ret;
> + int unit, error;
> 
>   ifc = if_clone_lookup(name, );
>   if (ifc == NULL)
>   return (EINVAL);
> 
> - if (ifunit(name) != NULL)
> - return (EEXIST);
> + error = if_clone_lock(ifc);
> + if (error != 0)
> + return (error);
> +
> + if (ifunit(name) != NULL) {
> + error = (EEXIST);
> + goto unlock;
> + }
> 
> - ret = (*ifc->ifc_create)(ifc, unit);
> + error = (*ifc->ifc_create)(ifc, unit);
> 
> - if (ret != 0 || (ifp = ifunit(name)) == NULL)
> - return (ret);
> + if (error != 0 || (ifp = ifunit(name)) == NULL)
> + goto unlock;
> 
>   NET_LOCK();
>   if_addgroup(ifp, ifc->ifc_name);
>   if (rdomain != 0)
>   

Re: fix races in if_clone_create()

2020-06-30 Thread Vitaliy Makkoveev
On Tue, Jun 30, 2020 at 03:48:22PM +0300, Vitaliy Makkoveev wrote:
> On Tue, Jun 30, 2020 at 12:08:03PM +0200, Martin Pieuchot wrote:
> > On 29/06/20(Mon) 11:59, Vitaliy Makkoveev wrote:
> > > [...] 
> > > I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> > > takes couple of minutes to take panic on 4 cores. Also some screenshots
> > > attached.
> > 
> > Setting kern.pool_debug=2 makes the race reproducible in seconds.

Unfortunately you will catch splassert() caused by kern/sched_bsd.c:304.
malloc() will call yield() while we are holding NET_LOCK(). I attached
screenshot with splassertion to this mail.

> > 
> > Could you turn this test into something committable in regress/?  We can
> > link it to the build once a fix is committed.
> > 
> 
> We have 3 races with cloned interfaces:
> 1. if_clone_create() vs if_clone_create()
> 2. if_clone_destroy() vs if_clone_destroy()
> 3. if_clone_destroy() vs the rest of stack
> 
> It makes sences to commit unified test to regress/, so I suggest to wait
> a little.

The another solution.

Diff below introduces per-`ifc' serialization for if_clone_create() and
if_clone_destroy(). There is no index bitmap anymore.

Diff fixes the following races:
1. if_clone_create() vs if_clone_create()
2. if_clone_destroy() vs if_clone_destroy()

`ifc_create' will go the same lock path for each cloner, and
`ifc_destroy' will go this path but in reverse order. It seems
reasonable to allow simultaneous create/destroy for different cloners
but since different instances of one cloner will block each other it's
no reason have parallelism here.

Updated test tool
 cut begin 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

static struct ifreq ifr;

static void *clone_create(void *arg)
{
int s;

if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
err(1, "socket()");
while(1){
if(ioctl(s, SIOCIFCREATE, )<0)
if(errno==EINVAL)
exit(1);
}

return NULL;
}

static void *clone_destroy(void *arg)
{
int s;

if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
err(1, "socket()");
while(1){
if(ioctl(s, SIOCIFDESTROY, )<0)
if(errno==EINVAL)
exit(1);
}

return NULL;
}

int main(int argc, char *argv[])
{
pthread_t thr;
int i;

if(argc!=2){
fprintf(stderr, "usage: %s ifname\n", getprogname());
return 1;
}

if(getuid()!=0){
fprintf(stderr, "should be root\n");
return 1;
}

memset(, 0, sizeof(ifr));
strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));

for(i=0; i<8*4; ++i){
if(pthread_create(, NULL, clone_create, NULL)!=0)
errx(1, "pthread_create(clone_create)");
if(pthread_create(, NULL, clone_destroy, NULL)!=0)
errx(1, "pthread_create(clone_destroy)");
}

select(0, NULL, NULL, NULL, NULL);

return 0;
}
 cut end 



Index: sys/net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.611
diff -u -p -r1.611 if.c
--- sys/net/if.c30 Jun 2020 09:31:38 -  1.611
+++ sys/net/if.c30 Jun 2020 20:41:50 -
@@ -155,6 +155,8 @@ int if_getgrouplist(caddr_t);
 void   if_linkstate(struct ifnet *);
 void   if_linkstate_task(void *);
 
+intif_clone_lock(struct if_clone *);
+void   if_clone_unlock(struct if_clone *);
 intif_clone_list(struct if_clonereq *);
 struct if_clone*if_clone_lookup(const char *, int *);
 
@@ -1244,27 +1246,35 @@ if_clone_create(const char *name, int rd
 {
struct if_clone *ifc;
struct ifnet *ifp;
-   int unit, ret;
+   int unit, error;
 
ifc = if_clone_lookup(name, );
if (ifc == NULL)
return (EINVAL);
 
-   if (ifunit(name) != NULL)
-   return (EEXIST);
+   error = if_clone_lock(ifc);
+   if (error != 0)
+   return (error);
+
+   if (ifunit(name) != NULL) {
+   error = (EEXIST);
+   goto unlock;
+   }
 
-   ret = (*ifc->ifc_create)(ifc, unit);
+   error = (*ifc->ifc_create)(ifc, unit);
 
-   if (ret != 0 || (ifp = ifunit(name)) == NULL)
-   return (ret);
+   if (error != 0 || (ifp = ifunit(name)) == NULL)
+   goto unlock;
 
NET_LOCK();
if_addgroup(ifp, ifc->ifc_name);
if (rdomain != 0)
if_setrdomain(ifp, rdomain);
NET_UNLOCK();
+unlock:
+   if_clone_unlock(ifc);
 
-   return (ret);
+   return (error);
 }
 
 /*
@@ -1275,18 +1285,26 @@ if_clone_destroy(const char *name)
 {
struct if_clone *ifc;
 

Re: fix races in if_clone_create()

2020-06-30 Thread Vitaliy Makkoveev
On Tue, Jun 30, 2020 at 12:08:03PM +0200, Martin Pieuchot wrote:
> On 29/06/20(Mon) 11:59, Vitaliy Makkoveev wrote:
> > [...] 
> > I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> > takes couple of minutes to take panic on 4 cores. Also some screenshots
> > attached.
> 
> Setting kern.pool_debug=2 makes the race reproducible in seconds.
> 
> Could you turn this test into something committable in regress/?  We can
> link it to the build once a fix is committed.
> 

We have 3 races with cloned interfaces:
1. if_clone_create() vs if_clone_create()
2. if_clone_destroy() vs if_clone_destroy()
3. if_clone_destroy() vs the rest of stack

It makes sences to commit unified test to regress/, so I suggest to wait
a little.

> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > #include 
> > 
> > static struct ifreq ifr;
> > 
> > static void *clone_create(void *arg)
> > {
> > int s;
> > 
> > if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
> > err(1, "socket()");
> > while(1){
> > if(ioctl(s, SIOCIFCREATE, )<0)
> > if(errno==EINVAL)
> > exit(1);
> > }
> > 
> > return NULL;
> > }
> > 
> > static void *clone_destroy(void *arg)
> > {
> > int s;
> > 
> > if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
> > err(1, "socket()");
> > while(1){
> > if(ioctl(s, SIOCIFDESTROY, )<0)
> > if(errno==EINVAL)
> > exit(1);
> > }
> > 
> > return NULL;
> > }
> > 
> > int main(int argc, char *argv[])
> > {
> > pthread_t thr;
> > int i;
> > 
> > if(argc!=2){
> > fprintf(stderr, "usage: %s ifname\n", getprogname());
> > return 1;
> > }
> > 
> > if(getuid()!=0){
> > fprintf(stderr, "should be root\n");
> > return 1;
> > }
> > 
> > memset(, 0, sizeof(ifr));
> > strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));
> > 
> > for(i=0; i<8*4; ++i){
> > if(pthread_create(, NULL, clone_create, NULL)!=0)
> > errx(1, "pthread_create(clone_create)");
> > }
> > 
> > clone_destroy(NULL);
> > 
> > return 0;
> > }
> > 
> >  cut end 
> 
> 
> 
> 
> 



Re: fix races in if_clone_create()

2020-06-30 Thread Martin Pieuchot
On 29/06/20(Mon) 11:59, Vitaliy Makkoveev wrote:
> [...] 
> I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> takes couple of minutes to take panic on 4 cores. Also some screenshots
> attached.

Setting kern.pool_debug=2 makes the race reproducible in seconds.

Could you turn this test into something committable in regress/?  We can
link it to the build once a fix is committed.

> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> 
> static struct ifreq ifr;
> 
> static void *clone_create(void *arg)
> {
>   int s;
> 
>   if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
>   err(1, "socket()");
>   while(1){
>   if(ioctl(s, SIOCIFCREATE, )<0)
>   if(errno==EINVAL)
>   exit(1);
>   }
> 
>   return NULL;
> }
> 
> static void *clone_destroy(void *arg)
> {
>   int s;
> 
>   if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
>   err(1, "socket()");
>   while(1){
>   if(ioctl(s, SIOCIFDESTROY, )<0)
>   if(errno==EINVAL)
>   exit(1);
>   }
> 
>   return NULL;
> }
> 
> int main(int argc, char *argv[])
> {
>   pthread_t thr;
>   int i;
> 
>   if(argc!=2){
>   fprintf(stderr, "usage: %s ifname\n", getprogname());
>   return 1;
>   }
> 
>   if(getuid()!=0){
>   fprintf(stderr, "should be root\n");
>   return 1;
>   }
> 
>   memset(, 0, sizeof(ifr));
>   strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));
> 
>   for(i=0; i<8*4; ++i){
>   if(pthread_create(, NULL, clone_create, NULL)!=0)
>   errx(1, "pthread_create(clone_create)");
>   }
> 
>   clone_destroy(NULL);
> 
>   return 0;
> }
> 
>  cut end 







Re: fix races in if_clone_create()

2020-06-29 Thread Vitaliy Makkoveev
On Mon, Jun 29, 2020 at 04:27:50PM +0200, Hrvoje Popovski wrote:
> On 29.6.2020. 10:59, Vitaliy Makkoveev wrote:
> > I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> > takes couple of minutes to take panic on 4 cores. Also some screenshots
> > attached.
> > 
> > I hope anyone else will try it.
> 
> Hi,
> 
> i'm getting panic quite fast :)
> i will leave box in ddb if more information is needed
>

Thanks. Right now it takes seconds to catch panic at least with switch(4),
bridge(4), pflog(4), vether(4) and etherip(4). So you can leave ddb(4).

I like to someone will try my solution for this issue. And reviews are
welcomed :)

The latest diff below. If the `unit' was obtained it's guaranteed that
there is no pseudo interface with `name' is system. ifunit() now useless
here and can be dropped.


Index: sys/net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.610
diff -u -p -r1.610 if.c
--- sys/net/if.c22 Jun 2020 09:45:13 -  1.610
+++ sys/net/if.c29 Jun 2020 13:54:29 -
@@ -157,6 +157,8 @@ voidif_linkstate_task(void *);
 
 intif_clone_list(struct if_clonereq *);
 struct if_clone*if_clone_lookup(const char *, int *);
+intif_clone_alloc_unit(struct if_clone *, int);
+void   if_clone_rele_unit(struct if_clone *, int);
 
 intif_group_egress_build(void);
 
@@ -1244,19 +1246,18 @@ if_clone_create(const char *name, int rd
 {
struct if_clone *ifc;
struct ifnet *ifp;
-   int unit, ret;
+   int unit, error;
 
ifc = if_clone_lookup(name, );
if (ifc == NULL)
return (EINVAL);
+   error = if_clone_alloc_unit(ifc, unit);
+   if (error != 0)
+   return (error);
 
-   if (ifunit(name) != NULL)
-   return (EEXIST);
-
-   ret = (*ifc->ifc_create)(ifc, unit);
-
-   if (ret != 0 || (ifp = ifunit(name)) == NULL)
-   return (ret);
+   error = (*ifc->ifc_create)(ifc, unit);
+   if (error != 0 || (ifp = ifunit(name)) == NULL)
+   return (error);
 
NET_LOCK();
if_addgroup(ifp, ifc->ifc_name);
@@ -1264,7 +1265,7 @@ if_clone_create(const char *name, int rd
if_setrdomain(ifp, rdomain);
NET_UNLOCK();
 
-   return (ret);
+   return (0);
 }
 
 /*
@@ -1275,9 +1276,9 @@ if_clone_destroy(const char *name)
 {
struct if_clone *ifc;
struct ifnet *ifp;
-   int ret;
+   int ret, unit;
 
-   ifc = if_clone_lookup(name, NULL);
+   ifc = if_clone_lookup(name, );
if (ifc == NULL)
return (EINVAL);
 
@@ -1297,6 +1298,7 @@ if_clone_destroy(const char *name)
}
NET_UNLOCK();
ret = (*ifc->ifc_destroy)(ifp);
+   if_clone_rele_unit(ifc, unit);
 
return (ret);
 }
@@ -1342,12 +1344,95 @@ if_clone_lookup(const char *name, int *u
unit = (unit * 10) + (*cp++ - '0');
}
 
-   if (unitp != NULL)
-   *unitp = unit;
+   *unitp = unit;
return (ifc);
 }
 
 /*
+ * Allocate unit for cloned network interface.
+ */
+int if_clone_alloc_unit(struct if_clone *ifc, int unit)
+{
+   int word, bit, ret;
+
+   word = unit / (sizeof(*ifc->ifc_map) * 8);
+   bit = unit % (sizeof(*ifc->ifc_map) * 8);
+
+   rw_enter_write(>ifc_lock);
+
+   if(word >= ifc->ifc_map_size) {
+   u_long *map;
+   int size;
+
+   size = word + 1;
+   map = mallocarray(size, sizeof(*map), M_TEMP, M_WAITOK |
+   M_ZERO);
+
+   if (ifc->ifc_map != NULL) {
+   memcpy(map, ifc->ifc_map, ifc->ifc_map_size);
+   free(ifc->ifc_map, M_TEMP,
+   ifc->ifc_map_size * sizeof(*map));
+   }
+
+   ifc->ifc_map = map;
+   ifc->ifc_map_size = size;
+   }
+
+   if (ifc->ifc_map[word] & (1UL << bit))
+   ret = EEXIST;
+   else {
+   ifc->ifc_map[word] |= (1UL << bit);
+   ret = 0;
+   }
+
+   rw_exit_write(>ifc_lock);
+
+   return ret;
+}
+
+/*
+ * Release allocated unit for cloned network interface.
+ */
+void if_clone_rele_unit(struct if_clone *ifc, int unit)
+{
+   int word, bit;
+
+   word = unit / (sizeof(*ifc->ifc_map) * 8);
+   bit = unit % (sizeof(*ifc->ifc_map) * 8);
+
+   rw_enter_write(>ifc_lock);
+   KASSERT(word < ifc->ifc_map_size);
+
+   ifc->ifc_map[word] &= ~(1UL << bit);
+
+   if (ifc->ifc_map[word] == 0) {
+   u_long *map;
+   int size;
+
+   size = ifc->ifc_map_size - 2;
+   while (size>=0) {
+   if (ifc->ifc_map[size] != 0)
+   break;
+   --size;
+   }
+   if (size<0) {
+   size = 0;
+  

Re: fix races in if_clone_create()

2020-06-29 Thread Vitaliy Makkoveev
On Sat, Jun 27, 2020 at 12:10:24PM +0200, Martin Pieuchot wrote:
> On 27/06/20(Sat) 00:35, Vitaliy Makkoveev wrote:
> > On Fri, Jun 26, 2020 at 09:12:16PM +0200, Martin Pieuchot wrote:
> > > On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> > > > if_clone_create() has the races caused by context switch.
> > > 
> > > Can you share a backtrace of such race?  Where does the kernel panic?
> > >
> > 
> > This diff was inspired by thread [1]. As I explained [2] here is 3
> > issues that cause panics produced by command below:
> > 
> >  cut begin 
> > for i in 1 2 3; do while true; do ifconfig bridge0 create& \
> > ifconfig bridge0 destroy& done& done
> >  cut end 
> 
> Thanks, I couldn't reproduce it on any of the machines I tried.  Did you
> managed to reproduce it with other pseudo-devices or just with bridge0?
> 
> > My system was stable with the last diff I did for thread [1]. But since
> > this final diff [3] which include fixes for tun(4) is quick and dirty
> > and not for commit I decided to make the diff to fix the races caused by
> > if_clone_create() at first.
> > 
> > I included screenshot with panic.
> 
> Thanks, interesting that the corruption happens on a list that should be
> initialized.  Does that mean the context switch on Thread 1 is happening
> before if_attach_common() is called?
> 
> You said your previous email that there's a context switch.  Do you know
> when it happens?  You could see that in ddb by looking at the backtrace
> of the other CPU.
> 
> Is the context switch leading to the race common to all pseudo-drivers
> or is it in the bridge(4) driver?
> 
> Regarding your solution, do I understand correctly that the goal is to
> serialize all if_clone_create()?  Is it really needed to remember which
> unit is being currently created or can't we just serialize all of them?
> 
> The fact that a lock is not held over the cloning operation is imho
> positive.
> 

I reworked tool for reproduce. Now I avoided fork()/exec() route and it
takes couple of minutes to take panic on 4 cores. Also some screenshots
attached.

I hope anyone else will try it.

 cut begin 

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

static struct ifreq ifr;

static void *clone_create(void *arg)
{
int s;

if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
err(1, "socket()");
while(1){
if(ioctl(s, SIOCIFCREATE, )<0)
if(errno==EINVAL)
exit(1);
}

return NULL;
}

static void *clone_destroy(void *arg)
{
int s;

if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
err(1, "socket()");
while(1){
if(ioctl(s, SIOCIFDESTROY, )<0)
if(errno==EINVAL)
exit(1);
}

return NULL;
}

int main(int argc, char *argv[])
{
pthread_t thr;
int i;

if(argc!=2){
fprintf(stderr, "usage: %s ifname\n", getprogname());
return 1;
}

if(getuid()!=0){
fprintf(stderr, "should be root\n");
return 1;
}

memset(, 0, sizeof(ifr));
strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));

for(i=0; i<8*4; ++i){
if(pthread_create(, NULL, clone_create, NULL)!=0)
errx(1, "pthread_create(clone_create)");
}

clone_destroy(NULL);

return 0;
}

 cut end 


Re: fix races in if_clone_create()

2020-06-29 Thread Vitaliy Makkoveev
screenshot


Re: fix races in if_clone_create()

2020-06-29 Thread Vitaliy Makkoveev
On Sat, Jun 27, 2020 at 12:10:24PM +0200, Martin Pieuchot wrote:
> On 27/06/20(Sat) 00:35, Vitaliy Makkoveev wrote:
> > On Fri, Jun 26, 2020 at 09:12:16PM +0200, Martin Pieuchot wrote:
> > > On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> > > > if_clone_create() has the races caused by context switch.
> > > 
> > > Can you share a backtrace of such race?  Where does the kernel panic?
> > >
> > 
> > This diff was inspired by thread [1]. As I explained [2] here is 3
> > issues that cause panics produced by command below:
> > 
> >  cut begin 
> > for i in 1 2 3; do while true; do ifconfig bridge0 create& \
> > ifconfig bridge0 destroy& done& done
> >  cut end 
> 
> Thanks, I couldn't reproduce it on any of the machines I tried.  Did you
> managed to reproduce it with other pseudo-devices or just with bridge0?
> 
> > My system was stable with the last diff I did for thread [1]. But since
> > this final diff [3] which include fixes for tun(4) is quick and dirty
> > and not for commit I decided to make the diff to fix the races caused by
> > if_clone_create() at first.
> > 
> > I included screenshot with panic.
> 
> Thanks, interesting that the corruption happens on a list that should be
> initialized.  Does that mean the context switch on Thread 1 is happening
> before if_attach_common() is called?
> 
> You said your previous email that there's a context switch.  Do you know
> when it happens?  You could see that in ddb by looking at the backtrace
> of the other CPU.
> 
> Is the context switch leading to the race common to all pseudo-drivers
> or is it in the bridge(4) driver?
> 
> Regarding your solution, do I understand correctly that the goal is to
> serialize all if_clone_create()?  Is it really needed to remember which
> unit is being currently created or can't we just serialize all of them?
> 
> The fact that a lock is not held over the cloning operation is imho
> positive.
> 

I reworked tool for reproduce. Now I avoided fork()/exec() route and it
takes couple of minutes to take panic on 4 cores. I attached some
screenshots with panics caused by various pseudo-interfaces but my
previous mail was banned. I will try to attach them with separate mails.

I hope anyone else will try it. Now switch(4) is the bast way to
reproduce.

 cut begin 

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 

static struct ifreq ifr;

static void *clone_create(void *arg)
{
int s;

if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
err(1, "socket()");
while(1){
if(ioctl(s, SIOCIFCREATE, )<0)
if(errno==EINVAL)
exit(1);
}

return NULL;
}

static void *clone_destroy(void *arg)
{
int s;

if((s=socket(AF_INET, SOCK_DGRAM, 0))<0)
err(1, "socket()");
while(1){
if(ioctl(s, SIOCIFDESTROY, )<0)
if(errno==EINVAL)
exit(1);
}

return NULL;
}

int main(int argc, char *argv[])
{
pthread_t thr;
int i;

if(argc!=2){
fprintf(stderr, "usage: %s ifname\n", getprogname());
return 1;
}

if(getuid()!=0){
fprintf(stderr, "should be root\n");
return 1;
}

memset(, 0, sizeof(ifr));
strlcpy(ifr.ifr_name, argv[1], sizeof(ifr.ifr_name));

for(i=0; i<8*4; ++i){
if(pthread_create(, NULL, clone_create, NULL)!=0)
errx(1, "pthread_create(clone_create)");
}

clone_destroy(NULL);

return 0;
}

 cut end 



Re: fix races in if_clone_create()

2020-06-29 Thread Hrvoje Popovski
On 29.6.2020. 10:59, Vitaliy Makkoveev wrote:
> I reworked tool for reproduce. Now I avoided fork()/exec() route and it
> takes couple of minutes to take panic on 4 cores. Also some screenshots
> attached.
> 
> I hope anyone else will try it.

Hi,

i'm getting panic quite fast :)
i will leave box in ddb if more information is needed

r620-1# ./a.out bridge0
panic: kernel diagnostic assertion "TAILQ_EMPTY(>if_addrhooks)"
failed: file "/sys/net/if.c", line 1168
Stopped at  db_enter+0x10:  popq%rbp
TIDPIDUID PRFLAGS PFLAGS  CPU  COMMAND
 475311   7753   1000 0x3  00  ifconfig
*128110   3280  0 0x3  01K a.out
  86419   3280  0 0x3  0x4004  a.out
 352360   3280  0 0x3  0x4003  a.out
 309715   3280  0 0x3  0x4005  a.out
 268210   3280  0 0x3  0x4002  a.out
db_enter() at db_enter+0x10
panic(81df42d3) at panic+0x128
__assert(81e5d55e,81e5b1fa,490,81e408d9) at
__assert+0x2b
if_detach(81169000) at if_detach+0x45f
bridge_clone_destroy(81169000) at bridge_clone_destroy+0x17b
ifioctl(fd839209c828,80206979,8000248fa980,800024902618) at
ifioctl+0x1c2
soo_ioctl(fd83b04b34c8,80206979,8000248fa980,800024902618)
at soo_ioctl+0x171
sys_ioctl(800024902618,8000248faa90,8000248faaf0) at
sys_ioctl+0x2df
syscall(8000248fab60) at syscall+0x389
Xsyscall() at Xsyscall+0x128
end of kernel
end trace frame: 0x7f7d3600, count: 5
https://www.openbsd.org/ddb.html describes the minimum info required in bug
reports.  Insufficient info makes it difficult to find and fix bugs.
ddb{1}>



Re: fix races in if_clone_create()

2020-06-27 Thread Vitaliy Makkoveev
On Sat, Jun 27, 2020 at 12:10:24PM +0200, Martin Pieuchot wrote:
> On 27/06/20(Sat) 00:35, Vitaliy Makkoveev wrote:
> > On Fri, Jun 26, 2020 at 09:12:16PM +0200, Martin Pieuchot wrote:
> > > On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> > > > if_clone_create() has the races caused by context switch.
> > > 
> > > Can you share a backtrace of such race?  Where does the kernel panic?
> > >
> > 
> > This diff was inspired by thread [1]. As I explained [2] here is 3
> > issues that cause panics produced by command below:
> > 
> >  cut begin 
> > for i in 1 2 3; do while true; do ifconfig bridge0 create& \
> > ifconfig bridge0 destroy& done& done
> >  cut end 
> 
> Thanks, I couldn't reproduce it on any of the machines I tried.  Did you
> managed to reproduce it with other pseudo-devices or just with bridge0?
> 

In thread [1] you talked about bridge(4), tun(4) and vether(4). A first
I fixed races in if_clone_destroy() and I caught the races with
if_clone_create() while I run your initial comman but with vether(4)

 cut begin 
for i in 0 1 2 3 4 5 6 7; do while true; \
do cat /dev/vether0& ifconfig vether0 destroy& done& done
 cut end 

It's hard to reproduce this issue. The best chances for me is bare metal
8 cores, fully unloaded system, no X, no active processes, test started
at console and all output redirected to /dev/null. And it can take
*hours* to catch. I can't reproduce this on 2 cores. I can't reproduce
this at 4 cores under kvm but it's reproducible under virtual box under
osx. The hosts has 8 cores. I can reproduce this on bare metal with 4
cores, but also it takes time.

Routine called by `ifc_create' within if_clone_attach() is very specific
to each pseudo interface. if_attach() is the only common point to sleep
for them, but you also can sleep in any point of sleep before
`ifc_create' will call if_attach(), For exmaple you will alloc software
context with `M_WAITOK'.

bridge(4) is just the best way to reproduce to me.

You have all `ifnet's linked to `if_list'. ifunit() does linear search
in this list by compare `ifp->if_xname' and given `name'. So if you
inserted many `ifnet's to this list ifunit() will return you first. but
if_get(9) doesn't work with this list. So if you have the case I talk
above if_get(9) and ifname() are inconsistent. Some times in the stack
you use if_get(9) sometimes you use ifunit() so you work every time with
diffetrent `ifnet's with the same `if_xname'. You can't predict where
`ifnet' will be corrupted.

> > My system was stable with the last diff I did for thread [1]. But since
> > this final diff [3] which include fixes for tun(4) is quick and dirty
> > and not for commit I decided to make the diff to fix the races caused by
> > if_clone_create() at first.
> > 
> > I included screenshot with panic.
> 
> Thanks, interesting that the corruption happens on a list that should be
> initialized.  Does that mean the context switch on Thread 1 is happening
> before if_attach_common() is called?
> 

I don't know where it was. if_attach() doesn't checks if `ifnet' with
the name in `if_xname' already linked. You will insert passed `ifnet' in
any cases. If you have more then one `ifnet' with identical `if_xname'
you have broken ifunit() and if_get() logic.

Look at if_attach():

 cut begin 
if_attach(struct ifnet *ifp)
{
if_attach_common(ifp);
NET_LOCK();
TAILQ_INSERT_TAIL(, ifp, if_list); /* (1) */
if_attachsetup(ifp);
NET_UNLOCK();
}

You link `ifp' at (1). And it's still your `ifp' before and after context
switch ot without context switch. You will brake it later. The reason is
pseudo driver received the same `unit' more than once. And it created
two or more software context with identical `unit'. And internal pseudo
driver's logic is broken. Also ifunit() and if_get(9) are inconsistent
now. You can break memory everythere.

 cut end 
> You said your previous email that there's a context switch.  Do you know
> when it happens?  You could see that in ddb by looking at the backtrace
> of the other CPU.
> 
> Is the context switch leading to the race common to all pseudo-drivers
> or is it in the bridge(4) driver?

ddb(4) is useless. The panic occured while we are trying to if_detach()
already broken `ifnet'. There is no reces here. But the rases was
*before* and we inserted two or more `ifnet's with the same name to
`if_list'. This insertion is no panic condition.

The first time I caught this races while I connected to you [1] thread.
I inserted ifunit() call to if_attach() as below and received panic so
I'am shure about the reason:

 cut begin 
if_attach(struct ifnet *ifp)
{
if_attach_common(ifp);
NET_LOCK();
KASSERT(ifunit(ifp->if_xname));
TAILQ_INSERT_TAIL(, ifp, if_list);
if_attachsetup(ifp);
NET_UNLOCK();
}
 cut end 

But in thread [1] you said these races with pseudo interfaces are very
old well know 

Re: fix races in if_clone_create()

2020-06-27 Thread Martin Pieuchot
On 27/06/20(Sat) 00:35, Vitaliy Makkoveev wrote:
> On Fri, Jun 26, 2020 at 09:12:16PM +0200, Martin Pieuchot wrote:
> > On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> > > if_clone_create() has the races caused by context switch.
> > 
> > Can you share a backtrace of such race?  Where does the kernel panic?
> >
> 
> This diff was inspired by thread [1]. As I explained [2] here is 3
> issues that cause panics produced by command below:
> 
>  cut begin 
> for i in 1 2 3; do while true; do ifconfig bridge0 create& \
>   ifconfig bridge0 destroy& done& done
>  cut end 

Thanks, I couldn't reproduce it on any of the machines I tried.  Did you
managed to reproduce it with other pseudo-devices or just with bridge0?

> My system was stable with the last diff I did for thread [1]. But since
> this final diff [3] which include fixes for tun(4) is quick and dirty
> and not for commit I decided to make the diff to fix the races caused by
> if_clone_create() at first.
> 
> I included screenshot with panic.

Thanks, interesting that the corruption happens on a list that should be
initialized.  Does that mean the context switch on Thread 1 is happening
before if_attach_common() is called?

You said your previous email that there's a context switch.  Do you know
when it happens?  You could see that in ddb by looking at the backtrace
of the other CPU.

Is the context switch leading to the race common to all pseudo-drivers
or is it in the bridge(4) driver?

Regarding your solution, do I understand correctly that the goal is to
serialize all if_clone_create()?  Is it really needed to remember which
unit is being currently created or can't we just serialize all of them?

The fact that a lock is not held over the cloning operation is imho
positive.



Re: fix races in if_clone_create()

2020-06-26 Thread Vitaliy Makkoveev
On Fri, Jun 26, 2020 at 09:12:16PM +0200, Martin Pieuchot wrote:
> On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> > if_clone_create() has the races caused by context switch.
> 
> Can you share a backtrace of such race?  Where does the kernel panic?
>

This diff was inspired by thread [1]. As I explained [2] here is 3
issues that cause panics produced by command below:

 cut begin 
for i in 1 2 3; do while true; do ifconfig bridge0 create& \
ifconfig bridge0 destroy& done& done
 cut end 

My system was stable with the last diff I did for thread [1]. But since
this final diff [3] which include fixes for tun(4) is quick and dirty
and not for commit I decided to make the diff to fix the races caused by
if_clone_create() at first.

I included screenshot with panic.
Also the code to reproduce below:

 cut begin 
#!/bin/sh
while true; do ifconfig bridge0 create& done&
while true; do ifconfig bridge0 destroy done
 cut end 

1. https://marc.info/?t=15928959011=1=2
2. https://marc.info/?l=openbsd-tech=159307900124245=2
3. https://marc.info/?l=openbsd-tech=159308633126243=2


Re: fix races in if_clone_create()

2020-06-26 Thread Vitaliy Makkoveev
Sorry, I found a memory leak. Updated diff below.

Index: sys/net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.610
diff -u -p -r1.610 if.c
--- sys/net/if.c22 Jun 2020 09:45:13 -  1.610
+++ sys/net/if.c26 Jun 2020 17:19:41 -
@@ -157,6 +157,8 @@ voidif_linkstate_task(void *);
 
 intif_clone_list(struct if_clonereq *);
 struct if_clone*if_clone_lookup(const char *, int *);
+intif_clone_alloc_unit(struct if_clone *, int);
+void   if_clone_rele_unit(struct if_clone *, int);
 
 intif_group_egress_build(void);
 
@@ -1244,19 +1246,23 @@ if_clone_create(const char *name, int rd
 {
struct if_clone *ifc;
struct ifnet *ifp;
-   int unit, ret;
+   int unit, error;
 
ifc = if_clone_lookup(name, );
if (ifc == NULL)
return (EINVAL);
+   error = if_clone_alloc_unit(ifc, unit);
+   if (error != 0)
+   return (error);
 
-   if (ifunit(name) != NULL)
-   return (EEXIST);
-
-   ret = (*ifc->ifc_create)(ifc, unit);
+   if (ifunit(name) != NULL) {
+   error = (EEXIST);
+   goto rele;
+   }
 
-   if (ret != 0 || (ifp = ifunit(name)) == NULL)
-   return (ret);
+   error = (*ifc->ifc_create)(ifc, unit);
+   if (error != 0 || (ifp = ifunit(name)) == NULL)
+   return (0);
 
NET_LOCK();
if_addgroup(ifp, ifc->ifc_name);
@@ -1264,7 +1270,10 @@ if_clone_create(const char *name, int rd
if_setrdomain(ifp, rdomain);
NET_UNLOCK();
 
-   return (ret);
+   return (0);
+rele:
+   if_clone_rele_unit(ifc, unit);
+   return (error);
 }
 
 /*
@@ -1275,9 +1284,9 @@ if_clone_destroy(const char *name)
 {
struct if_clone *ifc;
struct ifnet *ifp;
-   int ret;
+   int ret, unit;
 
-   ifc = if_clone_lookup(name, NULL);
+   ifc = if_clone_lookup(name, );
if (ifc == NULL)
return (EINVAL);
 
@@ -1297,6 +1306,7 @@ if_clone_destroy(const char *name)
}
NET_UNLOCK();
ret = (*ifc->ifc_destroy)(ifp);
+   if_clone_rele_unit(ifc, unit);
 
return (ret);
 }
@@ -1342,12 +1352,95 @@ if_clone_lookup(const char *name, int *u
unit = (unit * 10) + (*cp++ - '0');
}
 
-   if (unitp != NULL)
-   *unitp = unit;
+   *unitp = unit;
return (ifc);
 }
 
 /*
+ * Allocate unit for cloned network interface.
+ */
+int if_clone_alloc_unit(struct if_clone *ifc, int unit)
+{
+   int word, bit, ret;
+
+   word = unit / (sizeof(*ifc->ifc_map) * 8);
+   bit = unit % (sizeof(*ifc->ifc_map) * 8);
+
+   rw_enter_write(>ifc_lock);
+
+   if(word >= ifc->ifc_map_size) {
+   u_long *map;
+   int size;
+
+   size = word + 1;
+   map = mallocarray(size, sizeof(*map), M_TEMP, M_WAITOK |
+   M_ZERO);
+
+   if (ifc->ifc_map != NULL) {
+   memcpy(map, ifc->ifc_map, ifc->ifc_map_size);
+   free(ifc->ifc_map, M_TEMP,
+   ifc->ifc_map_size * sizeof(*map));
+   }
+
+   ifc->ifc_map = map;
+   ifc->ifc_map_size = size;
+   }
+
+   if (ifc->ifc_map[word] & (1UL << bit))
+   ret = EEXIST;
+   else {
+   ifc->ifc_map[word] |= (1UL << bit);
+   ret = 0;
+   }
+
+   rw_exit_write(>ifc_lock);
+
+   return ret;
+}
+
+/*
+ * Release allocated unit for cloned network interface.
+ */
+void if_clone_rele_unit(struct if_clone *ifc, int unit)
+{
+   int word, bit;
+
+   word = unit / (sizeof(*ifc->ifc_map) * 8);
+   bit = unit % (sizeof(*ifc->ifc_map) * 8);
+
+   rw_enter_write(>ifc_lock);
+   KASSERT(word < ifc->ifc_map_size);
+
+   ifc->ifc_map[word] &= ~(1UL << bit);
+
+   if (ifc->ifc_map[word] == 0) {
+   u_long *map;
+   int size;
+
+   size = ifc->ifc_map_size - 2;
+   while (size>=0) {
+   if (ifc->ifc_map[size] != 0)
+   break;
+   --size;
+   }
+   if (size<0) {
+   size = 0;
+   map = NULL;
+   } else {
+   size += 1;
+   map = mallocarray(size, sizeof(*map), M_TEMP,
+   M_WAITOK);
+   memcpy(map, ifc->ifc_map, size);
+   }
+
+   free(ifc->ifc_map, M_TEMP, ifc->ifc_map_size * sizeof(*map));
+   ifc->ifc_map = map;
+   ifc->ifc_map_size = size;
+   }
+   rw_exit_write(>ifc_lock);
+}
+
+/*
  * Register a network interface cloner.
  */
 void
@@ -1360,6 +1453,7 @@ if_clone_attach(struct if_clone *ifc)
 * 

Re: fix races in if_clone_create()

2020-06-26 Thread Martin Pieuchot
On 26/06/20(Fri) 16:56, Vitaliy Makkoveev wrote:
> if_clone_create() has the races caused by context switch.

Can you share a backtrace of such race?  Where does the kernel panic?



fix races in if_clone_create()

2020-06-26 Thread Vitaliy Makkoveev
if_clone_create() has the races caused by context switch.

 cut begin 
if_clone_create(const char *name, int rdomain)
{
struct if_clone *ifc;
struct ifnet *ifp;
int unit, ret;

ifc = if_clone_lookup(name, ); /* [1] */
if (ifc == NULL)
return (EINVAL);

if (ifunit(name) != NULL) /* [2] */
return (EEXIST);

ret = (*ifc->ifc_create)(ifc, unit); /* [3] */

if (ret != 0 || (ifp = ifunit(name)) == NULL) /* [4] */
return (ret);

NET_LOCK();
if_addgroup(ifp, ifc->ifc_name);
if (rdomain != 0)
if_setrdomain(ifp, rdomain);
NET_UNLOCK();

return (ret);
}

 cut end 


The race is:

Thread 1:
1. We pass the string "ifacename0" with contains interface name and
it's unit to if_clone_lookup(). if_clone_lookup() extracts the unit '0'
and return it to us as integer `unit'. this `unit' is our local
variable, nobody knows what we are going to create interface with this
unit.

2. There is no "ifacename0" yet, so ifunit() will return NULL. 

3. We did some checks add call `ifc_create' which will call
pseudo-driver's callback to create instance with passed `unit'. We
initialize ifnet with our softwere context and the `if_xname' set to
"ifacename0".

3.1. We do if_attach() which call NET_LOCK() before we link `ifnet' to
`if_list'. Also if_attach() doesn't check anything so we always attach
passed `ifnet'.

Context was switched to Thread 2:

1. We also passed string "ifacename0" to if_clone_lookup() and received
`unit' with value `0'

2. There is no "ifacename0" yet, so ifunit() will return NULL.

3. All checks done, we call `ifc_create'. We initialize out softwere
context with passed `unit', Who knows do we chech is software context
for `unit' already exists before? In fact we don't :)
3.1 We initialized `ifnet' with the `if_xname' set to "ifacename0" and
we link it `if_list'.

4. This check is passed, `ifc_create' returned `0' and we have
"ifacename0" linked so ifunit() will return `ifp'.

We returned to userland with success and switched to Thread 1:

We continue 3.1. if_attach() continue his work and we have another
`ifnet' with `if_xname' set to "ifacename0" in list.

Now we have inconsistent `if_list'.

4. We do these checks. ifunit() will return pointer to `ifp' with
requested "ifacename0". All ok we return to userland with success.

Diff below fixes this reces.

Each `struct if_clone' has the bitmap where requested `unit' marked as
busy if it was not allocated before. We have new function
if_clone_alloc_unit() for. If `unit' already was allocated
if_clone_alloc_unit() will return EEXIST. Also we have
if_clone_rele_unit() to release `unit'. We do unit allocation before we
do context switch so now we can't allocate the same unit twice or more.

Index: net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.610
diff -u -p -r1.610 if.c
--- net/if.c22 Jun 2020 09:45:13 -  1.610
+++ net/if.c26 Jun 2020 13:49:08 -
@@ -157,6 +157,8 @@ voidif_linkstate_task(void *);
 
 intif_clone_list(struct if_clonereq *);
 struct if_clone*if_clone_lookup(const char *, int *);
+intif_clone_alloc_unit(struct if_clone *, int);
+void   if_clone_rele_unit(struct if_clone *, int);
 
 intif_group_egress_build(void);
 
@@ -1244,19 +1246,23 @@ if_clone_create(const char *name, int rd
 {
struct if_clone *ifc;
struct ifnet *ifp;
-   int unit, ret;
+   int unit, error;
 
ifc = if_clone_lookup(name, );
if (ifc == NULL)
return (EINVAL);
+   error = if_clone_alloc_unit(ifc, unit);
+   if (error != 0)
+   return (error);
 
-   if (ifunit(name) != NULL)
-   return (EEXIST);
-
-   ret = (*ifc->ifc_create)(ifc, unit);
+   if (ifunit(name) != NULL) {
+   error = (EEXIST);
+   goto rele;
+   }
 
-   if (ret != 0 || (ifp = ifunit(name)) == NULL)
-   return (ret);
+   error = (*ifc->ifc_create)(ifc, unit);
+   if (error != 0 || (ifp = ifunit(name)) == NULL)
+   return (0);
 
NET_LOCK();
if_addgroup(ifp, ifc->ifc_name);
@@ -1264,7 +1270,10 @@ if_clone_create(const char *name, int rd
if_setrdomain(ifp, rdomain);
NET_UNLOCK();
 
-   return (ret);
+   return (0);
+rele:
+   if_clone_rele_unit(ifc, unit);
+   return (error);
 }
 
 /*
@@ -1275,9 +1284,9 @@ if_clone_destroy(const char *name)
 {
struct if_clone *ifc;
struct ifnet *ifp;
-   int ret;
+   int ret, unit;
 
-   ifc = if_clone_lookup(name, NULL);
+   ifc = if_clone_lookup(name, );
if (ifc == NULL)
return (EINVAL);
 
@@ -1297,6 +1306,7 @@ if_clone_destroy(const char *name)
}
NET_UNLOCK();
ret =