Re: iked: load explicit flows for ipip/ipcomp

2017-11-08 Thread Markus Friedl
ok

On Sun, Nov 05, 2017 at 10:39:18PM +0100, Patrick Wildt wrote:
> Hi,
> 
> for IPcomp we need to load explicit ESP-flows for the IPIP or IPCOMP
> tunneled packets, otherwise every packet between the gateways will
> be sent into the tunnel (e.g. ICMP, too).
> 
> ok?
> 
> Patrick
> 
> diff --git a/sbin/iked/ikev2.c b/sbin/iked/ikev2.c
> index 706f9ebbe1d..cacfe690008 100644
> --- a/sbin/iked/ikev2.c
> +++ b/sbin/iked/ikev2.c
> @@ -4942,16 +4942,21 @@ ikev2_ipcomp_enable(struct iked *env, struct iked_sa 
> *sa)
>  {
>   struct iked_childsa *other, *nother, *csa = NULL, *csb = NULL;
>   struct iked_flow*flow, *flowa = NULL, *flowb = NULL;
> + struct iked_flow*flowc = NULL, *flowd = NULL;
>   struct iked_flow*nflow, *oflow;
>  
>   if ((csa = calloc(1, sizeof(*csa))) == NULL ||
>   (csb = calloc(1, sizeof(*csb))) == NULL ||
>   (flowa = calloc(1, sizeof(*flowa))) == NULL ||
> - (flowb = calloc(1, sizeof(*flowb))) == NULL) {
> + (flowb = calloc(1, sizeof(*flowb))) == NULL ||
> + (flowc = calloc(1, sizeof(*flowc))) == NULL ||
> + (flowd = calloc(1, sizeof(*flowd))) == NULL) {
>   free(csa);
>   free(csb);
>   free(flowa);
>   free(flowb);
> + free(flowc);
> + free(flowd);
>   return (-1);
>   }
>  
> @@ -5039,8 +5044,9 @@ ikev2_ipcomp_enable(struct iked *env, struct iked_sa 
> *sa)
>   }
>   }
>  
> - /* setup ESP flows for gateways */
> + /* setup ESP flows for gateways (IPCOMP) */
>   flowa->flow_ipcomp = 1;
> + flowa->flow_ipproto = IPPROTO_IPCOMP;
>   flowa->flow_dir = IPSP_DIRECTION_OUT;
>   flowa->flow_saproto = IKEV2_SAPROTO_ESP;
>   flowa->flow_local = &sa->sa_local;
> @@ -5054,22 +5060,36 @@ ikev2_ipcomp_enable(struct iked *env, struct iked_sa 
> *sa)
>   (sa->sa_local.addr_af == AF_INET) ? 32 : 128;
>   flowa->flow_ikesa = sa;
>  
> - /* skip if flow already exists */
> + /* matching incoming flow */
> + memcpy(flowb, flowa, sizeof(*flowb));
> + flowb->flow_dir = IPSP_DIRECTION_IN;
> + memcpy(&flowb->flow_dst, &flowa->flow_src, sizeof(flowa->flow_src));
> + memcpy(&flowb->flow_src, &flowa->flow_dst, sizeof(flowa->flow_dst));
> +
> + /* setup ESP flows for gateways (IPIP) */
> + memcpy(flowc, flowa, sizeof(*flowc));
> + flowc->flow_ipproto = IPPROTO_IPIP;
> +
> + /* matching incoming flow */
> + memcpy(flowd, flowb, sizeof(*flowd));
> + flowd->flow_ipproto = IPPROTO_IPIP;
> +
> + /* skip if flows already exists */
>   TAILQ_FOREACH(flow, &sa->sa_flows, flow_entry) {
> - if (flow_equal(flow, flowa)) {
> + if (flow_equal(flow, flowa) || flow_equal(flow, flowb) ||
> + flow_equal(flow, flowc) || flow_equal(flow, flowd)) {
>   free(flowa);
>   free(flowb);
> + free(flowc);
> + free(flowd);
>   goto done;
>   }
>   }
>  
> - memcpy(flowb, flowa, sizeof(*flowb));
> - flowb->flow_dir = IPSP_DIRECTION_IN;
> - memcpy(&flowb->flow_dst, &flowa->flow_src, sizeof(flowa->flow_src));
> - memcpy(&flowb->flow_src, &flowa->flow_dst, sizeof(flowa->flow_dst));
> -
>   TAILQ_INSERT_TAIL(&sa->sa_flows, flowa, flow_entry);
>   TAILQ_INSERT_TAIL(&sa->sa_flows, flowb, flow_entry);
> + TAILQ_INSERT_TAIL(&sa->sa_flows, flowc, flow_entry);
> + TAILQ_INSERT_TAIL(&sa->sa_flows, flowd, flow_entry);
>  
>   done:
>   /* make sure IPCOMP CPIs are not reused */

EOF



Introduce NET_RLOCK()

2017-11-08 Thread Martin Pieuchot
We're at the stage where we want to run multiple parts of the Network
Stack in parallel.  sashan@ is trying to get multiple network threads
running to exercise parallelism in pf_test() and tb@ is trying to push
the NET_LOCK() further down in ioctl(2) path.

However we want to be able to gradually select which code paths can be
executed in parallel as the rest of the stack.  So the way to move
forward is to introduce a read version of the NET_LOCK(), NET_RLOCK()
and change code paths on a case-by-case basis.

Diff below does that for the input processing path.  Note that the 3
tasks below cannot run in parallel because they are scheduled on the
same taskq.

Regarding assert macros, NET_ASSERT_LOCK() now required either a read
or write lock.  I also introduced a NET_ASSERT_WLOCK() to put in paths
that are not ready to be run in parallel of the rest of the stack.

Once that's in we can slowly convert existing NET_LOCK() to NET_WLOCK()
to be explicit.

Ok?

Index: net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.523
diff -u -p -r1.523 if.c
--- net/if.c4 Nov 2017 16:58:46 -   1.523
+++ net/if.c8 Nov 2017 09:01:12 -
@@ -905,7 +905,7 @@ if_input_process(void *xifidx)
 * to PF globals, pipex globals, unicast and multicast addresses
 * lists.
 */
-   NET_LOCK();
+   NET_RLOCK();
s = splnet();
while ((m = ml_dequeue(&ml)) != NULL) {
/*
@@ -922,7 +922,7 @@ if_input_process(void *xifidx)
m_freem(m);
}
splx(s);
-   NET_UNLOCK();
+   NET_RUNLOCK();
 out:
if_put(ifp);
 }
Index: netinet/ip_input.c
===
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.329
diff -u -p -r1.329 ip_input.c
--- netinet/ip_input.c  5 Nov 2017 13:19:59 -   1.329
+++ netinet/ip_input.c  8 Nov 2017 09:06:57 -
@@ -1814,11 +1814,11 @@ ip_send_dispatch(void *xmq)
if (ml_empty(&ml))
return;
 
-   NET_LOCK();
+   NET_RLOCK();
while ((m = ml_dequeue(&ml)) != NULL) {
ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
}
-   NET_UNLOCK();
+   NET_RUNLOCK();
 }
 
 void
Index: netinet6/ip6_input.c
===
RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
retrieving revision 1.207
diff -u -p -r1.207 ip6_input.c
--- netinet6/ip6_input.c1 Nov 2017 06:35:38 -   1.207
+++ netinet6/ip6_input.c8 Nov 2017 09:07:16 -
@@ -1459,7 +1459,7 @@ ip6_send_dispatch(void *xmq)
if (ml_empty(&ml))
return;
 
-   NET_LOCK();
+   NET_RLOCK();
while ((m = ml_dequeue(&ml)) != NULL) {
/*
 * To avoid a "too big" situation at an intermediate router and
@@ -1470,7 +1470,7 @@ ip6_send_dispatch(void *xmq)
 */
ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL);
}
-   NET_UNLOCK();
+   NET_RUNLOCK();
 }
 
 void
Index: sys/systm.h
===
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.133
diff -u -p -r1.133 systm.h
--- sys/systm.h 11 Aug 2017 21:24:20 -  1.133
+++ sys/systm.h 8 Nov 2017 09:21:51 -
@@ -296,26 +296,34 @@ int   uiomove(void *, size_t, struct uio *
 
 extern struct rwlock netlock;
 
-#defineNET_LOCK()  
\
-do {   \
-   rw_enter_write(&netlock);   \
-} while (0)
+#define NET_LOCK() NET_WLOCK()
+#define NET_UNLOCK()   NET_WUNLOCK()
 
-#defineNET_UNLOCK()
\
+#defineNET_WLOCK() do { rw_enter_write(&netlock); } while (0)
+#defineNET_WUNLOCK()   do { rw_exit_write(&netlock); } while (0)
+
+#defineNET_ASSERT_WLOCKED()
\
 do {   \
-   rw_exit_write(&netlock);\
+   int _s = rw_status(&netlock);   \
+   if (_s != RW_WRITE) \
+   splassert_fail(RW_WRITE, _s, __func__); \
 } while (0)
 
+#defineNET_RLOCK() do { rw_enter_read(&netlock); } while (0)
+#defineNET_RUNLOCK()   do { rw_exit_read(&netlock); } while (0)
+
 #defineNET_ASSERT_LOCKED() 
\
 do {   \
-   if (rw_status(&netlock) != RW_WRITE)\
-   splassert_fail(RW_WRITE, rw_status(&netlock), __f

2 network threads & IPsec

2017-11-08 Thread Martin Pieuchot
We're experimenting with 2 threads processing input packets.  That's
good enough to improve forwarding performance and finally run pf_test()
in parallel.  However IPsec is not ready for that.  Until somebody takes
care of IPsec here's the solution: stay with a single thread.

Diff below implements this hack.  Like for the previous release, as soon
as you enable IPsec all your traffic gets punished.

This is a no-op for the moment since we only have a single network
taskq, but it allows us to experiment with the PF_LOCK() and start
splitting the use upcoming diff.

While here I renamed the taskq to reflect the fact that it doesn't
grab the KERNEL_LOCK() by default, following the ``systqmq'' notation. 

ok?

Index: net/pfkeyv2.c
===
RCS file: /cvs/src/sys/net/pfkeyv2.c,v
retrieving revision 1.172
diff -u -p -r1.172 pfkeyv2.c
--- net/pfkeyv2.c   3 Nov 2017 16:23:20 -   1.172
+++ net/pfkeyv2.c   8 Nov 2017 09:54:13 -
@@ -1767,6 +1767,14 @@ pfkeyv2_send(struct socket *so, void *me
}
TAILQ_INSERT_HEAD(&ipsec_policy_head, ipo, ipo_list);
ipsec_in_use++;
+   /*
+* XXXSMP IPsec data structures are not ready to be
+* accessed by multiple Network threads in parallel,
+* so force all packets to be processed by the first
+* one.
+*/
+   extern int nettaskqs;
+   nettaskqs = 1;
} else {
ipo->ipo_last_searched = ipo->ipo_flags = 0;
}
Index: net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.523
diff -u -p -r1.523 if.c
--- net/if.c4 Nov 2017 16:58:46 -   1.523
+++ net/if.c8 Nov 2017 09:50:39 -
@@ -227,8 +227,8 @@ int ifq_congestion;
 
 int netisr;
 
-#defineSOFTNET_TASKS   1
-struct taskq   *softnettq[SOFTNET_TASKS];
+#defineNET_TASKQ   1
+struct taskq   *nettqmp[NET_TASKQ];
 
 struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
 
@@ -254,10 +254,10 @@ ifinit(void)
 
timeout_set(&net_tick_to, net_tick, &net_tick_to);
 
-   for (i = 0; i < SOFTNET_TASKS; i++) {
-   softnettq[i] = taskq_create("softnet", 1, IPL_NET, 
TASKQ_MPSAFE);
-   if (softnettq[i] == NULL)
-   panic("unable to create softnet taskq");
+   for (i = 0; i < NET_TASKQ; i++) {
+   nettqmp[i] = taskq_create("softnet", 1, IPL_NET, TASKQ_MPSAFE);
+   if (nettqmp[i] == NULL)
+   panic("unable to create network taskq %d", i);
}
 
net_tick(&net_tick_to);
@@ -2924,12 +2924,19 @@ unhandled_af(int af)
panic("unhandled af %d", af);
 }
 
+/*
+ * XXXSMP This tunable is here to work around the fact that IPsec
+ * globals aren't ready to be accessed by multiple threads in
+ * parallel.
+ */
+int nettaskqs = NET_TASKQ;
+
 struct taskq *
 net_tq(unsigned int ifindex)
 {
struct taskq *t = NULL;
 
-   t = softnettq[ifindex % SOFTNET_TASKS];
+   t = nettqmp[ifindex % nettaskqs];
 
return (t);
 }



Re: Introduce NET_RLOCK()

2017-11-08 Thread Martin Pieuchot
On 08/11/17(Wed) 10:37, Martin Pieuchot wrote:
> We're at the stage where we want to run multiple parts of the Network
> Stack in parallel.  sashan@ is trying to get multiple network threads
> running to exercise parallelism in pf_test() and tb@ is trying to push
> the NET_LOCK() further down in ioctl(2) path.
> 
> However we want to be able to gradually select which code paths can be
> executed in parallel as the rest of the stack.  So the way to move
> forward is to introduce a read version of the NET_LOCK(), NET_RLOCK()
> and change code paths on a case-by-case basis.
> 
> Diff below does that for the input processing path.  Note that the 3
> tasks below cannot run in parallel because they are scheduled on the
> same taskq.
> 
> Regarding assert macros, NET_ASSERT_LOCK() now required either a read
> or write lock.  I also introduced a NET_ASSERT_WLOCK() to put in paths
> that are not ready to be run in parallel of the rest of the stack.
> 
> Once that's in we can slowly convert existing NET_LOCK() to NET_WLOCK()
> to be explicit.

I got bitten by rw_status() once again.  There's no way to tell that the
current lock is not holding a read lock.  But that's ok since what we
really need is to assert that we're not holding a write lock.

Fixed diff below, ok?

Index: net/if.c
===
RCS file: /cvs/src/sys/net/if.c,v
retrieving revision 1.523
diff -u -p -r1.523 if.c
--- net/if.c4 Nov 2017 16:58:46 -   1.523
+++ net/if.c8 Nov 2017 10:11:13 -
@@ -905,7 +905,7 @@ if_input_process(void *xifidx)
 * to PF globals, pipex globals, unicast and multicast addresses
 * lists.
 */
-   NET_LOCK();
+   NET_RLOCK();
s = splnet();
while ((m = ml_dequeue(&ml)) != NULL) {
/*
@@ -922,7 +922,7 @@ if_input_process(void *xifidx)
m_freem(m);
}
splx(s);
-   NET_UNLOCK();
+   NET_RUNLOCK();
 out:
if_put(ifp);
 }
Index: netinet/ip_input.c
===
RCS file: /cvs/src/sys/netinet/ip_input.c,v
retrieving revision 1.329
diff -u -p -r1.329 ip_input.c
--- netinet/ip_input.c  5 Nov 2017 13:19:59 -   1.329
+++ netinet/ip_input.c  8 Nov 2017 10:03:54 -
@@ -1814,11 +1814,11 @@ ip_send_dispatch(void *xmq)
if (ml_empty(&ml))
return;
 
-   NET_LOCK();
+   NET_RLOCK();
while ((m = ml_dequeue(&ml)) != NULL) {
ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
}
-   NET_UNLOCK();
+   NET_RUNLOCK();
 }
 
 void
Index: netinet6/ip6_input.c
===
RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
retrieving revision 1.207
diff -u -p -r1.207 ip6_input.c
--- netinet6/ip6_input.c1 Nov 2017 06:35:38 -   1.207
+++ netinet6/ip6_input.c8 Nov 2017 10:03:54 -
@@ -1459,7 +1459,7 @@ ip6_send_dispatch(void *xmq)
if (ml_empty(&ml))
return;
 
-   NET_LOCK();
+   NET_RLOCK();
while ((m = ml_dequeue(&ml)) != NULL) {
/*
 * To avoid a "too big" situation at an intermediate router and
@@ -1470,7 +1470,7 @@ ip6_send_dispatch(void *xmq)
 */
ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL);
}
-   NET_UNLOCK();
+   NET_RUNLOCK();
 }
 
 void
Index: sys/systm.h
===
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.133
diff -u -p -r1.133 systm.h
--- sys/systm.h 11 Aug 2017 21:24:20 -  1.133
+++ sys/systm.h 8 Nov 2017 10:13:42 -
@@ -296,26 +296,36 @@ int   uiomove(void *, size_t, struct uio *
 
 extern struct rwlock netlock;
 
-#defineNET_LOCK()  
\
-do {   \
-   rw_enter_write(&netlock);   \
-} while (0)
+#defineNET_LOCK()  NET_WLOCK()
+#defineNET_UNLOCK()NET_WUNLOCK()
+#defineNET_ASSERT_UNLOCKED()   NET_ASSERT_WUNLOCKED()
+
+
+#defineNET_WLOCK() do { rw_enter_write(&netlock); } while (0)
+#defineNET_WUNLOCK()   do { rw_exit_write(&netlock); } while (0)
 
-#defineNET_UNLOCK()
\
+#defineNET_ASSERT_WLOCKED()
\
 do {   \
-   rw_exit_write(&netlock);\
+   int _s = rw_status(&netlock);   \
+   if (_s != RW_WRITE) \
+   splassert_fail(RW_WRITE, _s, __func__); \
 } while (0)
 
-#defineNET_ASSERT_LOCKED()  

Re: Introduce NET_RLOCK()

2017-11-08 Thread Mark Kettenis
> Date: Wed, 8 Nov 2017 10:37:02 +0100
> From: Martin Pieuchot 
> 
> We're at the stage where we want to run multiple parts of the Network
> Stack in parallel.  sashan@ is trying to get multiple network threads
> running to exercise parallelism in pf_test() and tb@ is trying to push
> the NET_LOCK() further down in ioctl(2) path.
> 
> However we want to be able to gradually select which code paths can be
> executed in parallel as the rest of the stack.  So the way to move
> forward is to introduce a read version of the NET_LOCK(), NET_RLOCK()
> and change code paths on a case-by-case basis.
> 
> Diff below does that for the input processing path.  Note that the 3
> tasks below cannot run in parallel because they are scheduled on the
> same taskq.
> 
> Regarding assert macros, NET_ASSERT_LOCK() now required either a read
> or write lock.  I also introduced a NET_ASSERT_WLOCK() to put in paths
> that are not ready to be run in parallel of the rest of the stack.
> 
> Once that's in we can slowly convert existing NET_LOCK() to NET_WLOCK()
> to be explicit.
> 
> Ok?

makes sense to me

> Index: net/if.c
> ===
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.523
> diff -u -p -r1.523 if.c
> --- net/if.c  4 Nov 2017 16:58:46 -   1.523
> +++ net/if.c  8 Nov 2017 09:01:12 -
> @@ -905,7 +905,7 @@ if_input_process(void *xifidx)
>* to PF globals, pipex globals, unicast and multicast addresses
>* lists.
>*/
> - NET_LOCK();
> + NET_RLOCK();
>   s = splnet();
>   while ((m = ml_dequeue(&ml)) != NULL) {
>   /*
> @@ -922,7 +922,7 @@ if_input_process(void *xifidx)
>   m_freem(m);
>   }
>   splx(s);
> - NET_UNLOCK();
> + NET_RUNLOCK();
>  out:
>   if_put(ifp);
>  }
> Index: netinet/ip_input.c
> ===
> RCS file: /cvs/src/sys/netinet/ip_input.c,v
> retrieving revision 1.329
> diff -u -p -r1.329 ip_input.c
> --- netinet/ip_input.c5 Nov 2017 13:19:59 -   1.329
> +++ netinet/ip_input.c8 Nov 2017 09:06:57 -
> @@ -1814,11 +1814,11 @@ ip_send_dispatch(void *xmq)
>   if (ml_empty(&ml))
>   return;
>  
> - NET_LOCK();
> + NET_RLOCK();
>   while ((m = ml_dequeue(&ml)) != NULL) {
>   ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
>   }
> - NET_UNLOCK();
> + NET_RUNLOCK();
>  }
>  
>  void
> Index: netinet6/ip6_input.c
> ===
> RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
> retrieving revision 1.207
> diff -u -p -r1.207 ip6_input.c
> --- netinet6/ip6_input.c  1 Nov 2017 06:35:38 -   1.207
> +++ netinet6/ip6_input.c  8 Nov 2017 09:07:16 -
> @@ -1459,7 +1459,7 @@ ip6_send_dispatch(void *xmq)
>   if (ml_empty(&ml))
>   return;
>  
> - NET_LOCK();
> + NET_RLOCK();
>   while ((m = ml_dequeue(&ml)) != NULL) {
>   /*
>* To avoid a "too big" situation at an intermediate router and
> @@ -1470,7 +1470,7 @@ ip6_send_dispatch(void *xmq)
>*/
>   ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL);
>   }
> - NET_UNLOCK();
> + NET_RUNLOCK();
>  }
>  
>  void
> Index: sys/systm.h
> ===
> RCS file: /cvs/src/sys/sys/systm.h,v
> retrieving revision 1.133
> diff -u -p -r1.133 systm.h
> --- sys/systm.h   11 Aug 2017 21:24:20 -  1.133
> +++ sys/systm.h   8 Nov 2017 09:21:51 -
> @@ -296,26 +296,34 @@ int uiomove(void *, size_t, struct uio *
>  
>  extern struct rwlock netlock;
>  
> -#define  NET_LOCK()  
> \
> -do { \
> - rw_enter_write(&netlock);   \
> -} while (0)
> +#define NET_LOCK()   NET_WLOCK()
> +#define NET_UNLOCK() NET_WUNLOCK()
>  
> -#define  NET_UNLOCK()
> \
> +#define  NET_WLOCK() do { rw_enter_write(&netlock); } while (0)
> +#define  NET_WUNLOCK()   do { rw_exit_write(&netlock); } while (0)
> +
> +#define  NET_ASSERT_WLOCKED()
> \
>  do { \
> - rw_exit_write(&netlock);\
> + int _s = rw_status(&netlock);   \
> + if (_s != RW_WRITE) \
> + splassert_fail(RW_WRITE, _s, __func__); \
>  } while (0)
>  
> +#define  NET_RLOCK() do { rw_enter_read(&netlock); } while (0)
> +#define  NET_RUNLOCK()   do { rw_exit_read(&netlock); } while (0)
> +
>  #define  NET_ASSERT_LOCKED() 

Re: Introduce NET_RLOCK()

2017-11-08 Thread Saša Nedvědický
Hello,

I think the recent diff should go in so I can pick it up to my tree.
It's the same what I have in my net/systm.h

What I also found that in some cases, we are going to
grab KERNEL_LOCK(), while running in in_input_process().
this typically happens when we deal with multicast forwarding
in ip_input_if():
diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c
index c02e95de474..a35b3d6e533 100644
--- a/sys/netinet/ip_input.c
+++ b/sys/netinet/ip_input.c
@@ -432,9 +432,12 @@ ip_input_if(struct mbuf **mp, int *offp, int nxt,
int af, struct ifnet *ifp)
 * as expected when ip_mforward() is called from
 * ip_output().)
 */
+   NET_RUNLOCK();
+   NET_LOCK();
KERNEL_LOCK();
error = ip_mforward(m, ifp);
KERNEL_UNLOCK();
+   NET_DOWNGRADE();
if (error) {
ipstat_inc(ips_cantforward);
goto bad;


I deliberately keep NET_RUNLOCK()/NET_LOCK() as separate operations,
while the reverse one is joined to NET_DOWNGRADE(). The piece above
comes from larger diff. Which I hope to send later today. I'll become off-line
then and will be back on Monday 13th.

O.K. sashan@



2017-11-08 11:50 GMT+01:00 Martin Pieuchot :
> On 08/11/17(Wed) 10:37, Martin Pieuchot wrote:
>> We're at the stage where we want to run multiple parts of the Network
>> Stack in parallel.  sashan@ is trying to get multiple network threads
>> running to exercise parallelism in pf_test() and tb@ is trying to push
>> the NET_LOCK() further down in ioctl(2) path.
>>
>> However we want to be able to gradually select which code paths can be
>> executed in parallel as the rest of the stack.  So the way to move
>> forward is to introduce a read version of the NET_LOCK(), NET_RLOCK()
>> and change code paths on a case-by-case basis.
>>
>> Diff below does that for the input processing path.  Note that the 3
>> tasks below cannot run in parallel because they are scheduled on the
>> same taskq.
>>
>> Regarding assert macros, NET_ASSERT_LOCK() now required either a read
>> or write lock.  I also introduced a NET_ASSERT_WLOCK() to put in paths
>> that are not ready to be run in parallel of the rest of the stack.
>>
>> Once that's in we can slowly convert existing NET_LOCK() to NET_WLOCK()
>> to be explicit.
>
> I got bitten by rw_status() once again.  There's no way to tell that the
> current lock is not holding a read lock.  But that's ok since what we
> really need is to assert that we're not holding a write lock.
>
> Fixed diff below, ok?
>
> Index: net/if.c
> ===
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.523
> diff -u -p -r1.523 if.c
> --- net/if.c4 Nov 2017 16:58:46 -   1.523
> +++ net/if.c8 Nov 2017 10:11:13 -
> @@ -905,7 +905,7 @@ if_input_process(void *xifidx)
>  * to PF globals, pipex globals, unicast and multicast addresses
>  * lists.
>  */
> -   NET_LOCK();
> +   NET_RLOCK();
> s = splnet();
> while ((m = ml_dequeue(&ml)) != NULL) {
> /*
> @@ -922,7 +922,7 @@ if_input_process(void *xifidx)
> m_freem(m);
> }
> splx(s);
> -   NET_UNLOCK();
> +   NET_RUNLOCK();
>  out:
> if_put(ifp);
>  }
> Index: netinet/ip_input.c
> ===
> RCS file: /cvs/src/sys/netinet/ip_input.c,v
> retrieving revision 1.329
> diff -u -p -r1.329 ip_input.c
> --- netinet/ip_input.c  5 Nov 2017 13:19:59 -   1.329
> +++ netinet/ip_input.c  8 Nov 2017 10:03:54 -
> @@ -1814,11 +1814,11 @@ ip_send_dispatch(void *xmq)
> if (ml_empty(&ml))
> return;
>
> -   NET_LOCK();
> +   NET_RLOCK();
> while ((m = ml_dequeue(&ml)) != NULL) {
> ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
> }
> -   NET_UNLOCK();
> +   NET_RUNLOCK();
>  }
>
>  void
> Index: netinet6/ip6_input.c
> ===
> RCS file: /cvs/src/sys/netinet6/ip6_input.c,v
> retrieving revision 1.207
> diff -u -p -r1.207 ip6_input.c
> --- netinet6/ip6_input.c1 Nov 2017 06:35:38 -   1.207
> +++ netinet6/ip6_input.c8 Nov 2017 10:03:54 -
> @@ -1459,7 +1459,7 @@ ip6_send_dispatch(void *xmq)
> if (ml_empty(&ml))
> return;
>
> -   NET_LOCK();
> +   NET_RLOCK();
> while ((m = ml_dequeue(&ml)) != NULL) {
> /*
>  * To avoid a "too big" situation at an intermediate router 
> and
> @@ -1470,7 +1470,7 @@ ip6_send_dispatch(void *xmq)
>  */
> ip6_output(m, NULL, NULL, IPV6_MINMTU, NULL, NULL);
> }
> -   NET_

Re: 2 network threads & IPsec

2017-11-08 Thread Saša Nedvědický
Hello,

I'm all-in your diff gets me un-stuck.

O.K. sashan

2017-11-08 11:02 GMT+01:00 Martin Pieuchot :
> We're experimenting with 2 threads processing input packets.  That's
> good enough to improve forwarding performance and finally run pf_test()
> in parallel.  However IPsec is not ready for that.  Until somebody takes
> care of IPsec here's the solution: stay with a single thread.
>
> Diff below implements this hack.  Like for the previous release, as soon
> as you enable IPsec all your traffic gets punished.
>
> This is a no-op for the moment since we only have a single network
> taskq, but it allows us to experiment with the PF_LOCK() and start
> splitting the use upcoming diff.
>
> While here I renamed the taskq to reflect the fact that it doesn't
> grab the KERNEL_LOCK() by default, following the ``systqmq'' notation.
>
> ok?
>
> Index: net/pfkeyv2.c
> ===
> RCS file: /cvs/src/sys/net/pfkeyv2.c,v
> retrieving revision 1.172
> diff -u -p -r1.172 pfkeyv2.c
> --- net/pfkeyv2.c   3 Nov 2017 16:23:20 -   1.172
> +++ net/pfkeyv2.c   8 Nov 2017 09:54:13 -
> @@ -1767,6 +1767,14 @@ pfkeyv2_send(struct socket *so, void *me
> }
> TAILQ_INSERT_HEAD(&ipsec_policy_head, ipo, ipo_list);
> ipsec_in_use++;
> +   /*
> +* XXXSMP IPsec data structures are not ready to be
> +* accessed by multiple Network threads in parallel,
> +* so force all packets to be processed by the first
> +* one.
> +*/
> +   extern int nettaskqs;
> +   nettaskqs = 1;
> } else {
> ipo->ipo_last_searched = ipo->ipo_flags = 0;
> }
> Index: net/if.c
> ===
> RCS file: /cvs/src/sys/net/if.c,v
> retrieving revision 1.523
> diff -u -p -r1.523 if.c
> --- net/if.c4 Nov 2017 16:58:46 -   1.523
> +++ net/if.c8 Nov 2017 09:50:39 -
> @@ -227,8 +227,8 @@ int ifq_congestion;
>
>  int netisr;
>
> -#defineSOFTNET_TASKS   1
> -struct taskq   *softnettq[SOFTNET_TASKS];
> +#defineNET_TASKQ   1
> +struct taskq   *nettqmp[NET_TASKQ];
>
>  struct task if_input_task_locked = TASK_INITIALIZER(if_netisr, NULL);
>
> @@ -254,10 +254,10 @@ ifinit(void)
>
> timeout_set(&net_tick_to, net_tick, &net_tick_to);
>
> -   for (i = 0; i < SOFTNET_TASKS; i++) {
> -   softnettq[i] = taskq_create("softnet", 1, IPL_NET, 
> TASKQ_MPSAFE);
> -   if (softnettq[i] == NULL)
> -   panic("unable to create softnet taskq");
> +   for (i = 0; i < NET_TASKQ; i++) {
> +   nettqmp[i] = taskq_create("softnet", 1, IPL_NET, 
> TASKQ_MPSAFE);
> +   if (nettqmp[i] == NULL)
> +   panic("unable to create network taskq %d", i);
> }
>
> net_tick(&net_tick_to);
> @@ -2924,12 +2924,19 @@ unhandled_af(int af)
> panic("unhandled af %d", af);
>  }
>
> +/*
> + * XXXSMP This tunable is here to work around the fact that IPsec
> + * globals aren't ready to be accessed by multiple threads in
> + * parallel.
> + */
> +int nettaskqs = NET_TASKQ;
> +
>  struct taskq *
>  net_tq(unsigned int ifindex)
>  {
> struct taskq *t = NULL;
>
> -   t = softnettq[ifindex % SOFTNET_TASKS];
> +   t = nettqmp[ifindex % nettaskqs];
>
> return (t);
>  }



Re: Introduce NET_RLOCK()

2017-11-08 Thread Martin Pieuchot
On 08/11/17(Wed) 12:22, Saša Nedvědický wrote:
> Hello,
> 
> I think the recent diff should go in so I can pick it up to my tree.
> It's the same what I have in my net/systm.h
> 
> What I also found that in some cases, we are going to
> grab KERNEL_LOCK(), while running in in_input_process().
> this typically happens when we deal with multicast forwarding
> in ip_input_if():

This is a deliberate change and it is perfectly ok.  In that case the 
KERNEL_LOCK() is just an indication that nobody did the work to say if
the underlying code is safe to be executed without it.

In other words, the NET_LOCK() does not protect any of the global
multicast data structures and I'd like to keep it that way.  Otherwise
it becomes more difficult to shrink the NET_LOCK().

However if somebody wants to remove the KERNEL_LOCK() from this code
path, it'd be great.

> diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c
> index c02e95de474..a35b3d6e533 100644
> --- a/sys/netinet/ip_input.c
> +++ b/sys/netinet/ip_input.c
> @@ -432,9 +432,12 @@ ip_input_if(struct mbuf **mp, int *offp, int nxt,
> int af, struct ifnet *ifp)
>  * as expected when ip_mforward() is called from
>  * ip_output().)
>  */
> +   NET_RUNLOCK();
> +   NET_LOCK();
> KERNEL_LOCK();
> error = ip_mforward(m, ifp);
> KERNEL_UNLOCK();
> +   NET_DOWNGRADE();
> if (error) {
> ipstat_inc(ips_cantforward);
> goto bad;
> 
> 
> I deliberately keep NET_RUNLOCK()/NET_LOCK() as separate operations,
> while the reverse one is joined to NET_DOWNGRADE(). The piece above
> comes from larger diff. Which I hope to send later today. I'll become off-line
> then and will be back on Monday 13th.
> 
> O.K. sashan@
> 
> 
> 
> 2017-11-08 11:50 GMT+01:00 Martin Pieuchot :
> > On 08/11/17(Wed) 10:37, Martin Pieuchot wrote:
> >> We're at the stage where we want to run multiple parts of the Network
> >> Stack in parallel.  sashan@ is trying to get multiple network threads
> >> running to exercise parallelism in pf_test() and tb@ is trying to push
> >> the NET_LOCK() further down in ioctl(2) path.
> >>
> >> However we want to be able to gradually select which code paths can be
> >> executed in parallel as the rest of the stack.  So the way to move
> >> forward is to introduce a read version of the NET_LOCK(), NET_RLOCK()
> >> and change code paths on a case-by-case basis.
> >>
> >> Diff below does that for the input processing path.  Note that the 3
> >> tasks below cannot run in parallel because they are scheduled on the
> >> same taskq.
> >>
> >> Regarding assert macros, NET_ASSERT_LOCK() now required either a read
> >> or write lock.  I also introduced a NET_ASSERT_WLOCK() to put in paths
> >> that are not ready to be run in parallel of the rest of the stack.
> >>
> >> Once that's in we can slowly convert existing NET_LOCK() to NET_WLOCK()
> >> to be explicit.
> >
> > I got bitten by rw_status() once again.  There's no way to tell that the
> > current lock is not holding a read lock.  But that's ok since what we
> > really need is to assert that we're not holding a write lock.
> >
> > Fixed diff below, ok?
> >
> > Index: net/if.c
> > ===
> > RCS file: /cvs/src/sys/net/if.c,v
> > retrieving revision 1.523
> > diff -u -p -r1.523 if.c
> > --- net/if.c4 Nov 2017 16:58:46 -   1.523
> > +++ net/if.c8 Nov 2017 10:11:13 -
> > @@ -905,7 +905,7 @@ if_input_process(void *xifidx)
> >  * to PF globals, pipex globals, unicast and multicast addresses
> >  * lists.
> >  */
> > -   NET_LOCK();
> > +   NET_RLOCK();
> > s = splnet();
> > while ((m = ml_dequeue(&ml)) != NULL) {
> > /*
> > @@ -922,7 +922,7 @@ if_input_process(void *xifidx)
> > m_freem(m);
> > }
> > splx(s);
> > -   NET_UNLOCK();
> > +   NET_RUNLOCK();
> >  out:
> > if_put(ifp);
> >  }
> > Index: netinet/ip_input.c
> > ===
> > RCS file: /cvs/src/sys/netinet/ip_input.c,v
> > retrieving revision 1.329
> > diff -u -p -r1.329 ip_input.c
> > --- netinet/ip_input.c  5 Nov 2017 13:19:59 -   1.329
> > +++ netinet/ip_input.c  8 Nov 2017 10:03:54 -
> > @@ -1814,11 +1814,11 @@ ip_send_dispatch(void *xmq)
> > if (ml_empty(&ml))
> > return;
> >
> > -   NET_LOCK();
> > +   NET_RLOCK();
> > while ((m = ml_dequeue(&ml)) != NULL) {
> > ip_output(m, NULL, NULL, 0, NULL, NULL, 0);
> > }
> > -   NET_UNLOCK();
> > +   NET_RUNLOCK();
> >  }
> >
> >  void
> > Index: netinet6/ip6_input.c
> > ==

fuse: vfs create does not map 1:1 to fuse create

2017-11-08 Thread Helg Bredow
There is a bug when creating a file in fuse-exfat and then deleting it
again without first unmounting the file system. The reason for this is
that fuse-exfat maintains strict reference counts and fuse currently
calls the file system create and open functions when it should only
call create. The additional call to open results in a node having one
extra reference on release.

The Linux version of fuse.h states the following for create.

* Create and open a file
*
* If the file does not exist, first create it with the specified
* mode, and then open it.
*
* If this method is not implemented or under Linux kernel
* versions earlier than 2.6.15, the mknod() and open() methods
* will be called instead.
*
* Introduced in version 2.5

The VOP_CREATE(9) function does not behave like this so we either need
to simulate it within fuse or fall back to mknod() and open(). Note
that fuse mknod DOES allow the creation of regular files so doesn't map
completely 1:1 to OpenBSDs mknod(2). Linux has the atomic_open system
call, which maps 1:1 to fuse create.

The following patch is the first pass at changing the behaviour of
fusefs_create so that it maps 1:1 to fuse mknod. If this approach is
accepted, we should consider lowering FUSE_VERSION in fuse.h to 24 so
that file systems can check the supported fuse version at compile time
to determine whether create is available. The only port that currently
does not implement mknod is fuse-zip. It should not be difficult to
patch this.

An alternative and more complicated patch is to modify fusefs_create to
store the file handle returned by the file system in the same way that
fusefs_open does and then not open the file again. The downside of this
is that fusefs_create does not receive the open flags from the vn_open
vfs system call so cannot pass these onto the fuse file system. Let me
know if you would like to see this alternative patch.


Index: sys/miscfs/fuse/fuse_vnops.c
===
RCS file: /cvs/src/sys/miscfs/fuse/fuse_vnops.c,v
retrieving revision 1.33
diff -u -p -r1.33 fuse_vnops.c
--- sys/miscfs/fuse/fuse_vnops.c7 Sep 2016 17:53:35
-   1.33 +++ sys/miscfs/fuse/fuse_vnops.c   8 Nov 2017
13:38:04 - @@ -891,13 +891,13 @@ fusefs_create(void *v)
goto out;
}
 
-   if (fmp->undef_op & UNDEF_CREATE) {
+   if (fmp->undef_op & UNDEF_MKNOD) {
error = ENOSYS;
goto out;
}
 
fbuf = fb_setup(cnp->cn_namelen + 1, ip->ufs_ino.i_number,
-   FBT_CREATE, p);
+   FBT_MKNOD, p);
 
fbuf->fb_io_mode = mode;
fbuf->fb_io_flags = O_CREAT | O_RDWR;
@@ -908,7 +908,7 @@ fusefs_create(void *v)
error = fb_queue(fmp->dev, fbuf);
if (error) {
if (error == ENOSYS)
-   fmp->undef_op |= UNDEF_CREATE;
+   fmp->undef_op |= UNDEF_MKNOD;
 
fb_delete(fbuf);
goto out;
Index: lib/libfuse/fuse_ops.c
===
RCS file: /cvs/src/lib/libfuse/fuse_ops.c,v
retrieving revision 1.26
diff -u -p -r1.26 fuse_ops.c
--- lib/libfuse/fuse_ops.c  7 Sep 2016 17:53:35 -   1.26
+++ lib/libfuse/fuse_ops.c  8 Nov 2017 13:38:05 -
@@ -598,8 +598,6 @@ ifuse_ops_create(struct fuse *f, struct 
 
if (f->op.create)
fbuf->fb_err = f->op.create(realname, mode,  &ffi);
-   else if (f->op.mknod)
-   fbuf->fb_err = f->op.mknod(realname, S_IFREG | mode,
0); else
fbuf->fb_err = -ENOSYS;
 



Re: fuse: vfs create does not map 1:1 to fuse create

2017-11-08 Thread Martin Pieuchot
On 08/11/17(Wed) 14:12, Helg Bredow wrote:
> There is a bug when creating a file in fuse-exfat and then deleting it
> again without first unmounting the file system. The reason for this is
> that fuse-exfat maintains strict reference counts and fuse currently
> calls the file system create and open functions when it should only
> call create. 
> [...]

Does it call the userland file system functions because it receives 2
FBT messages?

Can you see which FBT messages are generated by the kernel?  Does
ktrace(1) has support for that?

> The VOP_CREATE(9) function does not behave like this so we either need
> to simulate it within fuse or fall back to mknod() and open(). 

VOP_CREATE(9) is just a wrapper around the underlying FS.  In your case
fusefs_create().  This function enqueues a single FBT_CREATE operation,
so it doesn't match what you said before.

So I still don't understand the problem.  Where is the bug?

If the problem is that the kernel enqueue 2 FBT operations instead of
one, then change the kernel.

If the problem is that fusefs_create() does not have all the information
for generating a FBT_CREATE message and that it hardcodes (O_CREAT|O_RDWR)
then maybe you should delay that operation to fusefs_open().

These are stupid guesses because I don't understand what the problem is.



un-KERNEL_LOCK() TCP/UDP input & co

2017-11-08 Thread Martin Pieuchot
After auditing all the pr_input() functions, the only missing bits
for taking them out of the KERNEL_LOCK() are: ``etheripstat''.  I
leave such counter conversions for somebody else (8

In the meantime I annotated globals used in these functions.  Most
of the pseudo-interfaces have a global list that is currently protected
by the NET_LOCK().

PCB tables are also currently protected by the NET_LOCK().

carp(4) and IGMP inputs need some love, so they grab the KERNEL_LOCK()
for now.

inet{,6}ctlerrmap are read-only so they get turned into 'const'.

With this diff in we should be able to have a mostly KERNEL_LOCK()-free
softnet taskq.

ok?

Index: net/if_etherip.c
===
RCS file: /cvs/src/sys/net/if_etherip.c,v
retrieving revision 1.21
diff -u -p -r1.21 if_etherip.c
--- net/if_etherip.c25 Oct 2017 09:24:09 -  1.21
+++ net/if_etherip.c8 Nov 2017 14:41:29 -
@@ -434,6 +434,7 @@ ip_etherip_input(struct mbuf **mp, int *
return IPPROTO_DONE;
}
 
+   NET_ASSERT_LOCKED();
LIST_FOREACH(sc, ðerip_softc_list, sc_entry) {
if (sc->sc_src.ss_family != AF_INET ||
sc->sc_dst.ss_family != AF_INET)
@@ -598,6 +599,7 @@ ip6_etherip_input(struct mbuf **mp, int 
in6_recoverscope(&ipsrc, &ip6->ip6_src);
in6_recoverscope(&ipdst, &ip6->ip6_dst);
 
+   NET_ASSERT_LOCKED();
LIST_FOREACH(sc, ðerip_softc_list, sc_entry) {
if (sc->sc_src.ss_family != AF_INET6 ||
sc->sc_dst.ss_family != AF_INET6)
Index: net/if_gif.c
===
RCS file: /cvs/src/sys/net/if_gif.c,v
retrieving revision 1.101
diff -u -p -r1.101 if_gif.c
--- net/if_gif.c25 Oct 2017 09:24:09 -  1.101
+++ net/if_gif.c8 Nov 2017 14:40:26 -
@@ -622,6 +622,7 @@ in_gif_input(struct mbuf **mp, int *offp
 
ip = mtod(m, struct ip *);
 
+   NET_ASSERT_LOCKED();
/* this code will be soon improved. */
LIST_FOREACH(sc, &gif_softc_list, gif_list) {
if (sc->gif_psrc == NULL || sc->gif_pdst == NULL ||
@@ -730,7 +731,8 @@ in6_gif_output(struct ifnet *ifp, int fa
return 0;
 }
 
-int in6_gif_input(struct mbuf **mp, int *offp, int proto, int af)
+int
+in6_gif_input(struct mbuf **mp, int *offp, int proto, int af)
 {
struct mbuf *m = *mp;
struct gif_softc *sc;
@@ -747,6 +749,7 @@ int in6_gif_input(struct mbuf **mp, int 
in6_recoverscope(&src, &ip6->ip6_src);
in6_recoverscope(&dst, &ip6->ip6_dst);
 
+   NET_ASSERT_LOCKED();
LIST_FOREACH(sc, &gif_softc_list, gif_list) {
if (sc->gif_psrc == NULL || sc->gif_pdst == NULL ||
sc->gif_psrc->sa_family != AF_INET6 ||
Index: net/if_pfsync.c
===
RCS file: /cvs/src/sys/net/if_pfsync.c,v
retrieving revision 1.254
diff -u -p -r1.254 if_pfsync.c
--- net/if_pfsync.c 11 Aug 2017 21:24:19 -  1.254
+++ net/if_pfsync.c 8 Nov 2017 14:42:49 -
@@ -659,6 +659,8 @@ pfsync_input(struct mbuf **mp, int *offp
int offset, noff, len, count, mlen, flags = 0;
int e;
 
+   NET_ASSERT_LOCKED();
+
pfsyncstat_inc(pfsyncs_ipackets);
 
/* verify that we have a sync interface configured */
Index: net/if_vxlan.c
===
RCS file: /cvs/src/sys/net/if_vxlan.c,v
retrieving revision 1.63
diff -u -p -r1.63 if_vxlan.c
--- net/if_vxlan.c  25 Oct 2017 09:24:09 -  1.63
+++ net/if_vxlan.c  8 Nov 2017 14:49:58 -
@@ -611,6 +611,7 @@ vxlan_lookup(struct mbuf *m, struct udph
vni = VXLAN_VNI_UNSET;
}
 
+   NET_ASSERT_LOCKED();
/* First search for a vxlan(4) interface with the packet's VNI */
LIST_FOREACH(sc, &vxlan_tagh[VXLAN_TAGHASH(vni)], sc_entry) {
if ((uh->uh_dport == sc->sc_dstport) &&
Index: net/pf.c
===
RCS file: /cvs/src/sys/net/pf.c,v
retrieving revision 1.1043
diff -u -p -r1.1043 pf.c
--- net/pf.c31 Oct 2017 22:05:12 -  1.1043
+++ net/pf.c8 Nov 2017 15:03:35 -
@@ -3184,12 +3184,14 @@ pf_socket_lookup(struct pf_pdesc *pd)
sport = pd->hdr.tcp.th_sport;
dport = pd->hdr.tcp.th_dport;
PF_ASSERT_LOCKED();
+   NET_ASSERT_LOCKED();
tb = &tcbtable;
break;
case IPPROTO_UDP:
sport = pd->hdr.udp.uh_sport;
dport = pd->hdr.udp.uh_dport;
PF_ASSERT_LOCKED();
+   NET_ASSERT_LOCKED();
tb = &udbtable;
break;
default:
Index: net/pipex.c
===
RCS file: /cvs/src/sys/net/pipex.c,v
retrieving rev

Re: iked: Do not accept superfluous arguments

2017-11-08 Thread Klemens Nanni
On Wed, Aug 23, 2017 at 10:42:36PM +0200, Klemens Nanni wrote:
> Calling `iked reload' when I meant `ikectl reload' showed that iked
> happily returned 0 and and fired up another daemon.
> 
> Feedback?
Second bump after two months with the diff reattached. Anyone?

diff --git a/sbin/iked/iked.c b/sbin/iked/iked.c
index 61079167c2a..51183c9b990 100644
--- a/sbin/iked/iked.c
+++ b/sbin/iked/iked.c
@@ -109,6 +109,11 @@ main(int argc, char *argv[])
usage();
}
}
+   argc -= optind;
+   argv += optind;
+
+   if (argc > 1)
+   usage();
 
if ((env = calloc(1, sizeof(*env))) == NULL)
fatal("calloc: env");



Re: iked: Do not accept superfluous arguments

2017-11-08 Thread Patrick Wildt
On Wed, Nov 08, 2017 at 05:18:29PM +0100, Klemens Nanni wrote:
> On Wed, Aug 23, 2017 at 10:42:36PM +0200, Klemens Nanni wrote:
> > Calling `iked reload' when I meant `ikectl reload' showed that iked
> > happily returned 0 and and fired up another daemon.
> > 
> > Feedback?
> Second bump after two months with the diff reattached. Anyone?
> 
> diff --git a/sbin/iked/iked.c b/sbin/iked/iked.c
> index 61079167c2a..51183c9b990 100644
> --- a/sbin/iked/iked.c
> +++ b/sbin/iked/iked.c
> @@ -109,6 +109,11 @@ main(int argc, char *argv[])
>   usage();
>   }
>   }
> + argc -= optind;
> + argv += optind;
> +
> + if (argc > 1)
> + usage();
>  
>   if ((env = calloc(1, sizeof(*env))) == NULL)
>   fatal("calloc: env");
> 

Thanks!  Committed with slight changes and a fix for something that was
better in your last version of the diff.

Happy `iked reload`-ing. :)



Re: [PATCH] amd64/bsd.rd: add growfs(8)

2017-11-08 Thread Florian Obser
On Tue, Nov 07, 2017 at 06:15:09PM +, Job Snijders wrote:
> On Mon, Nov 06, 2017 at 04:14:48PM -0700, Theo de Raadt wrote:
> > I agree on that.  So please put it into the correct lists files for
> > all the unlimited ramdisks.
> > 
> > Job, the situation is a little nit-picky but try to do it for all the
> > architectures and I'll give you fast feedback.
> 
> This is what I was able to test. The current state of affairs: growfs
> in bsd.rd will cost 16K on amd64 and 21K on i386.
> 
>   Filesystem  Size  Used  Avail Capacity mounted on
> amd64 with growfs:/dev/rd0a   3.5M  3.1M  365K90%/
> amd64 without growfs: /dev/rd0a   3.5M  3.1M  381K89%/
> i386 with growfs: /dev/rd0a   3.0M  2.7M  294K90%/
> i386 without growfs:  /dev/rd0a   3.0M  2.7M  315K90%/
> 
> Below is the MI patch. I glanced at Florian's slaacd commit to figure
> out where the link lines should go.

This looks about right. For obvious reasons I can't OK it though...

> 
> Kind regards,
> 
> Job
> 
> diff --git distrib/alpha/bsd.rd/list.local distrib/alpha/bsd.rd/list.local
> index 4d2d3f1875b..c8d52363fe5 100644
> --- distrib/alpha/bsd.rd/list.local
> +++ distrib/alpha/bsd.rd/list.local
> @@ -1,3 +1,4 @@
> +LINK instbin sbin/growfs
>  LINK instbin sbin/mount_cd9660
>  LINK instbin sbin/dhclient
>  LINK instbin bin/mt bin/eject
> diff --git distrib/amd64/ramdisk_cd/list.local 
> distrib/amd64/ramdisk_cd/list.local
> index 49d677cb6d5..094ead2f06a 100644
> --- distrib/amd64/ramdisk_cd/list.local
> +++ distrib/amd64/ramdisk_cd/list.local
> @@ -9,6 +9,7 @@ LINK  instbin sbin/mount_msdos
>  LINK instbin sbin/mount_udf
>  LINK instbin sbin/newfs_msdos
>  LINK instbin sbin/fsck_msdos
> +LINK instbin sbin/growfs
>  LINK instbin sbin/slaacd
>  
>  COPY ${DESTDIR}/etc/ssl/cert.pem etc/ssl/cert.pem
> diff --git distrib/arm64/ramdisk/list distrib/arm64/ramdisk/list
> index d1b4f696646..3f3a2926aff 100644
> --- distrib/arm64/ramdisk/list
> +++ distrib/arm64/ramdisk/list
> @@ -35,6 +35,7 @@ LINKinstbin 
> sbin/fdisk
>  LINK instbin sbin/fsck
>  LINK instbin sbin/fsck_ext2fs
>  LINK instbin sbin/fsck_ffs
> +LINK instbin sbin/growfs
>  LINK instbin sbin/ifconfig
>  LINK instbin sbin/init
>  LINK instbin sbin/mknod
> diff --git distrib/armv7/ramdisk/list distrib/armv7/ramdisk/list
> index dd2b1ddc618..02b4800f226 100644
> --- distrib/armv7/ramdisk/list
> +++ distrib/armv7/ramdisk/list
> @@ -35,6 +35,7 @@ LINKinstbin 
> sbin/fdisk
>  LINK instbin sbin/fsck
>  LINK instbin sbin/fsck_ext2fs
>  LINK instbin sbin/fsck_ffs
> +LINK instbin sbin/growfs
>  LINK instbin sbin/ifconfig
>  LINK instbin sbin/init
>  LINK instbin sbin/mknod
> diff --git distrib/hppa/ramdisk/list.local distrib/hppa/ramdisk/list.local
> index d2130f3bbde..d4598cba7bf 100644
> --- distrib/hppa/ramdisk/list.local
> +++ distrib/hppa/ramdisk/list.local
> @@ -5,6 +5,7 @@ LINK  instbin sbin/disklabel
>  LINK instbin usr/bin/grep usr/bin/egrep 
> usr/bin/fgrep
>  LINK instbin usr/bin/more usr/bin/less
>  LINK instbin sbin/bioctl
> +LINK instbin sbin/growfs
>  LINK instbin sbin/slaacd
>  
>  # copy the MAKEDEV script and make some devices
> diff --git distrib/i386/ramdisk_cd/list.local 
> distrib/i386/ramdisk_cd/list.local
> index 38879e31040..eed3304bb06 100644
> --- distrib/i386/ramdisk_cd/list.local
> +++ distrib/i386/ramdisk_cd/list.local
> @@ -1,6 +1,7 @@
>  # $OpenBSD: list.local,v 1.38 2017/07/08 15:42:46 florian Exp $
>  
>  # add local links; use bin/sh since instbin has already been unlinked
> +LINK instbin sbin/growfs
>  LINK instbin sbin/mount_ext2fs
>  LINK instbin sbin/mount_msdos
>  LINK instbin sbin/mount_udf
> diff --git distrib/landisk/ramdisk/list distrib/landisk/ramdisk/list
> index 0aa2b9109d8..6295dd433dc 100644
> --- distrib/landisk/ramdisk/list
> +++ distrib

Re: [PATCH 1/2 v2] VMD: remove add from switch configuration

2017-11-08 Thread Reyk Floeter
On Thu, Nov 02, 2017 at 05:55:32PM -0700, Carlos Cardenas wrote:
> Remove configuration items on switches for:
> * adding static interfaces
> 
> Adding static interfaces are to be set in hostname.if.
> 
> Changed rule on rdomain:
> * vm->interface->rdomain takes precedence over sw->rdomain
> 
> Update examples/vm.conf and vm.conf manpage to reflect changes.
> 
> Comments? Ok?
> 

Style comments below, otherwise OK

Thanks!
Reyk

> diff --git etc/examples/vm.conf etc/examples/vm.conf
> index f35235e3fba..d61ce8c4977 100644
> --- etc/examples/vm.conf
> +++ etc/examples/vm.conf
> @@ -10,15 +10,14 @@ sets="/var/www/htdocs/pub/OpenBSD/snapshots/amd64/"
>  #
>  
>  switch "uplink" {
> - # This interface will default to bridge0, but switch(4) is supported
> + # This switch will use bridge0, defined by /etc/hostname.bridge0, as
> + # the underlying interface.  switch(4) is also supported
>   #interface switch0
> -
> - # Add additional members
> - add em0
> + interface bridge0
>  }
>  
>  switch "local" {
> - add vether0
> + interface bridge1
>   down
>  }
>  
> diff --git usr.sbin/vmd/parse.y usr.sbin/vmd/parse.y
> index a0e96545923..18543a74911 100644
> --- usr.sbin/vmd/parse.y
> +++ usr.sbin/vmd/parse.y
> @@ -88,7 +88,6 @@ int  parse_disk(char *);
>  static struct vmop_create_params vmc;
>  static struct vm_create_params   *vcp;
>  static struct vmd_switch *vsw;
> -static struct vmd_if *vif;
>  static struct vmd_vm *vm;
>  static char   vsw_type[IF_NAMESIZE];
>  static intvcp_disable;
> @@ -193,7 +192,6 @@ switch: SWITCH string {
>   vsw->sw_id = env->vmd_nswitches + 1;
>   vsw->sw_name = $2;
>   vsw->sw_flags = VMIFF_UP;
> - TAILQ_INIT(&vsw->sw_ifs);
>  
>   vcp_disable = 0;
>   } '{' optnl switch_opts_l '}'   {
> @@ -224,21 +222,6 @@ switch_opts_l: switch_opts_l switch_opts nl
>  switch_opts  : disable   {
>   vcp_disable = $1;
>   }
> - | ADD string{
> - chartype[IF_NAMESIZE];
> -
> - if ((vif = calloc(1, sizeof(*vif))) == NULL)
> - fatal("could not allocate interface");
> -
> - if (priv_getiftype($2, type, NULL) == -1) {
> - yyerror("invalid interface: %s", $2);
> - free($2);
> - YYERROR;
> - }
> - vif->vif_name = $2;
> -
> - TAILQ_INSERT_TAIL(&vsw->sw_ifs, vif, vif_entry);
> - }
>   | GROUP string  {
>   if (priv_validgroup($2) == -1) {
>   yyerror("invalid group name: %s", $2);
> diff --git usr.sbin/vmd/priv.c usr.sbin/vmd/priv.c
> index d585bf75a99..c099bdf4ba5 100644
> --- usr.sbin/vmd/priv.c
> +++ usr.sbin/vmd/priv.c
> @@ -255,7 +255,17 @@ priv_validgroup(const char *name)
>  }
>  
>  /*
> - * Called from the process peer
> + * Called from the Parent process to setup vm interface(s)
> + * - ensure the interface has the description set (tracking purposes)
> + * - if interface is to be attached to a switch, attach it
> + * - check if rdomain is set on interface and switch
> + *   - if interface only or both, use interface rdomain
> + *   - if switch only, use switch rdomain
> + * - check if group is set on interface and switch
> + *   - if interface, add it
> + *   - if switch, add it
> + * - ensure the interface is up/down
> + * - if local interface, set address
>   */

Just a style note -

The "Called from the process peer" was a comment for the "section" of
vm_priv_* functions - that is why it is followed by an empty newline.

If you want to add a "mlarkin-style" comment for the function, please
add it additionally without an empty newline between comment and the
function header.  I don't always add such function comments, but when
we do, we should do it in a consistent way like mlarkin did.

Please also note that this comment might diverge from reality at some
point.  It is a detailed description of what happens in the code, but
what if somebody changes the code without adjusting the comment?  I'd
say it is better to describe the steps inline...  but I don't insist.

>  
>  int
> @@ -279,18 +289,6 @@ vm_priv_ifconfig(struct privsep *ps, struct vmd_vm *vm)
>   sizeof(vfr.vfr_name)) >= sizeof(vfr.vfr_name))
>   return (-1);
>  
> - /* Use the configured rdomain or get it from the process */
> - if (vif->vif_flags & VMIFF_RDOMAIN)
> - vfr.vfr_id = vif->vif_rdomain;
> - else
> - vfr.vfr_id = getrtable();
> - if (vfr.vfr_id != 0)

Re: fuse: vfs create does not map 1:1 to fuse create

2017-11-08 Thread Helg Bredow
On Wed, 8 Nov 2017 16:50:07 +0100
Martin Pieuchot  wrote:

> On 08/11/17(Wed) 14:12, Helg Bredow wrote:
> > There is a bug when creating a file in fuse-exfat and then deleting it
> > again without first unmounting the file system. The reason for this is
> > that fuse-exfat maintains strict reference counts and fuse currently
> > calls the file system create and open functions when it should only
> > call create. 
> > [...]
> 
> Does it call the userland file system functions because it receives 2
> FBT messages?

Yes, fuse receives both create and open from the kernel and blindly
passes them via FBT messages to userland.

> 
> Can you see which FBT messages are generated by the kernel?  Does
> ktrace(1) has support for that?

The debug output from libfuse let's you trace the FBT messages and I
also have judicious printf() statements to help me diagnose.

> 
> > The VOP_CREATE(9) function does not behave like this so we either need
> > to simulate it within fuse or fall back to mknod() and open(). 
> 
> VOP_CREATE(9) is just a wrapper around the underlying FS.  In your case
> fusefs_create().  This function enqueues a single FBT_CREATE operation,
> so it doesn't match what you said before.
> 
> So I still don't understand the problem.  Where is the bug?
> 

The problem is that the kernel's idea of create is different to the
fuse FBT_CREATE passed to userland. The kernel doesn't expect create to
also open the file. fuse create was designed to map 1:1 to
atomic_open, which OpenBSD does not have and I'm not keen to implement
it.

> If the problem is that the kernel enqueue 2 FBT operations instead of
> one, then change the kernel.
> 
> If the problem is that fusefs_create() does not have all the information
> for generating a FBT_CREATE message and that it hardcodes (O_CREAT|O_RDWR)
> then maybe you should delay that operation to fusefs_open().
> 
> These are stupid guesses because I don't understand what the problem is.

Your last suggestion is one that I had not considered. I immediately
went to implement it but was disappointed to find that it won't work
without crazy lock management and maintaining state between kernel
calls. I think it's too complex and prefer the simple solution I've
proposed because it behaves the same as ffs, nfs etc.

-- 
Helg 



README patch

2017-11-08 Thread Edgar Pettijohn
EADME.orig 2017-11-08 20:11:47.091955000 -0600
+++ README  2017-11-08 20:12:19.787639000 -0600
@@ -49,8 +49,8 @@
 
 No major funding or cost-sharing of the project comes from any company
 or educational institution. Theo works full-time on improving OpenBSD
-and paying bills, many other developers expend spend significant
-quantities of time as well.
+and paying bills, many other developers spend significant quantities
+of time as well.
 
 For those unable to make their contributions as straightforward gifts,
 the OpenBSD Foundation (http://OpenBSDFoundation.org) is a Canadian


Or perhaps it should be expend instead. Not sure, but both sound weird.



[PATCH 1/2 v3] VMD: remove add from switch configuration

2017-11-08 Thread Carlos Cardenas
Remove configuration items on switches for:
* adding static interfaces

Adding static interfaces are to be set in hostname.if.

Changed rule on rdomain:
* vm->interface->rdomain takes precedence over sw->rdomain

Update examples/vm.conf and vm.conf manpage to reflect changes.

Comments? Ok?

Ok by reyk@

diff --git etc/examples/vm.conf etc/examples/vm.conf
index f35235e3fba..d61ce8c4977 100644
--- etc/examples/vm.conf
+++ etc/examples/vm.conf
@@ -10,15 +10,14 @@ sets="/var/www/htdocs/pub/OpenBSD/snapshots/amd64/"
 #
 
 switch "uplink" {
-   # This interface will default to bridge0, but switch(4) is supported
+   # This switch will use bridge0, defined by /etc/hostname.bridge0, as
+   # the underlying interface.  switch(4) is also supported
#interface switch0
-
-   # Add additional members
-   add em0
+   interface bridge0
 }
 
 switch "local" {
-   add vether0
+   interface bridge1
down
 }
 
diff --git usr.sbin/vmd/parse.y usr.sbin/vmd/parse.y
index a0e96545923..18543a74911 100644
--- usr.sbin/vmd/parse.y
+++ usr.sbin/vmd/parse.y
@@ -88,7 +88,6 @@ intparse_disk(char *);
 static struct vmop_create_params vmc;
 static struct vm_create_params *vcp;
 static struct vmd_switch   *vsw;
-static struct vmd_if   *vif;
 static struct vmd_vm   *vm;
 static char vsw_type[IF_NAMESIZE];
 static int  vcp_disable;
@@ -193,7 +192,6 @@ switch  : SWITCH string {
vsw->sw_id = env->vmd_nswitches + 1;
vsw->sw_name = $2;
vsw->sw_flags = VMIFF_UP;
-   TAILQ_INIT(&vsw->sw_ifs);
 
vcp_disable = 0;
} '{' optnl switch_opts_l '}'   {
@@ -224,21 +222,6 @@ switch_opts_l  : switch_opts_l switch_opts nl
 switch_opts: disable   {
vcp_disable = $1;
}
-   | ADD string{
-   chartype[IF_NAMESIZE];
-
-   if ((vif = calloc(1, sizeof(*vif))) == NULL)
-   fatal("could not allocate interface");
-
-   if (priv_getiftype($2, type, NULL) == -1) {
-   yyerror("invalid interface: %s", $2);
-   free($2);
-   YYERROR;
-   }
-   vif->vif_name = $2;
-
-   TAILQ_INSERT_TAIL(&vsw->sw_ifs, vif, vif_entry);
-   }
| GROUP string  {
if (priv_validgroup($2) == -1) {
yyerror("invalid group name: %s", $2);
diff --git usr.sbin/vmd/priv.c usr.sbin/vmd/priv.c
index d585bf75a99..c1d0c0e13be 100644
--- usr.sbin/vmd/priv.c
+++ usr.sbin/vmd/priv.c
@@ -255,9 +255,18 @@ priv_validgroup(const char *name)
 }
 
 /*
- * Called from the process peer
+ * Called from the Parent process to setup vm interface(s)
+ * - ensure the interface has the description set (tracking purposes)
+ * - if interface is to be attached to a switch, attach it
+ * - check if rdomain is set on interface and switch
+ *   - if interface only or both, use interface rdomain
+ *   - if switch only, use switch rdomain
+ * - check if group is set on interface and switch
+ *   - if interface, add it
+ *   - if switch, add it
+ * - ensure the interface is up/down
+ * - if local interface, set address
  */
-
 int
 vm_priv_ifconfig(struct privsep *ps, struct vmd_vm *vm)
 {
@@ -279,18 +288,6 @@ vm_priv_ifconfig(struct privsep *ps, struct vmd_vm *vm)
sizeof(vfr.vfr_name)) >= sizeof(vfr.vfr_name))
return (-1);
 
-   /* Use the configured rdomain or get it from the process */
-   if (vif->vif_flags & VMIFF_RDOMAIN)
-   vfr.vfr_id = vif->vif_rdomain;
-   else
-   vfr.vfr_id = getrtable();
-   if (vfr.vfr_id != 0)
-   log_debug("%s: interface %s rdomain %u", __func__,
-   vfr.vfr_name, vfr.vfr_id);
-
-   proc_compose(ps, PROC_PRIV, IMSG_VMDOP_PRIV_IFRDOMAIN,
-   &vfr, sizeof(vfr));
-
/* Description can be truncated */
(void)snprintf(vfr.vfr_value, sizeof(vfr.vfr_value),
"vm%u-if%u-%s", vm->vm_vmid, i, vcp->vcp_name);
@@ -301,8 +298,17 @@ vm_priv_ifconfig(struct privsep *ps, struct vmd_vm *vm)
proc_compose(ps, PROC_PRIV, IMSG_VMDOP_PRIV_IFDESCR,
&vfr, sizeof(vfr));
 
-   /* Add interface to bridge/switch */
-   if ((vsw = switch_getbyname(vif->vif_switch)) != NULL) {
+   /* set default rdomain */
+   vfr.vfr_id = getrtable();
+
+   vsw = switch_getbyname(vif->vif_swit

[PATCH 0/2 v3] VMD: switch configuration changes

2017-11-08 Thread Carlos Cardenas
This patch series makes the following changes to switch configuration:
* Removes adding static interfaces (done in /etc/hostname.if)
* vm->interface->rdomain take precedence over sw->rdomain

Updated regression tests to match vm.conf changes.

Updated examples/vm.conf to match vm.conf changes.

changes since v2:
* style changes from reyk@

-- 
2.14.3



[PATCH 2/2 v3] VMD: regress tests update for switch configuration

2017-11-08 Thread Carlos Cardenas
Update regression tests to match new switch configuration

diff --git regress/usr.sbin/vmd/config/Makefile 
regress/usr.sbin/vmd/config/Makefile
index 2adc69ae491..3bf124aff56 100644
--- regress/usr.sbin/vmd/config/Makefile
+++ regress/usr.sbin/vmd/config/Makefile
@@ -5,7 +5,7 @@ VMD ?= /usr/sbin/vmd
 VMD_PASS=boot-keyword memory-round memory-just-enough
 VMD_FAIL=kernel-keyword too-few-ram vm-name-too-long too-many-ifs \
 boot-name-too-long disk-path-too-long too-many-disks \
-switch-no-interface
+switch-no-interface switch-no-add
 
 REGRESS_TARGETS=
 
diff --git regress/usr.sbin/vmd/config/vmd-fail-switch-no-add.conf 
regress/usr.sbin/vmd/config/vmd-fail-switch-no-add.conf
new file mode 100644
index 000..40117749346
--- /dev/null
+++ regress/usr.sbin/vmd/config/vmd-fail-switch-no-add.conf
@@ -0,0 +1,6 @@
+#  $OpenBSD$
+# Fail when a switch is attempting to use add
+switch "x" {
+interface bridge0
+add vether0
+}
diff --git regress/usr.sbin/vmd/config/vmd-fail-switch-no-add.ok 
regress/usr.sbin/vmd/config/vmd-fail-switch-no-add.ok
new file mode 100644
index 000..c04c2d13732
--- /dev/null
+++ regress/usr.sbin/vmd/config/vmd-fail-switch-no-add.ok
@@ -0,0 +1 @@
+5: syntax error
diff --git regress/usr.sbin/vmd/config/vmd-fail-switch-no-interface.conf 
regress/usr.sbin/vmd/config/vmd-fail-switch-no-interface.conf
index 891d9c88176..f92c09656d6 100644
--- regress/usr.sbin/vmd/config/vmd-fail-switch-no-interface.conf
+++ regress/usr.sbin/vmd/config/vmd-fail-switch-no-interface.conf
@@ -1,5 +1,5 @@
 #  $OpenBSD: vmd-fail-switch-no-interface.conf,v 1.1 2017/10/30 03:49:30 
mlarkin Exp $
 # Fail when a switch is missing interface name
 switch "x" {
-add vether0
+up
 }
-- 
2.14.3



/etc/netstart diff

2017-11-08 Thread Holger Mikolon
The veriable $HN_DIR is set in /etc/netstart on line 166 but used only
once (line 78). The diff below makes use of $HN_DIR in the other cases
where netstart cares of ip address configuration.

With below change I can maintain different sets (think "profiles") of
hostname.if(5) files in separate directories and use them e.g. like this:
"env HN_DIR=/etc/myprofile sh /etc/netstart"

Even without such use case it's at least a consistency fix.

Regards
Holger
;-se


Index: etc/netstart
===
RCS file: /cvs/src/etc/netstart,v
retrieving revision 1.186
diff -u -p -u -r1.186 netstart
--- etc/netstart25 Jul 2017 21:17:11 -  1.186
+++ etc/netstart7 Nov 2017 15:36:25 -
@@ -129,8 +129,8 @@ ifmstart() {
local _sifs=$1 _xifs=$2 _hn _if _sif _xif
 
for _sif in ${_sifs:-ALL}; do
-   for _hn in /etc/hostname.*; do
-   _if=${_hn#/etc/hostname.}
+   for _hn in $HN_DIR/hostname.*; do
+   _if=${_hn#$HN_DIR/hostname.}
[[ $_if == '*' ]] && continue
 
# Skip unwanted ifs.
@@ -147,12 +147,12 @@ ifmstart() {
 # Parse /etc/mygate and add default routes for IPv4 and IPv6
 # Usage: defaultroute
 defaultroute() {
-   ! $V4_DHCPCONF && stripcom /etc/mygate |
+   ! $V4_DHCPCONF && stripcom $HN_DIR/mygate |
while read gw; do
[[ $gw == @(*:*) ]] && continue
route -qn add -host default $gw && break
done
-   ! $V6_AUTOCONF && stripcom /etc/mygate |
+   ! $V6_AUTOCONF && stripcom $HN_DIR/mygate |
while read gw; do
[[ $gw == !(*:*) ]] && continue
route -qn add -host -inet6 default $gw && break



Re: TLS with static non-PIE binaries

2017-11-08 Thread Charles Collicutt
On Sun, Nov 05, 2017 at 01:02:36PM -0800, Philip Guenther wrote:
> Well, ld.so and libc _should_ currently support startup-time TLS using the 
> initial-exec and local-exec modules.

I can't see support for R_x_TPOFF64 relocations in ld.so(1) so I
don't think initial-exec will work. But local-exec doesn't require any
relocations by the dynamic linker so that should work.


> The diff below fixes that, at least on amd64, by checking whether no 
> AUX_phdr value was found and, if so, trying to instead find them via the 
> ELF header referenced via the linker-provided __executable_start symbol. 

The patch fixed the segfaults I was seeing. However, initialized TLS
data doesn't work in static executables: it appears as zero.

$ cat test.s
.globl  foo
.section.tdata,"awT",@nobits
.align 4
.size   foo, 4
foo:
.int42
.text
.globl  get_foo
.type   get_foo, @function
get_foo:
movl%fs:foo@tpoff, %eax
ret

$ cc -o test.o -c test.s
$ cc -o test test.o main.c
$ ./test
42
$ cc -static -o test test.o main.c
$ ./test
0

PIE/no-PIE doesn't seem to make any difference to this.

I don't actually need initialized TLS data for my use, which is
statically linking Go binaries, so you've fixed my problem: thank you!

I've run the complete Go test-suite with your patch applied and
everything that I'd expect to pass does pass. This is on AMD64 only; I
don't have access to any other architecture for testing.

-- 
Charles



Re: TLS with static non-PIE binaries

2017-11-08 Thread Philip Guenther
On Wed, Nov 8, 2017 at 10:43 PM, Charles Collicutt 
wrote:

> On Sun, Nov 05, 2017 at 01:02:36PM -0800, Philip Guenther wrote:
> > Well, ld.so and libc _should_ currently support startup-time TLS using
> the
> > initial-exec and local-exec modules.
>
> I can't see support for R_x_TPOFF64 relocations in ld.so(1) so I
> don't think initial-exec will work. But local-exec doesn't require any
> relocations by the dynamic linker so that should work.
>

Hmm, yeah, that makes sense.



> > The diff below fixes that, at least on amd64, by checking whether no
> > AUX_phdr value was found and, if so, trying to instead find them via the
> > ELF header referenced via the linker-provided __executable_start symbol.
>
> The patch fixed the segfaults I was seeing. However, initialized TLS
> data doesn't work in static executables: it appears as zero.
>
> $ cat test.s
> .globl  foo
> .section.tdata,"awT",@nobits
>

@nobits means the section won't have filespace (where the initialization
data is) included in the loaded data.  It should work with @progbits
instead.


Philip Guenther