Re: pf: honor quick on anchor rules

2018-10-05 Thread Theo de Raadt
Klemens Nanni  wrote:

> On Fri, Oct 05, 2018 at 11:53:08PM +0200, Klemens Nanni wrote:
> > While rules within the anchor do not match, the anchor itself does.
> > After all, it is a rule itself just like `pass' or `block', hence your
> > ruleset's second rule `anchor quick' matches every packet and therefore
> > stops evaluation.
> To illustrate further:
> 
> Block everything except IPv4 traffic:
> 
>   pass
>   anchor quick inet {
>   }
>   block
> 
> This very bad example is stupid but works as expected. It's especially
> bad as it uses an anonymous anchor, thus you cannot modify it later on
> using `pfctl -a ...'.
> 
> Maybe we should disallow anonymous anchors with empty rulesets for above
> mentioned reasons?

I don't understand why to disallow it.  It is quirky, but does as
expected.  Why does one need to be able to modify it?  anchors grew
into what you see now, the initial design was an attempt to beat skip-step
performance by giving the rule-writer more control.  But then the optimizer
showed up...



lib/libfuse: Handle signals that get sent to any thread

2018-10-05 Thread Rian Hunter
lib/libfuse/fuse.c was using a EINTR return from kevent() to detect when 
a signal had occurred, but in a multi-threaded process the signal may 
not have been delivered to the thread blocking on kevent(). This changes 
makes it so the signals are caught using EVFILT_SIGNAL filters so they 
can be detected even when they happen on another thread.


Index: lib/libfuse/fuse.c
===
RCS file: /cvs/src/lib/libfuse/fuse.c,v
retrieving revision 1.49
diff -u -p -u -p -r1.49 fuse.c
--- lib/libfuse/fuse.c  5 Jul 2018 10:57:31 -   1.49
+++ lib/libfuse/fuse.c  5 Oct 2018 22:13:41 -
@@ -32,8 +32,6 @@
 #include "fuse_private.h"
 #include "debug.h"

-static volatile sig_atomic_t sigraised = 0;
-static volatile sig_atomic_t signum = 0;
 static struct fuse_context *ictx = NULL;

 enum {
@@ -116,21 +114,10 @@ static struct fuse_opt fuse_mount_opts[]
 };

 static void
-ifuse_sighdlr(int num)
-{
-   if (!sigraised || (num == SIGCHLD)) {
-   sigraised = 1;
-   signum = num;
-   }
-}
-
-static void
 ifuse_try_unmount(struct fuse *f)
 {
pid_t child;

-   signal(SIGCHLD, ifuse_sighdlr);
-
/* unmount in another thread so fuse_loop() doesn't deadlock */
child = fork();

@@ -152,7 +139,6 @@ ifuse_child_exit(const struct fuse *f)
 {
int status;

-   signal(SIGCHLD, SIG_DFL);
if (waitpid(WAIT_ANY, &status, WNOHANG) == -1)
fprintf(stderr, "fuse: %s\n", strerror(errno));

@@ -160,7 +146,6 @@ ifuse_child_exit(const struct fuse *f)
fprintf(stderr, "fuse: %s: %s\n",
f->fc->dir, strerror(WEXITSTATUS(status)));

-   sigraised = 0;
return;
 }

@@ -170,6 +155,7 @@ fuse_loop(struct fuse *fuse)
struct fusebuf fbuf;
struct fuse_context ctx;
struct fb_ioctl_xch ioexch;
+   struct kevent event[5];
struct kevent ev;
ssize_t n;
int ret;
@@ -181,28 +167,39 @@ fuse_loop(struct fuse *fuse)
if (fuse->fc->kq == -1)
return (-1);

-   EV_SET(&fuse->fc->event, fuse->fc->fd, EVFILT_READ, EV_ADD |
+   EV_SET(&event[0], fuse->fc->fd, EVFILT_READ, EV_ADD |
EV_ENABLE, 0, 0, 0);

+   /* signal events */
+   EV_SET(&event[1], SIGCHLD, EVFILT_SIGNAL, EV_ADD |
+   EV_ENABLE, 0, 0, 0);
+   EV_SET(&event[2], SIGHUP, EVFILT_SIGNAL, EV_ADD |
+   EV_ENABLE, 0, 0, 0);
+   EV_SET(&event[3], SIGINT, EVFILT_SIGNAL, EV_ADD |
+   EV_ENABLE, 0, 0, 0);
+   EV_SET(&event[4], SIGTERM, EVFILT_SIGNAL, EV_ADD |
+   EV_ENABLE, 0, 0, 0);
+
while (!fuse->fc->dead) {
-   ret = kevent(fuse->fc->kq, &fuse->fc->event, 1, &ev, 1, 
NULL);

+   ret = kevent(fuse->fc->kq, &event[0], 5, &ev, 1, NULL);
if (ret == -1) {
-   if (errno == EINTR) {
-   switch (signum) {
-   case SIGCHLD:
-   ifuse_child_exit(fuse);
-   break;
-   case SIGHUP:
-   case SIGINT:
-   case SIGTERM:
-   ifuse_try_unmount(fuse);
-   break;
-   default:
-   fprintf(stderr, "%s: %s\n", 
__func__,

-   strsignal(signum));
-   }
-   } else
+   if (errno != EINTR)
DPERROR(__func__);
+   } else if (ret > 0 && ev.filter == EVFILT_SIGNAL) {
+   int signum = ev.ident;
+   switch (signum) {
+   case SIGCHLD:
+   ifuse_child_exit(fuse);
+   break;
+   case SIGHUP:
+   case SIGINT:
+   case SIGTERM:
+   ifuse_try_unmount(fuse);
+   break;
+   default:
+   fprintf(stderr, "%s: %s\n", __func__,
+   strsignal(signum));
+   }
} else if (ret > 0) {
n = read(fuse->fc->fd, &fbuf, sizeof(fbuf));
if (n != sizeof(fbuf)) {
@@ -479,20 +476,24 @@ fuse_remove_signal_handlers(unused struc
struct sigaction old_sa;

if (sigaction(SIGHUP, NULL, &old_sa) == 0)
-   if (old_sa.sa_handler == ifuse_sighdlr)
+   if (old_sa.sa_handler == SIG_IGN)
signal(SIGHUP, SIG_DFL);

if (sigaction(SIGINT, NULL, &old_sa) == 0)
-   if (old_sa.sa_handler == ifuse_sighdlr)
+

Re: pf: honor quick on anchor rules

2018-10-05 Thread Klemens Nanni
On Fri, Oct 05, 2018 at 11:53:08PM +0200, Klemens Nanni wrote:
> While rules within the anchor do not match, the anchor itself does.
> After all, it is a rule itself just like `pass' or `block', hence your
> ruleset's second rule `anchor quick' matches every packet and therefore
> stops evaluation.
To illustrate further:

Block everything except IPv4 traffic:

pass
anchor quick inet {
}
block

This very bad example is stupid but works as expected. It's especially
bad as it uses an anonymous anchor, thus you cannot modify it later on
using `pfctl -a ...'.

Maybe we should disallow anonymous anchors with empty rulesets for above
mentioned reasons?

Do note that named but empty anchors as fine and even neccessary for
some daemons such as relayd(8):

anchor "relayd/*"



Re: pf: honor quick on anchor rules

2018-10-05 Thread Klemens Nanni
On Fri, Oct 05, 2018 at 10:38:48PM +0200, Fabian Mueller-Knapp wrote:
> If i read man correctly it means "evaluate the rules inside and stop if
> any rule within matched".
While it's own description is quite clear and unambigious:

 quick   If a packet matches a rule which has the quick option set, this
 rule is considered the last matching rule, and evaluation of
 subsequent rules is skipped.

the anchor specific addition can be improved indeed:

 If the anchor itself is marked with the quick option, ruleset
 evaluation will terminate when the anchor is exited if the packet is
 matched by any rule within the anchor.

Does an empty ruleset match a packet? If so, and our code technically
does so, -current including my latest fix behaves correctly according
to the documentation.

> With the current fix, rule evaluation stops after the first block,
> regardless of matching. Therefore the following currently always passes: 
> 
> snap# uname -a
> OpenBSD snap.my.domain 6.4 GENERIC#333 amd64
> snap# pfctl -sr   
>  
> pass all flags S/SA
> anchor quick all {
> }
> block drop all
> 
> 6.1 behaves as i would expect, it drops all. Instead of the empty block,
> any non-matching rule has the same effect.
While rules within the anchor do not match, the anchor itself does.
After all, it is a rule itself just like `pass' or `block', hence your
ruleset's second rule `anchor quick' matches every packet and therefore
stops evaluation.



Re: traceroute6 and ospf6d (icmp6 source addresses and link-locals)

2018-10-05 Thread Stuart Henderson
On 2018/10/05 18:38, Alexander Bluhm wrote:
> IPv6 Source selection is a mess!
> 
> > ICMP6 messages
> > are generated with a source of, I think, the local address associated with
> > the route to the recipient,
> 
> It is not that simple.  Look at in6_ifawithscope() in sys/netinet6/in6.c.

I know that's used for newly generated packets, but I'm not sure it's the
case for icmp6, I haven't tried modifying the kernel to confirm for sure
that this is the code generating the address in this case, but it seems
likely, in icmp6.c:

 /*
1112  * If the incoming packet was addressed directly to us (i.e. 
unicast),
1113  * use dst as the src for the reply.
1114  * The IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED case would be VERY 
rare,
1115  * but is possible (for example) when we encounter an error while
1116  * forwarding procedure destined to a duplicated address of ours.
1117  */
1118 rt = rtalloc(sin6tosa(&sa6_dst), 0, rtableid);
1119 if (rtisvalid(rt) && ISSET(rt->rt_flags, RTF_LOCAL) &&
1120 !ISSET(ifatoia6(rt->rt_ifa)->ia6_flags,
1121 IN6_IFF_ANYCAST|IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED)) {
1122 src = &t;
1123 }

> Could you provide your ifconfig and route output?  So we could
> figure out into which path of this algorith you are running.

The host running traceroute has a handful of global scope addresses on
loopback interfaces, plus a global scope address on a vlan interface
facing the next router, all advertised into ospf.

The default source address is one of the loopbacks,
2001:67c:15f4:a423::26, so the only route back from the rest of the
network to this address is via link-locals all the way.

BGP routes changed so I'll include a traceroute using the default source
address again so all the new copied output is consistent:

$ traceroute6 -n www.google.com
traceroute6 to www.google.com (2a00:1450:4009:80b::2004), 64 hops max, 60 byte 
packets
 1  fe80::5606:33d8:d784:cd2f%vlan701  0.494 ms  0.362 ms  0.373 ms
 2  * * *
 3  * * *
 4  2001:7f8:17::1b1b:1  7.272 ms  7.332 ms  6.938 ms
 5  2001:7f8:17::3b41:1  6.699 ms  6.342 ms  6.453 ms
[...]

>From the first hop router,

gr1$ route -n get -inet6 2001:67c:15f4:a423::26
   route to: 2001:67c:15f4:a423::26
destination: 2001:67c:15f4:a423::26
   mask: :::::::
gateway: fe80::e648:4970:e85f:5e13%vlan701
  interface: vlan701
 if address: fe80::5606:33d8:d784:cd2f%vlan701
   priority: 32 (ospf)
  flags: 
 use   mtuexpire
29464060 0 0 

>From the second hop router,

gr5$ route -n get -inet6 2001:67c:15f4:a423::26
   route to: 2001:67c:15f4:a423::26
destination: 2001:67c:15f4:a423::26
   mask: :::::::
gateway: fe80::d05a:f0e8:e5e:a30a%vlan740
  interface: vlan740
 if address: fe80::b769:5751:d87b:44b7%vlan740
   priority: 32 (ospf)
  flags: 
 use   mtuexpire
20369017 0 0 

If instead I source packets from the vlan interface (directly connected
to the next router), I instead get this:

$ traceroute6 -n -s 2a03:8920:1:52bd::184 www.google.com
traceroute6 to www.google.com (2a00:1450:4009:80b::2004) from 
2a03:8920:1:52bd::184, 64 hops max, 60 byte packets
 1  2a03:8920:1:52bd::181  1.769 ms  0.382 ms  0.377 ms
 2  * * *
 3  * * *
 4  2001:7f8:17::1b1b:1  6.931 ms  6.999 ms  7.115 ms
 5  2001:7f8:17::3b41:1  6.466 ms  6.568 ms  6.416 ms

The routes in this case from the hop 1 and 2 routers are

gr1$ route -n get -inet6 2a03:8920:1:52bd::184
   route to: 2a03:8920:1:52bd::184
destination: 2a03:8920:1:52bd::184
   mask: :::::::
  interface: vlan701
 if address: 2a03:8920:1:52bd::181
   priority: 3 ()
  flags: 
 use   mtuexpire
   22811 0 3 

gr5$ route -n get -inet6 2a03:8920:1:52bd::184
   route to: 2a03:8920:1:52bd::184
destination: 2a03:8920:1:52bd::
   mask: :::::
gateway: fe80::d05a:f0e8:e5e:a30a%vlan740
  interface: vlan740
 if address: fe80::b769:5751:d87b:44b7%vlan740
   priority: 32 (ospf)
  flags: 
 use   mtuexpire
 234 0 0 

So the first hop packet is returned from a "normal" address, and the
second (from tcpdump) is returned from fe80::b769:5751:d87b:44b7 and of
course doesn't make it all the way back to the host running traceroute.

> Once I have have added the following rule from a newer RFC.  It
> makes things better for many caes, but not with OSPF6.  There you
> may have an interface with only link-local addresses.  Then this
> link-local is used instead of another better scoped one.

I have global addresses on all interfaces involved, none of the involved
interfaces just have link-local.

> /* RFC 3484 5. Rule 5: Prefer outgoing interface */
> 
> >  4  2001:728:0:5000::55  7.843 ms  8.236 ms  7.391 ms
> 
> How can this work?  Does your AS-Bounda

vmd: rate-limit to avoid reboot loops

2018-10-05 Thread Reyk Floeter
Hi,

it sometimes happens that a VM is stuck in a reboot loop.  This isn't
very pleasent for vmd, so this diff attempts to introduce a hard
rate-limit: if the VM rebooted after less than VM_START_RATE_SEC (6)
seconds, increment a counter.  If this happens VM_START_RATE_LIMIT (3)
times in a row, stop the VM.

The idea is that it might be desirable in some cases to reboot quickly
(you're either really fast on the boot prompt, or you use something
like grub that can automatically reboot into a previous kernel).  But
if this happens too often (more than 3 times), something is wrong and
cannot be intended, not even in the worst Linux/grub/unikernel/...
situation.

These limits are a guessed default.

Test case: I dd'ed random bytes to a kernel after some initial bytes,
keeping the original size of the kernel.  The boot loader loads the
header, the complete kernel, tries to boot it and *boom*, reset ;)

Comments?  Concerns?  Better ideas?  OKs?

Reyk

Index: usr.sbin/vmd/config.c
===
RCS file: /cvs/src/usr.sbin/vmd/config.c,v
retrieving revision 1.50
diff -u -p -u -p -r1.50 config.c
--- usr.sbin/vmd/config.c   7 Aug 2018 14:49:05 -   1.50
+++ usr.sbin/vmd/config.c   5 Oct 2018 21:15:12 -
@@ -187,6 +187,7 @@ config_setvm(struct privsep *ps, struct 
char ifname[IF_NAMESIZE], *s;
char path[PATH_MAX];
unsigned int unit;
+   struct timeval   tv, rate, since_last;
 
errno = 0;
 
@@ -204,6 +205,39 @@ config_setvm(struct privsep *ps, struct 
goto fail;
}
}
+
+   /*
+* Rate-limit the VM so that it cannot restart in a loop:
+* if the VM restarts after less than VM_START_RATE_SEC seconds,
+* we increment the limit counter.  After VM_START_RATE_LIMIT
+* of suchs fast reboots the VM is stopped.
+*/
+   getmonotime(&tv);
+   if (vm->vm_start_tv.tv_sec) {
+   timersub(&tv, &vm->vm_start_tv, &since_last);
+
+   rate.tv_sec = VM_START_RATE_SEC;
+   rate.tv_usec = 0;
+   if (timercmp(&since_last, &rate, <))
+   vm->vm_start_limit++;
+   else {
+   /* Reset counter */
+   vm->vm_start_limit = 0;
+   }
+
+   log_debug("%s: vm %u restarted after %lld.%ld seconds,"
+   " limit %d/%d", __func__, vcp->vcp_id, since_last.tv_sec,
+   since_last.tv_usec, vm->vm_start_limit,
+   VM_START_RATE_LIMIT);
+
+   if (vm->vm_start_limit >= VM_START_RATE_LIMIT) {
+   log_warnx("%s: vm %u restarted too quickly",
+   __func__, vcp->vcp_id);
+   errno = EPERM;
+   goto fail;
+   }
+   }
+   vm->vm_start_tv = tv;
 
diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds));
if (diskfds == NULL) {
Index: usr.sbin/vmd/vmd.c
===
RCS file: /cvs/src/usr.sbin/vmd/vmd.c,v
retrieving revision 1.102
diff -u -p -u -p -r1.102 vmd.c
--- usr.sbin/vmd/vmd.c  29 Sep 2018 22:33:09 -  1.102
+++ usr.sbin/vmd/vmd.c  5 Oct 2018 21:15:12 -
@@ -1918,3 +1918,14 @@ prefixlen2mask(uint8_t prefixlen)
 
return (htonl(0x << (32 - prefixlen)));
 }
+
+void
+getmonotime(struct timeval *tv)
+{
+   struct timespec  ts;
+
+   if (clock_gettime(CLOCK_MONOTONIC, &ts))
+   fatal("clock_gettime");
+
+   TIMESPEC_TO_TIMEVAL(tv, &ts);
+}
Index: usr.sbin/vmd/vmd.h
===
RCS file: /cvs/src/usr.sbin/vmd/vmd.h,v
retrieving revision 1.81
diff -u -p -u -p -r1.81 vmd.h
--- usr.sbin/vmd/vmd.h  1 Oct 2018 09:31:15 -   1.81
+++ usr.sbin/vmd/vmd.h  5 Oct 2018 21:15:13 -
@@ -54,6 +54,10 @@
 #define VMD_SWITCH_TYPE"bridge"
 #define VM_DEFAULT_MEMORY  512
 
+/* Rate-limit fast reboots */
+#define VM_START_RATE_SEC  6   /* min. seconds since last reboot */
+#define VM_START_RATE_LIMIT3   /* max. number of fast reboots */
+
 /* default user instance limits */
 #define VM_DEFAULT_USER_MAXCPU 4
 #define VM_DEFAULT_USER_MAXMEM 2048
@@ -260,6 +264,10 @@ struct vmd_vm {
int  vm_receive_fd;
struct vmd_user *vm_user;
 
+   /* For rate-limiting */
+   struct timeval   vm_
- Message truncated -



Re: Qcow2: External snapshots

2018-10-05 Thread Reyk Floeter
On Wed, Oct 03, 2018 at 11:41:41PM -0700, Ori Bernstein wrote:
> diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c
> index 550b73c1a39..68be738d304 100644
> --- usr.sbin/vmd/config.c
> +++ usr.sbin/vmd/config.c
> @@ -35,6 +35,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  
>  #include "proc.h"
>  #include "vmd.h"
> @@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg)
>  int
>  config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t 
> uid)
>  {
> + int diskfds[VMM_MAX_DISKS_PER_VM][VM_MAX_BASE_PER_DISK];
>   struct vmd_if   *vif;
>   struct vmop_create_params *vmc = &vm->vm_params;
>   struct vm_create_params *vcp = &vmc->vmc_params;
> - unsigned int i;
> + unsigned int i, j;
>   int  fd = -1, vmboot = 0;
> - int  kernfd = -1, *diskfds = NULL, *tapfds = NULL;
> + int  kernfd = -1;
> + int *tapfds;

keep tapfds = NULL or you might cause a segfault in the goto fail case...

[snip]
>   if (tapfds != NULL) {
>   for (i = 0; i < vcp->vcp_nnics; i++)
>   close(tapfds[i]);

...here (same function).

Reyk



Re: pf: honor quick on anchor rules

2018-10-05 Thread Fabian Mueller-Knapp
Hello,

Sorry for the late reply. I just tested the fix with the latest snapshot
and think the behaviour is still not correct:

On 18-10-04 01:25:09, Klemens Nanni wrote:
> On Sat, Sep 29, 2018 at 10:44:41PM +0200, Klemens Nanni wrote:
> 
> `anchor quick' means "evaluate the rules inside but stop after that".

If i read man correctly it means "evaluate the rules inside and stop if
any rule within matched".

With the current fix, rule evaluation stops after the first block,
regardless of matching. Therefore the following currently always passes: 

snap# uname -a
OpenBSD snap.my.domain 6.4 GENERIC#333 amd64
snap# pfctl -sr
pass all flags S/SA
anchor quick all {
}
block drop all

6.1 behaves as i would expect, it drops all. Instead of the empty block,
any non-matching rule has the same effect.

Regards,
Fabian



Re: traceroute6 and ospf6d (icmp6 source addresses and link-locals)

2018-10-05 Thread Alexander Bluhm
IPv6 Source selection is a mess!

> ICMP6 messages
> are generated with a source of, I think, the local address associated with
> the route to the recipient,

It is not that simple.  Look at in6_ifawithscope() in sys/netinet6/in6.c.

/*
 * At this point, we have two cases:
 * 1. we are looking at a non-deprecated address,
 *and ia6_best is also non-deprecated.
 * 2. we are looking at a deprecated address,
 *and ia6_best is also deprecated.
 * Also, we do not have to consider a case where
 * the scope of if_best is larger(smaller) than dst and
 * the scope of the current address is smaller(larger)
 * than dst. Such a case has already been covered.
 * Tiebreaking is done according to the following
 * items:
 * - the scope comparison between the address and
 *   dst (dscopecmp)
 * - the scope comparison between the address and
 *   ia6_best (bscopecmp)
 * - if the address match dst longer than ia6_best
 *   (matchcmp)
 * - if the address is on the outgoing I/F (outI/F)
 *
 * Roughly speaking, the selection policy is
 * - the most important item is scope. The same scope
 *   is best. Then search for a larger scope.
 *   Smaller scopes are the last resort.
 * - A deprecated address is chosen only when we have
 *   no address that has an enough scope, but is
 *   prefered to any addresses of smaller scopes.
 * - Longest address match against dst is considered
 *   only for addresses that has the same scope of dst.
 * - If there is no other reasons to choose one,
 *   addresses on the outgoing I/F are preferred.
 *
 * The precise decision table is as follows:
 * dscopecmp bscopecmp matchcmp outI/F | replace?
 *!equal equal  N/AYes |  Yes (1)
 *!equal equal  N/A No |   No (2)
 *largerlarger  N/AN/A |   No (3)
 *larger   smaller  N/AN/A |  Yes (4)
 *   smallerlarger  N/AN/A |  Yes (5)
 *   smaller   smaller  N/AN/A |   No (6)
 * equal   smaller  N/AN/A |  Yes (7)
 * equallarger   (already done)
 * equal equal   largerN/A |  Yes (8)
 * equal equal  smallerN/A |   No (9)
 * equal equalequalYes |  Yes (a)
 * equal equalequal No |   No (b)
 */

Could you provide your ifconfig and route output?  So we could
figure out into which path of this algorith you are running.

Once I have have added the following rule from a newer RFC.  It
makes things better for many caes, but not with OSPF6.  There you
may have an interface with only link-local addresses.  Then this
link-local is used instead of another better scoped one.

/* RFC 3484 5. Rule 5: Prefer outgoing interface */

>  4  2001:728:0:5000::55  7.843 ms  8.236 ms  7.391 ms

How can this work?  Does your AS-Boundary Router do NAT?

> What's anyone else doing? Just living with it or has anyone figured a way
> to make it nicer? I'd like to reply with either a global scope address for
> the interface, or a loopback address,

We have implemented more or less a very old RFC.  There are two
newer RFCs with different algorithms.  There is recommendation to
store policies from user-land into the kernel for address selection.

I have just looked at FreeBSD in6_ifawithifp(), it is quite simple.
Perhaps this is a way to go.

> I didn't get anywhere with PF
> translation though.

pf with IPv6 link-local addresses does not work properly.  I think
it cannot parse the %if suffixes.  The KAME hack scope id is not
handled.

bluhm



traceroute6 and ospf6d (icmp6 source addresses and link-locals)

2018-10-05 Thread Stuart Henderson
With ospf6d, routes are added using link-local addresses. ICMP6 messages
are generated with a source of, I think, the local address associated with
the route to the recipient, so with a couple of hops in the internal network
it results in traceroutes looking like

$ traceroute6 -n www.google.com
traceroute6 to www.google.com (2a00:1450:4009:80f::2004), 64 hops max, 60 byte 
packets
 1  fe80::5606:33d8:d784:cd2f%vlan701 (fe80::5606:33d8:d784:cd2f%vlan701)  
0.519 ms  1.547 ms  0.417 ms
 2  * * *
 3  * * *
 4  2001:728:0:5000::55  7.843 ms  8.236 ms  7.391 ms
[...]

The first hop works (ugliness aside) because the link-local is directly
connected, the next ones don't because the ICMP response are sourced from
a link-local addresses (to match the route to the destination) but these
aren't allowed to be forwarded by routers. (And once packets have left
the AS, things work as expected on global addresses).

What's anyone else doing? Just living with it or has anyone figured a way
to make it nicer? I'd like to reply with either a global scope address for
the interface, or a loopback address, I didn't get anywhere with PF
translation though.



Re: odd condition/test in PF lexer

2018-10-05 Thread Theo de Raadt
Todd C. Miller  wrote:

> On Fri, 05 Oct 2018 00:37:33 +0200, Alexandr Nedvedicky wrote:
> 
> > because earlier line at 5279 grants the variable c holds backslash,
> > therefore it can't contain space or tab. The simple change is tempting,
> > but let's check the history first. That particular line has been
> > introduced 10+ years ago with commit message as follows:
> >
> > in the lex... even inside quotes, a \ followed by space or tab should
> > expand to space or tab, and a \ followed by newline should be ignored
> > (as a line continuation).  compatible with the needs of hoststated
> > (which has the most strict quoted string requirements), and ifstated
> > (where one commonly does line continuations in strings).
> >
> > Comment above makes me thinking the intended change looks as follows:
> >
> > 5282 if (next == quotec || next == ' ' || next == '\t')
> 
> I agree with your analysis.  Thank you for taking the time to
> determine the author's intent instead of blindly trusting the static
> analyzer.

It looks obviously correct.

Still, since we are in the release window, I would like to know what the
regression tests think of this.

It took so long for someone to hit this lex error, so it probably should
be punted post-release regardless.



Re: odd condition/test in PF lexer

2018-10-05 Thread Todd C. Miller
On Fri, 05 Oct 2018 00:37:33 +0200, Alexandr Nedvedicky wrote:

> because earlier line at 5279 grants the variable c holds backslash,
> therefore it can't contain space or tab. The simple change is tempting,
> but let's check the history first. That particular line has been
> introduced 10+ years ago with commit message as follows:
>
> in the lex... even inside quotes, a \ followed by space or tab should
> expand to space or tab, and a \ followed by newline should be ignored
> (as a line continuation).  compatible with the needs of hoststated
> (which has the most strict quoted string requirements), and ifstated
> (where one commonly does line continuations in strings).
>
> Comment above makes me thinking the intended change looks as follows:
>
> 5282 if (next == quotec || next == ' ' || next == '\t')

I agree with your analysis.  Thank you for taking the time to
determine the author's intent instead of blindly trusting the static
analyzer.

 - todd



top cpu stats are wrong with hyper threading

2018-10-05 Thread Moritz Buhl
Hi,

Due to the SMT stuff the output of top showed the first few cpus instead
of the ones that are actually active.

To reproduce the bad output:
Use a machine with hyper therading, top should show half the cpus, of
which every second is disabled.

The following diff skips the disabled cpus and disabling/reenabling the
cores with hw.smt also works.
The only problem is that the lines reserved for cpu-stats does not
change with reenabling. Refreshing the output with '1' or resizing the
window should help.

Thanks,
Moritz Buhl


Index: display.c
===
RCS file: /home/mbuhl/cvs/src/usr.bin/top/display.c,v
retrieving revision 1.55
diff -u -p -r1.55 display.c
--- display.c   26 Sep 2018 17:23:13 -  1.55
+++ display.c   5 Oct 2018 15:19:00 -
@@ -381,7 +381,7 @@ cpustates_tag(int cpu)
 void
 i_cpustates(int64_t *ostates, int *online)
 {
-   int i, first, cpu, ncpuonline;
+   int i, first, cpu, ncpuonline, off;
double value;
int64_t *states;
char **names, *thisname;
@@ -434,15 +434,18 @@ i_cpustates(int64_t *ostates, int *onlin
}
return;
}
+   off = 0;
for (cpu = 0; cpu < ncpu; cpu++) {
/* now walk thru the names and print the line */
names = cpustate_names;
first = 0;
states = ostates + (CPUSTATES * cpu);
 
-   if ((screen_length > 2 + cpu && 2 + cpu < y_mem) ||
+   if (y_mem == ncpuonline + 2 && !online[cpu])
+   continue;
+   if ((screen_length > 2 + cpu && 2 + off < y_mem) ||
!smart_terminal) {
-   move(2 + cpu, 0);
+   move(2 + off, 0);
clrtoeol();
addstrp(cpustates_tag(cpu));
 
@@ -465,6 +468,7 @@ i_cpustates(int64_t *ostates, int *onlin
}
putn();
}
+   off++;
}
 }
 



landisk: fix up MD swap64 function

2018-10-05 Thread Christian Weisgerber
Provide an MD 64-bit byteswapping function build on 32-bit swaps
as we do on arm and i386.  Copied from arm.

If there are no MD byteswapping functions, MI macros are used.
These are wrapped by static inline functions to prevent multiple
evaluation of their argument.  If there are MD functions, they are
used directly and therefore we must implicitly guarantee that they
are safe from multiple evaluation.  Defining an MD function to an
MI macro breaks this promise.

OK?

Index: arch/sh/include/endian.h
===
RCS file: /cvs/src/sys/arch/sh/include/endian.h,v
retrieving revision 1.6
diff -u -p -r1.6 endian.h
--- arch/sh/include/endian.h2 Oct 2018 21:30:44 -   1.6
+++ arch/sh/include/endian.h5 Oct 2018 13:17:00 -
@@ -31,7 +31,16 @@ __swap32md(__uint32_t _x)
return (_rv);
 }
 
-#define__swap64md  __swap64gen
+static __inline __uint64_t
+__swap64md(__uint64_t _x)
+{
+   __uint64_t _rv;
+
+   _rv = (__uint64_t)__swap32md(_x >> 32) |
+   (__uint64_t)__swap32md(_x) << 32;
+
+   return (_rv);
+}
 
 /* Tell sys/endian.h we have MD variants of the swap macros.  */
 #define __HAVE_MD_SWAP
-- 
Christian "naddy" Weisgerber  na...@mips.inka.de



vmd: servicing virtio devices from separate processes

2018-10-05 Thread Sergio Lopez
Hi,

I have an idea in mind that I'd like to share to ask you if you think
it's worth giving it a try.

Right now, vmd already features an excellent privsep model to ensure
the process servicing the VM requests to the outside world is running
with the lowest possible privileges.

I was wondering if we could take a step further, servicing each virtio
device from a different process. This design would simplify the
implementation and maintenance of those devices, improve the privsep
model and increase the resilience of VMs (the crash of a process
servicing a device won't bring down the whole VM, and a mechanism to
recover from this scenario could be explored).

Doing this in an efficient way requires:

 - The ability to receive virtqueue notifications directly on the
process. I've already sent an RFC patch for this (see "Lightweight
mechanism to kick virtio VQs"), as it'd be useful for the non-separated
model too.

 - An in-kernel IRQ chip. This one is the most controversial, as it
means emulating a device from a privileged domain, but I'm pretty sure
a lapic implementation with enough functionality to serve *BSD/Linux
Guests can be small and simple enough to be easily auditable.

 - A way to map the VM memory into a third process context. Can
uvm_share for this? Here's also the opportunity to explore options to
avoid mapping the whole VM memory, though that'll possibly require
functionality non covered by the virtio specs.

Do you think it's worth exploring this model? What are feelings
regarding the in-kernel IRQ chip?

Sergio (slp).



[RFC PATCH] [vmm] Lightweight mechanism to kick virtio VQs

2018-10-05 Thread Sergio Lopez
Hi,

This patch implements a mechanism to allow users register an I/O port
with a special file descriptor (kickfd) which can be monitored for
events using kevent. The kernel will note an event each time the Guest
writes to an I/O port registered with a kickfd.

This is mainly intended for kicking virtio virtqueues, and allows the
VCPU to return immediately to the Guest, instead of having to exit all
the way to vmd to process the request. With kickfd, the actual work
(i.e. an I/O operation) requested by the Guest can be done in a
parallel thread that eventually asserts the corresponding IRQ.

In a general sense, this improves the I/O performance when the Guest is
issuing asynchronous operations (or its virtio implementation is able
to access the virtqueue asynchronously) and makes it more responsive
(as the VCPU spends less time outside the Guest), but it may hurt a bit
when the Guest is actively waiting for the operation to finish. This is
also a first towards the possibility of having virtio devices running
on separate processes, but I'll write more about this on another email
thread.

I deliberately omitted the i386 implementation to make this easier to
review.

Sergio (slp).


Index: arch/amd64/amd64/vmm.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v
retrieving revision 1.220
diff -u -p -r1.220 vmm.c
--- arch/amd64/amd64/vmm.c  20 Sep 2018 14:32:59 -  1.220
+++ arch/amd64/amd64/vmm.c  5 Oct 2018 09:54:34 -
@@ -20,6 +20,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -63,6 +66,16 @@ void *l1tf_flush_region;
 #define VMX_EXIT_INFO_COMPLETE \
 (VMX_EXIT_INFO_HAVE_RIP | VMX_EXIT_INFO_HAVE_REASON)
 
+struct kickfd {
+   uint16_t ioport;
+   struct klist note;
+   struct vm*vm;
+
+   SLIST_ENTRY(kickfd)  kickfd_link;
+};
+
+SLIST_HEAD(kickfd_head, kickfd);
+
 struct vm {
vm_map_t vm_map;
uint32_t vm_id;
@@ -77,6 +90,9 @@ struct vm {
u_intvm_vcpus_running;
struct rwlockvm_vcpu_lock;
 
+   struct kickfd_head   vm_kickfd_list;
+   struct rwlockvm_kickfd_lock;
+
SLIST_ENTRY(vm)  vm_link;
 };
 
@@ -122,6 +138,7 @@ int vm_get_info(struct vm_info_params *)
 int vm_resetcpu(struct vm_resetcpu_params *);
 int vm_intr_pending(struct vm_intr_params *);
 int vm_rwregs(struct vm_rwregs_params *, int);
+int vm_register_kickfd(struct vm_regkickfd_params *, struct proc *);
 int vm_find(uint32_t, struct vm **);
 int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *);
 int vcpu_readregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *);
@@ -281,6 +298,7 @@ const struct {
 /* Pools for VMs and VCPUs */
 struct pool vm_pool;
 struct pool vcpu_pool;
+struct pool kickfd_pool;
 
 struct vmm_softc *vmm_softc;
 
@@ -419,6 +437,8 @@ vmm_attach(struct device *parent, struct
"vmpool", NULL);
pool_init(&vcpu_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK,
"vcpupl", NULL);
+   pool_init(&kickfd_pool, sizeof(struct kickfd), 64, IPL_NONE, PR_WAITOK,
+   "kickfdpl", NULL);
 
vmm_softc = sc;
 }
@@ -482,6 +502,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t 
case VMM_IOC_WRITEREGS:
ret = vm_rwregs((struct vm_rwregs_params *)data, 1);
break;
+   case VMM_IOC_REGKICKFD:
+   ret = vm_register_kickfd((struct vm_regkickfd_params *)data, p);
+   break;
default:
DPRINTF("%s: unknown ioctl code 0x%lx\n", __func__, cmd);
ret = ENOTTY;
@@ -513,6 +536,7 @@ pledge_ioctl_vmm(struct proc *p, long co
case VMM_IOC_INTR:
case VMM_IOC_READREGS:
case VMM_IOC_WRITEREGS:
+   case VMM_IOC_REGKICKFD:
return (0);
}
 
@@ -725,6 +749,176 @@ vm_rwregs(struct vm_rwregs_params *vrwp,
}
 }
 
+void
+filt_vmmkickfd_detach(struct knote *kn)
+{
+   struct kickfd *kickfd = kn->kn_fp->f_data;
+
+   rw_enter_write(&kickfd->vm->vm_kickfd_lock);
+   SLIST_REMOVE(&kickfd->note, kn, knote, kn_selnext);
+   rw_exit_write(&kickfd->vm->vm_kickfd_lock);
+}
+
+int
+filt_vmmkickfd_read(struct knote *kn, long hint)
+{
+   return(1);
+}
+
+struct filterops vmmkickfd_filtops =
+   { 1, NULL, filt_vmmkickfd_detach, filt_vmmkickfd_read };
+
+int
+vmmkickfd_read(struct file *fp, struct uio *uio, int fflags)
+{
+   return (EINVAL);
+}
+
+int
+vmmkickfd_write(struct file *fp, struct uio *uio, int fflags)
+{
+   return (EINVAL);
+}
+
+int
+vmmkickfd_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p)
+{
+   return (ENOTTY);
+}
+
+int
+vmmkickfd_poll(struct file *fp, int events, struct proc *p)
+{
+   return (EINVAL);
+}
+
+int
+vmmkickfd_

Re: do not join node information multicast group

2018-10-05 Thread Sebastian Benoit
Florian Obser(flor...@openbsd.org) on 2018.10.04 20:13:03 +0200:
> Benno removed code to answer ICMP queries over 4 years ago.
> Aham Brahmasmi (aham.brahmasmi AT gmx.com) points out
> that we still joine the group though.
> 
> OK?


ok benno@

> diff --git in6.c in6.c
> index c09ab1dcd0a..5297c0a1249 100644
> --- in6.c
> +++ in6.c
> @@ -808,19 +808,6 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq 
> *ifra,
>   goto cleanup;
>   LIST_INSERT_HEAD(&ia6->ia6_memberships, imm, i6mm_chain);
>  
> - /*
> -  * join node information group address
> -  */
> - if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr) == 0) {
> - imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error);
> - if (!imm) {
> - /* XXX not very fatal, go on... */
> - } else {
> - LIST_INSERT_HEAD(&ia6->ia6_memberships,
> - imm, i6mm_chain);
> - }
> - }
> -
>   /*
>* join interface-local all-nodes address.
>* (ff01::1%ifN, and ff01::%ifN/32)
> diff --git in6_ifattach.c in6_ifattach.c
> index 2f8463e3a47..b6e67a5eee7 100644
> --- in6_ifattach.c
> +++ in6_ifattach.c
> @@ -428,57 +428,6 @@ in6_ifattach_loopback(struct ifnet *ifp)
>   return (in6_update_ifa(ifp, &ifra, NULL));
>  }
>  
> -/*
> - * compute NI group address, based on the current hostname setting.
> - * see draft-ietf-ipngwg-icmp-name-lookup-* (04 and later).
> - *
> - * when ifp == NULL, the caller is responsible for filling scopeid.
> - */
> -int
> -in6_nigroup(struct ifnet *ifp, const char *name, int namelen,
> -struct sockaddr_in6 *sa6)
> -{
> - const char *p;
> - u_int8_t *q;
> - SHA2_CTX ctx;
> - u_int8_t digest[SHA512_DIGEST_LENGTH];
> - u_int8_t l;
> - u_int8_t n[64]; /* a single label must not exceed 63 chars */
> -
> - if (!namelen || !name)
> - return -1;
> -
> - p = name;
> - while (p && *p && *p != '.' && p - name < namelen)
> - p++;
> - if (p - name > sizeof(n) - 1)
> - return -1;  /* label too long */
> - l = p - name;
> - strncpy((char *)n, name, l);
> - n[(int)l] = '\0';
> - for (q = n; *q; q++) {
> - if ('A' <= *q && *q <= 'Z')
> - *q = *q - 'A' + 'a';
> - }
> -
> - /* generate 8 bytes of pseudo-random value. */
> - SHA512Init(&ctx);
> - SHA512Update(&ctx, &l, sizeof(l));
> - SHA512Update(&ctx, n, l);
> - SHA512Final(digest, &ctx);
> -
> - bzero(sa6, sizeof(*sa6));
> - sa6->sin6_family = AF_INET6;
> - sa6->sin6_len = sizeof(*sa6);
> - sa6->sin6_addr.s6_addr16[0] = htons(0xff02);
> - sa6->sin6_addr.s6_addr16[1] = htons(ifp->if_index);
> - sa6->sin6_addr.s6_addr8[11] = 2;
> - memcpy(&sa6->sin6_addr.s6_addr32[3], digest,
> - sizeof(sa6->sin6_addr.s6_addr32[3]));
> -
> - return 0;
> -}
> -
>  /*
>   * XXX multiple loopback interface needs more care.  for instance,
>   * nodelocal address needs to be configured onto only one of them.
> diff --git in6_ifattach.h in6_ifattach.h
> index 0f54b457de9..525cc365ffe 100644
> --- in6_ifattach.h
> +++ in6_ifattach.h
> @@ -36,7 +36,6 @@
>  #ifdef _KERNEL
>  int in6_ifattach(struct ifnet *);
>  void in6_ifdetach(struct ifnet *);
> -int in6_nigroup(struct ifnet *, const char *, int, struct sockaddr_in6 *);
>  int in6_ifattach_linklocal(struct ifnet *, struct in6_addr *);
>  void in6_soiiupdate(struct ifnet *);
>  #endif /* _KERNEL */
> 
> 
> -- 
> I'm not entirely sure you are real.
> 



Re: Qcow2: External snapshots

2018-10-05 Thread Reyk Floeter
On Wed, Oct 03, 2018 at 11:41:41PM -0700, Ori Bernstein wrote:
> Thanks, another update based on Reyk's feeback and fixes.
> 

You missed one thing: jmc@'s manpage comments.

For everything else:  Looks good!  Tests work fine.  OK reyk@

Reyk

> diff --git regress/usr.sbin/vmd/diskfmt/Makefile 
> regress/usr.sbin/vmd/diskfmt/Makefile
> index c2a5f42d5f6..1f8673e0e26 100644
> --- regress/usr.sbin/vmd/diskfmt/Makefile
> +++ regress/usr.sbin/vmd/diskfmt/Makefile
> @@ -11,7 +11,7 @@
>  VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/
>  
>  PROG=vioscribble
> -SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
> +SRCS=vioscribble.c vioqcow2.c vioraw.c
>  CFLAGS+=-I$(VMD_DIR) -pthread
>  LDFLAGS+=-pthread
>  
> @@ -26,3 +26,6 @@ scribble-images:
>  .PHONY: ${REGRESS_TARGETS} scribble-images
>  
>  .include 
> +
> +vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c
> + cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c .
> diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c 
> regress/usr.sbin/vmd/diskfmt/vioscribble.c
> index 14d720db652..1da8efedac7 100644
> --- regress/usr.sbin/vmd/diskfmt/vioscribble.c
> +++ regress/usr.sbin/vmd/diskfmt/vioscribble.c
> @@ -122,16 +122,18 @@ main(int argc, char **argv)
>   verbose = !!getenv("VERBOSE");
>   qcfd = open("scribble.qc2", O_RDWR);
>   rawfd = open("scribble.raw", O_RDWR);
> - if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1)
> + if (qcfd == -1)
>   err(1, "unable to open qcow");
> - if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1)
> + if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1)
> + err(1, "unable to init qcow");
> + if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) == -1)
>   err(1, "unable to open raw");
>  
>   srandom_deterministic(123);
>  
>   /* scribble to both disks */
>   printf("scribbling...\n");
> - for (i = 0; i < 16; i++) {
> + for (i = 0; i < 1024*16; i++) {
>   off = (random() % DISKSZ);
>   len = random() % sizeof buf + 1;
>   fill(off, buf, sizeof buf);
> diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c
> index 8748ecfdedc..a3ab4672370 100644
> --- usr.sbin/vmctl/main.c
> +++ usr.sbin/vmctl/main.c
> @@ -67,7 +67,8 @@ int  ctl_receive(struct parse_result *, int, char 
> *[]);
>  
>  struct ctl_command ctl_commands[] = {
>   { "console",CMD_CONSOLE,ctl_console,"id" },
> - { "create", CMD_CREATE, ctl_create, "\"path\" -s size", 1 },
> + { "create", CMD_CREATE, ctl_create,
> + "\"path\" [-s size] [-b base]", 1 },
>   { "load",   CMD_LOAD,   ctl_load,   "\"path\"" },
>   { "log",CMD_LOG,ctl_log,"[verbose|brief]" },
>   { "reload", CMD_RELOAD, ctl_reload, "" },
> @@ -538,47 +539,55 @@ int
>  ctl_create(struct parse_result *res, int argc, char *argv[])
>  {
>   int  ch, ret, type;
> - const char  *paths[2], *disk, *format;
> + const char  *disk, *format, *base;
>  
>   if (argc < 2)
>   ctl_usage(res->ctl);
>  
> + base = NULL;
>   type = parse_disktype(argv[1], &disk);
>  
> - paths[0] = disk;
> - paths[1] = NULL;
> -
> - if (unveil(paths[0], "rwc") == -1)
> + if (pledge("stdio rpath wpath cpath unveil", NULL) == -1)
> + err(1, "pledge");
> + if (unveil(disk, "rwc") == -1)
>   err(1, "unveil");
>  
> - if (pledge("stdio rpath wpath cpath", NULL) == -1)
> - err(1, "pledge");
>   argc--;
>   argv++;
>  
> - while ((ch = getopt(argc, argv, "s:")) != -1) {
> + while ((ch = getopt(argc, argv, "s:b:")) != -1) {
>   switch (ch) {
>   case 's':
>   if (parse_size(res, optarg, 0) != 0)
>   errx(1, "invalid size: %s", optarg);
>   break;
> + case 'b':
> + base = optarg;
> + if (unveil(base, "r") == -1)
> + err(1, "unveil");
> + break;
>   default:
>   ctl_usage(res->ctl);
>   /* NOTREACHED */
>   }
>   }
> + if (unveil(NULL, NULL))
> + err(1, "unveil");
>  
> - if (res->size == 0) {
> - fprintf(stderr, "missing size argument\n");
> + if (base && type != VMDF_QCOW2)
> + errx(1, "base images require qcow2 disk format");
> + if (res->size == 0 && !base) {
> + fprintf(stderr, "could not create %s: missing size argument\n",
> + disk);
>   ctl_usage(res->ctl);
>   }
>  
>   if (type == VMDF_QCOW2) {
>   format = "qcow2";
> - ret = create_qc2_imagefile(paths[0], res->size);
> + ret = create_qc2_imagefile(disk, base, res->size);
>