Re: pf: honor quick on anchor rules
Klemens Nanni wrote: > On Fri, Oct 05, 2018 at 11:53:08PM +0200, Klemens Nanni wrote: > > While rules within the anchor do not match, the anchor itself does. > > After all, it is a rule itself just like `pass' or `block', hence your > > ruleset's second rule `anchor quick' matches every packet and therefore > > stops evaluation. > To illustrate further: > > Block everything except IPv4 traffic: > > pass > anchor quick inet { > } > block > > This very bad example is stupid but works as expected. It's especially > bad as it uses an anonymous anchor, thus you cannot modify it later on > using `pfctl -a ...'. > > Maybe we should disallow anonymous anchors with empty rulesets for above > mentioned reasons? I don't understand why to disallow it. It is quirky, but does as expected. Why does one need to be able to modify it? anchors grew into what you see now, the initial design was an attempt to beat skip-step performance by giving the rule-writer more control. But then the optimizer showed up...
lib/libfuse: Handle signals that get sent to any thread
lib/libfuse/fuse.c was using a EINTR return from kevent() to detect when a signal had occurred, but in a multi-threaded process the signal may not have been delivered to the thread blocking on kevent(). This changes makes it so the signals are caught using EVFILT_SIGNAL filters so they can be detected even when they happen on another thread. Index: lib/libfuse/fuse.c === RCS file: /cvs/src/lib/libfuse/fuse.c,v retrieving revision 1.49 diff -u -p -u -p -r1.49 fuse.c --- lib/libfuse/fuse.c 5 Jul 2018 10:57:31 - 1.49 +++ lib/libfuse/fuse.c 5 Oct 2018 22:13:41 - @@ -32,8 +32,6 @@ #include "fuse_private.h" #include "debug.h" -static volatile sig_atomic_t sigraised = 0; -static volatile sig_atomic_t signum = 0; static struct fuse_context *ictx = NULL; enum { @@ -116,21 +114,10 @@ static struct fuse_opt fuse_mount_opts[] }; static void -ifuse_sighdlr(int num) -{ - if (!sigraised || (num == SIGCHLD)) { - sigraised = 1; - signum = num; - } -} - -static void ifuse_try_unmount(struct fuse *f) { pid_t child; - signal(SIGCHLD, ifuse_sighdlr); - /* unmount in another thread so fuse_loop() doesn't deadlock */ child = fork(); @@ -152,7 +139,6 @@ ifuse_child_exit(const struct fuse *f) { int status; - signal(SIGCHLD, SIG_DFL); if (waitpid(WAIT_ANY, &status, WNOHANG) == -1) fprintf(stderr, "fuse: %s\n", strerror(errno)); @@ -160,7 +146,6 @@ ifuse_child_exit(const struct fuse *f) fprintf(stderr, "fuse: %s: %s\n", f->fc->dir, strerror(WEXITSTATUS(status))); - sigraised = 0; return; } @@ -170,6 +155,7 @@ fuse_loop(struct fuse *fuse) struct fusebuf fbuf; struct fuse_context ctx; struct fb_ioctl_xch ioexch; + struct kevent event[5]; struct kevent ev; ssize_t n; int ret; @@ -181,28 +167,39 @@ fuse_loop(struct fuse *fuse) if (fuse->fc->kq == -1) return (-1); - EV_SET(&fuse->fc->event, fuse->fc->fd, EVFILT_READ, EV_ADD | + EV_SET(&event[0], fuse->fc->fd, EVFILT_READ, EV_ADD | EV_ENABLE, 0, 0, 0); + /* signal events */ + EV_SET(&event[1], SIGCHLD, EVFILT_SIGNAL, EV_ADD | + EV_ENABLE, 0, 0, 0); + EV_SET(&event[2], SIGHUP, EVFILT_SIGNAL, EV_ADD | + EV_ENABLE, 0, 0, 0); + EV_SET(&event[3], SIGINT, EVFILT_SIGNAL, EV_ADD | + EV_ENABLE, 0, 0, 0); + EV_SET(&event[4], SIGTERM, EVFILT_SIGNAL, EV_ADD | + EV_ENABLE, 0, 0, 0); + while (!fuse->fc->dead) { - ret = kevent(fuse->fc->kq, &fuse->fc->event, 1, &ev, 1, NULL); + ret = kevent(fuse->fc->kq, &event[0], 5, &ev, 1, NULL); if (ret == -1) { - if (errno == EINTR) { - switch (signum) { - case SIGCHLD: - ifuse_child_exit(fuse); - break; - case SIGHUP: - case SIGINT: - case SIGTERM: - ifuse_try_unmount(fuse); - break; - default: - fprintf(stderr, "%s: %s\n", __func__, - strsignal(signum)); - } - } else + if (errno != EINTR) DPERROR(__func__); + } else if (ret > 0 && ev.filter == EVFILT_SIGNAL) { + int signum = ev.ident; + switch (signum) { + case SIGCHLD: + ifuse_child_exit(fuse); + break; + case SIGHUP: + case SIGINT: + case SIGTERM: + ifuse_try_unmount(fuse); + break; + default: + fprintf(stderr, "%s: %s\n", __func__, + strsignal(signum)); + } } else if (ret > 0) { n = read(fuse->fc->fd, &fbuf, sizeof(fbuf)); if (n != sizeof(fbuf)) { @@ -479,20 +476,24 @@ fuse_remove_signal_handlers(unused struc struct sigaction old_sa; if (sigaction(SIGHUP, NULL, &old_sa) == 0) - if (old_sa.sa_handler == ifuse_sighdlr) + if (old_sa.sa_handler == SIG_IGN) signal(SIGHUP, SIG_DFL); if (sigaction(SIGINT, NULL, &old_sa) == 0) - if (old_sa.sa_handler == ifuse_sighdlr) +
Re: pf: honor quick on anchor rules
On Fri, Oct 05, 2018 at 11:53:08PM +0200, Klemens Nanni wrote: > While rules within the anchor do not match, the anchor itself does. > After all, it is a rule itself just like `pass' or `block', hence your > ruleset's second rule `anchor quick' matches every packet and therefore > stops evaluation. To illustrate further: Block everything except IPv4 traffic: pass anchor quick inet { } block This very bad example is stupid but works as expected. It's especially bad as it uses an anonymous anchor, thus you cannot modify it later on using `pfctl -a ...'. Maybe we should disallow anonymous anchors with empty rulesets for above mentioned reasons? Do note that named but empty anchors as fine and even neccessary for some daemons such as relayd(8): anchor "relayd/*"
Re: pf: honor quick on anchor rules
On Fri, Oct 05, 2018 at 10:38:48PM +0200, Fabian Mueller-Knapp wrote: > If i read man correctly it means "evaluate the rules inside and stop if > any rule within matched". While it's own description is quite clear and unambigious: quick If a packet matches a rule which has the quick option set, this rule is considered the last matching rule, and evaluation of subsequent rules is skipped. the anchor specific addition can be improved indeed: If the anchor itself is marked with the quick option, ruleset evaluation will terminate when the anchor is exited if the packet is matched by any rule within the anchor. Does an empty ruleset match a packet? If so, and our code technically does so, -current including my latest fix behaves correctly according to the documentation. > With the current fix, rule evaluation stops after the first block, > regardless of matching. Therefore the following currently always passes: > > snap# uname -a > OpenBSD snap.my.domain 6.4 GENERIC#333 amd64 > snap# pfctl -sr > > pass all flags S/SA > anchor quick all { > } > block drop all > > 6.1 behaves as i would expect, it drops all. Instead of the empty block, > any non-matching rule has the same effect. While rules within the anchor do not match, the anchor itself does. After all, it is a rule itself just like `pass' or `block', hence your ruleset's second rule `anchor quick' matches every packet and therefore stops evaluation.
Re: traceroute6 and ospf6d (icmp6 source addresses and link-locals)
On 2018/10/05 18:38, Alexander Bluhm wrote: > IPv6 Source selection is a mess! > > > ICMP6 messages > > are generated with a source of, I think, the local address associated with > > the route to the recipient, > > It is not that simple. Look at in6_ifawithscope() in sys/netinet6/in6.c. I know that's used for newly generated packets, but I'm not sure it's the case for icmp6, I haven't tried modifying the kernel to confirm for sure that this is the code generating the address in this case, but it seems likely, in icmp6.c: /* 1112 * If the incoming packet was addressed directly to us (i.e. unicast), 1113 * use dst as the src for the reply. 1114 * The IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED case would be VERY rare, 1115 * but is possible (for example) when we encounter an error while 1116 * forwarding procedure destined to a duplicated address of ours. 1117 */ 1118 rt = rtalloc(sin6tosa(&sa6_dst), 0, rtableid); 1119 if (rtisvalid(rt) && ISSET(rt->rt_flags, RTF_LOCAL) && 1120 !ISSET(ifatoia6(rt->rt_ifa)->ia6_flags, 1121 IN6_IFF_ANYCAST|IN6_IFF_TENTATIVE|IN6_IFF_DUPLICATED)) { 1122 src = &t; 1123 } > Could you provide your ifconfig and route output? So we could > figure out into which path of this algorith you are running. The host running traceroute has a handful of global scope addresses on loopback interfaces, plus a global scope address on a vlan interface facing the next router, all advertised into ospf. The default source address is one of the loopbacks, 2001:67c:15f4:a423::26, so the only route back from the rest of the network to this address is via link-locals all the way. BGP routes changed so I'll include a traceroute using the default source address again so all the new copied output is consistent: $ traceroute6 -n www.google.com traceroute6 to www.google.com (2a00:1450:4009:80b::2004), 64 hops max, 60 byte packets 1 fe80::5606:33d8:d784:cd2f%vlan701 0.494 ms 0.362 ms 0.373 ms 2 * * * 3 * * * 4 2001:7f8:17::1b1b:1 7.272 ms 7.332 ms 6.938 ms 5 2001:7f8:17::3b41:1 6.699 ms 6.342 ms 6.453 ms [...] >From the first hop router, gr1$ route -n get -inet6 2001:67c:15f4:a423::26 route to: 2001:67c:15f4:a423::26 destination: 2001:67c:15f4:a423::26 mask: ::::::: gateway: fe80::e648:4970:e85f:5e13%vlan701 interface: vlan701 if address: fe80::5606:33d8:d784:cd2f%vlan701 priority: 32 (ospf) flags: use mtuexpire 29464060 0 0 >From the second hop router, gr5$ route -n get -inet6 2001:67c:15f4:a423::26 route to: 2001:67c:15f4:a423::26 destination: 2001:67c:15f4:a423::26 mask: ::::::: gateway: fe80::d05a:f0e8:e5e:a30a%vlan740 interface: vlan740 if address: fe80::b769:5751:d87b:44b7%vlan740 priority: 32 (ospf) flags: use mtuexpire 20369017 0 0 If instead I source packets from the vlan interface (directly connected to the next router), I instead get this: $ traceroute6 -n -s 2a03:8920:1:52bd::184 www.google.com traceroute6 to www.google.com (2a00:1450:4009:80b::2004) from 2a03:8920:1:52bd::184, 64 hops max, 60 byte packets 1 2a03:8920:1:52bd::181 1.769 ms 0.382 ms 0.377 ms 2 * * * 3 * * * 4 2001:7f8:17::1b1b:1 6.931 ms 6.999 ms 7.115 ms 5 2001:7f8:17::3b41:1 6.466 ms 6.568 ms 6.416 ms The routes in this case from the hop 1 and 2 routers are gr1$ route -n get -inet6 2a03:8920:1:52bd::184 route to: 2a03:8920:1:52bd::184 destination: 2a03:8920:1:52bd::184 mask: ::::::: interface: vlan701 if address: 2a03:8920:1:52bd::181 priority: 3 () flags: use mtuexpire 22811 0 3 gr5$ route -n get -inet6 2a03:8920:1:52bd::184 route to: 2a03:8920:1:52bd::184 destination: 2a03:8920:1:52bd:: mask: ::::: gateway: fe80::d05a:f0e8:e5e:a30a%vlan740 interface: vlan740 if address: fe80::b769:5751:d87b:44b7%vlan740 priority: 32 (ospf) flags: use mtuexpire 234 0 0 So the first hop packet is returned from a "normal" address, and the second (from tcpdump) is returned from fe80::b769:5751:d87b:44b7 and of course doesn't make it all the way back to the host running traceroute. > Once I have have added the following rule from a newer RFC. It > makes things better for many caes, but not with OSPF6. There you > may have an interface with only link-local addresses. Then this > link-local is used instead of another better scoped one. I have global addresses on all interfaces involved, none of the involved interfaces just have link-local. > /* RFC 3484 5. Rule 5: Prefer outgoing interface */ > > > 4 2001:728:0:5000::55 7.843 ms 8.236 ms 7.391 ms > > How can this work? Does your AS-Bounda
vmd: rate-limit to avoid reboot loops
Hi, it sometimes happens that a VM is stuck in a reboot loop. This isn't very pleasent for vmd, so this diff attempts to introduce a hard rate-limit: if the VM rebooted after less than VM_START_RATE_SEC (6) seconds, increment a counter. If this happens VM_START_RATE_LIMIT (3) times in a row, stop the VM. The idea is that it might be desirable in some cases to reboot quickly (you're either really fast on the boot prompt, or you use something like grub that can automatically reboot into a previous kernel). But if this happens too often (more than 3 times), something is wrong and cannot be intended, not even in the worst Linux/grub/unikernel/... situation. These limits are a guessed default. Test case: I dd'ed random bytes to a kernel after some initial bytes, keeping the original size of the kernel. The boot loader loads the header, the complete kernel, tries to boot it and *boom*, reset ;) Comments? Concerns? Better ideas? OKs? Reyk Index: usr.sbin/vmd/config.c === RCS file: /cvs/src/usr.sbin/vmd/config.c,v retrieving revision 1.50 diff -u -p -u -p -r1.50 config.c --- usr.sbin/vmd/config.c 7 Aug 2018 14:49:05 - 1.50 +++ usr.sbin/vmd/config.c 5 Oct 2018 21:15:12 - @@ -187,6 +187,7 @@ config_setvm(struct privsep *ps, struct char ifname[IF_NAMESIZE], *s; char path[PATH_MAX]; unsigned int unit; + struct timeval tv, rate, since_last; errno = 0; @@ -204,6 +205,39 @@ config_setvm(struct privsep *ps, struct goto fail; } } + + /* +* Rate-limit the VM so that it cannot restart in a loop: +* if the VM restarts after less than VM_START_RATE_SEC seconds, +* we increment the limit counter. After VM_START_RATE_LIMIT +* of suchs fast reboots the VM is stopped. +*/ + getmonotime(&tv); + if (vm->vm_start_tv.tv_sec) { + timersub(&tv, &vm->vm_start_tv, &since_last); + + rate.tv_sec = VM_START_RATE_SEC; + rate.tv_usec = 0; + if (timercmp(&since_last, &rate, <)) + vm->vm_start_limit++; + else { + /* Reset counter */ + vm->vm_start_limit = 0; + } + + log_debug("%s: vm %u restarted after %lld.%ld seconds," + " limit %d/%d", __func__, vcp->vcp_id, since_last.tv_sec, + since_last.tv_usec, vm->vm_start_limit, + VM_START_RATE_LIMIT); + + if (vm->vm_start_limit >= VM_START_RATE_LIMIT) { + log_warnx("%s: vm %u restarted too quickly", + __func__, vcp->vcp_id); + errno = EPERM; + goto fail; + } + } + vm->vm_start_tv = tv; diskfds = reallocarray(NULL, vcp->vcp_ndisks, sizeof(*diskfds)); if (diskfds == NULL) { Index: usr.sbin/vmd/vmd.c === RCS file: /cvs/src/usr.sbin/vmd/vmd.c,v retrieving revision 1.102 diff -u -p -u -p -r1.102 vmd.c --- usr.sbin/vmd/vmd.c 29 Sep 2018 22:33:09 - 1.102 +++ usr.sbin/vmd/vmd.c 5 Oct 2018 21:15:12 - @@ -1918,3 +1918,14 @@ prefixlen2mask(uint8_t prefixlen) return (htonl(0x << (32 - prefixlen))); } + +void +getmonotime(struct timeval *tv) +{ + struct timespec ts; + + if (clock_gettime(CLOCK_MONOTONIC, &ts)) + fatal("clock_gettime"); + + TIMESPEC_TO_TIMEVAL(tv, &ts); +} Index: usr.sbin/vmd/vmd.h === RCS file: /cvs/src/usr.sbin/vmd/vmd.h,v retrieving revision 1.81 diff -u -p -u -p -r1.81 vmd.h --- usr.sbin/vmd/vmd.h 1 Oct 2018 09:31:15 - 1.81 +++ usr.sbin/vmd/vmd.h 5 Oct 2018 21:15:13 - @@ -54,6 +54,10 @@ #define VMD_SWITCH_TYPE"bridge" #define VM_DEFAULT_MEMORY 512 +/* Rate-limit fast reboots */ +#define VM_START_RATE_SEC 6 /* min. seconds since last reboot */ +#define VM_START_RATE_LIMIT3 /* max. number of fast reboots */ + /* default user instance limits */ #define VM_DEFAULT_USER_MAXCPU 4 #define VM_DEFAULT_USER_MAXMEM 2048 @@ -260,6 +264,10 @@ struct vmd_vm { int vm_receive_fd; struct vmd_user *vm_user; + /* For rate-limiting */ + struct timeval vm_ - Message truncated -
Re: Qcow2: External snapshots
On Wed, Oct 03, 2018 at 11:41:41PM -0700, Ori Bernstein wrote: > diff --git usr.sbin/vmd/config.c usr.sbin/vmd/config.c > index 550b73c1a39..68be738d304 100644 > --- usr.sbin/vmd/config.c > +++ usr.sbin/vmd/config.c > @@ -35,6 +35,7 @@ > #include > #include > #include > +#include > > #include "proc.h" > #include "vmd.h" > @@ -176,16 +177,21 @@ config_getreset(struct vmd *env, struct imsg *imsg) > int > config_setvm(struct privsep *ps, struct vmd_vm *vm, uint32_t peerid, uid_t > uid) > { > + int diskfds[VMM_MAX_DISKS_PER_VM][VM_MAX_BASE_PER_DISK]; > struct vmd_if *vif; > struct vmop_create_params *vmc = &vm->vm_params; > struct vm_create_params *vcp = &vmc->vmc_params; > - unsigned int i; > + unsigned int i, j; > int fd = -1, vmboot = 0; > - int kernfd = -1, *diskfds = NULL, *tapfds = NULL; > + int kernfd = -1; > + int *tapfds; keep tapfds = NULL or you might cause a segfault in the goto fail case... [snip] > if (tapfds != NULL) { > for (i = 0; i < vcp->vcp_nnics; i++) > close(tapfds[i]); ...here (same function). Reyk
Re: pf: honor quick on anchor rules
Hello, Sorry for the late reply. I just tested the fix with the latest snapshot and think the behaviour is still not correct: On 18-10-04 01:25:09, Klemens Nanni wrote: > On Sat, Sep 29, 2018 at 10:44:41PM +0200, Klemens Nanni wrote: > > `anchor quick' means "evaluate the rules inside but stop after that". If i read man correctly it means "evaluate the rules inside and stop if any rule within matched". With the current fix, rule evaluation stops after the first block, regardless of matching. Therefore the following currently always passes: snap# uname -a OpenBSD snap.my.domain 6.4 GENERIC#333 amd64 snap# pfctl -sr pass all flags S/SA anchor quick all { } block drop all 6.1 behaves as i would expect, it drops all. Instead of the empty block, any non-matching rule has the same effect. Regards, Fabian
Re: traceroute6 and ospf6d (icmp6 source addresses and link-locals)
IPv6 Source selection is a mess! > ICMP6 messages > are generated with a source of, I think, the local address associated with > the route to the recipient, It is not that simple. Look at in6_ifawithscope() in sys/netinet6/in6.c. /* * At this point, we have two cases: * 1. we are looking at a non-deprecated address, *and ia6_best is also non-deprecated. * 2. we are looking at a deprecated address, *and ia6_best is also deprecated. * Also, we do not have to consider a case where * the scope of if_best is larger(smaller) than dst and * the scope of the current address is smaller(larger) * than dst. Such a case has already been covered. * Tiebreaking is done according to the following * items: * - the scope comparison between the address and * dst (dscopecmp) * - the scope comparison between the address and * ia6_best (bscopecmp) * - if the address match dst longer than ia6_best * (matchcmp) * - if the address is on the outgoing I/F (outI/F) * * Roughly speaking, the selection policy is * - the most important item is scope. The same scope * is best. Then search for a larger scope. * Smaller scopes are the last resort. * - A deprecated address is chosen only when we have * no address that has an enough scope, but is * prefered to any addresses of smaller scopes. * - Longest address match against dst is considered * only for addresses that has the same scope of dst. * - If there is no other reasons to choose one, * addresses on the outgoing I/F are preferred. * * The precise decision table is as follows: * dscopecmp bscopecmp matchcmp outI/F | replace? *!equal equal N/AYes | Yes (1) *!equal equal N/A No | No (2) *largerlarger N/AN/A | No (3) *larger smaller N/AN/A | Yes (4) * smallerlarger N/AN/A | Yes (5) * smaller smaller N/AN/A | No (6) * equal smaller N/AN/A | Yes (7) * equallarger (already done) * equal equal largerN/A | Yes (8) * equal equal smallerN/A | No (9) * equal equalequalYes | Yes (a) * equal equalequal No | No (b) */ Could you provide your ifconfig and route output? So we could figure out into which path of this algorith you are running. Once I have have added the following rule from a newer RFC. It makes things better for many caes, but not with OSPF6. There you may have an interface with only link-local addresses. Then this link-local is used instead of another better scoped one. /* RFC 3484 5. Rule 5: Prefer outgoing interface */ > 4 2001:728:0:5000::55 7.843 ms 8.236 ms 7.391 ms How can this work? Does your AS-Boundary Router do NAT? > What's anyone else doing? Just living with it or has anyone figured a way > to make it nicer? I'd like to reply with either a global scope address for > the interface, or a loopback address, We have implemented more or less a very old RFC. There are two newer RFCs with different algorithms. There is recommendation to store policies from user-land into the kernel for address selection. I have just looked at FreeBSD in6_ifawithifp(), it is quite simple. Perhaps this is a way to go. > I didn't get anywhere with PF > translation though. pf with IPv6 link-local addresses does not work properly. I think it cannot parse the %if suffixes. The KAME hack scope id is not handled. bluhm
traceroute6 and ospf6d (icmp6 source addresses and link-locals)
With ospf6d, routes are added using link-local addresses. ICMP6 messages are generated with a source of, I think, the local address associated with the route to the recipient, so with a couple of hops in the internal network it results in traceroutes looking like $ traceroute6 -n www.google.com traceroute6 to www.google.com (2a00:1450:4009:80f::2004), 64 hops max, 60 byte packets 1 fe80::5606:33d8:d784:cd2f%vlan701 (fe80::5606:33d8:d784:cd2f%vlan701) 0.519 ms 1.547 ms 0.417 ms 2 * * * 3 * * * 4 2001:728:0:5000::55 7.843 ms 8.236 ms 7.391 ms [...] The first hop works (ugliness aside) because the link-local is directly connected, the next ones don't because the ICMP response are sourced from a link-local addresses (to match the route to the destination) but these aren't allowed to be forwarded by routers. (And once packets have left the AS, things work as expected on global addresses). What's anyone else doing? Just living with it or has anyone figured a way to make it nicer? I'd like to reply with either a global scope address for the interface, or a loopback address, I didn't get anywhere with PF translation though.
Re: odd condition/test in PF lexer
Todd C. Miller wrote: > On Fri, 05 Oct 2018 00:37:33 +0200, Alexandr Nedvedicky wrote: > > > because earlier line at 5279 grants the variable c holds backslash, > > therefore it can't contain space or tab. The simple change is tempting, > > but let's check the history first. That particular line has been > > introduced 10+ years ago with commit message as follows: > > > > in the lex... even inside quotes, a \ followed by space or tab should > > expand to space or tab, and a \ followed by newline should be ignored > > (as a line continuation). compatible with the needs of hoststated > > (which has the most strict quoted string requirements), and ifstated > > (where one commonly does line continuations in strings). > > > > Comment above makes me thinking the intended change looks as follows: > > > > 5282 if (next == quotec || next == ' ' || next == '\t') > > I agree with your analysis. Thank you for taking the time to > determine the author's intent instead of blindly trusting the static > analyzer. It looks obviously correct. Still, since we are in the release window, I would like to know what the regression tests think of this. It took so long for someone to hit this lex error, so it probably should be punted post-release regardless.
Re: odd condition/test in PF lexer
On Fri, 05 Oct 2018 00:37:33 +0200, Alexandr Nedvedicky wrote: > because earlier line at 5279 grants the variable c holds backslash, > therefore it can't contain space or tab. The simple change is tempting, > but let's check the history first. That particular line has been > introduced 10+ years ago with commit message as follows: > > in the lex... even inside quotes, a \ followed by space or tab should > expand to space or tab, and a \ followed by newline should be ignored > (as a line continuation). compatible with the needs of hoststated > (which has the most strict quoted string requirements), and ifstated > (where one commonly does line continuations in strings). > > Comment above makes me thinking the intended change looks as follows: > > 5282 if (next == quotec || next == ' ' || next == '\t') I agree with your analysis. Thank you for taking the time to determine the author's intent instead of blindly trusting the static analyzer. - todd
top cpu stats are wrong with hyper threading
Hi, Due to the SMT stuff the output of top showed the first few cpus instead of the ones that are actually active. To reproduce the bad output: Use a machine with hyper therading, top should show half the cpus, of which every second is disabled. The following diff skips the disabled cpus and disabling/reenabling the cores with hw.smt also works. The only problem is that the lines reserved for cpu-stats does not change with reenabling. Refreshing the output with '1' or resizing the window should help. Thanks, Moritz Buhl Index: display.c === RCS file: /home/mbuhl/cvs/src/usr.bin/top/display.c,v retrieving revision 1.55 diff -u -p -r1.55 display.c --- display.c 26 Sep 2018 17:23:13 - 1.55 +++ display.c 5 Oct 2018 15:19:00 - @@ -381,7 +381,7 @@ cpustates_tag(int cpu) void i_cpustates(int64_t *ostates, int *online) { - int i, first, cpu, ncpuonline; + int i, first, cpu, ncpuonline, off; double value; int64_t *states; char **names, *thisname; @@ -434,15 +434,18 @@ i_cpustates(int64_t *ostates, int *onlin } return; } + off = 0; for (cpu = 0; cpu < ncpu; cpu++) { /* now walk thru the names and print the line */ names = cpustate_names; first = 0; states = ostates + (CPUSTATES * cpu); - if ((screen_length > 2 + cpu && 2 + cpu < y_mem) || + if (y_mem == ncpuonline + 2 && !online[cpu]) + continue; + if ((screen_length > 2 + cpu && 2 + off < y_mem) || !smart_terminal) { - move(2 + cpu, 0); + move(2 + off, 0); clrtoeol(); addstrp(cpustates_tag(cpu)); @@ -465,6 +468,7 @@ i_cpustates(int64_t *ostates, int *onlin } putn(); } + off++; } }
landisk: fix up MD swap64 function
Provide an MD 64-bit byteswapping function build on 32-bit swaps as we do on arm and i386. Copied from arm. If there are no MD byteswapping functions, MI macros are used. These are wrapped by static inline functions to prevent multiple evaluation of their argument. If there are MD functions, they are used directly and therefore we must implicitly guarantee that they are safe from multiple evaluation. Defining an MD function to an MI macro breaks this promise. OK? Index: arch/sh/include/endian.h === RCS file: /cvs/src/sys/arch/sh/include/endian.h,v retrieving revision 1.6 diff -u -p -r1.6 endian.h --- arch/sh/include/endian.h2 Oct 2018 21:30:44 - 1.6 +++ arch/sh/include/endian.h5 Oct 2018 13:17:00 - @@ -31,7 +31,16 @@ __swap32md(__uint32_t _x) return (_rv); } -#define__swap64md __swap64gen +static __inline __uint64_t +__swap64md(__uint64_t _x) +{ + __uint64_t _rv; + + _rv = (__uint64_t)__swap32md(_x >> 32) | + (__uint64_t)__swap32md(_x) << 32; + + return (_rv); +} /* Tell sys/endian.h we have MD variants of the swap macros. */ #define __HAVE_MD_SWAP -- Christian "naddy" Weisgerber na...@mips.inka.de
vmd: servicing virtio devices from separate processes
Hi, I have an idea in mind that I'd like to share to ask you if you think it's worth giving it a try. Right now, vmd already features an excellent privsep model to ensure the process servicing the VM requests to the outside world is running with the lowest possible privileges. I was wondering if we could take a step further, servicing each virtio device from a different process. This design would simplify the implementation and maintenance of those devices, improve the privsep model and increase the resilience of VMs (the crash of a process servicing a device won't bring down the whole VM, and a mechanism to recover from this scenario could be explored). Doing this in an efficient way requires: - The ability to receive virtqueue notifications directly on the process. I've already sent an RFC patch for this (see "Lightweight mechanism to kick virtio VQs"), as it'd be useful for the non-separated model too. - An in-kernel IRQ chip. This one is the most controversial, as it means emulating a device from a privileged domain, but I'm pretty sure a lapic implementation with enough functionality to serve *BSD/Linux Guests can be small and simple enough to be easily auditable. - A way to map the VM memory into a third process context. Can uvm_share for this? Here's also the opportunity to explore options to avoid mapping the whole VM memory, though that'll possibly require functionality non covered by the virtio specs. Do you think it's worth exploring this model? What are feelings regarding the in-kernel IRQ chip? Sergio (slp).
[RFC PATCH] [vmm] Lightweight mechanism to kick virtio VQs
Hi, This patch implements a mechanism to allow users register an I/O port with a special file descriptor (kickfd) which can be monitored for events using kevent. The kernel will note an event each time the Guest writes to an I/O port registered with a kickfd. This is mainly intended for kicking virtio virtqueues, and allows the VCPU to return immediately to the Guest, instead of having to exit all the way to vmd to process the request. With kickfd, the actual work (i.e. an I/O operation) requested by the Guest can be done in a parallel thread that eventually asserts the corresponding IRQ. In a general sense, this improves the I/O performance when the Guest is issuing asynchronous operations (or its virtio implementation is able to access the virtqueue asynchronously) and makes it more responsive (as the VCPU spends less time outside the Guest), but it may hurt a bit when the Guest is actively waiting for the operation to finish. This is also a first towards the possibility of having virtio devices running on separate processes, but I'll write more about this on another email thread. I deliberately omitted the i386 implementation to make this easier to review. Sergio (slp). Index: arch/amd64/amd64/vmm.c === RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v retrieving revision 1.220 diff -u -p -r1.220 vmm.c --- arch/amd64/amd64/vmm.c 20 Sep 2018 14:32:59 - 1.220 +++ arch/amd64/amd64/vmm.c 5 Oct 2018 09:54:34 - @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -63,6 +66,16 @@ void *l1tf_flush_region; #define VMX_EXIT_INFO_COMPLETE \ (VMX_EXIT_INFO_HAVE_RIP | VMX_EXIT_INFO_HAVE_REASON) +struct kickfd { + uint16_t ioport; + struct klist note; + struct vm*vm; + + SLIST_ENTRY(kickfd) kickfd_link; +}; + +SLIST_HEAD(kickfd_head, kickfd); + struct vm { vm_map_t vm_map; uint32_t vm_id; @@ -77,6 +90,9 @@ struct vm { u_intvm_vcpus_running; struct rwlockvm_vcpu_lock; + struct kickfd_head vm_kickfd_list; + struct rwlockvm_kickfd_lock; + SLIST_ENTRY(vm) vm_link; }; @@ -122,6 +138,7 @@ int vm_get_info(struct vm_info_params *) int vm_resetcpu(struct vm_resetcpu_params *); int vm_intr_pending(struct vm_intr_params *); int vm_rwregs(struct vm_rwregs_params *, int); +int vm_register_kickfd(struct vm_regkickfd_params *, struct proc *); int vm_find(uint32_t, struct vm **); int vcpu_readregs_vmx(struct vcpu *, uint64_t, struct vcpu_reg_state *); int vcpu_readregs_svm(struct vcpu *, uint64_t, struct vcpu_reg_state *); @@ -281,6 +298,7 @@ const struct { /* Pools for VMs and VCPUs */ struct pool vm_pool; struct pool vcpu_pool; +struct pool kickfd_pool; struct vmm_softc *vmm_softc; @@ -419,6 +437,8 @@ vmm_attach(struct device *parent, struct "vmpool", NULL); pool_init(&vcpu_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK, "vcpupl", NULL); + pool_init(&kickfd_pool, sizeof(struct kickfd), 64, IPL_NONE, PR_WAITOK, + "kickfdpl", NULL); vmm_softc = sc; } @@ -482,6 +502,9 @@ vmmioctl(dev_t dev, u_long cmd, caddr_t case VMM_IOC_WRITEREGS: ret = vm_rwregs((struct vm_rwregs_params *)data, 1); break; + case VMM_IOC_REGKICKFD: + ret = vm_register_kickfd((struct vm_regkickfd_params *)data, p); + break; default: DPRINTF("%s: unknown ioctl code 0x%lx\n", __func__, cmd); ret = ENOTTY; @@ -513,6 +536,7 @@ pledge_ioctl_vmm(struct proc *p, long co case VMM_IOC_INTR: case VMM_IOC_READREGS: case VMM_IOC_WRITEREGS: + case VMM_IOC_REGKICKFD: return (0); } @@ -725,6 +749,176 @@ vm_rwregs(struct vm_rwregs_params *vrwp, } } +void +filt_vmmkickfd_detach(struct knote *kn) +{ + struct kickfd *kickfd = kn->kn_fp->f_data; + + rw_enter_write(&kickfd->vm->vm_kickfd_lock); + SLIST_REMOVE(&kickfd->note, kn, knote, kn_selnext); + rw_exit_write(&kickfd->vm->vm_kickfd_lock); +} + +int +filt_vmmkickfd_read(struct knote *kn, long hint) +{ + return(1); +} + +struct filterops vmmkickfd_filtops = + { 1, NULL, filt_vmmkickfd_detach, filt_vmmkickfd_read }; + +int +vmmkickfd_read(struct file *fp, struct uio *uio, int fflags) +{ + return (EINVAL); +} + +int +vmmkickfd_write(struct file *fp, struct uio *uio, int fflags) +{ + return (EINVAL); +} + +int +vmmkickfd_ioctl(struct file *fp, u_long cmd, caddr_t data, struct proc *p) +{ + return (ENOTTY); +} + +int +vmmkickfd_poll(struct file *fp, int events, struct proc *p) +{ + return (EINVAL); +} + +int +vmmkickfd_
Re: do not join node information multicast group
Florian Obser(flor...@openbsd.org) on 2018.10.04 20:13:03 +0200: > Benno removed code to answer ICMP queries over 4 years ago. > Aham Brahmasmi (aham.brahmasmi AT gmx.com) points out > that we still joine the group though. > > OK? ok benno@ > diff --git in6.c in6.c > index c09ab1dcd0a..5297c0a1249 100644 > --- in6.c > +++ in6.c > @@ -808,19 +808,6 @@ in6_update_ifa(struct ifnet *ifp, struct in6_aliasreq > *ifra, > goto cleanup; > LIST_INSERT_HEAD(&ia6->ia6_memberships, imm, i6mm_chain); > > - /* > - * join node information group address > - */ > - if (in6_nigroup(ifp, hostname, hostnamelen, &mltaddr) == 0) { > - imm = in6_joingroup(ifp, &mltaddr.sin6_addr, &error); > - if (!imm) { > - /* XXX not very fatal, go on... */ > - } else { > - LIST_INSERT_HEAD(&ia6->ia6_memberships, > - imm, i6mm_chain); > - } > - } > - > /* >* join interface-local all-nodes address. >* (ff01::1%ifN, and ff01::%ifN/32) > diff --git in6_ifattach.c in6_ifattach.c > index 2f8463e3a47..b6e67a5eee7 100644 > --- in6_ifattach.c > +++ in6_ifattach.c > @@ -428,57 +428,6 @@ in6_ifattach_loopback(struct ifnet *ifp) > return (in6_update_ifa(ifp, &ifra, NULL)); > } > > -/* > - * compute NI group address, based on the current hostname setting. > - * see draft-ietf-ipngwg-icmp-name-lookup-* (04 and later). > - * > - * when ifp == NULL, the caller is responsible for filling scopeid. > - */ > -int > -in6_nigroup(struct ifnet *ifp, const char *name, int namelen, > -struct sockaddr_in6 *sa6) > -{ > - const char *p; > - u_int8_t *q; > - SHA2_CTX ctx; > - u_int8_t digest[SHA512_DIGEST_LENGTH]; > - u_int8_t l; > - u_int8_t n[64]; /* a single label must not exceed 63 chars */ > - > - if (!namelen || !name) > - return -1; > - > - p = name; > - while (p && *p && *p != '.' && p - name < namelen) > - p++; > - if (p - name > sizeof(n) - 1) > - return -1; /* label too long */ > - l = p - name; > - strncpy((char *)n, name, l); > - n[(int)l] = '\0'; > - for (q = n; *q; q++) { > - if ('A' <= *q && *q <= 'Z') > - *q = *q - 'A' + 'a'; > - } > - > - /* generate 8 bytes of pseudo-random value. */ > - SHA512Init(&ctx); > - SHA512Update(&ctx, &l, sizeof(l)); > - SHA512Update(&ctx, n, l); > - SHA512Final(digest, &ctx); > - > - bzero(sa6, sizeof(*sa6)); > - sa6->sin6_family = AF_INET6; > - sa6->sin6_len = sizeof(*sa6); > - sa6->sin6_addr.s6_addr16[0] = htons(0xff02); > - sa6->sin6_addr.s6_addr16[1] = htons(ifp->if_index); > - sa6->sin6_addr.s6_addr8[11] = 2; > - memcpy(&sa6->sin6_addr.s6_addr32[3], digest, > - sizeof(sa6->sin6_addr.s6_addr32[3])); > - > - return 0; > -} > - > /* > * XXX multiple loopback interface needs more care. for instance, > * nodelocal address needs to be configured onto only one of them. > diff --git in6_ifattach.h in6_ifattach.h > index 0f54b457de9..525cc365ffe 100644 > --- in6_ifattach.h > +++ in6_ifattach.h > @@ -36,7 +36,6 @@ > #ifdef _KERNEL > int in6_ifattach(struct ifnet *); > void in6_ifdetach(struct ifnet *); > -int in6_nigroup(struct ifnet *, const char *, int, struct sockaddr_in6 *); > int in6_ifattach_linklocal(struct ifnet *, struct in6_addr *); > void in6_soiiupdate(struct ifnet *); > #endif /* _KERNEL */ > > > -- > I'm not entirely sure you are real. >
Re: Qcow2: External snapshots
On Wed, Oct 03, 2018 at 11:41:41PM -0700, Ori Bernstein wrote: > Thanks, another update based on Reyk's feeback and fixes. > You missed one thing: jmc@'s manpage comments. For everything else: Looks good! Tests work fine. OK reyk@ Reyk > diff --git regress/usr.sbin/vmd/diskfmt/Makefile > regress/usr.sbin/vmd/diskfmt/Makefile > index c2a5f42d5f6..1f8673e0e26 100644 > --- regress/usr.sbin/vmd/diskfmt/Makefile > +++ regress/usr.sbin/vmd/diskfmt/Makefile > @@ -11,7 +11,7 @@ > VMD_DIR=$(BSDSRCDIR)/usr.sbin/vmd/ > > PROG=vioscribble > -SRCS=vioscribble.c $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c > +SRCS=vioscribble.c vioqcow2.c vioraw.c > CFLAGS+=-I$(VMD_DIR) -pthread > LDFLAGS+=-pthread > > @@ -26,3 +26,6 @@ scribble-images: > .PHONY: ${REGRESS_TARGETS} scribble-images > > .include > + > +vioqcow2.c vioraw.c: $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c > + cp $(VMD_DIR)/vioqcow2.c $(VMD_DIR)/vioraw.c . > diff --git regress/usr.sbin/vmd/diskfmt/vioscribble.c > regress/usr.sbin/vmd/diskfmt/vioscribble.c > index 14d720db652..1da8efedac7 100644 > --- regress/usr.sbin/vmd/diskfmt/vioscribble.c > +++ regress/usr.sbin/vmd/diskfmt/vioscribble.c > @@ -122,16 +122,18 @@ main(int argc, char **argv) > verbose = !!getenv("VERBOSE"); > qcfd = open("scribble.qc2", O_RDWR); > rawfd = open("scribble.raw", O_RDWR); > - if (qcfd == -1 || virtio_init_qcow2(&qcowfile, &qcsz, qcfd) == -1) > + if (qcfd == -1) > err(1, "unable to open qcow"); > - if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, rawfd) == -1) > + if (virtio_init_qcow2(&qcowfile, &qcsz, &qcfd, 1) == -1) > + err(1, "unable to init qcow"); > + if (rawfd == -1 || virtio_init_raw(&rawfile, &rawsz, &rawfd, 1) == -1) > err(1, "unable to open raw"); > > srandom_deterministic(123); > > /* scribble to both disks */ > printf("scribbling...\n"); > - for (i = 0; i < 16; i++) { > + for (i = 0; i < 1024*16; i++) { > off = (random() % DISKSZ); > len = random() % sizeof buf + 1; > fill(off, buf, sizeof buf); > diff --git usr.sbin/vmctl/main.c usr.sbin/vmctl/main.c > index 8748ecfdedc..a3ab4672370 100644 > --- usr.sbin/vmctl/main.c > +++ usr.sbin/vmctl/main.c > @@ -67,7 +67,8 @@ int ctl_receive(struct parse_result *, int, char > *[]); > > struct ctl_command ctl_commands[] = { > { "console",CMD_CONSOLE,ctl_console,"id" }, > - { "create", CMD_CREATE, ctl_create, "\"path\" -s size", 1 }, > + { "create", CMD_CREATE, ctl_create, > + "\"path\" [-s size] [-b base]", 1 }, > { "load", CMD_LOAD, ctl_load, "\"path\"" }, > { "log",CMD_LOG,ctl_log,"[verbose|brief]" }, > { "reload", CMD_RELOAD, ctl_reload, "" }, > @@ -538,47 +539,55 @@ int > ctl_create(struct parse_result *res, int argc, char *argv[]) > { > int ch, ret, type; > - const char *paths[2], *disk, *format; > + const char *disk, *format, *base; > > if (argc < 2) > ctl_usage(res->ctl); > > + base = NULL; > type = parse_disktype(argv[1], &disk); > > - paths[0] = disk; > - paths[1] = NULL; > - > - if (unveil(paths[0], "rwc") == -1) > + if (pledge("stdio rpath wpath cpath unveil", NULL) == -1) > + err(1, "pledge"); > + if (unveil(disk, "rwc") == -1) > err(1, "unveil"); > > - if (pledge("stdio rpath wpath cpath", NULL) == -1) > - err(1, "pledge"); > argc--; > argv++; > > - while ((ch = getopt(argc, argv, "s:")) != -1) { > + while ((ch = getopt(argc, argv, "s:b:")) != -1) { > switch (ch) { > case 's': > if (parse_size(res, optarg, 0) != 0) > errx(1, "invalid size: %s", optarg); > break; > + case 'b': > + base = optarg; > + if (unveil(base, "r") == -1) > + err(1, "unveil"); > + break; > default: > ctl_usage(res->ctl); > /* NOTREACHED */ > } > } > + if (unveil(NULL, NULL)) > + err(1, "unveil"); > > - if (res->size == 0) { > - fprintf(stderr, "missing size argument\n"); > + if (base && type != VMDF_QCOW2) > + errx(1, "base images require qcow2 disk format"); > + if (res->size == 0 && !base) { > + fprintf(stderr, "could not create %s: missing size argument\n", > + disk); > ctl_usage(res->ctl); > } > > if (type == VMDF_QCOW2) { > format = "qcow2"; > - ret = create_qc2_imagefile(paths[0], res->size); > + ret = create_qc2_imagefile(disk, base, res->size); >