Re: Special casing fd's opened before pledge(2)

2017-01-23 Thread Theo de Raadt
> There has been developer pressure to permit an increasing number of
> ioctl's to pledged programs.  The problem is that providing a specific
> ioctl under a promise to one program, means it becomes supplied to all
> other programs that make that promise.  There is no discrete method
> to differentiate further, until now.
> 
> This proposal annotates file descriptors allocated before the first
> pledge(2) call in a process, with a marker.  Such file descriptors can
> be dup'd; the mark is retained.  They can fdpassed, which also retains
> the mark in the receiver (therefore, a non-pledged process can feed a
> pledged process).  Execve clears this flag.
> 
> That is the proposed semantic.  There are still a few glitches.
> 
> Once that mark exists, I feel more comfortable adding additional ioctl's
> which demand early-open fd's.  Specifically in privsep daemons.
> 
> 5 programs have been discovered in the tree which needed small changes
> (csh, ksh, less, tmux, dhclient).
> 
> less and tmux fixes are not commited, be cautious about tmux.

New version.

Index: sys/filedesc.h
===
RCS file: /cvs/src/sys/sys/filedesc.h,v
retrieving revision 1.30
diff -u -p -u -r1.30 filedesc.h
--- sys/filedesc.h  6 May 2015 08:52:17 -   1.30
+++ sys/filedesc.h  23 Jan 2017 16:00:14 -
@@ -105,6 +105,7 @@ struct filedesc0 {
  * Per-process open flags.
  */
 #defineUF_EXCLOSE  0x01/* auto-close on exec */
+#defineUF_PLEDGED  0x02/* opened after pledge(2) */
 
 /*
  * Flags on the file descriptor table.
@@ -121,7 +122,7 @@ struct filedesc0 {
  * Kernel global variables and routines.
  */
 void   filedesc_init(void);
-intdupfdopen(struct filedesc *, int, int, int);
+intdupfdopen(struct proc *p, struct filedesc *, int, int);
 intfdalloc(struct proc *p, int want, int *result);
 void   fdexpand(struct proc *);
 intfalloc(struct proc *p, struct file **resultfp, int *resultfd);
Index: sys/pledge.h
===
RCS file: /cvs/src/sys/sys/pledge.h,v
retrieving revision 1.30
diff -u -p -u -r1.30 pledge.h
--- sys/pledge.h23 Jan 2017 04:25:05 -  1.30
+++ sys/pledge.h23 Jan 2017 11:40:58 -
@@ -126,7 +126,7 @@ int pledge_adjtime(struct proc *p, const
 intpledge_sendit(struct proc *p, const void *to);
 intpledge_sockopt(struct proc *p, int set, int level, int optname);
 intpledge_socket(struct proc *p, int domain, int state);
-intpledge_ioctl(struct proc *p, long com, struct file *);
+intpledge_ioctl(struct proc *p, long com, struct file *, int, int);
 intpledge_ioctl_drm(struct proc *p, long com, dev_t device);
 intpledge_ioctl_vmm(struct proc *p, long com);
 intpledge_flock(struct proc *p);
Index: sys/unpcb.h
===
RCS file: /cvs/src/sys/sys/unpcb.h,v
retrieving revision 1.12
diff -u -p -u -r1.12 unpcb.h
--- sys/unpcb.h 28 Aug 2015 04:38:47 -  1.12
+++ sys/unpcb.h 22 Jan 2017 09:19:54 -
@@ -86,17 +86,22 @@ struct  unpcb {
 #definesotounpcb(so)   ((struct unpcb *)((so)->so_pcb))
 
 #ifdef _KERNEL
+struct fdpass {
+   struct file *fp;
+   int flags;
+};
+
 intunp_attach(struct socket *);
 intunp_bind(struct unpcb *, struct mbuf *, struct proc *);
 intunp_connect(struct socket *, struct mbuf *, struct proc *);
 intunp_connect2(struct socket *, struct socket *);
 void   unp_detach(struct unpcb *);
-void   unp_discard(struct file **, int);
+void   unp_discard(struct fdpass *, int);
 void   unp_disconnect(struct unpcb *);
 void   unp_drop(struct unpcb *, int);
 void   unp_gc(void *);
-void   unp_mark(struct file **, int);
-void   unp_scan(struct mbuf *, void (*)(struct file **, int));
+void   unp_mark(struct fdpass *, int);
+void   unp_scan(struct mbuf *, void (*)(struct fdpass *, int));
 void   unp_shutdown(struct unpcb *);
 intunp_externalize(struct mbuf *, socklen_t, int);
 intunp_internalize(struct mbuf *, struct proc *);
Index: kern/kern_descrip.c
===
RCS file: /cvs/src/sys/kern/kern_descrip.c,v
retrieving revision 1.136
diff -u -p -u -r1.136 kern_descrip.c
--- kern/kern_descrip.c 24 Sep 2016 18:39:17 -  1.136
+++ kern/kern_descrip.c 23 Jan 2017 16:01:59 -
@@ -611,7 +611,7 @@ finishdup(struct proc *p, struct file *f
FREF(oldfp);
 
fdp->fd_ofiles[new] = fp;
-   fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
+   fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & 
~(UF_EXCLOSE|UF_PLEDGED);
fp->f_count++;
FRELE(fp, p);
if (dup2 && oldfp == NULL)
@@ -632,6 +632,7 @@ fdremove(struct filedesc *fdp, int fd)
 {
fdpassertlocked(fdp);
fdp->fd_ofiles[fd] = NULL;
+   fdp->fd_ofileflags[fd] = 0;

Re: less

2017-01-23 Thread Sebastian Benoit
Theo de Raadt(dera...@openbsd.org) on 2017.01.22 22:01:22 -0700:
> This change causes less(1) to open /dev/tty slightly earlier,
> or fallback to stderr.
> 
> As a result the TIOCGWINSZ operation can be done against the most
> likely tty descriptor.
> 
> As a side effect, that tty descriptor will be opened before the
> first pledge call.
> 
> Anyone see a downside?

no, ok

> 
> Index: usr.bin/less/main.c
> ===
> RCS file: /cvs/src/usr.bin/less/main.c,v
> retrieving revision 1.35
> diff -u -p -u -r1.35 main.c
> --- usr.bin/less/main.c   17 Sep 2016 15:06:41 -  1.35
> +++ usr.bin/less/main.c   21 Jan 2017 06:10:53 -
> @@ -90,6 +90,8 @@ main(int argc, char *argv[])
>   if (s != NULL && *s != '\0')
>   secure = 1;
>  
> + open_getchr();
> +
>   if (secure) {
>   if (pledge("stdio rpath wpath tty", NULL) == -1) {
>   perror("pledge");
> @@ -225,7 +227,6 @@ main(int argc, char *argv[])
>   if (missing_cap && !know_dumb)
>   error("WARNING: terminal is not fully functional", NULL);
>   init_mark();
> - open_getchr();
>  
>   if (secure)
>   if (pledge("stdio rpath tty", NULL) == -1) {
> Index: usr.bin/less/screen.c
> ===
> RCS file: /cvs/src/usr.bin/less/screen.c,v
> retrieving revision 1.24
> diff -u -p -u -r1.24 screen.c
> --- usr.bin/less/screen.c 8 Jul 2016 15:23:44 -   1.24
> +++ usr.bin/less/screen.c 21 Jan 2017 12:06:22 -
> @@ -181,7 +181,7 @@ scrsize(void)
>  #define  DEF_SC_WIDTH80
>  #define  DEF_SC_HEIGHT   24
>  
> - if (ioctl(2, TIOCGWINSZ, ) == 0) {
> + if (ioctl(tty, TIOCGWINSZ, ) == 0) {
>   if (w.ws_row > 0)
>   sys_height = w.ws_row;
>   if (w.ws_col > 0)
> 



network booting efi systems with dhcpd(8)

2017-01-23 Thread Jonathan Matthew
For network boot clients, dhcpd(8) can supply a filename for the initial
boot file for the client, which is something like pxeboot (or pxelinux.0).
EFI and BIOS clients need different boot files, though, so the server
needs to know what mode the client is booting in, in order to supply the
right filename.  RFC 4578 defines DHCP client option 93 for this purpose.

The ISC dhcpd approach to using this looks something like this:

  option arch code 93 = unsigned integer 16;

  if option arch = 00:00 {
filename "bios/pxelinux.0";
  } elsif  option arch = 00:07 {
 filename "efi.x64/syslinux.efi";
  }

which seems a bit complicated, and also a lot of work to implement.  Instead
I propose adding 'efi-filename' (and 'efi32-filename', though I'm not sure
that's worth doing) next to the existing 'filename' statement and having
dhcpd itself interpret the option values to figure out which one should be
used.

I don't actually need this yet because we don't have many EFI-only client
machines, so we can just run everything in legacy mode, but some day this
may change.

Does this seem reasonable?

Index: conflex.c
===
RCS file: /cvs/src/usr.sbin/dhcpd/conflex.c,v
retrieving revision 1.16
diff -u -p -u -p -r1.16 conflex.c
--- conflex.c   6 Feb 2016 23:50:10 -   1.16
+++ conflex.c   23 Jan 2017 08:09:15 -
@@ -310,6 +310,8 @@ static const struct keywords {
{ "dynamic-bootp",  TOK_DYNAMIC_BOOTP },
{ "dynamic-bootp-lease-cutoff", TOK_DYNAMIC_BOOTP_LEASE_CUTOFF 
},
{ "dynamic-bootp-lease-length", TOK_DYNAMIC_BOOTP_LEASE_LENGTH 
},
+   { "efi-filename",   TOK_EFI64_FILENAME },
+   { "efi32-filename", TOK_EFI32_FILENAME },
{ "ends",   TOK_ENDS },
{ "ethernet",   TOK_ETHERNET },
{ "filename",   TOK_FILENAME },
Index: confpars.c
===
RCS file: /cvs/src/usr.sbin/dhcpd/confpars.c,v
retrieving revision 1.28
diff -u -p -u -p -r1.28 confpars.c
--- confpars.c  17 Aug 2016 00:55:33 -  1.28
+++ confpars.c  23 Jan 2017 08:09:15 -
@@ -380,6 +380,14 @@ parse_statement(FILE *cfile, struct grou
group->filename = parse_string(cfile);
break;
 
+   case TOK_EFI32_FILENAME:
+   group->efi32_filename = parse_string(cfile);
+   break;
+
+   case TOK_EFI64_FILENAME:
+   group->efi64_filename = parse_string(cfile);
+   break;
+
case TOK_SERVER_NAME:
group->server_name = parse_string(cfile);
break;
Index: dhcp.c
===
RCS file: /cvs/src/usr.sbin/dhcpd/dhcp.c,v
retrieving revision 1.52
diff -u -p -u -p -r1.52 dhcp.c
--- dhcp.c  24 Oct 2016 21:05:55 -  1.52
+++ dhcp.c  23 Jan 2017 08:09:15 -
@@ -699,6 +699,35 @@ nak_lease(struct packet *packet, struct 
outgoing.packet_length, from, , NULL);
 }
 
+static const char *
+group_filename(struct group *group, struct packet *packet)
+{
+   u_int16_t arch, *opt;
+   int i;
+
+   opt = (u_int16_t *)packet->options[DHO_DHCP_CLIENT_ARCHITECTURE].data;
+   for (i = 0; i < packet->options[DHO_DHCP_CLIENT_ARCHITECTURE].len;
+   i += 2) {
+   arch = betoh16(*opt++);
+   switch (arch) {
+   case DCA_EFI_IA32:
+   if (group->efi32_filename != NULL)
+   return group->efi32_filename;
+   break;
+   
+   case DCA_EFI_BC:
+   case DCA_EFI_X86_64:
+   if (group->efi64_filename != NULL)
+   return group->efi64_filename;
+   break;
+   default:
+   break;
+   }
+   }
+
+   return group->filename;
+}
+
 void
 ack_lease(struct packet *packet, struct lease *lease, unsigned int offer,
 time_t when)
@@ -707,6 +736,7 @@ ack_lease(struct packet *packet, struct 
struct lease_state *state;
time_t lease_time, offered_lease_time, max_lease_time, 
default_lease_time;
struct class *vendor_class, *user_class;
+   const char *filename;
int ulafdr, i;
 
/* If we're already acking this lease, don't do it again. */
@@ -808,21 +838,20 @@ ack_lease(struct packet *packet, struct 
 * Choose a filename; first from the host_decl, if any, then from
 * the user class, then from the vendor class.
 */
-   if (lease->host && lease->host->group->filename)
-   strlcpy(state->filename, lease->host->group->filename,
-   sizeof state->filename);
-   else if (user_class && 

Re: network booting efi systems with dhcpd(8)

2017-01-23 Thread Jiri B
> > The ISC dhcpd approach to using this looks something like this:
> > 
> >   option arch code 93 = unsigned integer 16;
> > 
> >   if option arch = 00:00 {
> > filename "bios/pxelinux.0";
> >   } elsif  option arch = 00:07 {
> >  filename "efi.x64/syslinux.efi";
> >   }
> 
> [...] IIUC OpenBSD dhcpd doesn't support conditionals...

Ugly workaround is to use rewrite via tftpd's socket
but one has to known IP address in advance.

 -r socket
 Issue filename rewrite requests to the specified UNIX domain
 socket.  tftpd will write lines in the format "IP OP filename",
 terminated by a newline, where IP is the client's IP address, and
 OP is one of "read" or "write".  tftpd expects replies in the
 format "filename" terminated by a newline.  All rewrite requests
 from the daemon must be answered (even if it is with the original
 filename) before the TFTP request will continue.  By default
 tftpd does not use filename rewriting.

j.



Re: network booting efi systems with dhcpd(8)

2017-01-23 Thread Jiri B
On Mon, Jan 23, 2017 at 08:39:17PM +1000, Jonathan Matthew wrote:
> For network boot clients, dhcpd(8) can supply a filename for the initial
> boot file for the client, which is something like pxeboot (or pxelinux.0).
> EFI and BIOS clients need different boot files, though, so the server
> needs to know what mode the client is booting in, in order to supply the
> right filename.  RFC 4578 defines DHCP client option 93 for this purpose.
> 
> The ISC dhcpd approach to using this looks something like this:
> 
>   option arch code 93 = unsigned integer 16;
> 
>   if option arch = 00:00 {
> filename "bios/pxelinux.0";
>   } elsif  option arch = 00:07 {
>  filename "efi.x64/syslinux.efi";
>   }
> 
> which seems a bit complicated, and also a lot of work to implement.  Instead
> I propose adding 'efi-filename' (and 'efi32-filename', though I'm not sure
> that's worth doing) next to the existing 'filename' statement and having
> dhcpd itself interpret the option values to figure out which one should be
> used.

'filename' or 'option bootfile-name' are used by autoinstall. I'm not
sure diverting dhcpd option for boot image from rest of world is good
way. IIUC OpenBSD dhcpd doesn't support conditionals...

j.




Re: network booting efi systems with dhcpd(8)

2017-01-23 Thread Jonathan Gray
On Mon, Jan 23, 2017 at 08:39:17PM +1000, Jonathan Matthew wrote:
> For network boot clients, dhcpd(8) can supply a filename for the initial
> boot file for the client, which is something like pxeboot (or pxelinux.0).
> EFI and BIOS clients need different boot files, though, so the server
> needs to know what mode the client is booting in, in order to supply the
> right filename.  RFC 4578 defines DHCP client option 93 for this purpose.
> 
> The ISC dhcpd approach to using this looks something like this:
> 
>   option arch code 93 = unsigned integer 16;
> 
>   if option arch = 00:00 {
> filename "bios/pxelinux.0";
>   } elsif  option arch = 00:07 {
>  filename "efi.x64/syslinux.efi";
>   }
> 
> which seems a bit complicated, and also a lot of work to implement.  Instead
> I propose adding 'efi-filename' (and 'efi32-filename', though I'm not sure
> that's worth doing) next to the existing 'filename' statement and having
> dhcpd itself interpret the option values to figure out which one should be
> used.
> 
> I don't actually need this yet because we don't have many EFI-only client
> machines, so we can just run everything in legacy mode, but some day this
> may change.
> 
> Does this seem reasonable?

How will values 10 (32 bit arm) and 11 (64 bit arm) be handled
in future if this went in?  Wouldn't it be better to map the arch numbers
to strings?

The uefi spec refers to
http://www.iana.org/assignments/dhcpv6-parameters/dhcpv6-parameters.xhtml#processor-architecture
Which also lists different numbers for PXE vs boot from HTTP.

> 
> Index: conflex.c
> ===
> RCS file: /cvs/src/usr.sbin/dhcpd/conflex.c,v
> retrieving revision 1.16
> diff -u -p -u -p -r1.16 conflex.c
> --- conflex.c 6 Feb 2016 23:50:10 -   1.16
> +++ conflex.c 23 Jan 2017 08:09:15 -
> @@ -310,6 +310,8 @@ static const struct keywords {
>   { "dynamic-bootp",  TOK_DYNAMIC_BOOTP },
>   { "dynamic-bootp-lease-cutoff", TOK_DYNAMIC_BOOTP_LEASE_CUTOFF 
> },
>   { "dynamic-bootp-lease-length", TOK_DYNAMIC_BOOTP_LEASE_LENGTH 
> },
> + { "efi-filename",   TOK_EFI64_FILENAME },
> + { "efi32-filename", TOK_EFI32_FILENAME },
>   { "ends",   TOK_ENDS },
>   { "ethernet",   TOK_ETHERNET },
>   { "filename",   TOK_FILENAME },
> Index: confpars.c
> ===
> RCS file: /cvs/src/usr.sbin/dhcpd/confpars.c,v
> retrieving revision 1.28
> diff -u -p -u -p -r1.28 confpars.c
> --- confpars.c17 Aug 2016 00:55:33 -  1.28
> +++ confpars.c23 Jan 2017 08:09:15 -
> @@ -380,6 +380,14 @@ parse_statement(FILE *cfile, struct grou
>   group->filename = parse_string(cfile);
>   break;
>  
> + case TOK_EFI32_FILENAME:
> + group->efi32_filename = parse_string(cfile);
> + break;
> +
> + case TOK_EFI64_FILENAME:
> + group->efi64_filename = parse_string(cfile);
> + break;
> +
>   case TOK_SERVER_NAME:
>   group->server_name = parse_string(cfile);
>   break;
> Index: dhcp.c
> ===
> RCS file: /cvs/src/usr.sbin/dhcpd/dhcp.c,v
> retrieving revision 1.52
> diff -u -p -u -p -r1.52 dhcp.c
> --- dhcp.c24 Oct 2016 21:05:55 -  1.52
> +++ dhcp.c23 Jan 2017 08:09:15 -
> @@ -699,6 +699,35 @@ nak_lease(struct packet *packet, struct 
>   outgoing.packet_length, from, , NULL);
>  }
>  
> +static const char *
> +group_filename(struct group *group, struct packet *packet)
> +{
> + u_int16_t arch, *opt;
> + int i;
> +
> + opt = (u_int16_t *)packet->options[DHO_DHCP_CLIENT_ARCHITECTURE].data;
> + for (i = 0; i < packet->options[DHO_DHCP_CLIENT_ARCHITECTURE].len;
> + i += 2) {
> + arch = betoh16(*opt++);
> + switch (arch) {
> + case DCA_EFI_IA32:
> + if (group->efi32_filename != NULL)
> + return group->efi32_filename;
> + break;
> + 
> + case DCA_EFI_BC:
> + case DCA_EFI_X86_64:
> + if (group->efi64_filename != NULL)
> + return group->efi64_filename;
> + break;
> + default:
> + break;
> + }
> + }
> +
> + return group->filename;
> +}
> +
>  void
>  ack_lease(struct packet *packet, struct lease *lease, unsigned int offer,
>  time_t when)
> @@ -707,6 +736,7 @@ ack_lease(struct packet *packet, struct 
>   struct lease_state *state;
>   time_t lease_time, offered_lease_time, max_lease_time, 
> default_lease_time;
>   struct class *vendor_class, *user_class;
> +  

Re: rtsock refactoring

2017-01-23 Thread Alexander Bluhm
On Sat, Jan 21, 2017 at 01:28:02AM +0100, Claudio Jeker wrote:
> On Fri, Jan 20, 2017 at 02:51:52AM +0100, Claudio Jeker wrote:
> > I sent this diff out some time ago and would really like to get this in.
> > This is one step on makeing rtsock.c less of a hornets nest.
> > This reduces the side effects in route_output and simplifies some other
> > bits as well. For example route_input is less variadic and simpler.
> > 
> 
> Here is just the route_input change. Which should be easier to OK.
> Changed a few things based on input from bluhm@

OK bluhm@

> 
> -- 
> :wq Claudio
> 
> Index: net/rtsock.c
> ===
> RCS file: /cvs/src/sys/net/rtsock.c,v
> retrieving revision 1.214
> diff -u -p -r1.214 rtsock.c
> --- net/rtsock.c  20 Jan 2017 08:10:54 -  1.214
> +++ net/rtsock.c  21 Jan 2017 00:10:16 -
> @@ -92,7 +92,6 @@
>  
>  struct sockaddr  route_dst = { 2, PF_ROUTE, };
>  struct sockaddr  route_src = { 2, PF_ROUTE, };
> -struct sockproto route_proto = { PF_ROUTE, };
>  
>  struct walkarg {
>   int w_op, w_arg, w_given, w_needed, w_tmemsize;
> @@ -100,7 +99,7 @@ struct walkarg {
>  };
>  
>  int  route_ctloutput(int, struct socket *, int, int, struct mbuf **);
> -void route_input(struct mbuf *m0, ...);
> +void route_input(struct mbuf *m0, sa_family_t);
>  int  route_arp_conflict(struct rtentry *, struct rt_addrinfo *);
>  int  route_cleargateway(struct rtentry *, void *, unsigned int);
>  
> @@ -332,7 +331,7 @@ rt_senddesync(void *data)
>  }
>  
>  void
> -route_input(struct mbuf *m0, ...)
> +route_input(struct mbuf *m0, sa_family_t sa_family)
>  {
>   struct rawcb *rp;
>   struct routecb *rop;
> @@ -340,15 +339,10 @@ route_input(struct mbuf *m0, ...)
>   struct mbuf *m = m0;
>   int s, sockets = 0;
>   struct socket *last = NULL;
> - va_list ap;
> - struct sockproto *proto;
>   struct sockaddr *sosrc, *sodst;
>   
> - va_start(ap, m0);
> - proto = va_arg(ap, struct sockproto *);
> - sosrc = va_arg(ap, struct sockaddr *);
> - sodst = va_arg(ap, struct sockaddr *);
> - va_end(ap);
> + sosrc = _src;
> + sodst = _dst;
>  
>   /* ensure that we can access the rtm_type via mtod() */
>   if (m->m_len < offsetof(struct rt_msghdr, rtm_type) + 1) {
> @@ -359,10 +353,16 @@ route_input(struct mbuf *m0, ...)
>   LIST_FOREACH(rp, , rcb_list) {
>   if (rp->rcb_socket->so_state & SS_CANTRCVMORE)
>   continue;
> - if (rp->rcb_proto.sp_family != proto->sp_family)
> + if (rp->rcb_proto.sp_family != PF_ROUTE)
>   continue;
> - if (rp->rcb_proto.sp_protocol && proto->sp_protocol &&
> - rp->rcb_proto.sp_protocol != proto->sp_protocol)
> + /*
> +  * If route socket is bound to an address family only send
> +  * messages that match the address family. Address family
> +  * agnostic messages are always send.
> +  */
> + if (rp->rcb_proto.sp_protocol != AF_UNSPEC &&
> + sa_family != AF_UNSPEC &&
> + rp->rcb_proto.sp_protocol != sa_family)
>   continue;
>   /*
>* We assume the lower level routines have
> @@ -953,8 +953,6 @@ flush:
>   rtm->rtm_flags |= RTF_DONE;
>   }
>   }
> - if (info.rti_info[RTAX_DST])
> - route_proto.sp_protocol = info.rti_info[RTAX_DST]->sa_family;
>   if (rt)
>   rtfree(rt);
>  
> @@ -970,9 +968,8 @@ fail:
>   }
>   /* There is another listener, so construct message */
>   rp = sotorawcb(so);
> - }
> - if (rp)
>   rp->rcb_proto.sp_family = 0; /* Avoid us */
> + }
>   if (rtm) {
>   if (m_copyback(m, 0, rtm->rtm_msglen, rtm, M_NOWAIT)) {
>   m_freem(m);
> @@ -982,9 +979,10 @@ fail:
>   free(rtm, M_RTABLE, 0);
>   }
>   if (m)
> - route_input(m, _proto, _src, _dst);
> + route_input(m, info.rti_info[RTAX_DST] ?
> + info.rti_info[RTAX_DST]->sa_family : AF_UNSPEC);
>   if (rp)
> - rp->rcb_proto.sp_family = PF_ROUTE;
> + rp->rcb_proto.sp_family = PF_ROUTE; /* Readd us */
>  
>   return (error);
>  }
> @@ -1056,7 +1054,6 @@ rt_setmetrics(u_long which, const struct
>  
>   out->rmx_expire = expire;
>   }
> - /* RTV_PRIORITY handled before */
>  }
>  
>  void
> @@ -1258,11 +1255,7 @@ rt_missmsg(int type, struct rt_addrinfo 
>   rtm->rtm_tableid = tableid;
>   rtm->rtm_addrs = rtinfo->rti_addrs;
>   rtm->rtm_index = ifidx;
> - if (sa == NULL)
> - route_proto.sp_protocol = 0;
> - else
> - route_proto.sp_protocol = sa->sa_family;
> - route_input(m, _proto, _src, 

Re: rtwn: fix iq calibration

2017-01-23 Thread Stefan Sperling
On Sun, Jan 22, 2017 at 11:07:00PM +0100, Jeremie Courreges-Anglas wrote:
> Stefan Sperling  writes:
> 
> > FreeBSD committed a couple of interesting things in r307529
> > https://svnweb.freebsd.org/base?view=revision=r307529
> > Unfortunately the commit is a giant patch bomb which makes
> > it hard to pick out individual fixes :-(
> >
> > This patch extracts what I believe "fix IQ calibration bug" refers to.
> > The current code is only ever setting bits but not clearing bits which
> > should be cleared.
> >
> > My diff looks very different because FreeBSD made a lot of style
> > changes in their driver (e.g. they have an rtwn_bb_setbits() function
> > which we don't have). We could think about porting these over but
> > for now I'd like to focus on fixing bugs instead of style.
> 
> Appears to work fine with
> 
> urtwn0 at uhub0 port 1 configuration 1 interface 0 "vendor 0x0bda ALFA 
> AWUS036NHR" rev 2.00/2.00 addr 2
> urtwn0: MAC/BB RTL8188RU, RF 6052 1T1R, address 00:c0:ca:xx:xx:xx

Thanks for testing.

I should have mentioned that this diff only affects PCI devices since
IQ calibration is disabled for USB devices. Running IQ calib has always
been causing USB devices to stop receiving packets for some reason.



Re: network booting efi systems with dhcpd(8)

2017-01-23 Thread Jonathan Matthew
On Mon, Jan 23, 2017 at 09:57:28PM +1100, Jonathan Gray wrote:
> On Mon, Jan 23, 2017 at 08:39:17PM +1000, Jonathan Matthew wrote:
> > For network boot clients, dhcpd(8) can supply a filename for the initial
> > boot file for the client, which is something like pxeboot (or pxelinux.0).
> > EFI and BIOS clients need different boot files, though, so the server
> > needs to know what mode the client is booting in, in order to supply the
> > right filename.  RFC 4578 defines DHCP client option 93 for this purpose.
> > 
> > The ISC dhcpd approach to using this looks something like this:
> > 
> >   option arch code 93 = unsigned integer 16;
> > 
> >   if option arch = 00:00 {
> > filename "bios/pxelinux.0";
> >   } elsif  option arch = 00:07 {
> >  filename "efi.x64/syslinux.efi";
> >   }
> > 
> > which seems a bit complicated, and also a lot of work to implement.  Instead
> > I propose adding 'efi-filename' (and 'efi32-filename', though I'm not sure
> > that's worth doing) next to the existing 'filename' statement and having
> > dhcpd itself interpret the option values to figure out which one should be
> > used.
> > 
> > I don't actually need this yet because we don't have many EFI-only client
> > machines, so we can just run everything in legacy mode, but some day this
> > may change.
> > 
> > Does this seem reasonable?
> 
> How will values 10 (32 bit arm) and 11 (64 bit arm) be handled
> in future if this went in?  Wouldn't it be better to map the arch numbers
> to strings?
> 
> The uefi spec refers to
> http://www.iana.org/assignments/dhcpv6-parameters/dhcpv6-parameters.xhtml#processor-architecture
> Which also lists different numbers for PXE vs boot from HTTP.

If we have more than a couple of architectures to deal with, we probably
need something smarter.  Adding arm-filename and arm64-filename in the
same way seems ugly.  I don't think our dhcpd has any existing mechanisms
that would support this kind of thing, so some actual thought may be
required.



Re: rtsock refactoring

2017-01-23 Thread Alexander Bluhm
On Mon, Jan 23, 2017 at 01:18:05AM +0100, Claudio Jeker wrote:
> Last bit for now. This is changing the reporting madness. It moves it in
> its own function which is called after the big switch statement.
> If you hit a bad error in the switch the code should eiter goto fail or
> flush. 
> The new function rt_report is actually constructing the rt message out of
> a rtentry. It does not magically update the message that was passed in
> from userland. I think this is much safer and works better.

OK bluhm@

> +rt_report(struct rtentry *rt, u_char type, int seq, int tableid)
...
> + ifp = if_get(rt->rt_ifidx);
> + if (ifp != NULL) {
> + info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
> + info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
> + if (ifp->if_flags & IFF_POINTOPOINT)
> + info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
> + }
> + if_put(ifp);

The if_put() could be in the if (ifp != NULL) block.

>  flush:
> + if (rt)
> + rtfree(rt);

rtfree() checks (rt != NULL) itself.



Re: split pledge "ioctl" into "bpf" and "tape"

2017-01-23 Thread Ted Unangst
Theo de Raadt wrote:
> So let's just split these cases out.  "ioctl"'s number gets reused for
> tape, and a new "bpf" promise is added..  That paves the way for a
> more complex diff coming in a few hours.

The mention of bpf made me worried that dhclient would be affected, but I
checked and it's not. So this shouldn't cause trouble crossing over.



Re: split pledge "ioctl" into "bpf" and "tape"

2017-01-23 Thread Theo de Raadt
>Theo de Raadt wrote:
>> So let's just split these cases out.  "ioctl"'s number gets reused for
>> tape, and a new "bpf" promise is added..  That paves the way for a
>> more complex diff coming in a few hours.
>
>The mention of bpf made me worried that dhclient would be affected, but I
>checked and it's not. So this shouldn't cause trouble crossing over.

the "bpf" pledge allows only one tiny minor operation.   that is not new.

the programs you are thinking of perform all their bpf setup before
pledge (or don't pledge at all).  after that they simply read/write.



fd-passing: convert internalized format to a struct

2017-01-23 Thread Theo de Raadt
File descriptor passing internalizes process fd's to an array of
struct file *.  This results in tricky pointer management.

How about passing it as an array of structs instead.  Only one field
inside for now, struct file *.  However soon I'll need to pass additional
fields along with the file, and I didn't want to use the low bits of the
pointer :)

Index: kern/uipc_usrreq.c
===
RCS file: /cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.109
diff -u -p -u -r1.109 uipc_usrreq.c
--- kern/uipc_usrreq.c  29 Dec 2016 12:12:43 -  1.109
+++ kern/uipc_usrreq.c  23 Jan 2017 23:02:15 -
@@ -63,8 +63,8 @@ LIST_HEAD(unp_head, unpcb) unp_head = LI
 struct unp_deferral {
SLIST_ENTRY(unp_deferral)   ud_link;
int ud_n;
-   /* followed by ud_n struct file * pointers */
-   struct file *ud_fp[];
+   /* followed by ud_n struct fdpass */
+   struct fdpass ud_fp[];
 };
 
 /* list of sets of files that were sent over sockets that are now closed */
@@ -664,12 +664,12 @@ unp_externalize(struct mbuf *rights, soc
struct proc *p = curproc;   /* XXX */
struct cmsghdr *cm = mtod(rights, struct cmsghdr *);
int i, *fdp = NULL;
-   struct file **rp;
+   struct fdpass *rp;
struct file *fp;
int nfds, error = 0;
 
nfds = (cm->cmsg_len - CMSG_ALIGN(sizeof(*cm))) /
-   sizeof(struct file *);
+   sizeof(struct fdpass);
if (controllen < CMSG_ALIGN(sizeof(struct cmsghdr)))
controllen = 0;
else
@@ -680,9 +680,10 @@ unp_externalize(struct mbuf *rights, soc
}
 
/* Make sure the recipient should be able to see the descriptors.. */
-   rp = (struct file **)CMSG_DATA(cm);
+   rp = (struct fdpass *)CMSG_DATA(cm);
for (i = 0; i < nfds; i++) {
-   fp = *rp++;
+   fp = rp->fp;
+   rp++;
error = pledge_recvfd(p, fp);
if (error)
break;
@@ -709,7 +710,7 @@ restart:
fdplock(p->p_fd);
if (error != 0) {
if (nfds > 0) {
-   rp = ((struct file **)CMSG_DATA(cm));
+   rp = ((struct fdpass *)CMSG_DATA(cm));
unp_discard(rp, nfds);
}
goto out;
@@ -719,7 +720,7 @@ restart:
 * First loop -- allocate file descriptor table slots for the
 * new descriptors.
 */
-   rp = ((struct file **)CMSG_DATA(cm));
+   rp = ((struct fdpass *)CMSG_DATA(cm));
for (i = 0; i < nfds; i++) {
if ((error = fdalloc(p, 0, [i])) != 0) {
/*
@@ -748,7 +749,8 @@ restart:
 * fdalloc() works properly.. We finalize it all
 * in the loop below.
 */
-   p->p_fd->fd_ofiles[fdp[i]] = *rp++;
+   p->p_fd->fd_ofiles[fdp[i]] = rp->fp;
+   rp++;
 
if (flags & MSG_CMSG_CLOEXEC)
p->p_fd->fd_ofileflags[fdp[i]] |= UF_EXCLOSE;
@@ -758,11 +760,12 @@ restart:
 * Now that adding them has succeeded, update all of the
 * descriptor passing state.
 */
-   rp = (struct file **)CMSG_DATA(cm);
+   rp = (struct fdpass *)CMSG_DATA(cm);
for (i = 0; i < nfds; i++) {
struct unpcb *unp;
 
-   fp = *rp++;
+   fp = rp->fp;
+   rp++;
if ((unp = fptounp(fp)) != NULL)
unp->unp_msgcount--;
unp_rights--;
@@ -787,7 +790,8 @@ unp_internalize(struct mbuf *control, st
 {
struct filedesc *fdp = p->p_fd;
struct cmsghdr *cm = mtod(control, struct cmsghdr *);
-   struct file **rp, *fp;
+   struct fdpass *rp;
+   struct file *fp;
struct unpcb *unp;
int i, error;
int nfds, *ip, fd, neededspace;
@@ -807,7 +811,7 @@ unp_internalize(struct mbuf *control, st
 
/* Make sure we have room for the struct file pointers */
 morespace:
-   neededspace = CMSG_SPACE(nfds * sizeof(struct file *)) -
+   neededspace = CMSG_SPACE(nfds * sizeof(struct fdpass)) -
control->m_len;
if (neededspace > M_TRAILINGSPACE(control)) {
char *tmp;
@@ -834,11 +838,11 @@ morespace:
}
 
/* adjust message & mbuf to note amount of space actually used. */
-   cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct file *));
-   control->m_len = CMSG_SPACE(nfds * sizeof(struct file *));
+   cm->cmsg_len = CMSG_LEN(nfds * sizeof(struct fdpass));
+   control->m_len = CMSG_SPACE(nfds * sizeof(struct fdpass));
 
ip = ((int *)CMSG_DATA(cm)) + nfds - 1;
-   rp = ((struct file **)CMSG_DATA(cm)) + nfds - 1;
+   rp = ((struct fdpass *)CMSG_DATA(cm)) + nfds - 1;
for (i = 0; i < nfds; i++) {
memcpy(, ip, 

Re: pfind(9) -> tfind(9)

2017-01-23 Thread Philip Guenther
On Tue, 24 Jan 2017, Martin Pieuchot wrote:
> We live in a thread world!  Let's face it.
>
> ok?

Heh, I lost track that I managed to get it down to such a small set.  ok 
on the rename, just some small manpage wankery:


> --- /dev/null 1 Jan 1970 00:00:00 -
> +++ share/man/man9/tfind.924 Jan 2017 00:07:27 -
...
> +.Sh SYNOPSIS
> +.In sys/cdefs.h
> +.In sys/proc.h

This should be
.In sys/types.h
.In sys/signal.h
.In sys/proc.h
(never )


> +.Ft "struct proc *"
> +.Fn tfind "pid_t tid"

Let's add prfind() too:
.Ft "struct process *"
.Fn prfind "pid_t pid"


> +.Ft "struct pgrp *"
> +.Fn pgfind "pid_t pgid"
> +.Sh DESCRIPTION
> +The
> +.Fn tfind

.Fn tfind ,
.Fn prfind ,


> +.Fn pgfind
> +functions retrieve thread and progress group structures from thread and

functions retrieve thread, process, and process group structures from
thread, process, and process group IDs, respectively.

> +.Pp
> +Both functions return

These functions return



Re: NET_LOCK() ordering issue

2017-01-23 Thread Alexander Bluhm
On Tue, Jan 24, 2017 at 01:48:49AM +0100, Alexander Bluhm wrote:
> On Tue, Jan 24, 2017 at 10:01:02AM +1000, Martin Pieuchot wrote:
> > Updated diff, thanks for your review.
> 
> > @@ -360,24 +358,20 @@ redo:
> > error = soaccept(so, nam);
> > if (!error && name != NULL)
> > error = copyaddrout(p, nam, name, namelen, anamelen);
> > -
> > +   if (!error) {
> > +   (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t), p);
> > +   FILE_SET_MATURE(fp, p);
> > +   *retval = tmpfd;
> > +   }
> > +out:
> 
> Perhaps a goto out would be nicer than two if (!error).

No, there is a error = copyaddrout() assignment.  Please disregard
my comment.

>   error = soaccept(so, nam);
>   if (error)
>   goto out;
>   if (name != NULL)
>   error = copyaddrout(p, nam, name, namelen, anamelen);
>   (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t), p);
>   FILE_SET_MATURE(fp, p);
>   *retval = tmpfd;
> out:
> 
> anyway OK bluhm@

Still OK bluhm@



Re: rtsock refactoring

2017-01-23 Thread Martin Pieuchot
On 23/01/17(Mon) 01:18, Claudio Jeker wrote:
> [...] 
> Last bit for now. This is changing the reporting madness. It moves it in
> its own function which is called after the big switch statement.
> If you hit a bad error in the switch the code should eiter goto fail or
> flush. 
> The new function rt_report is actually constructing the rt message out of
> a rtentry. It does not magically update the message that was passed in
> from userland. I think this is much safer and works better.

[...]

> + /*
> +  * From here on these vars need to be valid
> +  * rt, rtm, error, so, m, tableid, sa_family

Why not use KASSERT() rather than a comment?

ok mpi@  either way .



Re: fd-passing: convert internalized format to a struct

2017-01-23 Thread Martin Pieuchot
On 23/01/17(Mon) 16:05, Theo de Raadt wrote:
> File descriptor passing internalizes process fd's to an array of
> struct file *.  This results in tricky pointer management.
> 
> How about passing it as an array of structs instead.  Only one field
> inside for now, struct file *.  However soon I'll need to pass additional
> fields along with the file, and I didn't want to use the low bits of the
> pointer :)

Less magic is good.

If you're only going to use 'struct fdpass' inside kern/uipc_usrreq.c
I'd suggest to declare it there, to avoid namespace pollution.

One nit below, with that ok mpi@

> @@ -911,7 +915,7 @@ unp_gc(void *arg __unused)
>   unp_rights--;
>   (void) closef(fp, NULL);
>   }
> - free(defer, M_TEMP, sizeof(*defer) + sizeof(fp) * defer->ud_n);
> + free(defer, M_TEMP, sizeof(*defer) + sizeof(struct fdpass) * 
> defer->ud_n);
^
Line too long.



Re: rtsock refactoring

2017-01-23 Thread Claudio Jeker
On Mon, Jan 23, 2017 at 04:56:02PM +0100, Alexander Bluhm wrote:
> On Mon, Jan 23, 2017 at 01:18:05AM +0100, Claudio Jeker wrote:
> > Last bit for now. This is changing the reporting madness. It moves it in
> > its own function which is called after the big switch statement.
> > If you hit a bad error in the switch the code should eiter goto fail or
> > flush. 
> > The new function rt_report is actually constructing the rt message out of
> > a rtentry. It does not magically update the message that was passed in
> > from userland. I think this is much safer and works better.
> 
> OK bluhm@
> 
> > +rt_report(struct rtentry *rt, u_char type, int seq, int tableid)
> ...
> > +   ifp = if_get(rt->rt_ifidx);
> > +   if (ifp != NULL) {
> > +   info.rti_info[RTAX_IFP] = sdltosa(ifp->if_sadl);
> > +   info.rti_info[RTAX_IFA] = rt->rt_ifa->ifa_addr;
> > +   if (ifp->if_flags & IFF_POINTOPOINT)
> > +   info.rti_info[RTAX_BRD] = rt->rt_ifa->ifa_dstaddr;
> > +   }
> > +   if_put(ifp);
> 
> The if_put() could be in the if (ifp != NULL) block.
> 

While true, I like the symetry of if_get and if_put. Makes sure the if_put
is not lost in some other later code changes.

> >  flush:
> > +   if (rt)
> > +   rtfree(rt);
> 
> rtfree() checks (rt != NULL) itself.
> 

Thanks, will change that.

-- 
:wq Claudio



Re: Help for ddb(4)'s "tr /p"

2017-01-23 Thread Philip Guenther
On Tue, 24 Jan 2017, Martin Pieuchot wrote:
> Now that pfind(9) takes tid we need a way to show TID in ddb(4) ps
> output.  Otherwise it's hard to use "ps /p"

You mean "tr /p", right?


> Here's the difference:
> 
> Before:
>PID   PPID   PGRPUID  S   FLAGS  WAIT  COMMAND
> *50672  7  50672  0  7 0x3sysctl
>  7  1  7  0  30x10008b  pause ksh
>  90555  1  90555  0  30x100098  poll  cron
> 
> After:
>PID TID   PPID   PGRPUID  S   FLAGS  WAIT  COMMAND
> *52636  496486  51956  52636  0  7 0x3sysctl
>  51956  407679  1  51956  0  30x10008b  pause ksh
>  42726  321266  1  42726  0  30x100098  poll  cron
> 
> ok?

Hmm, ps/n (the default) is the only of the ps subcommands to not show the 
TID...so that it can show more columns of COMMAND.  You've added 8 columns 
without shrinking any of the existing ones, which will cause an unreadable 
mess when it wraps around.  Maybe drop PGRP from /n and we add a new one 
that's more like (userspace) ps -j ?


The outer two chunks here are clearly correct, regardless of how we
bikeshed ps:

> --- share/man/man4/ddb.4  1 Sep 2016 12:24:56 -   1.82
> +++ share/man/man4/ddb.4  23 Jan 2017 23:38:09 -
> @@ -542,7 +542,7 @@ The
>  .Cm /p
>  modifier interprets the
>  .Ar frameaddr
> -argument as the PID of a process and shows the stack trace of
> +argument as the TID of a process and shows the stack trace of
>  that process.
>  The
>  .Cm /p
...
> @@ -928,7 +928,7 @@ A synonym for the
>  .Ic show all callout
>  command.
>  .\" 
> -.It Ic ps Op Cm /anw
> +.It Ic ps Op Cm /anow
>  A synonym for
>  .Ic show all procs .
>  .\" 
> 
> 



refactor PF option parsing loops

2017-01-23 Thread Richard Procter
Hi, 

PF implements six distinct TCP option parsing loops. This patch converts 
these to one inline function in pfvar_priv.h, normalises their semantics, 
and strips ~100 lines. 

I've laid out the existing semantics below. The new loop implements the 
timestamp parser's semantics of "(s-b) (v-3) (t-a) (t-b)". As a result,

- MSS and WSCALE parsers are stricter about candidate option len.
- MSS normalization is more tolerant of malformed option lists.

These changes were immaterial to the live traffic I've examined (thanks to 
Richard Cziva for help in gathering the data).

I'd like test reports before committing. 

comments, oks? 

best, 
Richard. 

* postcondition on identity (return correct option) "validity"

- 'r[]' contains the returned option.

r[0] = TYPE /\

MIN_TYPELEN >= 2 /\ 

MAX_TYPELEN  = r[1]  = MIN_TYPELEN  (v-1)  (fixed-len opt)  or
MAX_TYPELEN >= r[1] >= MIN_TYPELEN  (v-2)  (weaker) or 
   r[1] >= MIN_TYPELEN  (v-3)  (weaker) or 
   r[1] >= 2(v-4)  (weaker) or 
   true (v-5)  (weakest)


* postcondition on (eoh - opt) (ensure buffer safety) "safety"

- 'eoh' points to one past the end of option header
- 'opt' points to returned option

(eoh - opt) >= r[1] (s-a) 
(eoh - opt) >= MIN_TYPELEN  (s-b)


* parsing (tolerance of malformed headers) "tolerance"

r[0] = EOL=>   continue with r[1] := 1   (t-b)
r[1] < 2  =>   continue with r[1] := 2   (t-a)   
r[1] < 2  =>   break (t-d) 
(eoh - opt) < r[1]=>   break (t-c) 


* Analysis of existing parsers
  safety validity 
tolerance
[TS:fixed:10]  pf_norm.c:pf_normalize_tcp_init():  (s-b) (v-3) 
(t-a) (t-b)
[TS:fixed:10]  pf_norm.c:pf_normalize_tcp_stateful():  (s-b) (v-3) 
(t-a) (t-b)

[SACK:var:2 + (8*n)]   pf.c:pf_modulate_sack():  (s-a) (s-b) (v-3) 
(t-a) (t-b) 

[MSS:fixed:4]  pf_norm.c:pf_normalize_mss(): (s-a) (s-b) (v-4)  
   (t-c) (t-d)
   
[MSS:fixed:4]  pf.c:pf_get_mss():  (s-b) (v-5) 
(t-a) (t-b)   
[WSCL:fixed:3] pf.c:pf_get_wscale():   (s-b) (v-5) 
(t-a) (t-b) 


--- 

Index: net/pf_norm.c
===
--- net.orig/pf_norm.c
+++ net/pf_norm.c
@@ -901,8 +901,9 @@ pf_normalize_tcp_init(struct pf_pdesc *p
 {
struct tcphdr   *th = >hdr.tcp;
u_int32_ttsval, tsecr;
-   u_int8_t hdr[60];
-   u_int8_t*opt;
+   int  olen;
+   u_int8_t opts[MAX_TCPOPTLEN], *opt = opts;
+
 
KASSERT(src->scrub == NULL);
 
@@ -935,44 +936,28 @@ pf_normalize_tcp_init(struct pf_pdesc *p
if ((th->th_flags & TH_SYN) == 0)
return (0);
 
-   if (th->th_off > (sizeof(struct tcphdr) >> 2) && src->scrub &&
-   pf_pull_hdr(pd->m, pd->off, hdr, th->th_off << 2, NULL, NULL,
-   pd->af)) {
-   /* Diddle with TCP options */
-   int hlen;
+   if (src->scrub == NULL)
+   return (0);
 
-   opt = hdr + sizeof(struct tcphdr);
-   hlen = (th->th_off << 2) - sizeof(struct tcphdr);
-   while (hlen >= TCPOLEN_TIMESTAMP) {
-   switch (*opt) {
-   case TCPOPT_EOL:/* FALLTHROUGH */
-   case TCPOPT_NOP:
-   opt++;
-   hlen--;
-   break;
-   case TCPOPT_TIMESTAMP:
-   if (opt[1] >= TCPOLEN_TIMESTAMP) {
-   src->scrub->pfss_flags |=
-   PFSS_TIMESTAMP;
-   src->scrub->pfss_ts_mod = arc4random();
-
-   /* note PFSS_PAWS not set yet */
-   memcpy(, [2],
-   sizeof(u_int32_t));
-   memcpy(, [6],
-   sizeof(u_int32_t));
-   src->scrub->pfss_tsval0 = ntohl(tsval);
-   src->scrub->pfss_tsval = ntohl(tsval);
-   src->scrub->pfss_tsecr = ntohl(tsecr);
-   getmicrouptime(>scrub->pfss_last);
-   }
-   /* FALLTHROUGH */
-   default:
-   hlen -= MAX(opt[1], 2);
-   opt += MAX(opt[1], 2);
-   

Re: refactor PF option parsing loops

2017-01-23 Thread Richard Procter

PS Find this patch broken down for easier review here:  

http://203.79.107.124/opts/

On Tue, 24 Jan 2017, Richard Procter wrote:

> Hi,
>
> PF implements six distinct TCP option parsing loops. This patch converts
> these to one inline function in pfvar_priv.h, normalises their semantics,
> and strips ~100 lines.




Re: NET_LOCK() ordering issue

2017-01-23 Thread Alexander Bluhm
On Sat, Jan 21, 2017 at 10:46:42AM +1000, Martin Pieuchot wrote:
> Here's another way to fix the problem, call falloc() before grabbing the
> NET_LOCK().
> 
> Comments?

There a two bugs in the "goto redo" block that is not part of the
diff.  You could do a m_freem(nam), then goto redo, get an error
and goto out.  This will result in a double m_freem(nam).

Also as you have moved the falloc() before the redo label, you must
not fdremove() and closef() before goto redo.

You you use only if (error) and if (!error) and not mix it with
(error != 0) and (error == 0)?  That makes checking the error paths
easier.

bluhm

> 
> Index: kern/uipc_syscalls.c
> ===
> RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v
> retrieving revision 1.144
> diff -u -p -r1.144 uipc_syscalls.c
> --- kern/uipc_syscalls.c  29 Dec 2016 12:12:43 -  1.144
> +++ kern/uipc_syscalls.c  21 Jan 2017 00:38:18 -
> @@ -265,7 +265,7 @@ doaccept(struct proc *p, int sock, struc
>  {
>   struct filedesc *fdp = p->p_fd;
>   struct file *fp, *headfp;
> - struct mbuf *nam;
> + struct mbuf *nam = NULL;
>   socklen_t namelen;
>   int error, s, tmpfd;
>   struct socket *head, *so;
> @@ -277,19 +277,35 @@ doaccept(struct proc *p, int sock, struc
>   return (error);
>  
>   headfp = fp;
> + head = headfp->f_data;
> +
> + fdplock(fdp);
> + error = falloc(p, , );
> + if (error == 0 && (flags & SOCK_CLOEXEC))
> + fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
> + fdpunlock(fdp);
> + if (error != 0) {
> + /*
> +  * Probably ran out of file descriptors.  Wakeup
> +  * so some other process might have a chance at it.
> +  */
> + wakeup_one(>so_timeo);
> + FRELE(headfp, p);
> + return (error);
> + }
> +
>  redo:
>   NET_LOCK(s);
> - head = headfp->f_data;
>   if (isdnssocket(head) || (head->so_options & SO_ACCEPTCONN) == 0) {
>   error = EINVAL;
> - goto bad;
> + goto out;
>   }
>   if ((head->so_state & SS_NBIO) && head->so_qlen == 0) {
>   if (head->so_state & SS_CANTRCVMORE)
>   error = ECONNABORTED;
>   else
>   error = EWOULDBLOCK;
> - goto bad;
> + goto out;
>   }
>   while (head->so_qlen == 0 && head->so_error == 0) {
>   if (head->so_state & SS_CANTRCVMORE) {
> @@ -298,34 +314,19 @@ redo:
>   }
>   error = tsleep(>so_timeo, PSOCK | PCATCH,
>   "netcon", 0);
> - if (error) {
> - goto bad;
> - }
> + if (error)
> + goto out;
>   }
>   if (head->so_error) {
>   error = head->so_error;
>   head->so_error = 0;
> - goto bad;
> + goto out;
>   }
>  
>   /* Figure out whether the new socket should be non-blocking. */
>   nflag = flags & SOCK_NONBLOCK_INHERIT ? (headfp->f_flag & FNONBLOCK)
>   : (flags & SOCK_NONBLOCK ? FNONBLOCK : 0);
>  
> - fdplock(fdp);
> - error = falloc(p, , );
> - if (error == 0 && (flags & SOCK_CLOEXEC))
> - fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
> - fdpunlock(fdp);
> - if (error != 0) {
> - /*
> -  * Probably ran out of file descriptors.  Wakeup
> -  * so some other process might have a chance at it.
> -  */
> - wakeup_one(>so_timeo);
> - goto bad;
> - }
> -
>   nam = m_get(M_WAIT, MT_SONAME);
>  
>   /*
> @@ -360,24 +361,20 @@ redo:
>   error = soaccept(so, nam);
>   if (!error && name != NULL)
>   error = copyaddrout(p, nam, name, namelen, anamelen);
> -
> - if (error) {
> - /* if an error occurred, free the file descriptor */
> - NET_UNLOCK(s);
> - m_freem(nam);
> + if (!error) {
> + (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t), p);
> + FILE_SET_MATURE(fp, p);
> + *retval = tmpfd;
> + }
> +out:
> + NET_UNLOCK(s);
> + m_freem(nam);
> + if (error != 0) {
>   fdplock(fdp);
>   fdremove(fdp, tmpfd);
>   closef(fp, p);
>   fdpunlock(fdp);
> - goto out;
>   }
> - (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t), p);
> - FILE_SET_MATURE(fp, p);
> - *retval = tmpfd;
> - m_freem(nam);
> -bad:
> - NET_UNLOCK(s);
> -out:
>   FRELE(headfp, p);
>   return (error);
>  }



Re: NET_LOCK() ordering issue

2017-01-23 Thread Martin Pieuchot
On 24/01/17(Tue) 00:51, Alexander Bluhm wrote:
> On Sat, Jan 21, 2017 at 10:46:42AM +1000, Martin Pieuchot wrote:
> > Here's another way to fix the problem, call falloc() before grabbing the
> > NET_LOCK().
> > 
> > Comments?
> 
> There a two bugs in the "goto redo" block that is not part of the
> diff.  You could do a m_freem(nam), then goto redo, get an error
> and goto out.  This will result in a double m_freem(nam).

Ok, I include this fix.

> Also as you have moved the falloc() before the redo label, you must
> not fdremove() and closef() before goto redo.

Nice catch.

> You you use only if (error) and if (!error) and not mix it with
> (error != 0) and (error == 0)?  That makes checking the error paths
> easier.

Fine.

Updated diff, thanks for your review.

Index: kern/uipc_syscalls.c
===
RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v
retrieving revision 1.144
diff -u -p -r1.144 uipc_syscalls.c
--- kern/uipc_syscalls.c29 Dec 2016 12:12:43 -  1.144
+++ kern/uipc_syscalls.c23 Jan 2017 23:59:34 -
@@ -265,7 +265,7 @@ doaccept(struct proc *p, int sock, struc
 {
struct filedesc *fdp = p->p_fd;
struct file *fp, *headfp;
-   struct mbuf *nam;
+   struct mbuf *nam = NULL;
socklen_t namelen;
int error, s, tmpfd;
struct socket *head, *so;
@@ -277,19 +277,35 @@ doaccept(struct proc *p, int sock, struc
return (error);
 
headfp = fp;
+   head = headfp->f_data;
+
+   fdplock(fdp);
+   error = falloc(p, , );
+   if (!error && (flags & SOCK_CLOEXEC))
+   fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
+   fdpunlock(fdp);
+   if (error) {
+   /*
+* Probably ran out of file descriptors.  Wakeup
+* so some other process might have a chance at it.
+*/
+   wakeup_one(>so_timeo);
+   FRELE(headfp, p);
+   return (error);
+   }
+
 redo:
NET_LOCK(s);
-   head = headfp->f_data;
if (isdnssocket(head) || (head->so_options & SO_ACCEPTCONN) == 0) {
error = EINVAL;
-   goto bad;
+   goto out;
}
if ((head->so_state & SS_NBIO) && head->so_qlen == 0) {
if (head->so_state & SS_CANTRCVMORE)
error = ECONNABORTED;
else
error = EWOULDBLOCK;
-   goto bad;
+   goto out;
}
while (head->so_qlen == 0 && head->so_error == 0) {
if (head->so_state & SS_CANTRCVMORE) {
@@ -298,34 +314,19 @@ redo:
}
error = tsleep(>so_timeo, PSOCK | PCATCH,
"netcon", 0);
-   if (error) {
-   goto bad;
-   }
+   if (error)
+   goto out;
}
if (head->so_error) {
error = head->so_error;
head->so_error = 0;
-   goto bad;
+   goto out;
}
 
/* Figure out whether the new socket should be non-blocking. */
nflag = flags & SOCK_NONBLOCK_INHERIT ? (headfp->f_flag & FNONBLOCK)
: (flags & SOCK_NONBLOCK ? FNONBLOCK : 0);
 
-   fdplock(fdp);
-   error = falloc(p, , );
-   if (error == 0 && (flags & SOCK_CLOEXEC))
-   fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
-   fdpunlock(fdp);
-   if (error != 0) {
-   /*
-* Probably ran out of file descriptors.  Wakeup
-* so some other process might have a chance at it.
-*/
-   wakeup_one(>so_timeo);
-   goto bad;
-   }
-
nam = m_get(M_WAIT, MT_SONAME);
 
/*
@@ -336,10 +337,7 @@ redo:
if (head->so_qlen == 0) {
NET_UNLOCK(s);
m_freem(nam);
-   fdplock(fdp);
-   fdremove(fdp, tmpfd);
-   closef(fp, p);
-   fdpunlock(fdp);
+   nam = NULL;
goto redo;
}
 
@@ -360,24 +358,20 @@ redo:
error = soaccept(so, nam);
if (!error && name != NULL)
error = copyaddrout(p, nam, name, namelen, anamelen);
-
+   if (!error) {
+   (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t), p);
+   FILE_SET_MATURE(fp, p);
+   *retval = tmpfd;
+   }
+out:
+   NET_UNLOCK(s);
+   m_freem(nam);
if (error) {
-   /* if an error occurred, free the file descriptor */
-   NET_UNLOCK(s);
-   m_freem(nam);
fdplock(fdp);
fdremove(fdp, tmpfd);
closef(fp, p);
fdpunlock(fdp);
-   goto out;
}
-   (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t), p);
-   FILE_SET_MATURE(fp, p);
-   *retval = tmpfd;
-  

Re: NET_LOCK() ordering issue

2017-01-23 Thread Alexander Bluhm
On Tue, Jan 24, 2017 at 10:01:02AM +1000, Martin Pieuchot wrote:
> Updated diff, thanks for your review.

> @@ -360,24 +358,20 @@ redo:
>   error = soaccept(so, nam);
>   if (!error && name != NULL)
>   error = copyaddrout(p, nam, name, namelen, anamelen);
> -
> + if (!error) {
> + (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t), p);
> + FILE_SET_MATURE(fp, p);
> + *retval = tmpfd;
> + }
> +out:

Perhaps a goto out would be nicer than two if (!error).

error = soaccept(so, nam);
if (error)
goto out;
if (name != NULL)
error = copyaddrout(p, nam, name, namelen, anamelen);
(*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t), p);
FILE_SET_MATURE(fp, p);
*retval = tmpfd;
out:

anyway OK bluhm@



Re: global mbuf memory limit

2017-01-23 Thread Alexander Bluhm
On Wed, Dec 14, 2016 at 03:52:32PM +1000, David Gwynne wrote:
> > Wouldn't it make sense to use atomic operations to keep track of the
> > amount of memory that was allocated?
> 
> the mtx may be slow, but it is a slow path id like to keep simple.

Is allocate and free of mbuf clusters in the slow path?  I would
expect that it has to be done for every packet.  An atomic operation
would not solve the slowdown in the fast path, perhaps a multi-cpu
counter?

> > Long run I suppose we want to drop nmbclust and let users tune the
> > total amount of memory available for clusters and set the initial
> > amount to a percentage of physical memory?
> 
> i do want to move to specifying memory in terms of bytes instead of 2k 
> clusters.

That might be better.  But leave it as it is for now.  Percentage
of physical memory is bad as we have architectures that cannot do
DMA everywhere.

> how much should be available by default is a complicated issue.

Yes.  Maybe we have to reconsier the default.  We had it per cluster
size before, now the limit is for all clusters.

> >> void
> >> mbcpuinit()
> >> {
> >> +  int i;
> >> +
> >>mbstat = counters_alloc_ncpus(mbstat, MBSTAT_COUNT, M_DEVBUF);
> >> +
> >> +  pool_cache_init();
> >> +  pool_cache_init();
> >> +
> >> +  for (i = 0; i < nitems(mclsizes); i++)
> >> +  pool_cache_init([i]);
> >> }

Is the pool cache related to the global mbuf limit?

Apart from the problem that I don't know wether the mutex kills
performance, the diff looks good.

bluhm



Re: rtsock refactoring

2017-01-23 Thread Claudio Jeker
On Tue, Jan 24, 2017 at 08:54:23AM +1000, Martin Pieuchot wrote:
> On 23/01/17(Mon) 01:18, Claudio Jeker wrote:
> > [...] 
> > Last bit for now. This is changing the reporting madness. It moves it in
> > its own function which is called after the big switch statement.
> > If you hit a bad error in the switch the code should eiter goto fail or
> > flush. 
> > The new function rt_report is actually constructing the rt message out of
> > a rtentry. It does not magically update the message that was passed in
> > from userland. I think this is much safer and works better.
> 
> [...]
> 
> > +   /*
> > +* From here on these vars need to be valid
> > +* rt, rtm, error, so, m, tableid, sa_family
> 
> Why not use KASSERT() rather than a comment?

I could not come up easily with proper KASSERT() statements.
Will ponder a bit more about this.

> 
> ok mpi@  either way .
> 

-- 
:wq Claudio



[WWW] Reverse chronological order for faq/current.html

2017-01-23 Thread Raf Czlonka
Hi all,

As faq/current.html[0] grows, each major change is being added at
the very bottom, chronologically. There already are several other
pages where this kind of ordering makes sense, i.e. innovations.html[1].

Given the "current" (unintentional pun) nature of changes on the
aforementioned page, it seem like reverse chronological order would
suit it better, as is the case with, i.e. events.html[2].

The proposed change has a clear benefit to those following -current
- i.e. not having to scroll all the way to the bottom of the page
(whichever way one does it) to get the information about the most
recent changes - since they would always be available at the very
top.

Obviously, doing it now, before the -release has been cut, doesn't
make any sense whatsoever, but the change could be implemented when
faq/current.html gets a reset... perhaps even with a style guide
as it is done in events.html?



What do people think about introducing such change?

Regards,

Raf

[0] https://www.openbsd.org/faq/current.html
[1] https://www.openbsd.org/innovations.html
[2] https://www.openbsd.org/events.html



Help for ddb(4)'s "ps /p"

2017-01-23 Thread Martin Pieuchot
Now that pfind(9) takes tid we need a way to show TID in ddb(4) ps
output.  Otherwise it's hard to use "ps /p"

Diff below does that and fix some documentation about "show all procs".

pfind(9) manual still needs a fix, and we can all decide afterward if
we should rename "/p" into "/t".

Here's the difference:

Before:
   PID   PPID   PGRPUID  S   FLAGS  WAIT  COMMAND
*50672  7  50672  0  7 0x3sysctl
 7  1  7  0  30x10008b  pause ksh
 90555  1  90555  0  30x100098  poll  cron

After:
   PID TID   PPID   PGRPUID  S   FLAGS  WAIT  COMMAND
*52636  496486  51956  52636  0  7 0x3sysctl
 51956  407679  1  51956  0  30x10008b  pause ksh
 42726  321266  1  42726  0  30x100098  poll  cron

ok?

Index: sys/kern/kern_proc.c
===
RCS file: /cvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.72
diff -u -p -r1.72 kern_proc.c
--- sys/kern/kern_proc.c21 Jan 2017 05:42:03 -  1.72
+++ sys/kern/kern_proc.c23 Jan 2017 23:47:49 -
@@ -471,8 +471,8 @@ db_show_all_procs(db_expr_t addr, int ha
"COMMAND", "STRUCT PROC *", "UAREA *", "VMSPACE/VM_MAP");
break;
case 'n':
-   db_printf("   PID  %5s  %5s  %5s  S  %10s  %-12s  %-16s\n",
-   "PPID", "PGRP", "UID", "FLAGS", "WAIT", "COMMAND");
+   db_printf("   PID  %6s  %5s  %5s  %5s  S  %10s  %-12s  %-16s\n",
+   "TID", "PPID", "PGRP", "UID", "FLAGS", "WAIT", "COMMAND");
break;
case 'w':
db_printf("   TID  %-16s  %-8s  %18s  %s\n",
@@ -497,8 +497,14 @@ db_show_all_procs(db_expr_t addr, int ha
ci_schedstate.spc_idleproc == p)
continue;
}
-   db_printf("%c%5d  ", p == curproc ? '*' : ' ',
-   *mode == 'n' ? pr->ps_pid : p->p_tid);
+
+   if (*mode == 'n') {
+   db_printf("%c%5d  ", (p == curproc ?
+   '*' : ' '), pr->ps_pid);
+   } else {
+   db_printf("%c%6d  ", (p == curproc ?
+   '*' : ' '), p->p_tid);
+   }
 
switch (*mode) {
 
@@ -508,9 +514,9 @@ db_show_all_procs(db_expr_t addr, int ha
break;
 
case 'n':
-   db_printf("%5d  %5d  %5d  %d  %#10x  "
-   "%-12.12s  %-16s\n",
-   ppr ? ppr->ps_pid : -1,
+   db_printf("%6d  %5d  %5d  %5d  %d  "
+   "%#10x  %-12.12s  %-16s\n",
+   p->p_tid, ppr ? ppr->ps_pid : -1,
pr->ps_pgrp ? pr->ps_pgrp->pg_id : 
-1,
pr->ps_ucred->cr_ruid, p->p_stat,
p->p_flag | pr->ps_flags,
Index: share/man/man4/ddb.4
===
RCS file: /cvs/src/share/man/man4/ddb.4,v
retrieving revision 1.82
diff -u -p -r1.82 ddb.4
--- share/man/man4/ddb.41 Sep 2016 12:24:56 -   1.82
+++ share/man/man4/ddb.423 Jan 2017 23:38:09 -
@@ -542,7 +542,7 @@ The
 .Cm /p
 modifier interprets the
 .Ar frameaddr
-argument as the PID of a process and shows the stack trace of
+argument as the TID of a process and shows the stack trace of
 that process.
 The
 .Cm /p
@@ -832,7 +832,7 @@ Display information on all processes.
 (Default) Show process information in a
 .Xr ps 1 Ns \&-like
 format.
-Information printed includes thread ID, parent
+Information printed includes process ID, thread ID, parent
 process ID, process group, UID, process status, process flags, process
 command name, and process wait channel message.
 .It Cm /a
@@ -928,7 +928,7 @@ A synonym for the
 .Ic show all callout
 command.
 .\" 
-.It Ic ps Op Cm /anw
+.It Ic ps Op Cm /anow
 A synonym for
 .Ic show all procs .
 .\" 



pfind(9) -> tfind(9)

2017-01-23 Thread Martin Pieuchot
We live in a thread world!  Let's face it.

ok?

Index: sys/arch/amd64/amd64/db_trace.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/db_trace.c,v
retrieving revision 1.23
diff -u -p -r1.23 db_trace.c
--- sys/arch/amd64/amd64/db_trace.c 16 Sep 2016 19:13:16 -  1.23
+++ sys/arch/amd64/amd64/db_trace.c 24 Jan 2017 00:10:47 -
@@ -181,7 +181,7 @@ db_stack_trace_print(db_expr_t addr, boo
callpc = (db_addr_t)ddb_regs.tf_rip;
} else {
if (trace_proc) {
-   struct proc *p = pfind((pid_t)addr);
+   struct proc *p = tfind((pid_t)addr);
if (p == NULL) {
(*pr) ("not found\n");
return;
Index: sys/arch/arm/arm/db_trace.c
===
RCS file: /cvs/src/sys/arch/arm/arm/db_trace.c,v
retrieving revision 1.9
diff -u -p -r1.9 db_trace.c
--- sys/arch/arm/arm/db_trace.c 20 Sep 2016 08:35:25 -  1.9
+++ sys/arch/arm/arm/db_trace.c 24 Jan 2017 00:10:47 -
@@ -104,7 +104,7 @@ db_stack_trace_print(db_expr_t addr, int
struct proc *p;
struct user *u;
(*pr) ("trace: pid %d ", (int)addr);
-   p = pfind(addr);
+   p = tfind(addr);
if (p == NULL) {
(*pr)("not found\n");
return;
Index: sys/arch/i386/i386/db_trace.c
===
RCS file: /cvs/src/sys/arch/i386/i386/db_trace.c,v
retrieving revision 1.22
diff -u -p -r1.22 db_trace.c
--- sys/arch/i386/i386/db_trace.c   18 Sep 2016 13:38:01 -  1.22
+++ sys/arch/i386/i386/db_trace.c   24 Jan 2017 00:10:47 -
@@ -195,7 +195,7 @@ db_stack_trace_print(db_expr_t addr, boo
} else if (trace_thread) {
(*pr) ("%s: can't trace thread\n", __func__);
} else if (trace_proc) {
-   struct proc *p = pfind((pid_t)addr);
+   struct proc *p = tfind((pid_t)addr);
if (p == NULL) {
(*pr) ("not found\n");
return;
Index: sys/arch/powerpc/ddb/db_trace.c
===
RCS file: /cvs/src/sys/arch/powerpc/ddb/db_trace.c,v
retrieving revision 1.10
diff -u -p -r1.10 db_trace.c
--- sys/arch/powerpc/ddb/db_trace.c 10 Sep 2016 06:36:26 -  1.10
+++ sys/arch/powerpc/ddb/db_trace.c 24 Jan 2017 00:10:47 -
@@ -134,7 +134,7 @@ db_stack_trace_print(db_expr_t addr, int
lr = ddb_regs.srr0;
} else {
if (trace_proc) {
-   struct proc *p = pfind((pid_t)addr);
+   struct proc *p = tfind((pid_t)addr);
if (p == NULL) {
(*pr) ("not found\n");
return;
Index: sys/arch/sparc64/sparc64/db_trace.c
===
RCS file: /cvs/src/sys/arch/sparc64/sparc64/db_trace.c,v
retrieving revision 1.12
diff -u -p -r1.12 db_trace.c
--- sys/arch/sparc64/sparc64/db_trace.c 11 Sep 2016 03:14:04 -  1.12
+++ sys/arch/sparc64/sparc64/db_trace.c 24 Jan 2017 00:10:47 -
@@ -78,7 +78,7 @@ db_stack_trace_print(db_expr_t addr, int
struct proc *p;
struct user *u;
(*pr)("trace: pid %d ", (int)addr);
-   p = pfind(addr);
+   p = tfind(addr);
if (p == NULL) {
(*pr)("not found\n");
return;
Index: sys/kern/kern_fork.c
===
RCS file: /cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.192
diff -u -p -r1.192 kern_fork.c
--- sys/kern/kern_fork.c7 Nov 2016 00:26:32 -   1.192
+++ sys/kern/kern_fork.c24 Jan 2017 00:04:58 -
@@ -559,7 +559,7 @@ alloctid(void)
do {
/* (0 .. TID_MASK+1] */
tid = 1 + (arc4random() & TID_MASK);
-   } while (pfind(tid) != NULL);
+   } while (tfind(tid) != NULL);
 
return (tid);
 }
Index: sys/kern/kern_proc.c
===
RCS file: /cvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.72
diff -u -p -r1.72 kern_proc.c
--- sys/kern/kern_proc.c21 Jan 2017 05:42:03 -  1.72
+++ sys/kern/kern_proc.c24 Jan 2017 00:05:06 -
@@ -169,7 +169,7 @@ inferior(struct process *pr, struct proc
  * Locate a proc (thread) by number
  */
 struct proc *
-pfind(pid_t tid)
+tfind(pid_t tid)
 {
struct proc *p;
 
Index: sys/kern/kern_sig.c

let bfd survive link-down

2017-01-23 Thread Peter Hessler
When an interface loses link, we delete all directly connected and
cloned routes.  However, this means we also lose any BFD configuration
on those routes.

Surviving link-down is pretty much mandatory for BFD to work.

So instead, I bypass the route deletion, and clean up the link-local
gateway information.

$ arp -n | grep 203.0.113.9
203.0.113.9  (incomplete)  em1 expired   

$ netstat -rnf inet | grep 203.0.113.9
203.0.113.9link#2 HLcF   2   53 - 3 em1  

OK?

Index: sys/net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.347
diff -u -p -u -p -r1.347 route.c
--- sys/net/route.c 20 Jan 2017 08:10:54 -  1.347
+++ sys/net/route.c 24 Jan 2017 01:32:23 -
@@ -1760,13 +1760,28 @@ rt_if_linkstate_change(struct rtentry *r
 * new routes from a better source.
 */
if (ISSET(rt->rt_flags, RTF_CLONED|RTF_DYNAMIC) &&
-   !ISSET(rt->rt_flags, RTF_CACHED)) {
+   !ISSET(rt->rt_flags, RTF_CACHED)
+#ifdef BFD
+   && !ISSET(rt->rt_flags, RTF_BFD)
+#endif
+   ) {
int error;
 
if ((error = rtdeletemsg(rt, ifp, id)))
return (error);
return (EAGAIN);
}
+#ifdef BFD
+   if (ISSET(rt->rt_flags, RTF_BFD) &&
+   ISSET(rt->rt_flags, RTF_CLONED) &&
+   (rt->rt_gateway->sa_family == AF_LINK)) {
+   struct sockaddr_dl *sdl;
+   sdl = (struct sockaddr_dl *)rt->rt_gateway;
+   memset(sdl->sdl_data, 0, sdl->sdl_alen);
+   sdl->sdl_alen = 0;
+   rt->rt_expire = time_uptime;
+   }
+#endif
/* take route down */
rt->rt_flags &= ~RTF_UP;
rtable_mpath_reprio(id, rt_key(rt),


-- 
Show me a man who is a good loser and I'll show you a man who is
playing golf with his boss.



Re: [WWW] Reverse chronological order for faq/current.html

2017-01-23 Thread Theo de Raadt
> As faq/current.html[0] grows, each major change is being added at
> the very bottom, chronologically. There already are several other
> pages where this kind of ordering makes sense, i.e. innovations.html[1].
> 
> Given the "current" (unintentional pun) nature of changes on the
> aforementioned page, it seem like reverse chronological order would
> suit it better, as is the case with, i.e. events.html[2].

This page includes remedial actions a current-follower needs, which
are generally cut  A reader decides "Where was I last time",
then would intuitively move forward.

The proposal doesn't make sense to me.



Re: Help for ddb(4)'s "tr /p"

2017-01-23 Thread Martin Pieuchot
On 24/01/17(Tue) 10:08, Philip Guenther wrote:
> On Tue, 24 Jan 2017, Martin Pieuchot wrote:
> > Now that pfind(9) takes tid we need a way to show TID in ddb(4) ps
> > output.  Otherwise it's hard to use "ps /p"
> 
> You mean "tr /p", right?

I do :)

> Hmm, ps/n (the default) is the only of the ps subcommands to not show the 
> TID...so that it can show more columns of COMMAND.  You've added 8 columns 
> without shrinking any of the existing ones, which will cause an unreadable 
> mess when it wraps around.  Maybe drop PGRP from /n and we add a new one 
> that's more like (userspace) ps -j ?

Here's the updated output.

   PID TID   PPIDUID  S   FLAGS  WAIT  COMMAND
*97567  517976  11478  0  7 0x3sysctl
 11478  302289  1  0  30x10008b  pause ksh
 68659  470461  1  0  30x100098  poll  cron


I like the idea of "ps /j", I could also merge the information in the
current "ps /w" since EMUL is always 'native' now.  What do you think?

ok for the diff below?

Index: sys/kern/kern_proc.c
===
RCS file: /cvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.73
diff -u -p -r1.73 kern_proc.c
--- sys/kern/kern_proc.c24 Jan 2017 00:58:55 -  1.73
+++ sys/kern/kern_proc.c24 Jan 2017 01:36:05 -
@@ -471,8 +471,8 @@ db_show_all_procs(db_expr_t addr, int ha
"COMMAND", "STRUCT PROC *", "UAREA *", "VMSPACE/VM_MAP");
break;
case 'n':
-   db_printf("   PID  %5s  %5s  %5s  S  %10s  %-12s  %-16s\n",
-   "PPID", "PGRP", "UID", "FLAGS", "WAIT", "COMMAND");
+   db_printf("   PID  %6s  %5s  %5s  S  %10s  %-12s  %-16s\n",
+   "TID", "PPID", "UID", "FLAGS", "WAIT", "COMMAND");
break;
case 'w':
db_printf("   TID  %-16s  %-8s  %18s  %s\n",
@@ -497,8 +497,14 @@ db_show_all_procs(db_expr_t addr, int ha
ci_schedstate.spc_idleproc == p)
continue;
}
-   db_printf("%c%5d  ", p == curproc ? '*' : ' ',
-   *mode == 'n' ? pr->ps_pid : p->p_tid);
+
+   if (*mode == 'n') {
+   db_printf("%c%5d  ", (p == curproc ?
+   '*' : ' '), pr->ps_pid);
+   } else {
+   db_printf("%c%6d  ", (p == curproc ?
+   '*' : ' '), p->p_tid);
+   }
 
switch (*mode) {
 
@@ -508,10 +514,9 @@ db_show_all_procs(db_expr_t addr, int ha
break;
 
case 'n':
-   db_printf("%5d  %5d  %5d  %d  %#10x  "
+   db_printf("%6d  %5d  %5d  %d  %#10x  "
"%-12.12s  %-16s\n",
-   ppr ? ppr->ps_pid : -1,
-   pr->ps_pgrp ? pr->ps_pgrp->pg_id : 
-1,
+   p->p_tid, ppr ? ppr->ps_pid : -1,
pr->ps_ucred->cr_ruid, p->p_stat,
p->p_flag | pr->ps_flags,
(p->p_wchan && p->p_wmesg) ?
Index: share/man/man4/ddb.4
===
RCS file: /cvs/src/share/man/man4/ddb.4,v
retrieving revision 1.83
diff -u -p -r1.83 ddb.4
--- share/man/man4/ddb.424 Jan 2017 01:01:33 -  1.83
+++ share/man/man4/ddb.424 Jan 2017 01:45:48 -
@@ -833,8 +833,8 @@ Display information on all processes.
 .Xr ps 1 Ns \&-like
 format.
 Information printed includes process ID, thread ID, parent
-process ID, process group, UID, process status, process flags, process
-command name, and process wait channel message.
+process ID, UID, process status, process flags, process
+wait channel message and process command name.
 .It Cm /a
 Shows the kernel virtual addresses of each process'
 proc structure, u-area, and vmspace structure.



Re: Help for ddb(4)'s "tr /p"

2017-01-23 Thread Philip Guenther
On Tue, 24 Jan 2017, Martin Pieuchot wrote:
> On 24/01/17(Tue) 10:08, Philip Guenther wrote:
...
> > Hmm, ps/n (the default) is the only of the ps subcommands to not show 
> > the TID...so that it can show more columns of COMMAND.  You've added 8 
> > columns without shrinking any of the existing ones, which will cause 
> > an unreadable mess when it wraps around.  Maybe drop PGRP from /n and 
> > we add a new one that's more like (userspace) ps -j ?
> 
> Here's the updated output.
> 
>PID TID   PPIDUID  S   FLAGS  WAIT  COMMAND
> *97567  517976  11478  0  7 0x3sysctl
>  11478  302289  1  0  30x10008b  pause ksh
>  68659  470461  1  0  30x100098  poll  cron
> 
> 
> I like the idea of "ps /j", I could also merge the information in the
> current "ps /w" since EMUL is always 'native' now.  What do you think?

Just s/EMUL/PGID/ and wait for someone to need something more?


> ok for the diff below?

Almost

> Index: sys/kern/kern_proc.c
> ===
> RCS file: /cvs/src/sys/kern/kern_proc.c,v
> retrieving revision 1.73
> diff -u -p -r1.73 kern_proc.c
> --- sys/kern/kern_proc.c  24 Jan 2017 00:58:55 -  1.73
> +++ sys/kern/kern_proc.c  24 Jan 2017 01:36:05 -
> @@ -471,8 +471,8 @@ db_show_all_procs(db_expr_t addr, int ha
>   "COMMAND", "STRUCT PROC *", "UAREA *", "VMSPACE/VM_MAP");
>   break;
>   case 'n':
> - db_printf("   PID  %5s  %5s  %5s  S  %10s  %-12s  %-16s\n",
> - "PPID", "PGRP", "UID", "FLAGS", "WAIT", "COMMAND");
> + db_printf("   PID  %6s  %5s  %5s  S  %10s  %-12s  %-16s\n",
> + "TID", "PPID", "UID", "FLAGS", "WAIT", "COMMAND");

TID is %6s instead of %5s, so should change %-16s to %-15s for COMMAND, 
both here and...

>   case 'n':
> - db_printf("%5d  %5d  %5d  %d  %#10x  "
> + db_printf("%6d  %5d  %5d  %d  %#10x  "
>   "%-12.12s  %-16s\n",

here.



document that RES_USE_EDNS0 and RES_USE_DNSSEC currently do nothing

2017-01-23 Thread Kirill Miazine

To reduce the risk that others spend hours doing the same excercise that
myself (https://marc.info/?l=openbsd-misc=148513378718363), here's
a patch that mentiones that RES_USE_EDNS0 and RES_USE_DNSSEC options
currently do nothing.

Index: lib/libc/net/resolver.3
===
RCS file: /cvs/src/lib/libc/net/resolver.3,v
retrieving revision 1.33
diff -u -p -r1.33 resolver.3
--- lib/libc/net/resolver.3 16 Dec 2015 18:12:42 -  1.33
+++ lib/libc/net/resolver.3 23 Jan 2017 22:53:58 -
@@ -187,11 +187,11 @@ This informs DNS servers of a client's r
allowing them to take advantage of a non-default receive buffer size,
and thus to send larger replies.
DNS query packets with the EDNS0 extension are not compatible with
-non-EDNS0 DNS servers.
+non-EDNS0 DNS servers.  At the moment this option does nothing.
.It Dv RES_USE_DNSSEC
Request that the resolver uses
Domain Name System Security Extensions (DNSSEC),
-as defined in RFCs 4033, 4034, and 4035.
+as defined in RFCs 4033, 4034, and 4035. At the moment this option does
nothing.
.El
.Pp
The



httpd TLS ticket support

2017-01-23 Thread Claudio Jeker
Since I just added ticket support to libtls here is a diff to enable it
in httpd.

Cheers
-- 
:wq Claudio

Index: config.c
===
RCS file: /cvs/src/usr.sbin/httpd/config.c,v
retrieving revision 1.50
diff -u -p -r1.50 config.c
--- config.c6 Nov 2016 10:49:38 -   1.50
+++ config.c22 Jan 2017 02:02:03 -
@@ -146,6 +146,7 @@ config_getcfg(struct httpd *env, struct 
memcpy(, imsg->data, sizeof(cf));
env->sc_opts = cf.cf_opts;
env->sc_flags = cf.cf_flags;
+   memcpy(env->sc_tls_sid, cf.cf_tls_sid, sizeof(env->sc_tls_sid));
 
what = ps->ps_what[privsep_process];
 
Index: httpd.c
===
RCS file: /cvs/src/usr.sbin/httpd/httpd.c,v
retrieving revision 1.63
diff -u -p -r1.63 httpd.c
--- httpd.c 9 Jan 2017 14:49:22 -   1.63
+++ httpd.c 23 Jan 2017 04:18:44 -
@@ -57,6 +57,7 @@ intparent_dispatch_server(int, struct
struct imsg *);
 int parent_dispatch_logger(int, struct privsep_proc *,
struct imsg *);
+voidparent_tls_ticket_rekey(int, short, void *);
 
 struct httpd   *httpd_env;
 
@@ -253,6 +254,11 @@ main(int argc, char *argv[])
exit(0);
}
 
+   /* rekey the TLS tickets before pushing the config */
+   parent_tls_ticket_rekey(0, 0, env);
+   /* initialize the TLS session id to a random key for all procs */
+   arc4random_buf(env->sc_tls_sid, sizeof(env->sc_tls_sid));
+
if (parent_configure(env) == -1)
fatalx("configuration failed");
 
@@ -307,6 +313,7 @@ parent_configure(struct httpd *env)
continue;
cf.cf_opts = env->sc_opts;
cf.cf_flags = env->sc_flags;
+   memcpy(cf.cf_tls_sid, env->sc_tls_sid, sizeof(cf.cf_tls_sid));
 
proc_compose(env->sc_ps, id, IMSG_CFG_DONE, , sizeof(cf));
}
@@ -450,6 +457,26 @@ parent_dispatch_logger(int fd, struct pr
}
 
return (0);
+}
+
+void
+parent_tls_ticket_rekey(int fd, short events, void *arg)
+{
+   static struct event  rekeyev;
+   struct httpd*env = arg;
+   struct timeval   tv;
+   struct httpd_tls_ticket  key;
+
+   key.tt_keyrev = arc4random();
+   arc4random_buf(key.tt_key, sizeof(key.tt_key));
+
+   proc_compose_imsg(env->sc_ps, PROC_SERVER, -1, IMSG_TLSTICKET_REKEY,
+   -1, -1, , sizeof(key));
+
+   evtimer_set(, parent_tls_ticket_rekey, env);
+   timerclear();
+   tv.tv_sec = SERVER_TLS_LIFE_TIME / 4;
+   evtimer_add(, );
 }
 
 /*
Index: httpd.conf.5
===
RCS file: /cvs/src/usr.sbin/httpd/httpd.conf.5,v
retrieving revision 1.76
diff -u -p -r1.76 httpd.conf.5
--- httpd.conf.514 Nov 2016 10:28:31 -  1.76
+++ httpd.conf.524 Jan 2017 01:52:34 -
@@ -556,6 +556,8 @@ will be used (secure protocols; TLSv1.2-
 Refer to the
 .Xr tls_config_parse_protocols 3
 function for other valid protocol string values.
+.It Oo Ic no Oc Ic tickets
+Enable or disable TLS session tickets.
 .El
 .El
 .Sh TYPES
Index: httpd.h
===
RCS file: /cvs/src/usr.sbin/httpd/httpd.h,v
retrieving revision 1.125
diff -u -p -r1.125 httpd.h
--- httpd.h 9 Jan 2017 14:49:22 -   1.125
+++ httpd.h 23 Jan 2017 04:17:52 -
@@ -73,6 +73,8 @@
 #define SERVER_MAX_PREFETCH256
 #define SERVER_MIN_PREFETCHED  32
 #define SERVER_HSTS_DEFAULT_AGE31536000
+//#define SERVER_TLS_LIFE_TIME (4 * 3600)
+#define SERVER_TLS_LIFE_TIME   (4 * 60)
 
 #define MEDIATYPE_NAMEMAX  128 /* file name extension */
 #define MEDIATYPE_TYPEMAX  64  /* length of type/subtype */
@@ -105,6 +107,7 @@ enum httpchunk {
 struct ctl_flags {
uint8_t  cf_opts;
uint32_t cf_flags;
+   uint8_t  cf_tls_sid[TLS_MAX_SESSION_ID_LENGTH];
 };
 
 enum key_type {
@@ -215,7 +218,8 @@ enum imsg_type {
IMSG_CFG_DONE,
IMSG_LOG_ACCESS,
IMSG_LOG_ERROR,
-   IMSG_LOG_OPEN
+   IMSG_LOG_OPEN,
+   IMSG_TLSTICKET_REKEY
 };
 
 enum privsep_procid {
@@ -452,6 +456,7 @@ struct server_config {
uint8_t *tls_ocsp_staple;
size_t   tls_ocsp_staple_len;
char*tls_ocsp_staple_file;
+   int  tls_ticket_enabled;
 
uint32_t flags;
int  strip;
@@ -504,6 +509,11 @@ struct server {
 };
 TAILQ_HEAD(serverlist, server);
 
+struct httpd_tls_ticket {
+   uint32_ttt_keyrev;
+   unsigned char   tt_key[TLS_TICKET_KEY_SIZE];
+};
+
 struct httpd {
uint8_t  sc_opts;
uint32_t sc_flags;
@@ -514,6 

Re: freestanding take 2, part 1

2017-01-23 Thread Philip Guenther
On Tue, Jan 24, 2017 at 3:33 PM, Mark Kettenis  wrote:
> Turns out our compilers support different sets of builtins.  So the
> diff below does the following:
>
> * Enables bzero, memcmp, memcpy and memset for all compilers.
> * Enables bcmp, bcopy and memmove for gcc4.
>
> This diff should be a no-op for kernels compiled without
> -ffreestanding.  I've checked a couple of architectures and so far
> have not found any evidence that this isn't the case.
>
> As soon as we start adding -ffreestanding to the kernel Makefiles,
> things will start to change.  That'll be part2.
>
> ok?

Built and rebooted to kernels with this and -ffreestanding (instead of
the list of -fno-builtin-foo options) on both loongson and macppc and
they're happy.  Idea makes sense.  ok guenther@



Re: global mbuf memory limit

2017-01-23 Thread David Gwynne
On Tue, Jan 24, 2017 at 02:55:50AM +0100, Alexander Bluhm wrote:
> On Wed, Dec 14, 2016 at 03:52:32PM +1000, David Gwynne wrote:
> > > Wouldn't it make sense to use atomic operations to keep track of the
> > > amount of memory that was allocated?
> > 
> > the mtx may be slow, but it is a slow path id like to keep simple.
> 
> Is allocate and free of mbuf clusters in the slow path?  I would
> expect that it has to be done for every packet.  An atomic operation
> would not solve the slowdown in the fast path, perhaps a multi-cpu
> counter?

no, this is only done at the level of pool page allocation. the
pool pages are what get cut up into mbufs and mbuf clusters.

the only difference between mbufs and clusters at the pool level
is their size, and therefore which pool they come from.

on my firewall the interesting numbers are:

$ vmstat -m | grep -e ^mbuf -e ^mcl -e ^Name
NameSize Requests FailInUse Pgreq Pgrel Npage Hiwat Minpg Maxpg Idle
mbufpl   256 1502227362815 357415 2547 22176623 22176415 208 953 181
mcl2k   2048 1446266146051 0709 104851993 104851840 153 1712 08   46
mcl4k   4096 7438867115  00 519649 519648   135 0 81
mcl8k   8192 3388545041  00 583882 583881   161 0 81
mcl9k   9216 15293011574 0   18 507792 507790   231 0 80

mbufs get used 1502227362815 / 22176623 or 67739 times per page
allocation. 2k clusters get used 1446266146051 / 104851993 or 13793
per page allocation.

> > > Long run I suppose we want to drop nmbclust and let users tune the
> > > total amount of memory available for clusters and set the initial
> > > amount to a percentage of physical memory?
> > 
> > i do want to move to specifying memory in terms of bytes instead of 2k 
> > clusters.
> 
> That might be better.  But leave it as it is for now.  Percentage
> of physical memory is bad as we have architectures that cannot do
> DMA everywhere.
> 
> > how much should be available by default is a complicated issue.
> 
> Yes.  Maybe we have to reconsier the default.  We had it per cluster
> size before, now the limit is for all clusters.

percentage of memory isnt a good idea. the memory available to the
stack should be sized by how much tcp you want to do, where how
much is determined by the bandwidth and latency you need to support.
that is regardless of whether you have 4G or 512G of RAM.

> > >> void
> > >> mbcpuinit()
> > >> {
> > >> +int i;
> > >> +
> > >>  mbstat = counters_alloc_ncpus(mbstat, MBSTAT_COUNT, M_DEVBUF);
> > >> +
> > >> +pool_cache_init();
> > >> +pool_cache_init();
> > >> +
> > >> +for (i = 0; i < nitems(mclsizes); i++)
> > >> +pool_cache_init([i]);
> > >> }
> 
> Is the pool cache related to the global mbuf limit?

no, but it is possible because of it.

the idea of the per cpu caches is to let cpus operate completely
independently most of the time. having an item limit on a pool means
the cpus need to coordinate on every pool put and get so they can
check and update that limit.

> Apart from the problem that I don't know wether the mutex kills
> performance, the diff looks good.

the tests ive done and simon mages has done show a slight benefit.
id expect to see that grow as we use pools more concurrently.



Re: clean up and modernize test calls in bsd.obj.mk

2017-01-23 Thread Robert Peichaer
On Tue, Jan 24, 2017 at 01:01:57PM +1000, Theo Buehler wrote:
> We're currently using several idioms for conditionally executing code in
> bsd.obj.mk. I'd like to unify them for the sake of readability and
> consistency.  This was done joint with rpe.

OK rpe@
 
> Index: share/mk/bsd.obj.mk
> ===
> RCS file: /cvs/src/share/mk/bsd.obj.mk,v
> retrieving revision 1.18
> diff -u -p -r1.18 bsd.obj.mk
> --- share/mk/bsd.obj.mk   24 Jan 2017 02:56:50 -  1.18
> +++ share/mk/bsd.obj.mk   24 Jan 2017 02:58:50 -
> @@ -33,20 +33,19 @@ obj! _SUBDIRUSE
>   SETOWNER=:; \
>   fi; \
>   [[ -z $$MKDIRS ]] && MKDIRS="mkdir -p"; \
> - if test $$here != $$subdir ; then \
> + if [[ $$here != $$subdir ]]; then \
>   dest=${BSDOBJDIR}/$$subdir ; \
>   echo "$$here/${__objdir} -> $$dest"; \
> - if test ! -L ${__objdir} -o \
> - X`readlink ${__objdir}` != X$$dest; \
> + if [[ ! -L ${__objdir} || `readlink ${__objdir}` != $$dest ]]; \
>   then \
> - if test -e ${__objdir}; then rm -rf ${__objdir}; fi; \
> + [[ -e ${__objdir} ]] && rm -rf ${__objdir}; \
>   ln -sf $$dest ${__objdir}; \
>   $$SETOWNER ${__objdir}; \
>   fi; \
> - if test -d ${BSDOBJDIR}; then \
> - test -d $$dest || $$MKDIRS $$dest; \
> + if [[ -d ${BSDOBJDIR} ]]; then \
> + [[ -d $$dest ]] || $$MKDIRS $$dest; \
>   else \
> - if test -e ${BSDOBJDIR}; then \
> + if [[ -e ${BSDOBJDIR} ]]; then \
>   echo "${BSDOBJDIR} is not a directory"; \
>   else \
>   echo "${BSDOBJDIR} does not exist"; \
> @@ -54,7 +53,7 @@ obj! _SUBDIRUSE
>   fi; \
>   else \
>   dest=$$here/${__objdir} ; \
> - if test ! -d ${__objdir} ; then \
> + if [[ ! -d ${__objdir} ]]; then \
>   echo "making $$dest" ; \
>   $$MKDIRS $$dest; \
>   $$SETOWNER $$dest; \



Re: let bfd survive link-down

2017-01-23 Thread Peter Hessler
On 2017 Jan 24 (Tue) at 02:38:54 +0100 (+0100), Peter Hessler wrote:
:When an interface loses link, we delete all directly connected and
:cloned routes.  However, this means we also lose any BFD configuration
:on those routes.
:
:Surviving link-down is pretty much mandatory for BFD to work.
:
:So instead, I bypass the route deletion, and clean up the link-local
:gateway information.
:
:$ arp -n | grep 203.0.113.9
:203.0.113.9  (incomplete)  em1 expired   
:
:$ netstat -rnf inet | grep 203.0.113.9
:203.0.113.9link#2 HLcF   2   53 - 3 em1  
:
:OK?
:

Even nicer, now I only expire the route, instead of poking at some of
the internals.


Index: sys/net/route.c
===
RCS file: /cvs/src/sys/net/route.c,v
retrieving revision 1.347
diff -u -p -u -p -r1.347 route.c
--- sys/net/route.c 20 Jan 2017 08:10:54 -  1.347
+++ sys/net/route.c 24 Jan 2017 03:55:01 -
@@ -1760,13 +1760,27 @@ rt_if_linkstate_change(struct rtentry *r
 * new routes from a better source.
 */
if (ISSET(rt->rt_flags, RTF_CLONED|RTF_DYNAMIC) &&
-   !ISSET(rt->rt_flags, RTF_CACHED)) {
+   !ISSET(rt->rt_flags, RTF_CACHED)
+#ifdef BFD
+   && !ISSET(rt->rt_flags, RTF_BFD)
+#endif
+   ) {
int error;
 
if ((error = rtdeletemsg(rt, ifp, id)))
return (error);
return (EAGAIN);
}
+#ifdef BFD
+   /*
+* in the BFD case, expire the route
+*/
+   if (ISSET(rt->rt_flags, RTF_BFD) &&
+   ISSET(rt->rt_flags, RTF_CLONED) &&
+   (rt->rt_gateway->sa_family == AF_LINK)) {
+   rt->rt_expire = time_uptime;
+   }
+#endif
/* take route down */
rt->rt_flags &= ~RTF_UP;
rtable_mpath_reprio(id, rt_key(rt),


-- 
Only presidents, editors, and people with tapeworms have the right to
use the editorial "we."



Re: let bfd survive link-down

2017-01-23 Thread Martin Pieuchot
On 24/01/17(Tue) 04:55, Peter Hessler wrote:
> On 2017 Jan 24 (Tue) at 02:38:54 +0100 (+0100), Peter Hessler wrote:
> :When an interface loses link, we delete all directly connected and
> :cloned routes.  However, this means we also lose any BFD configuration
> :on those routes.
> :
> :Surviving link-down is pretty much mandatory for BFD to work.
> :
> :So instead, I bypass the route deletion, and clean up the link-local
> :gateway information.
> :
> :$ arp -n | grep 203.0.113.9
> :203.0.113.9  (incomplete)  em1 expired   
> :
> :$ netstat -rnf inet | grep 203.0.113.9
> :203.0.113.9link#2 HLcF   2   53 - 3 em1  
> :
> :OK?
> :
> 
> Even nicer, now I only expire the route, instead of poking at some of
> the internals.

There's no need to expire it, bringing the route down is enough.
Expiration means you assume the route represents a L2 entry.  Plus
rt_expire should be only modified by ARP/NDP code.

Just change:

!ISSET(rt->rt_flags, RTF_CACHED)

into
!ISSET(rt->rt_flags, RTF_CACHED|RTF_BFD)


But this really show that BFD data structure don't need to be linked to
rtentries.

> Index: sys/net/route.c
> ===
> RCS file: /cvs/src/sys/net/route.c,v
> retrieving revision 1.347
> diff -u -p -u -p -r1.347 route.c
> --- sys/net/route.c   20 Jan 2017 08:10:54 -  1.347
> +++ sys/net/route.c   24 Jan 2017 03:55:01 -
> @@ -1760,13 +1760,27 @@ rt_if_linkstate_change(struct rtentry *r
>* new routes from a better source.
>*/
>   if (ISSET(rt->rt_flags, RTF_CLONED|RTF_DYNAMIC) &&
> - !ISSET(rt->rt_flags, RTF_CACHED)) {
> + !ISSET(rt->rt_flags, RTF_CACHED)
> +#ifdef BFD
> + && !ISSET(rt->rt_flags, RTF_BFD)
> +#endif
> + ) {
>   int error;
>  
>   if ((error = rtdeletemsg(rt, ifp, id)))
>   return (error);
>   return (EAGAIN);
>   }
> +#ifdef BFD
> + /*
> +  * in the BFD case, expire the route
> +  */
> + if (ISSET(rt->rt_flags, RTF_BFD) &&
> + ISSET(rt->rt_flags, RTF_CLONED) &&
> + (rt->rt_gateway->sa_family == AF_LINK)) {
> + rt->rt_expire = time_uptime;
> + }
> +#endif
>   /* take route down */
>   rt->rt_flags &= ~RTF_UP;
>   rtable_mpath_reprio(id, rt_key(rt),
> 
> 
> -- 
> Only presidents, editors, and people with tapeworms have the right to
> use the editorial "we."
> 



ld.so: don't use _dl_exit() for fatal errors

2017-01-23 Thread Philip Guenther

So right now, ld.so simply exits iun various error cases, like unknown 
relocation.  This isn't great, as it's a normal exit when a linking 
failure really should be an abnormal exit as from a fatal signal.  If 
"grep" has a linking failure I want it to *die* instead of meekly 
returning non-zero.

Diff below adds three functions to make fatal error handling simpler:
 - _dl_diedie()
   This makes the process kill itself with SIGKILL via thrkill(2).  It 
   might be nicer to do SIGABRT, but that can be caught so it would need
   to be prepared to use sigaction and suddenly it seems like overkill.

 - _dl_die(fmt, ...)
   This is a printf-like function that prints a prefix of
"ld.so: progname: "
   then the provided format, and finally it adds a newline and calls 
   _dl_diedie().  Makes reporting fatal errors trivial.
   
 - _dl_oom()
   Like _dl_die("out of memory"), for use in the common case of a
   failed malloc/etc


The rest of the diff then converts the existing "_dl_printf()+_dl_exit()" 
and "_dl_printf() + forced segfault" cases into calls to _dl_die() or 
_dl_oom() as appropriate.  Only places which already used _dl_exit() or 
tried to trigger a segfault have been changed to use these.

Tested on amd64, armv7, loongson, macppc, and sparc64 by building a 
version of libc with the 'vis' line removed from Symbols.list, then 
running 'vis' against that .so and verifying the output is something 
like:
vis:vis: undefined symbol 'vis'
ld.so: vis: lazy binding failed!
Killed

Test results for other archs?  oks?


Philip


Index: dl_printf.c
===
RCS file: /cvs/src/libexec/ld.so/dl_printf.c,v
retrieving revision 1.18
diff -u -p -r1.18 dl_printf.c
--- dl_printf.c 23 Jan 2017 13:00:09 -  1.18
+++ dl_printf.c 24 Jan 2017 05:16:04 -
@@ -56,6 +56,7 @@
 
 #include 
 #include 
+#include "resolve.h"   /* for __progname */
 #include "syscall.h"
 #include "util.h"
 
@@ -132,8 +133,10 @@ kdoprnt(int fd, const char *fmt, va_list
 
for (;;) {
while ((ch = *fmt++) != '%') {
-   if (ch == '\0')
+   if (ch == '\0') {
+   _dl_flushbuf();
return;
+   }
putcharfd(ch, fd);
}
lflag = 0;
@@ -221,7 +224,6 @@ reswitch:
putcharfd(ch, fd);
}
}
-   _dl_flushbuf();
 }
 
 static void
@@ -237,4 +239,31 @@ kprintn(int fd, unsigned long ul, int ba
do {
putcharfd(*--p, fd);
} while (p > buf);
+}
+
+static char ldso[] = "ld.so: ";
+
+__dead void
+_dl_die(const char *fmt, ...)
+{
+   va_list ap;
+
+   _dl_printf("%s%s: ", ldso, __progname);
+   va_start(ap, fmt);
+   kdoprnt(2, fmt, ap);
+   _dl_write(2, "\n", 1);
+   va_end(ap);
+
+   _dl_diedie();
+}
+
+__dead void
+_dl_oom(void)
+{
+   static const char oom[] = ": out of memory\n";
+
+   _dl_write(2, ldso, sizeof(ldso) - 1);
+   _dl_write(2, __progname, _dl_strlen(__progname));
+   _dl_write(2, oom, sizeof(oom) - 1);
+   _dl_diedie();
 }
Index: library.c
===
RCS file: /cvs/src/libexec/ld.so/library.c,v
retrieving revision 1.79
diff -u -p -r1.79 library.c
--- library.c   12 Aug 2016 20:39:01 -  1.79
+++ library.c   24 Jan 2017 05:16:04 -
@@ -236,7 +236,7 @@ _dl_tryload_shlib(const char *libname, i
res = NULL; /* silence gcc */
next_load = _dl_calloc(1, sizeof(struct load_list));
if (next_load == NULL)
-   _dl_exit(7);
+   _dl_oom();
next_load->next = load_list;
load_list = next_load;
next_load->start = start;
Index: library_mquery.c
===
RCS file: /cvs/src/libexec/ld.so/library_mquery.c,v
retrieving revision 1.56
diff -u -p -r1.56 library_mquery.c
--- library_mquery.c12 Aug 2016 20:39:01 -  1.56
+++ library_mquery.c24 Jan 2017 05:16:04 -
@@ -176,7 +176,7 @@ _dl_tryload_shlib(const char *libname, i
if (size != 0) {
ld = _dl_malloc(sizeof(struct load_list));
if (ld == NULL)
-   _dl_exit(7);
+   _dl_oom();
ld->start = NULL;
ld->size = size;
ld->moff = TRUNC_PG(phdp->p_vaddr);
@@ -191,7 +191,7 @@ _dl_tryload_shlib(const char *libname, i
/* This phdr has a zfod section */
ld = _dl_calloc(1, 

pool_debug

2017-01-23 Thread Martin Pieuchot
I'd like to force a yield() for every pool_get(9) using PR_WAITOK, just
like we do with malloc(9), in order to ensure that the NET_LOCK() is not
held across context switches. 

ok?

Index: kern/subr_pool.c
===
RCS file: /cvs/src/sys/kern/subr_pool.c,v
retrieving revision 1.204
diff -u -p -r1.204 subr_pool.c
--- kern/subr_pool.c21 Nov 2016 01:44:06 -  1.204
+++ kern/subr_pool.c24 Jan 2017 06:29:09 -
@@ -513,7 +513,7 @@ pool_get(struct pool *pp, int flags)
}
mtx_leave(>pr_mtx);
 
-   if (slowdown && ISSET(flags, PR_WAITOK))
+   if ((slowdown || pool_debug == 2) && ISSET(flags, PR_WAITOK))
yield();
 
if (v == NULL) {



Scheduler ping-pong with preempt()

2017-01-23 Thread Martin Pieuchot
Userland threads are preempt()'d when hogging a CPU or when processing
an AST.  Currently when such a thread is preempted the scheduler looks
for an idle CPU and puts it on its run queue.  That means the number of
involuntary context switch often result in a migration. 

This is not a problem per se and one could argue that if another CPU
is idle it makes sense to move.  However with the KERNEL_LOCK() moving
to another CPU won't necessarily allows the preempt()'d thread to run.
It's even worse, it increases contention.

If you add to this behavior the fact that sched_choosecpu() prefers idle
CPUs in a linear order, meaning CPU0 > CPU1 > .. > CPUN, you'll
understand that the set of idle CPUs will change every time preempt() is
called.

I believe this behavior affects kernel threads by side effect, since
the set of idle CPU changes every time a thread is preempted.  With this
diff the 'softnet' thread didn't move on a 2 CPUs machine during simple
benchmarks.  Without, it plays ping-pong between CPU.

The goal of this diff is to reduce the number of migrations.  You
can compare the value of 'sched_nomigrations' and 'sched_nmigrations'
with and without it.

As usual, I'd like to know what's the impact of this diff on your
favorite benchmark.  Please test and report back.

Index: kern/kern_sched.c
===
RCS file: /cvs/src/sys/kern/kern_sched.c,v
retrieving revision 1.44
diff -u -p -r1.44 kern_sched.c
--- kern/kern_sched.c   21 Jan 2017 05:42:03 -  1.44
+++ kern/kern_sched.c   24 Jan 2017 03:08:23 -
@@ -51,6 +51,8 @@ uint64_t sched_noidle;/* Times we didn
 uint64_t sched_stolen; /* Times we stole proc from other cpus */
 uint64_t sched_choose; /* Times we chose a cpu */
 uint64_t sched_wasidle;/* Times we came out of idle */
+uint64_t sched_nvcsw;  /* voluntary context switches */
+uint64_t sched_nivcsw; /* involuntary context switches */
 
 #ifdef MULTIPROCESSOR
 struct taskq *sbartq;
Index: kern/kern_synch.c
===
RCS file: /cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.136
diff -u -p -r1.136 kern_synch.c
--- kern/kern_synch.c   21 Jan 2017 05:42:03 -  1.136
+++ kern/kern_synch.c   24 Jan 2017 03:08:23 -
@@ -296,6 +296,7 @@ sleep_finish(struct sleep_state *sls, in
if (sls->sls_do_sleep && do_sleep) {
p->p_stat = SSLEEP;
p->p_ru.ru_nvcsw++;
+   sched_nvcsw++;
SCHED_ASSERT_LOCKED();
mi_switch();
} else if (!do_sleep) {
@@ -481,6 +482,7 @@ sys_sched_yield(struct proc *p, void *v,
p->p_stat = SRUN;
setrunqueue(p);
p->p_ru.ru_nvcsw++;
+   sched_nvcsw++;
mi_switch();
SCHED_UNLOCK(s);
 
Index: kern/sched_bsd.c
===
RCS file: /cvs/src/sys/kern/sched_bsd.c,v
retrieving revision 1.43
diff -u -p -r1.43 sched_bsd.c
--- kern/sched_bsd.c9 Mar 2016 13:38:50 -   1.43
+++ kern/sched_bsd.c24 Jan 2017 03:18:24 -
@@ -302,6 +302,7 @@ yield(void)
p->p_stat = SRUN;
setrunqueue(p);
p->p_ru.ru_nvcsw++;
+   sched_nvcsw++;
mi_switch();
SCHED_UNLOCK(s);
 }
@@ -327,9 +328,12 @@ preempt(struct proc *newp)
SCHED_LOCK(s);
p->p_priority = p->p_usrpri;
p->p_stat = SRUN;
+#if 0
p->p_cpu = sched_choosecpu(p);
+#endif
setrunqueue(p);
p->p_ru.ru_nivcsw++;
+   sched_nivcsw++;
mi_switch();
SCHED_UNLOCK(s);
 }
Index: sys/sched.h
===
RCS file: /cvs/src/sys/sys/sched.h,v
retrieving revision 1.41
diff -u -p -r1.41 sched.h
--- sys/sched.h 17 Mar 2016 13:18:47 -  1.41
+++ sys/sched.h 24 Jan 2017 02:10:41 -
@@ -134,6 +134,9 @@ struct schedstate_percpu {
 extern int schedhz;/* ideally: 16 */
 extern int rrticks_init;   /* ticks per roundrobin() */
 
+extern uint64_t sched_nvcsw;   /* voluntary context switches */
+extern uint64_t sched_nivcsw;  /* involuntary context switches */
+
 struct proc;
 void schedclock(struct proc *);
 struct cpu_info;



ypldap(8) tls

2017-01-23 Thread Jonathan Matthew
This adds TLS to ypldap(8), both 'ldaps' (TLS on port 636) and 'tls' (on port
389, after a 'starttls' extended operation) variants.  I've tried this against
one of our AD servers and against ldapd.


Index: Makefile
===
RCS file: /cvs/src/usr.sbin/ypldap/Makefile,v
retrieving revision 1.8
diff -u -p -u -p -r1.8 Makefile
--- Makefile9 Sep 2015 15:33:18 -   1.8
+++ Makefile24 Jan 2017 03:27:47 -
@@ -9,7 +9,7 @@ SRCS=   parse.y ypldap.c log.c  \
 MAN=   ypldap.8 ypldap.conf.5
 
 DPADD= ${LIBEVENT} ${LIBUTIL} ${LIBRPCSVC}
-LDADD= -levent -lutil -lrpcsvc
+LDADD= -ltls -levent -lutil -lrpcsvc
 CFLAGS+=   -I${.CURDIR}
 CFLAGS+=   -Wall
 CFLAGS+=   -Wstrict-prototypes -Wmissing-prototypes
Index: aldap.c
===
RCS file: /cvs/src/usr.sbin/ypldap/aldap.c,v
retrieving revision 1.34
diff -u -p -u -p -r1.34 aldap.c
--- aldap.c 22 Oct 2016 03:37:13 -  1.34
+++ aldap.c 24 Jan 2017 03:27:47 -
@@ -25,6 +25,8 @@
 #include 
 #include 
 
+#include 
+
 #include "aldap.h"
 
 #if 0
@@ -42,6 +44,9 @@ static int isu8cont(unsigned char);
 char   *parseval(char *, size_t);
 intaldap_create_page_control(struct ber_element *,
int, struct aldap_page_control *);
+intaldap_send(struct aldap *,
+   struct ber_element *);
+unsigned long  aldap_application(struct ber_element *);
 
 #ifdef DEBUG
 voidldap_debug_elements(struct ber_element *);
@@ -55,13 +60,21 @@ void ldap_debug_elements(struct 
ber_e
 #define LDAP_DEBUG(x, y)   do { } while (0)
 #endif
 
+unsigned long
+aldap_application(struct ber_element *elm)
+{
+   return BER_TYPE_OCTETSTRING;
+}
+
 int
 aldap_close(struct aldap *al)
 {
-   if (close(al->ber.fd) == -1)
-   return (-1);
+   if (al->fd != -1)
+   if (close(al->ber.fd) == -1)
+   return (-1);
 
ber_free(>ber);
+   evbuffer_free(al->buf);
free(al);
 
return (0);
@@ -74,16 +87,109 @@ aldap_init(int fd)
 
if ((a = calloc(1, sizeof(*a))) == NULL)
return NULL;
-   a->ber.fd = fd;
+   a->buf = evbuffer_new();
+   a->fd = fd;
+   a->ber.fd = -1;
+   ber_set_application(>ber, aldap_application);
 
return a;
 }
 
 int
+aldap_tls(struct aldap *ldap, struct tls_config *cfg, const char *name)
+{
+   ldap->tls = tls_client();
+   if (ldap->tls == NULL) {
+   ldap->err = ALDAP_ERR_OPERATION_FAILED;
+   return (-1);
+   }
+
+   if (tls_configure(ldap->tls, cfg)) {
+   ldap->err = ALDAP_ERR_TLS_ERROR;
+   return (-1);
+   }
+
+   if (tls_connect_socket(ldap->tls, ldap->fd, name)) {
+   ldap->err = ALDAP_ERR_TLS_ERROR;
+   return (-1);
+   }
+
+   if (tls_handshake(ldap->tls)) {
+   ldap->err = ALDAP_ERR_TLS_ERROR;
+   return (-1);
+   }
+
+   ldap->fd = -1;
+   return (0);
+}
+
+int
+aldap_send(struct aldap *ldap, struct ber_element *root)
+{
+   int error, wrote;
+   void *data;
+   size_t len, done;
+
+   len = ber_calc_len(root);
+   error = ber_write_elements(>ber, root);
+   ber_free_elements(root);
+   if (error == -1)
+   return -1;
+
+   ber_get_writebuf(>ber, );
+   done = 0;
+   while (len > 0) {
+   if (ldap->tls != NULL) {
+   wrote = tls_write(ldap->tls, ((char *)data) + done,
+   len);
+   if (wrote == TLS_WANT_POLLIN ||
+   wrote == TLS_WANT_POLLOUT)
+   continue;
+   } else
+   wrote = write(ldap->fd, ((char *)data) + done, len);
+
+   if (wrote == -1)
+   return -1;
+
+   len -= wrote;
+   done += wrote;
+   }
+
+   return 0;
+}
+
+int
+aldap_req_starttls(struct aldap *ldap)
+{
+   struct ber_element *root = NULL, *ber;
+
+   if ((root = ber_add_sequence(NULL)) == NULL)
+   goto fail;
+
+   ber = ber_printf_elements(root, "d{tst", ++ldap->msgid, BER_CLASS_APP,
+   (unsigned long) LDAP_REQ_EXTENDED, LDAP_STARTTLS_OID,
+   BER_CLASS_CONTEXT, (unsigned long) 0);
+   if (ber == NULL) {
+   ldap->err = ALDAP_ERR_OPERATION_FAILED;
+   goto fail;
+   }
+
+   if (aldap_send(ldap, root) == -1)
+   goto fail;
+
+   return (ldap->msgid);
+fail:
+   if (root != NULL)
+   ber_free_elements(root);
+
+   ldap->err = ALDAP_ERR_OPERATION_FAILED;
+ 

freestanding take 2, part 1

2017-01-23 Thread Mark Kettenis
Turns out our compilers support different sets of builtins.  So the
diff below does the following:

* Enables bzero, memcmp, memcpy and memset for all compilers.
* Enables bcmp, bcopy and memmove for gcc4.

This diff should be a no-op for kernels compiled without
-ffreestanding.  I've checked a couple of architectures and so far
have not found any evidence that this isn't the case.

As soon as we start adding -ffreestanding to the kernel Makefiles,
things will start to change.  That'll be part2.

ok?


Index: sys/systm.h
===
RCS file: /cvs/src/sys/sys/systm.h,v
retrieving revision 1.121
diff -u -p -r1.121 systm.h
--- sys/systm.h 29 Dec 2016 12:12:44 -  1.121
+++ sys/systm.h 24 Jan 2017 05:28:04 -
@@ -322,6 +322,16 @@ extern int (*mountroot)(void);
 
 #include 
 
+#define bzero(b, n)__builtin_bzero((b), (n))
+#define memcmp(b1, b2, n)  __builtin_memcmp((b1), (b2), (n))
+#define memcpy(d, s, n)__builtin_memcpy((d), (s), (n))
+#define memset(b, c, n)__builtin_memset((b), (c), (n))
+#if !defined(__clang__) && (defined(__GNUC__) && __GNUC__ >= 4)
+#define bcmp(b1, b2, n)__builtin_bcmp((b1), (b2), (n))
+#define bcopy(s, d, n) __builtin_bcopy((s), (d), (n))
+#define memmove(d, s, n)   __builtin_memmove((d), (s), (n))
+#endif
+
 #if defined(DDB) || defined(KGDB)
 /* debugger entry points */
 void   Debugger(void); /* in DDB only */
Index: lib/libkern/bcmp.c
===
RCS file: /cvs/src/sys/lib/libkern/bcmp.c,v
retrieving revision 1.10
diff -u -p -r1.10 bcmp.c
--- lib/libkern/bcmp.c  10 Jun 2014 04:16:57 -  1.10
+++ lib/libkern/bcmp.c  24 Jan 2017 05:28:04 -
@@ -31,6 +31,8 @@
 
 #include 
 
+#undef bcmp
+
 /*
  * bcmp -- vax cmpc3 instruction
  */
Index: lib/libkern/bcopy.c
===
RCS file: /cvs/src/sys/lib/libkern/bcopy.c,v
retrieving revision 1.1
diff -u -p -r1.1 bcopy.c
--- lib/libkern/bcopy.c 13 Jun 2013 04:33:26 -  1.1
+++ lib/libkern/bcopy.c 24 Jan 2017 05:28:04 -
@@ -35,6 +35,8 @@
 #include 
 #include 
 
+#undef bcopy
+
 /*
  * sizeof(word) MUST BE A POWER OF TWO
  * SO THAT wmask BELOW IS ALL ONES
Index: lib/libkern/bzero.c
===
RCS file: /cvs/src/sys/lib/libkern/bzero.c,v
retrieving revision 1.9
diff -u -p -r1.9 bzero.c
--- lib/libkern/bzero.c 10 Jun 2014 04:16:57 -  1.9
+++ lib/libkern/bzero.c 24 Jan 2017 05:28:04 -
@@ -31,6 +31,8 @@
 
 #include 
 
+#undef bzero
+
 /*
  * bzero -- vax movc5 instruction
  */
Index: lib/libkern/memcmp.c
===
RCS file: /cvs/src/sys/lib/libkern/memcmp.c,v
retrieving revision 1.6
diff -u -p -r1.6 memcmp.c
--- lib/libkern/memcmp.c10 Jun 2014 04:16:57 -  1.6
+++ lib/libkern/memcmp.c24 Jan 2017 05:28:04 -
@@ -34,6 +34,8 @@
 
 #include 
 
+#undef memcmp
+
 /*
  * Compare memory regions.
  */
Index: lib/libkern/memcpy.c
===
RCS file: /cvs/src/sys/lib/libkern/memcpy.c,v
retrieving revision 1.3
diff -u -p -r1.3 memcpy.c
--- lib/libkern/memcpy.c12 Jun 2013 16:44:22 -  1.3
+++ lib/libkern/memcpy.c24 Jan 2017 05:28:04 -
@@ -32,6 +32,8 @@
 #include 
 #include 
 
+#undef memcpy
+
 /*
  * This is designed to be small, not fast.
  */
Index: lib/libkern/memmove.c
===
RCS file: /cvs/src/sys/lib/libkern/memmove.c,v
retrieving revision 1.1
diff -u -p -r1.1 memmove.c
--- lib/libkern/memmove.c   11 Jun 2013 18:04:41 -  1.1
+++ lib/libkern/memmove.c   24 Jan 2017 05:28:04 -
@@ -32,6 +32,8 @@
 #include 
 #include 
 
+#undef memmove
+
 /*
  * This is designed to be small, not fast.
  */
Index: lib/libkern/memset.c
===
RCS file: /cvs/src/sys/lib/libkern/memset.c,v
retrieving revision 1.7
diff -u -p -r1.7 memset.c
--- lib/libkern/memset.c10 Jun 2014 04:16:57 -  1.7
+++ lib/libkern/memset.c24 Jan 2017 05:28:04 -
@@ -39,6 +39,9 @@
 #include 
 #include 
 
+#undef bzero
+#undef memset
+
 #definewsize   sizeof(u_int)
 #definewmask   (wsize - 1)
 



Re: httpd TLS ticket support

2017-01-23 Thread Reyk Floeter

> Am 24.01.2017 um 02:54 schrieb Claudio Jeker :
> 
> Since I just added ticket support to libtls here is a diff to enable it
> in httpd.
> 

Thanks, comments below.

Reyk

> Cheers
> -- 
> :wq Claudio
> 
> Index: config.c
> ===
> RCS file: /cvs/src/usr.sbin/httpd/config.c,v
> retrieving revision 1.50
> diff -u -p -r1.50 config.c
> --- config.c6 Nov 2016 10:49:38 -1.50
> +++ config.c22 Jan 2017 02:02:03 -
> @@ -146,6 +146,7 @@ config_getcfg(struct httpd *env, struct 
>memcpy(, imsg->data, sizeof(cf));
>env->sc_opts = cf.cf_opts;
>env->sc_flags = cf.cf_flags;
> +memcpy(env->sc_tls_sid, cf.cf_tls_sid, sizeof(env->sc_tls_sid));
> 
>what = ps->ps_what[privsep_process];
> 
> Index: httpd.c
> ===
> RCS file: /cvs/src/usr.sbin/httpd/httpd.c,v
> retrieving revision 1.63
> diff -u -p -r1.63 httpd.c
> --- httpd.c9 Jan 2017 14:49:22 -1.63
> +++ httpd.c23 Jan 2017 04:18:44 -
> @@ -57,6 +57,7 @@ int parent_dispatch_server(int, struct
>struct imsg *);
> int parent_dispatch_logger(int, struct privsep_proc *,
>struct imsg *);
> +void parent_tls_ticket_rekey(int, short, void *);
> 
> struct httpd*httpd_env;
> 
> @@ -253,6 +254,11 @@ main(int argc, char *argv[])
>exit(0);
>}
> 
> +/* rekey the TLS tickets before pushing the config */
> +parent_tls_ticket_rekey(0, 0, env);

Why not doing this in parent_configure() -
this way it would rekey on config reload and look like the better place.

> +/* initialize the TLS session id to a random key for all procs */
> +arc4random_buf(env->sc_tls_sid, sizeof(env->sc_tls_sid));
> +
>if (parent_configure(env) == -1)
>fatalx("configuration failed");
> 
> @@ -307,6 +313,7 @@ parent_configure(struct httpd *env)
>continue;
>cf.cf_opts = env->sc_opts;
>cf.cf_flags = env->sc_flags;
> +memcpy(cf.cf_tls_sid, env->sc_tls_sid, sizeof(cf.cf_tls_sid));
> 
>proc_compose(env->sc_ps, id, IMSG_CFG_DONE, , sizeof(cf));
>}
> @@ -450,6 +457,26 @@ parent_dispatch_logger(int fd, struct pr
>}
> 
>return (0);
> +}
> +
> +void
> +parent_tls_ticket_rekey(int fd, short events, void *arg)
> +{
> +static struct event rekeyev;
> +struct httpd*env = arg;
> +struct timeval tv;
> +struct httpd_tls_ticket key;
> +
> +key.tt_keyrev = arc4random();
> +arc4random_buf(key.tt_key, sizeof(key.tt_key));
> +
> +proc_compose_imsg(env->sc_ps, PROC_SERVER, -1, IMSG_TLSTICKET_REKEY,
> +-1, -1, , sizeof(key));
> +
> +evtimer_set(, parent_tls_ticket_rekey, env);
> +timerclear();
> +tv.tv_sec = SERVER_TLS_LIFE_TIME / 4;
> +evtimer_add(, );
> }
> 
> /*
> Index: httpd.conf.5
> ===
> RCS file: /cvs/src/usr.sbin/httpd/httpd.conf.5,v
> retrieving revision 1.76
> diff -u -p -r1.76 httpd.conf.5
> --- httpd.conf.514 Nov 2016 10:28:31 -1.76
> +++ httpd.conf.524 Jan 2017 01:52:34 -
> @@ -556,6 +556,8 @@ will be used (secure protocols; TLSv1.2-
> Refer to the
> .Xr tls_config_parse_protocols 3
> function for other valid protocol string values.
> +.It Oo Ic no Oc Ic tickets
> +Enable or disable TLS session tickets.

What is the default? Should be mentioned here.

> .El
> .El
> .Sh TYPES
> Index: httpd.h
> ===
> RCS file: /cvs/src/usr.sbin/httpd/httpd.h,v
> retrieving revision 1.125
> diff -u -p -r1.125 httpd.h
> --- httpd.h9 Jan 2017 14:49:22 -1.125
> +++ httpd.h23 Jan 2017 04:17:52 -
> @@ -73,6 +73,8 @@
> #define SERVER_MAX_PREFETCH256
> #define SERVER_MIN_PREFETCHED32
> #define SERVER_HSTS_DEFAULT_AGE31536000
> +//#define SERVER_TLS_LIFE_TIME(4 * 3600)

This // comment seems to be a leftover.

> +#define SERVER_TLS_LIFE_TIME(4 * 60)
> 
> #define MEDIATYPE_NAMEMAX128/* file name extension */
> #define MEDIATYPE_TYPEMAX64/* length of type/subtype */
> @@ -105,6 +107,7 @@ enum httpchunk {
> struct ctl_flags {
>uint8_t cf_opts;
>uint32_t cf_flags;
> +uint8_t cf_tls_sid[TLS_MAX_SESSION_ID_LENGTH];
> };
> 
> enum key_type {
> @@ -215,7 +218,8 @@ enum imsg_type {
>IMSG_CFG_DONE,
>IMSG_LOG_ACCESS,
>IMSG_LOG_ERROR,
> -IMSG_LOG_OPEN
> +IMSG_LOG_OPEN,
> +IMSG_TLSTICKET_REKEY
> };
> 
> enum privsep_procid {
> @@ -452,6 +456,7 @@ struct server_config {
>uint8_t*tls_ocsp_staple;
>size_t tls_ocsp_staple_len;
>char*tls_ocsp_staple_file;
> +int tls_ticket_enabled;
> 
>uint32_t flags;
>int strip;
> @@ -504,6 +509,11 @@ struct server {
> };
> TAILQ_HEAD(serverlist, server);
> 
> 

Re: NET_LOCK() ordering issue

2017-01-23 Thread Philip Guenther
On Mon, Jan 23, 2017 at 4:01 PM, Martin Pieuchot  wrote:
> On 24/01/17(Tue) 00:51, Alexander Bluhm wrote:
>> On Sat, Jan 21, 2017 at 10:46:42AM +1000, Martin Pieuchot wrote:
...
> Updated diff, thanks for your review.

Very close...


> --- kern/uipc_syscalls.c29 Dec 2016 12:12:43 -  1.144
> +++ kern/uipc_syscalls.c23 Jan 2017 23:59:34 -
...
> @@ -277,19 +277,35 @@ doaccept(struct proc *p, int sock, struc
> return (error);
>
> headfp = fp;
> +   head = headfp->f_data;
> +
> +   fdplock(fdp);
> +   error = falloc(p, , );
> +   if (!error && (flags & SOCK_CLOEXEC))
> +   fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
> +   fdpunlock(fdp);
> +   if (error) {
> +   /*
> +* Probably ran out of file descriptors.  Wakeup
> +* so some other process might have a chance at it.
> +*/
> +   wakeup_one(>so_timeo);

This comment and wakeup_one() call should be left out.  They were
needed in the original location because the thread had consumed the
original wakeup from the new connection being completed at that point
and when it gave up (because falloc() failed) it needed to recreate
that for the next process.  Now, doing the falloc() before we don't
need or want that extra wakeup.

Otherwise, looks good.


Philip Guenther



Re: ypldap(8) tls

2017-01-23 Thread Theo de Raadt
 DPADD= ${LIBEVENT} ${LIBUTIL} ${LIBRPCSVC}
-LDADD= -levent -lutil -lrpcsvc
+LDADD= -ltls -levent -lutil -lrpcsvc

You will need to use:

DPADD=${LIBTLS} ${LIBSSL} ${LIBCRYPTO} ${LIBEVENT} ${LIBUTIL} 
${LIBRPCSVC}
LDADD=-ltls -lssl -lcrypto -levent -lutil -lrpcsvc



Re: Help for ddb(4)'s "tr /p"

2017-01-23 Thread Martin Pieuchot
On 24/01/17(Tue) 11:35, Philip Guenther wrote:
> On Tue, 24 Jan 2017, Martin Pieuchot wrote:
> > On 24/01/17(Tue) 10:08, Philip Guenther wrote:
> ...
> > > Hmm, ps/n (the default) is the only of the ps subcommands to not show 
> > > the TID...so that it can show more columns of COMMAND.  You've added 8 
> > > columns without shrinking any of the existing ones, which will cause 
> > > an unreadable mess when it wraps around.  Maybe drop PGRP from /n and 
> > > we add a new one that's more like (userspace) ps -j ?
> > 
> > Here's the updated output.
> > 
> >PID TID   PPIDUID  S   FLAGS  WAIT  COMMAND
> > *97567  517976  11478  0  7 0x3sysctl
> >  11478  302289  1  0  30x10008b  pause ksh
> >  68659  470461  1  0  30x100098  poll  cron
> > 
> > 
> > I like the idea of "ps /j", I could also merge the information in the
> > current "ps /w" since EMUL is always 'native' now.  What do you think?
> 
> Just s/EMUL/PGID/ and wait for someone to need something more?

Diff below does that and fix alignment issues with /a /o and /w.  The
problem was that TID needs 6 columns not 5.

With this ps /a no longer wraps on i386.

ddb{0}> ps /a
TID  COMMAND STRUCT PROC * UAREA *  VMSPACE/VM_MAP
* 70709  sysctl 0xd94e9418  0xf551c000  0xd94e7144
 154620  ksh0xd94e9c28  0xf5544000  0xd94e7324
 278101  cron   0xd94e9820  0xf557c000  0xd94e7004

ok?

Index: sys/kern/kern_proc.c
===
RCS file: /cvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.74
diff -u -p -r1.74 kern_proc.c
--- sys/kern/kern_proc.c24 Jan 2017 04:50:48 -  1.74
+++ sys/kern/kern_proc.c24 Jan 2017 05:03:38 -
@@ -467,7 +467,7 @@ db_show_all_procs(db_expr_t addr, int ha
switch (*mode) {
 
case 'a':
-   db_printf("   TID  %-10s  %18s  %18s  %18s\n",
+   db_printf("TID  %-9s  %18s  %18s  %18s\n",
"COMMAND", "STRUCT PROC *", "UAREA *", "VMSPACE/VM_MAP");
break;
case 'n':
@@ -475,12 +475,12 @@ db_show_all_procs(db_expr_t addr, int ha
"TID", "PPID", "UID", "FLAGS", "WAIT", "COMMAND");
break;
case 'w':
-   db_printf("   TID  %-16s  %-8s  %18s  %s\n",
-   "COMMAND", "EMUL", "WAIT-CHANNEL", "WAIT-MSG");
+   db_printf("TID  %-15s  %-5s  %18s  %s\n",
+   "COMMAND", "PGRP", "WAIT-CHANNEL", "WAIT-MSG");
break;
case 'o':
skipzomb = 1;
-   db_printf("   TID  %5s  %5s  %10s %10s  %3s  %-31s\n",
+   db_printf("TID  %5s  %5s  %10s %10s  %3s  %-30s\n",
"PID", "UID", "PRFLAGS", "PFLAGS", "CPU", "COMMAND");
break;
}
@@ -509,7 +509,7 @@ db_show_all_procs(db_expr_t addr, int ha
switch (*mode) {
 
case 'a':
-   db_printf("%-10.10s  %18p  %18p  
%18p\n",
+   db_printf("%-9.9s  %18p  %18p  %18p\n",
pr->ps_comm, p, p->p_addr, 
p->p_vmspace);
break;
 
@@ -524,9 +524,11 @@ db_show_all_procs(db_expr_t addr, int ha
break;
 
case 'w':
-   db_printf("%-16s  %-8s  %18p  %s\n", 
pr->ps_comm,
-   pr->ps_emul->e_name, p->p_wchan,
-   (p->p_wchan && p->p_wmesg) ? 
+   db_printf("%-15s  %-5d  %18p  %s\n",
+   pr->ps_comm, (pr->ps_pgrp ?
+   pr->ps_pgrp->pg_id : -1),
+   p->p_wchan,
+   (p->p_wchan && p->p_wmesg) ?
p->p_wmesg : "");
break;
 
Index: share/man/man4/ddb.4
===
RCS file: /cvs/src/share/man/man4/ddb.4,v
retrieving revision 1.84
diff -u -p -r1.84 ddb.4
--- share/man/man4/ddb.424 Jan 2017 04:50:48 -  1.84
+++ share/man/man4/ddb.424 Jan 2017 05:03:38 -
@@ -849,7 +849,7 @@ Shows non-idle threads that were on CPU 
 Information printed includes thread ID, process ID, UID, process flags,
 thread flags, current CPU, and command name.
 .It Cm /w
-Shows each thread's ID, command, system call emulation,
+Shows each thread's ID, command, process group,
 wait channel address, and wait channel message.
 .El
 .\" 



Re: pool_debug

2017-01-23 Thread David Gwynne
On Tue, Jan 24, 2017 at 04:46:01PM +1000, Martin Pieuchot wrote:
> I'd like to force a yield() for every pool_get(9) using PR_WAITOK, just
> like we do with malloc(9), in order to ensure that the NET_LOCK() is not
> held across context switches. 
> 
> ok?

ok

> 
> Index: kern/subr_pool.c
> ===
> RCS file: /cvs/src/sys/kern/subr_pool.c,v
> retrieving revision 1.204
> diff -u -p -r1.204 subr_pool.c
> --- kern/subr_pool.c  21 Nov 2016 01:44:06 -  1.204
> +++ kern/subr_pool.c  24 Jan 2017 06:29:09 -
> @@ -513,7 +513,7 @@ pool_get(struct pool *pp, int flags)
>   }
>   mtx_leave(>pr_mtx);
>  
> - if (slowdown && ISSET(flags, PR_WAITOK))
> + if ((slowdown || pool_debug == 2) && ISSET(flags, PR_WAITOK))
>   yield();
>  
>   if (v == NULL) {
> 



Re: NET_LOCK() ordering issue

2017-01-23 Thread Martin Pieuchot
On 23/01/17(Mon) 18:45, Philip Guenther wrote:
> On Mon, Jan 23, 2017 at 4:01 PM, Martin Pieuchot  wrote:
> [...]
> > @@ -277,19 +277,35 @@ doaccept(struct proc *p, int sock, struc
> > return (error);
> >
> > headfp = fp;
> > +   head = headfp->f_data;
> > +
> > +   fdplock(fdp);
> > +   error = falloc(p, , );
> > +   if (!error && (flags & SOCK_CLOEXEC))
> > +   fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
> > +   fdpunlock(fdp);
> > +   if (error) {
> > +   /*
> > +* Probably ran out of file descriptors.  Wakeup
> > +* so some other process might have a chance at it.
> > +*/
> > +   wakeup_one(>so_timeo);
> 
> This comment and wakeup_one() call should be left out.  They were
> needed in the original location because the thread had consumed the
> original wakeup from the new connection being completed at that point
> and when it gave up (because falloc() failed) it needed to recreate
> that for the next process.  Now, doing the falloc() before we don't
> need or want that extra wakeup.
> 
> Otherwise, looks good.

Thanks, I'm going to commit the version below then.

Index: kern/uipc_syscalls.c
===
RCS file: /cvs/src/sys/kern/uipc_syscalls.c,v
retrieving revision 1.144
diff -u -p -r1.144 uipc_syscalls.c
--- kern/uipc_syscalls.c29 Dec 2016 12:12:43 -  1.144
+++ kern/uipc_syscalls.c24 Jan 2017 03:39:56 -
@@ -265,7 +265,7 @@ doaccept(struct proc *p, int sock, struc
 {
struct filedesc *fdp = p->p_fd;
struct file *fp, *headfp;
-   struct mbuf *nam;
+   struct mbuf *nam = NULL;
socklen_t namelen;
int error, s, tmpfd;
struct socket *head, *so;
@@ -277,19 +277,30 @@ doaccept(struct proc *p, int sock, struc
return (error);
 
headfp = fp;
+
+   fdplock(fdp);
+   error = falloc(p, , );
+   if (!error && (flags & SOCK_CLOEXEC))
+   fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
+   fdpunlock(fdp);
+   if (error) {
+   FRELE(headfp, p);
+   return (error);
+   }
+
 redo:
NET_LOCK(s);
head = headfp->f_data;
if (isdnssocket(head) || (head->so_options & SO_ACCEPTCONN) == 0) {
error = EINVAL;
-   goto bad;
+   goto out;
}
if ((head->so_state & SS_NBIO) && head->so_qlen == 0) {
if (head->so_state & SS_CANTRCVMORE)
error = ECONNABORTED;
else
error = EWOULDBLOCK;
-   goto bad;
+   goto out;
}
while (head->so_qlen == 0 && head->so_error == 0) {
if (head->so_state & SS_CANTRCVMORE) {
@@ -298,34 +309,19 @@ redo:
}
error = tsleep(>so_timeo, PSOCK | PCATCH,
"netcon", 0);
-   if (error) {
-   goto bad;
-   }
+   if (error)
+   goto out;
}
if (head->so_error) {
error = head->so_error;
head->so_error = 0;
-   goto bad;
+   goto out;
}
 
/* Figure out whether the new socket should be non-blocking. */
nflag = flags & SOCK_NONBLOCK_INHERIT ? (headfp->f_flag & FNONBLOCK)
: (flags & SOCK_NONBLOCK ? FNONBLOCK : 0);
 
-   fdplock(fdp);
-   error = falloc(p, , );
-   if (error == 0 && (flags & SOCK_CLOEXEC))
-   fdp->fd_ofileflags[tmpfd] |= UF_EXCLOSE;
-   fdpunlock(fdp);
-   if (error != 0) {
-   /*
-* Probably ran out of file descriptors.  Wakeup
-* so some other process might have a chance at it.
-*/
-   wakeup_one(>so_timeo);
-   goto bad;
-   }
-
nam = m_get(M_WAIT, MT_SONAME);
 
/*
@@ -336,10 +332,7 @@ redo:
if (head->so_qlen == 0) {
NET_UNLOCK(s);
m_freem(nam);
-   fdplock(fdp);
-   fdremove(fdp, tmpfd);
-   closef(fp, p);
-   fdpunlock(fdp);
+   nam = NULL;
goto redo;
}
 
@@ -360,24 +353,20 @@ redo:
error = soaccept(so, nam);
if (!error && name != NULL)
error = copyaddrout(p, nam, name, namelen, anamelen);
-
+   if (!error) {
+   (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t), p);
+   FILE_SET_MATURE(fp, p);
+   *retval = tmpfd;
+   }
+out:
+   NET_UNLOCK(s);
+   m_freem(nam);
if (error) {
-   /* if an error occurred, free the file descriptor */
-   NET_UNLOCK(s);
-   m_freem(nam);
fdplock(fdp);
fdremove(fdp, tmpfd);
 

Re: Help for ddb(4)'s "tr /p"

2017-01-23 Thread Philip Guenther
On Tue, 24 Jan 2017, Martin Pieuchot wrote:
> Diff below does that and fix alignment issues with /a /o and /w.  The
> problem was that TID needs 6 columns not 5.
> 
> With this ps /a no longer wraps on i386.
> 
> ddb{0}> ps /a
> TID  COMMAND STRUCT PROC * UAREA *  VMSPACE/VM_MAP
> * 70709  sysctl 0xd94e9418  0xf551c000  0xd94e7144
>  154620  ksh0xd94e9c28  0xf5544000  0xd94e7324
>  278101  cron   0xd94e9820  0xf557c000  0xd94e7004
> 
> ok?

Looks good.  ok guenther@



clean up and modernize test calls in bsd.obj.mk

2017-01-23 Thread Theo Buehler
We're currently using several idioms for conditionally executing code in
bsd.obj.mk. I'd like to unify them for the sake of readability and
consistency.  This was done joint with rpe.

Index: share/mk/bsd.obj.mk
===
RCS file: /cvs/src/share/mk/bsd.obj.mk,v
retrieving revision 1.18
diff -u -p -r1.18 bsd.obj.mk
--- share/mk/bsd.obj.mk 24 Jan 2017 02:56:50 -  1.18
+++ share/mk/bsd.obj.mk 24 Jan 2017 02:58:50 -
@@ -33,20 +33,19 @@ obj! _SUBDIRUSE
SETOWNER=:; \
fi; \
[[ -z $$MKDIRS ]] && MKDIRS="mkdir -p"; \
-   if test $$here != $$subdir ; then \
+   if [[ $$here != $$subdir ]]; then \
dest=${BSDOBJDIR}/$$subdir ; \
echo "$$here/${__objdir} -> $$dest"; \
-   if test ! -L ${__objdir} -o \
-   X`readlink ${__objdir}` != X$$dest; \
+   if [[ ! -L ${__objdir} || `readlink ${__objdir}` != $$dest ]]; \
then \
-   if test -e ${__objdir}; then rm -rf ${__objdir}; fi; \
+   [[ -e ${__objdir} ]] && rm -rf ${__objdir}; \
ln -sf $$dest ${__objdir}; \
$$SETOWNER ${__objdir}; \
fi; \
-   if test -d ${BSDOBJDIR}; then \
-   test -d $$dest || $$MKDIRS $$dest; \
+   if [[ -d ${BSDOBJDIR} ]]; then \
+   [[ -d $$dest ]] || $$MKDIRS $$dest; \
else \
-   if test -e ${BSDOBJDIR}; then \
+   if [[ -e ${BSDOBJDIR} ]]; then \
echo "${BSDOBJDIR} is not a directory"; \
else \
echo "${BSDOBJDIR} does not exist"; \
@@ -54,7 +53,7 @@ obj! _SUBDIRUSE
fi; \
else \
dest=$$here/${__objdir} ; \
-   if test ! -d ${__objdir} ; then \
+   if [[ ! -d ${__objdir} ]]; then \
echo "making $$dest" ; \
$$MKDIRS $$dest; \
$$SETOWNER $$dest; \



Re: clean up and modernize test calls in bsd.obj.mk

2017-01-23 Thread Philip Guenther
fOn Tue, 24 Jan 2017, Theo Buehler wrote:
> We're currently using several idioms for conditionally executing code in 
> bsd.obj.mk. I'd like to unify them for the sake of readability and 
> consistency.  This was done joint with rpe.

Looks good.  The only caveat that comes to mind is that [[...]]'s = and != 
operators treat the right-hand side as a *pattern*, but that looks okay 
for the two uses of != being converted here.


Philip