ospf6d: use ROUTE_FLAGFILTER

2020-09-01 Thread Jonathan Matthew
Like ospfd, ospf6d can use ROUTE_FLAGFILTER to opt out of receiving messages
relating to L2 and broadcast routes on its routing socket.  We've been running
this for a week or so with no problems.

ok?

Index: kroute.c
===
RCS file: /cvs/src/usr.sbin/ospf6d/kroute.c,v
retrieving revision 1.64
diff -u -p -u -p -r1.64 kroute.c
--- kroute.c17 May 2020 18:29:25 -  1.64
+++ kroute.c18 Aug 2020 11:56:09 -
@@ -102,6 +102,7 @@ kr_init(int fs, u_int rdomain, int redis
int opt = 0, rcvbuf, default_rcvbuf;
socklen_t   optlen;
int filter_prio = fib_prio;
+   int filter_flags = RTF_LLINFO | RTF_BROADCAST;
 
kr_state.fib_sync = fs;
kr_state.rdomain = rdomain;
@@ -127,6 +128,12 @@ kr_init(int fs, u_int rdomain, int redis
if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_PRIOFILTER, _prio,
sizeof(filter_prio)) == -1) {
log_warn("%s: setsockopt AF_ROUTE ROUTE_PRIOFILTER", __func__);
+   /* not fatal */
+   }
+
+   if (setsockopt(kr_state.fd, AF_ROUTE, ROUTE_FLAGFILTER, _flags,
+   sizeof(filter_flags)) == -1) {
+   log_warn("%s: setsockopt AF_ROUTE ROUTE_FLAGFILTER", __func__);
/* not fatal */
}
 



Re: [PATCH] Add common PCIE capability list

2020-09-01 Thread Jonathan Gray
On Tue, Sep 01, 2020 at 11:44:03PM -0500, Jordan Hargrave wrote:
> This patch adds a common function for scanning PCIE Express Capability list
> The PCIE Capability list starts at 0x100 in extended PCI configuration space.

This seems to only handle extended capabilities?
Something like pcie_get_ext_capability() would be a better name.

It is 'PCI Express' not 'PCIExpress'

'ofs & 3' test doesn't make sense when PCI_PCIE_ECAP_NEXT() always
masks those bits.

> 
> ---
>  sys/dev/pci/pci.c| 28 
>  sys/dev/pci/pcivar.h |  2 ++
>  2 files changed, 30 insertions(+)
> 
> diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
> index bf75f875e..8f9a5ef7a 100644
> --- a/sys/dev/pci/pci.c
> +++ b/sys/dev/pci/pci.c
> @@ -677,6 +677,34 @@ pci_get_ht_capability(pci_chipset_tag_t pc, pcitag_t 
> tag, int capid,
>   return (0);
>  }
>  
> +int
> +pcie_get_capability(pci_chipset_tag_t pc, pcitag_t tag, int capid,
> +int *offset, pcireg_t *value)
> +{
> + pcireg_t reg;
> + unsigned int ofs;
> +
> + /* Make sure we support PCIExpress device */
> + if (pci_get_capability(pc, tag, PCI_CAP_PCIEXPRESS, NULL, NULL) == 0)
> + return (0);
> + /* Scan PCIExpress capabilities */
> + ofs = PCI_PCIE_ECAP;
> + while (ofs != 0) {
> + if ((ofs & 3) || (ofs < PCI_PCIE_ECAP))
> + return (0);
> + reg = pci_conf_read(pc, tag, ofs);
> + if (PCI_PCIE_ECAP_ID(reg) == capid) {
> + if (offset)
> + *offset = ofs;
> + if (value)
> + *value = reg;
> + return (1);
> + }
> + ofs = PCI_PCIE_ECAP_NEXT(reg);
> + }
> + return (0);
> +}
> +
>  uint16_t
>  pci_requester_id(pci_chipset_tag_t pc, pcitag_t tag)
>  {
> diff --git a/sys/dev/pci/pcivar.h b/sys/dev/pci/pcivar.h
> index bdfe0404f..0376ba992 100644
> --- a/sys/dev/pci/pcivar.h
> +++ b/sys/dev/pci/pcivar.h
> @@ -233,6 +233,8 @@ int   pci_io_find(pci_chipset_tag_t, pcitag_t, int, 
> bus_addr_t *,
>  int  pci_mem_find(pci_chipset_tag_t, pcitag_t, int, bus_addr_t *,
>   bus_size_t *, int *);
>  
> +int  pcie_get_capability(pci_chipset_tag_t, pcitag_t, int,
> + int *, pcireg_t *);
>  int  pci_get_capability(pci_chipset_tag_t, pcitag_t, int,
>   int *, pcireg_t *);
>  int  pci_get_ht_capability(pci_chipset_tag_t, pcitag_t, int,
> -- 
> 2.26.2
> 
> 



Re: [PATCH] Add IOMMU support for Intel VT-d and AMD-Vi

2020-09-01 Thread Jordan Hargrave
Oh good catch thanks. Weird, it does compile!


From: Daniel Dickman 
Sent: Tuesday, September 1, 2020 11:23 PM
To: Jordan Hargrave 
Cc: tech@openbsd.org 
Subject: Re: [PATCH] Add IOMMU support for Intel VT-d and AMD-Vi

> [PATCH] Add IOMMU support for Intel VT-d and AMD Vi
>
> This hooks each pci device and overrides bus_dmamap_xxx to issue
> remap of DMA requests to virtual DMA space.  It protects devices
> from issuing I/O requests to memory in the system that is outside
> the requested DMA space.

Hi Jordan, thanks for working on this. I would like to see iommu
support...

> + uint64_tefr;
> + uint8_t reserved[8];
> +} __packd;

...that being said, is the above a typo?


[PATCH] Add common PCIE capability list

2020-09-01 Thread Jordan Hargrave
This patch adds a common function for scanning PCIE Express Capability list
The PCIE Capability list starts at 0x100 in extended PCI configuration space.

---
 sys/dev/pci/pci.c| 28 
 sys/dev/pci/pcivar.h |  2 ++
 2 files changed, 30 insertions(+)

diff --git a/sys/dev/pci/pci.c b/sys/dev/pci/pci.c
index bf75f875e..8f9a5ef7a 100644
--- a/sys/dev/pci/pci.c
+++ b/sys/dev/pci/pci.c
@@ -677,6 +677,34 @@ pci_get_ht_capability(pci_chipset_tag_t pc, pcitag_t tag, 
int capid,
return (0);
 }
 
+int
+pcie_get_capability(pci_chipset_tag_t pc, pcitag_t tag, int capid,
+int *offset, pcireg_t *value)
+{
+   pcireg_t reg;
+   unsigned int ofs;
+
+   /* Make sure we support PCIExpress device */
+   if (pci_get_capability(pc, tag, PCI_CAP_PCIEXPRESS, NULL, NULL) == 0)
+   return (0);
+   /* Scan PCIExpress capabilities */
+   ofs = PCI_PCIE_ECAP;
+   while (ofs != 0) {
+   if ((ofs & 3) || (ofs < PCI_PCIE_ECAP))
+   return (0);
+   reg = pci_conf_read(pc, tag, ofs);
+   if (PCI_PCIE_ECAP_ID(reg) == capid) {
+   if (offset)
+   *offset = ofs;
+   if (value)
+   *value = reg;
+   return (1);
+   }
+   ofs = PCI_PCIE_ECAP_NEXT(reg);
+   }
+   return (0);
+}
+
 uint16_t
 pci_requester_id(pci_chipset_tag_t pc, pcitag_t tag)
 {
diff --git a/sys/dev/pci/pcivar.h b/sys/dev/pci/pcivar.h
index bdfe0404f..0376ba992 100644
--- a/sys/dev/pci/pcivar.h
+++ b/sys/dev/pci/pcivar.h
@@ -233,6 +233,8 @@ int pci_io_find(pci_chipset_tag_t, pcitag_t, int, 
bus_addr_t *,
 intpci_mem_find(pci_chipset_tag_t, pcitag_t, int, bus_addr_t *,
bus_size_t *, int *);
 
+intpcie_get_capability(pci_chipset_tag_t, pcitag_t, int,
+   int *, pcireg_t *);
 intpci_get_capability(pci_chipset_tag_t, pcitag_t, int,
int *, pcireg_t *);
 intpci_get_ht_capability(pci_chipset_tag_t, pcitag_t, int,
-- 
2.26.2



Re: [PATCH] Add IOMMU support for Intel VT-d and AMD-Vi

2020-09-01 Thread Daniel Dickman
> [PATCH] Add IOMMU support for Intel VT-d and AMD Vi
>
> This hooks each pci device and overrides bus_dmamap_xxx to issue
> remap of DMA requests to virtual DMA space.  It protects devices
> from issuing I/O requests to memory in the system that is outside
> the requested DMA space.

Hi Jordan, thanks for working on this. I would like to see iommu 
support...

> + uint64_tefr;
> + uint8_t reserved[8];
> +} __packd;

...that being said, is the above a typo?



Re: [macppc] clang-10: issue with ppc dag to dag pattern instruction selection

2020-09-01 Thread George Koehler
Moving from bugs to tech.

cwen reported that base-clang crashed on macppc in graphics/babl and
emulators/mednafen [1].  I observed that clang crashed on powerpc64 in
mednafen.  I now propose to backport a commit in llvm 11.x git [2] to
prevent these crashes.  This change affects other arches.

[1] https://marc.info/?l=openbsd-bugs=159775378014683=2
[2] https://github.com/llvm/llvm-project/commit/d4ce862

FreeBSD on powerpc64 found the problems earlier:
https://bugs.llvm.org/show_bug.cgi?id=45237 (babl)
https://bugs.llvm.org/show_bug.cgi?id=45274 (mednafen)

The problem is with clang -fno-unsafe-math-optimizations, one of the
new floating-point options in clang-10.  The diff disables the new
options on all targets except X86 (and SystemZ), so it also affects
arches other than PowerPC.  The disabled options cause a warning.

With the diff in clang:
 - mednafen stops using -fno-unsafe-math-optimizations, because the
   new warning fails the configure test.  I can build and package
   mednafen on macppc (where a single *m68k*.cpp takes over 2 hours),
   but when I try to play Super Mario World (snes), mednafen crashes
   with a SIGSEGV.  The build on powerpc64 stops crashing clang, but
   fails at an altivec error.
 - babl uses -fno-unsafe-math-optimizations and ignores the new
   warning.  I can build and package babl, and all 27 tests pass on
   both macppc and powerpc64.
 - I have built src and xenocara on powerpc64.

The diff is in llvm 11.x.  To backport the diff, I manually applied
some parts (where the context had changed), and I renamed
RoundingMode::NearestTiesToEven back to FPR_RoundToNearest.

Apply the diff in /usr/src/gnu/llvm.  If DiagnosticFrontendKinds.inc
exists under /usr/obj/gnu/usr.bin/clang/, you must delete the .inc,
because there is a missing dependency in our Makefiles.

Is this OK?  Should we do something else?  --George

Index: clang/docs/ClangCommandLineReference.rst
===
RCS file: /cvs/src/gnu/llvm/clang/docs/ClangCommandLineReference.rst,v
retrieving revision 1.1.1.2
diff -u -p -r1.1.1.2 ClangCommandLineReference.rst
--- clang/docs/ClangCommandLineReference.rst9 Aug 2020 15:51:07 -   
1.1.1.2
+++ clang/docs/ClangCommandLineReference.rst30 Aug 2020 19:37:51 -
@@ -818,6 +818,10 @@ Enables the experimental global instruct
 
 Enables an experimental new pass manager in LLVM.
 
+.. option:: -fexperimental-strict-floating-point
+
+Enables the use of non-default rounding modes and non-default exception 
handling on targets that are not currently ready.
+
 .. option:: -ffine-grained-bitfield-accesses, 
-fno-fine-grained-bitfield-accesses
 
 Use separate accesses for consecutive bitfield runs with legal widths and 
alignments.
Index: clang/include/clang/Basic/CodeGenOptions.def
===
RCS file: /cvs/src/gnu/llvm/clang/include/clang/Basic/CodeGenOptions.def,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 CodeGenOptions.def
--- clang/include/clang/Basic/CodeGenOptions.def3 Aug 2020 14:31:32 
-   1.1.1.1
+++ clang/include/clang/Basic/CodeGenOptions.def30 Aug 2020 19:37:51 
-
@@ -58,6 +58,8 @@ CODEGENOPT(DisableLLVMPasses , 1, 0) ///
  ///< frontend.
 CODEGENOPT(DisableLifetimeMarkers, 1, 0) ///< Don't emit any lifetime markers
 CODEGENOPT(DisableO0ImplyOptNone , 1, 0) ///< Don't annonate function with 
optnone at O0
+CODEGENOPT(ExperimentalStrictFloatingPoint, 1, 0) ///< Enables the new, 
experimental
+  ///< strict floating point.
 CODEGENOPT(ExperimentalNewPassManager, 1, 0) ///< Enables the new, experimental
  ///< pass manager.
 CODEGENOPT(DebugPassManager, 1, 0) ///< Prints debug information for the new
Index: clang/include/clang/Basic/DiagnosticFrontendKinds.td
===
RCS file: 
/cvs/src/gnu/llvm/clang/include/clang/Basic/DiagnosticFrontendKinds.td,v
retrieving revision 1.1.1.1
diff -u -p -r1.1.1.1 DiagnosticFrontendKinds.td
--- clang/include/clang/Basic/DiagnosticFrontendKinds.td3 Aug 2020 
14:31:32 -   1.1.1.1
+++ clang/include/clang/Basic/DiagnosticFrontendKinds.td30 Aug 2020 
19:37:51 -
@@ -37,6 +37,12 @@ def note_fe_backend_plugin: Note<"%0">, 
 def warn_fe_override_module : Warning<
 "overriding the module target triple with %0">,
 InGroup>;
+def warn_fe_backend_unsupported_fp_rounding : Warning<
+"overriding currently unsupported rounding mode on this target">,
+InGroup;
+def warn_fe_backend_unsupported_fp_exceptions : Warning<
+"overriding currently unsupported use of floating point exceptions "
+"on this target">, InGroup;
 
 def remark_fe_backend_optimization_remark : Remark<"%0">, BackendInfo,
 InGroup;
Index: clang/include/clang/Basic/DiagnosticGroups.td

[PATCH] Add IOMMU support for Intel VT-d and AMD-Vi

2020-09-01 Thread Jordan Hargrave
[PATCH] Add IOMMU support for Intel VT-d and AMD Vi

This hooks each pci device and overrides bus_dmamap_xxx to issue
remap of DMA requests to virtual DMA space.  It protects devices
from issuing I/O requests to memory in the system that is outside
the requested DMA space.
---
 sys/arch/amd64/conf/GENERIC  |1 +
 sys/arch/amd64/conf/RAMDISK  |1 +
 sys/arch/amd64/conf/RAMDISK_CD   |1 +
 sys/arch/amd64/include/pci_machdep.h |3 +-
 sys/arch/amd64/pci/pci_machdep.c |   15 +-
 sys/dev/acpi/acpi.c  |5 +
 sys/dev/acpi/acpidmar.c  | 2988 ++
 sys/dev/acpi/acpidmar.h  |  534 +
 sys/dev/acpi/acpireg.h   |   21 +-
 sys/dev/acpi/amd_iommu.h |  358 +++
 sys/dev/acpi/files.acpi  |5 +
 sys/dev/pci/pci.c|   28 +
 sys/dev/pci/pcivar.h |2 +
 13 files changed, 3959 insertions(+), 3 deletions(-)
 create mode 100644 sys/dev/acpi/acpidmar.c
 create mode 100644 sys/dev/acpi/acpidmar.h
 create mode 100644 sys/dev/acpi/amd_iommu.h

diff --git a/sys/arch/amd64/conf/GENERIC b/sys/arch/amd64/conf/GENERIC
index 2c49f91a1..1eda12bc9 100644
--- a/sys/arch/amd64/conf/GENERIC
+++ b/sys/arch/amd64/conf/GENERIC
@@ -45,6 +45,7 @@ acpibtn*  at acpi?
 acpicpu*   at acpi?
 acpicmos*  at acpi?
 acpidock*  at acpi?
+acpidmar0  at acpi?
 acpiec*at acpi?
 acpipci*   at acpi?
 acpiprt*   at acpi?
diff --git a/sys/arch/amd64/conf/RAMDISK b/sys/arch/amd64/conf/RAMDISK
index 10148add1..7ab48f32e 100644
--- a/sys/arch/amd64/conf/RAMDISK
+++ b/sys/arch/amd64/conf/RAMDISK
@@ -34,6 +34,7 @@ acpipci*  at acpi?
 acpiprt*   at acpi?
 acpimadt0  at acpi?
 #acpitz*   at acpi?
+acpidmar*  at acpi? disable
 
 mpbios0at bios0
 
diff --git a/sys/arch/amd64/conf/RAMDISK_CD b/sys/arch/amd64/conf/RAMDISK_CD
index 91022751e..82a24e210 100644
--- a/sys/arch/amd64/conf/RAMDISK_CD
+++ b/sys/arch/amd64/conf/RAMDISK_CD
@@ -48,6 +48,7 @@ sdhc* at acpi?
 acpihve*   at acpi?
 chvgpio*at acpi?
 glkgpio*   at acpi?
+acpidmar*  at acpi? disable
 
 mpbios0at bios0
 
diff --git a/sys/arch/amd64/include/pci_machdep.h 
b/sys/arch/amd64/include/pci_machdep.h
index bc295cc22..c725bdc73 100644
--- a/sys/arch/amd64/include/pci_machdep.h
+++ b/sys/arch/amd64/include/pci_machdep.h
@@ -91,7 +91,8 @@ void  *pci_intr_establish_cpu(pci_chipset_tag_t, 
pci_intr_handle_t,
int, struct cpu_info *,
int (*)(void *), void *, const char *);
 void   pci_intr_disestablish(pci_chipset_tag_t, void *);
-#definepci_probe_device_hook(c, a) (0)
+intpci_probe_device_hook(pci_chipset_tag_t,
+   struct pci_attach_args *);
 
 void   pci_dev_postattach(struct device *, struct pci_attach_args *);
 
diff --git a/sys/arch/amd64/pci/pci_machdep.c b/sys/arch/amd64/pci/pci_machdep.c
index cf4e835de..b700946a4 100644
--- a/sys/arch/amd64/pci/pci_machdep.c
+++ b/sys/arch/amd64/pci/pci_machdep.c
@@ -89,6 +89,11 @@
 #include 
 #endif
 
+#include "acpi.h"
+#if NACPI > 0
+#include 
+#endif
+
 /*
  * Memory Mapped Configuration space access.
  *
@@ -797,7 +802,15 @@ pci_init_extents(void)
}
 }
 
-#include "acpi.h"
+int
+pci_probe_device_hook(pci_chipset_tag_t pc, struct pci_attach_args *pa)
+{
+#if NACPI > 0
+   acpidmar_pci_hook(pc, pa);
+#endif
+   return 0;
+}
+
 #if NACPI > 0
 void acpi_pci_match(struct device *, struct pci_attach_args *);
 pcireg_t acpi_pci_min_powerstate(pci_chipset_tag_t, pcitag_t);
diff --git a/sys/dev/acpi/acpi.c b/sys/dev/acpi/acpi.c
index a6239198e..ea11483ad 100644
--- a/sys/dev/acpi/acpi.c
+++ b/sys/dev/acpi/acpi.c
@@ -49,6 +49,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include 
@@ -2448,6 +2449,8 @@ acpi_sleep_pm(struct acpi_softc *sc, int state)
sc->sc_fadt->pm2_cnt_blk && sc->sc_fadt->pm2_cnt_len)
acpi_write_pmreg(sc, ACPIREG_PM2_CNT, 0, ACPI_PM2_ARB_DIS);
 
+   acpidmar_sw(DVACT_SUSPEND);
+
/* Write SLP_TYPx values */
rega = acpi_read_pmreg(sc, ACPIREG_PM1A_CNT, 0);
regb = acpi_read_pmreg(sc, ACPIREG_PM1B_CNT, 0);
@@ -2483,6 +2486,8 @@ acpi_resume_pm(struct acpi_softc *sc, int fromstate)
 {
uint16_t rega, regb, en;
 
+   acpidmar_sw(DVACT_RESUME);
+
/* Write SLP_TYPx values */
rega = acpi_read_pmreg(sc, ACPIREG_PM1A_CNT, 0);
regb = acpi_read_pmreg(sc, ACPIREG_PM1B_CNT, 0);
diff --git a/sys/dev/acpi/acpidmar.c b/sys/dev/acpi/acpidmar.c
new file mode 100644
index 0..48506e1b1
--- /dev/null
+++ b/sys/dev/acpi/acpidmar.c
@@ -0,0 +1,2988 @@
+/*
+ * Copyright (c) 2015 Jordan Hargrave 
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright 

Re: m_defrag(9) leak

2020-09-01 Thread Theo Buehler
On Mon, Aug 31, 2020 at 08:50:09AM +0200, Theo Buehler wrote:
> On Tue, Aug 25, 2020 at 03:28:03PM +0200, Claudio Jeker wrote:
> > On Tue, Aug 25, 2020 at 08:38:06PM +1000, Matt Dunwoodie wrote:
> > > On Tue, 25 Aug 2020 08:54:10 +0200
> > > Claudio Jeker  wrote:
> > > 
> > > > On Tue, Aug 25, 2020 at 08:42:47AM +0200, Martin Pieuchot wrote:
> > > > > Maxime Villard mentioned a leak due to a missing m_freem() in wg(4):
> > > > >   https://marc.info/?l=netbsd-tech-net=159827988018641=2
> > > > > 
> > > > > It seems to be that such leak is present in other uses of
> > > > > m_defrag() in the tree.  I won't take the time to go though all of
> > > > > them, but an audit would be welcome.  
> > > > 
> > > > Why does wg(4) needs m_defrag? and why is it used in this way? It
> > > > does not even check if m_defrag() is actually needed.
> > > >
> > > > m_defrag() should be used by HW drivers where DMA constraints require
> > > > the mbuf to be flattened. Software dirvers should use m_pullup /
> > > > m_pulldown etc instead.
> > > 
> > > wg(4) uses m_defrag to bring all mbuf fragments into one, so it can
> > > read handshake values, and decrypt data directly from the mbuf. In
> > > both cases it needs the full length of the packet.
> > 
> > For the handshake headers m_pullup() or m_pulldown() is normally the way
> > to go.  Doing a massive pullup to decrypt data is a poor design. You work
> > against the network stack here and this comes at a reasonably high price
> > (considering you need to allocate a new mbuf, mbuf cluster and copy all
> > data over).
> >  
> > > However you're right, it doesn't check if it needs defrag and yes
> > > m_pullup is preferable. I have a new patch for m_pullup and includes a
> > > comment on why it's done. Thoughts?
> > > 
> > > There is one other case of "m_pullup(m, m->m_pkthdr.len)" in sppp_input.
> > 
> > sppp(4) is not without its own flaws. That code was added because of
> > alignment issues. It may actually no longer be needed but I have not the
> > time right now to really think this through.
> > 
> > > (To avoid any confusion with the previous patch, there is no need to
> > > m_free(m) as m_pullup will do that for us.)
> > 
> > > diff --git if_wg.c if_wg.c
> > > index e5a1071ccf1..242bd105e72 100644
> > > --- if_wg.c
> > > +++ if_wg.c
> > > @@ -2022,7 +2022,13 @@ wg_input(void *_sc, struct mbuf *m, struct ip *ip, 
> > > struct ip6_hdr *ip6,
> > >   /* m has a IP/IPv6 header of hlen length, we don't need it anymore. */
> > >   m_adj(m, hlen);
> > >  
> > > - if (m_defrag(m, M_NOWAIT) != 0)
> > > + /*
> > > +  * Ensure mbuf is contiguous over full length of packet. This is done
> > > +  * so we can directly read the handshake values in wg_handshake, and so
> > > +  * we can decrypt a transport packet by passing a single buffer to
> > > +  * noise_remote_decrypt in wg_decap.
> > > +  */
> > > + if ((m = m_pullup(m, m->m_pkthdr.len)) == NULL)
> > >   return NULL;
> > >  
> > >   if ((m->m_pkthdr.len == sizeof(struct wg_pkt_initiation) &&
> > 
> > I'm fine with this or the m_defrag fix to go in now to fix the mbuf leak.
> 
> Could someone who understands this please commit either version of the
> diff?

I committed this diff now. Thanks



wg: count peers per interface not globally

2020-09-01 Thread Klemens Nanni
The driver increases a static peer counter across all wg interfaces when
creating peers, the peer number is only used in debug output, though.

Output from console around recreating an interface  (2 and 4 are the same):

wg1: Receiving handshake response from peer 2
wg1: Receiving keepalive packet from peer 2
wg1: Sending keepalive packet to peer 2
# ifconfig wg0 destroy
wg1: Peer 1 destroyed
wg1: Peer 2 destroyed
wg1: Destroyed interface
# sh /etc/netstart wg0
wg1: Sending handshake initiation to peer 4
wg1: Receiving handshake response from peer 4
wg1: Receiving keepalive packet from peer 4
wg1: Sending keepalive packet to peer 4

I'm looking into an issue caused by having multiple `wgpeer's defined
on a single wg(4) interface and debug log is helpful, but having to
count along is not helpful, so I'd like to replace the global counter
with an interface specific one that's already in place.

There's another completely unused global counter which I remove as well
in the diff below.

Recreating as above now shows this:

# ifconfig wg1 destroy
wg1: Peer 0 destroyed
wg1: Peer 1 destroyed
wg1: Destroyed interface
# sh /etc/netstart wg0
wg1: Peer 0 created
wg1: Sending handshake initiation to peer 0
wg1: Receiving handshake response from peer 0
wg1: Peer 1 created
wg1: Receiving keepalive packet from peer 0

Feedback? OK?


Index: if_wg.c
===
RCS file: /cvs/src/sys/net/if_wg.c,v
retrieving revision 1.13
diff -u -p -r1.13 if_wg.c
--- if_wg.c 27 Aug 2020 21:27:17 -  1.13
+++ if_wg.c 1 Sep 2020 18:35:41 -
@@ -370,8 +370,6 @@ int wg_clone_create(struct if_clone *, i
 intwg_clone_destroy(struct ifnet *);
 void   wgattach(int);
 
-uint64_t   peer_counter = 0;
-uint64_t   keypair_counter = 0;
 struct poolwg_aip_pool;
 struct poolwg_peer_pool;
 struct poolwg_ratelimit_pool;
@@ -398,7 +396,6 @@ wg_peer_create(struct wg_softc *sc, uint
if ((peer = pool_get(_peer_pool, PR_NOWAIT)) == NULL)
return NULL;
 
-   peer->p_id = peer_counter++;
peer->p_sc = sc;
 
noise_remote_init(>p_remote, public, >sc_local);
@@ -442,7 +439,7 @@ wg_peer_create(struct wg_softc *sc, uint
rw_enter_write(>sc_peer_lock);
LIST_INSERT_HEAD(>sc_peer[idx], peer, p_pubkey_entry);
TAILQ_INSERT_TAIL(>sc_peer_seq, peer, p_seq_entry);
-   sc->sc_peer_num++;
+   peer->p_id = sc->sc_peer_num++;
rw_exit_write(>sc_peer_lock);
 
DPRINTF(sc, "Peer %llu created\n", peer->p_id);



Re: amd64: calibrate lapic timer frequency in constant time

2020-09-01 Thread Scott Cheloha
On Tue, Sep 01, 2020 at 06:14:05PM +0200, Mark Kettenis wrote:
> > Date: Tue, 1 Sep 2020 11:05:26 -0500
> > From: Scott Cheloha 
> > 
> > Hi,
> > 
> > At boot, if we don't know the lapic frequency offhand we compute it by
> > waiting for a known clock (the i8254) with a known frequency to cycle
> > a few times.
> > 
> > Currently we cycle hz times.  This doesn't make sense.  There is
> > little to no benefit to waiting additional cycles if your kernel is
> > compiled with a larger HZ.  Mostly it just makes the calibration take
> > longer.
> > 
> > Consider the common HZ=1000 case.  What is the benefit of looping an
> > additional 900 times?  The point of diminishing returns is well under
> > 1000 loops.
> > 
> > 20-50 loops is probably sufficient to limit our error, but I don't
> > want to break anything so let's use 100, like we do on default
> > kernels.
> > 
> > ok?
> 
> Sorry, but this makes no sense to me.  The current code waits for 1
> second regarless of what HZ is.
> 
> And I expect that the accuracy of the measurement depends on the total
> number elapsed time, so I expect a less acurate results if you only
> wait 100 cycles at HZ=1000 (which is 0.1 second).

Whoops.

Yes, you're right, I'm having a slow moment.  Nevermind that last
patch.

But this gives me an idea.

Given that we're waiting the same amount of time (1 second) no matter
the value of hz, and we're using the same clock to do the wait... why
are we fussing with the inner workings of reading the i8254?  We have
a function for this: i8254_delay().

If we just use that and tell it to wait a million microseconds we can
throw out a bunch of code.

Index: lapic.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.55
diff -u -p -r1.55 lapic.c
--- lapic.c 3 Aug 2019 14:57:51 -   1.55
+++ lapic.c 1 Sep 2020 16:40:39 -
@@ -451,24 +451,6 @@ lapic_initclocks(void)
i8254_inittimecounter_simple();
 }
 
-
-extern int gettick(void);  /* XXX put in header file */
-extern u_long rtclock_tval; /* XXX put in header file */
-
-static __inline void
-wait_next_cycle(void)
-{
-   unsigned int tick, tlast;
-
-   tlast = (1 << 16);  /* i8254 counter has 16 bits at most */
-   for (;;) {
-   tick = gettick();
-   if (tick > tlast)
-   return;
-   tlast = tick;
-   }
-}
-
 /*
  * Calibrate the local apic count-down timer (which is running at
  * bus-clock speed) vs. the i8254 counter/timer (which is running at
@@ -484,7 +466,7 @@ void
 lapic_calibrate_timer(struct cpu_info *ci)
 {
unsigned int startapic, endapic;
-   u_int64_t dtick, dapic, tmp;
+   u_int64_t tmp;
u_long s;
int i;
 
@@ -504,29 +486,13 @@ lapic_calibrate_timer(struct cpu_info *c
 
s = intr_disable();
 
-   /* wait for current cycle to finish */
-   wait_next_cycle();
-
startapic = lapic_gettick();
-
-   /* wait the next hz cycles */
-   for (i = 0; i < hz; i++)
-   wait_next_cycle();
-
+   i8254_delay(100);
endapic = lapic_gettick();
 
intr_restore(s);
 
-   dtick = hz * rtclock_tval;
-   dapic = startapic-endapic;
-
-   /*
-* there are TIMER_FREQ ticks per second.
-* in dtick ticks, there are dapic bus clocks.
-*/
-   tmp = (TIMER_FREQ * dapic) / dtick;
-
-   lapic_per_second = tmp;
+   lapic_per_second = startapic - endapic;
 
 skip_calibration:
printf("%s: apic clock running at %dMHz\n",



Re: amd64: calibrate lapic timer frequency in constant time

2020-09-01 Thread Mark Kettenis
> Date: Tue, 1 Sep 2020 11:05:26 -0500
> From: Scott Cheloha 
> 
> Hi,
> 
> At boot, if we don't know the lapic frequency offhand we compute it by
> waiting for a known clock (the i8254) with a known frequency to cycle
> a few times.
> 
> Currently we cycle hz times.  This doesn't make sense.  There is
> little to no benefit to waiting additional cycles if your kernel is
> compiled with a larger HZ.  Mostly it just makes the calibration take
> longer.
> 
> Consider the common HZ=1000 case.  What is the benefit of looping an
> additional 900 times?  The point of diminishing returns is well under
> 1000 loops.
> 
> 20-50 loops is probably sufficient to limit our error, but I don't
> want to break anything so let's use 100, like we do on default
> kernels.
> 
> ok?

Sorry, but this makes no sense to me.  The current code waits for 1
second regarless of what HZ is.

And I expect that the accuracy of the measurement depends on the total
number elapsed time, so I expect a less acurate results if you only
wait 100 cycles at HZ=1000 (which is 0.1 second).

> Index: lapic.c
> ===
> RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
> retrieving revision 1.55
> diff -u -p -r1.55 lapic.c
> --- lapic.c   3 Aug 2019 14:57:51 -   1.55
> +++ lapic.c   1 Sep 2020 15:58:41 -
> @@ -509,15 +509,15 @@ lapic_calibrate_timer(struct cpu_info *c
>  
>   startapic = lapic_gettick();
>  
> - /* wait the next hz cycles */
> - for (i = 0; i < hz; i++)
> + /* wait a few cycles */
> + for (i = 0; i < 100; i++)
>   wait_next_cycle();
>  
>   endapic = lapic_gettick();
>  
>   intr_restore(s);
>  
> - dtick = hz * rtclock_tval;
> + dtick = 100 * rtclock_tval;
>   dapic = startapic-endapic;
>  
>   /*
> 
> 



amd64: calibrate lapic timer frequency in constant time

2020-09-01 Thread Scott Cheloha
Hi,

At boot, if we don't know the lapic frequency offhand we compute it by
waiting for a known clock (the i8254) with a known frequency to cycle
a few times.

Currently we cycle hz times.  This doesn't make sense.  There is
little to no benefit to waiting additional cycles if your kernel is
compiled with a larger HZ.  Mostly it just makes the calibration take
longer.

Consider the common HZ=1000 case.  What is the benefit of looping an
additional 900 times?  The point of diminishing returns is well under
1000 loops.

20-50 loops is probably sufficient to limit our error, but I don't
want to break anything so let's use 100, like we do on default
kernels.

ok?

Index: lapic.c
===
RCS file: /cvs/src/sys/arch/amd64/amd64/lapic.c,v
retrieving revision 1.55
diff -u -p -r1.55 lapic.c
--- lapic.c 3 Aug 2019 14:57:51 -   1.55
+++ lapic.c 1 Sep 2020 15:58:41 -
@@ -509,15 +509,15 @@ lapic_calibrate_timer(struct cpu_info *c
 
startapic = lapic_gettick();
 
-   /* wait the next hz cycles */
-   for (i = 0; i < hz; i++)
+   /* wait a few cycles */
+   for (i = 0; i < 100; i++)
wait_next_cycle();
 
endapic = lapic_gettick();
 
intr_restore(s);
 
-   dtick = hz * rtclock_tval;
+   dtick = 100 * rtclock_tval;
dapic = startapic-endapic;
 
/*



Re: getitimer(2), setitimer(2): merge critical sections

2020-09-01 Thread Scott Cheloha
On Mon, Aug 17, 2020 at 05:55:34PM -0500, Scott Cheloha wrote:
> 
> [...]

Two week bump.

In summary:

- Merge the critical sections so that "timer swap" with setitimer(2)
  is atomic.

- To do this, move error-free operations into a common kernel
  subroutine, setitimer().  Now we have one critical section.

- Leave error-prone operations in sys_getitimer() and sys_setitimer().

In order to make the "timer swap" atomic we leave the timer installed
if the copyout(9) fails.  This isn't great, but it is unavoidable
without permitting copyout(9) from within a mutex.  FreeBSD and Linux
went this direction, too.  I would rather leave the timer running and
have an atomic swap than race the hardclock(9) or the realitexpire()
timeout.

ok?

Index: kern_time.c
===
RCS file: /cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.140
diff -u -p -r1.140 kern_time.c
--- kern_time.c 12 Aug 2020 15:31:27 -  1.140
+++ kern_time.c 1 Sep 2020 15:19:07 -
@@ -491,7 +491,7 @@ out:
 struct mutex itimer_mtx = MUTEX_INITIALIZER(IPL_CLOCK);
 
 /*
- * Get value of an interval timer.  The process virtual and
+ * Get and/or set value of an interval timer.  The process virtual and
  * profiling virtual time timers are kept internally in the
  * way they are specified externally: in time until they expire.
  *
@@ -509,6 +509,63 @@ struct mutex itimer_mtx = MUTEX_INITIALI
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+void
+setitimer(int which, struct itimerval *itv, struct itimerval *olditv)
+{
+   struct itimerspec its, oldits;
+   struct itimerspec *itimer;
+   struct process *pr;
+   int timo;
+
+   KASSERT(which >= ITIMER_REAL && which <= ITIMER_PROF);
+
+   pr = curproc->p_p;
+   itimer = >ps_timer[which];
+
+   if (itv != NULL) {
+   TIMEVAL_TO_TIMESPEC(>it_value, _value);
+   TIMEVAL_TO_TIMESPEC(>it_interval, _interval);
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_enter(_mtx);
+
+   if (olditv != NULL)
+   oldits = *itimer;
+   if (itv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec cts;
+   getnanouptime();
+   if (timespecisset(_value)) {
+   timo = tstohz(_value);
+   timeout_add(>ps_realit_to, timo);
+   timespecadd(_value, , _value);
+   } else
+   timeout_del(>ps_realit_to);
+   }
+   *itimer = its;
+   }
+
+   if (which != ITIMER_REAL)
+   mtx_leave(_mtx);
+
+   if (olditv != NULL) {
+   if (which == ITIMER_REAL) {
+   struct timespec now;
+   getnanouptime();
+   if (timespecisset(_value)) {
+   if (timespeccmp(_value, , <))
+   timespecclear(_value);
+   else
+   timespecsub(_value, ,
+   _value);
+   }
+   }
+   TIMESPEC_TO_TIMEVAL(>it_value, _value);
+   TIMESPEC_TO_TIMEVAL(>it_interval, _interval);
+   }
+}
+
 int
 sys_getitimer(struct proc *p, void *v, register_t *retval)
 {
@@ -516,44 +573,16 @@ sys_getitimer(struct proc *p, void *v, r
syscallarg(int) which;
syscallarg(struct itimerval *) itv;
} */ *uap = v;
-   struct itimerspec its;
struct itimerval aitv;
-   struct itimerspec *itimer;
int which;
 
which = SCARG(uap, which);
 
if (which < ITIMER_REAL || which > ITIMER_PROF)
return (EINVAL);
-   itimer = >p_p->ps_timer[which];
memset(, 0, sizeof(aitv));
 
-   if (which != ITIMER_REAL)
-   mtx_enter(_mtx);
-   its = *itimer;
-   if (which != ITIMER_REAL)
-   mtx_leave(_mtx);
-
-   if (which == ITIMER_REAL) {
-   struct timespec now;
-
-   getnanouptime();
-   /*
-* Convert from absolute to relative time in .it_value
-* part of real time timer.  If time for real time timer
-* has passed return 0, else return difference between
-* current time and time for the timer to go off.
-*/
-   if (timespecisset(_value)) {
-   if (timespeccmp(_value, , <))
-   timespecclear(_value);
-   else
-   timespecsub(_value, ,
-   _value);
-   }
-   }
-   TIMESPEC_TO_TIMEVAL(_value, _value);
-   

Re: shrinking and growing reallocs: a theoretical? bad case for performance

2020-09-01 Thread Stuart Henderson
On 2020/08/31 08:39, Otto Moerbeek wrote:
> A question from Theo made me think about realloc and come up with a
> particular bad case for performance. I do not know if it happens in
> practice, but it was easy to create a test program to hit the case.

Not very scientific testing (a single attempt at building one port), but
this seems to help quite a lot when compiling programs written in rust.
I encourage others to test the diff :-)